From 7b73db9ba65c006b16dc100954a633b9a6bf29c1 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 1 Dec 2018 19:17:38 +0100
Subject: [PATCH 001/166] added EnsembleLda

---
 docs/notebooks/Opinosis.ipynb    |  248 ++++++
 gensim/corpora/opinosiscorpus.py |   88 ++
 gensim/models/__init__.py        |    1 +
 gensim/models/ensemblelda.py     | 1368 ++++++++++++++++++++++++++++++
 4 files changed, 1705 insertions(+)
 create mode 100644 docs/notebooks/Opinosis.ipynb
 create mode 100644 gensim/corpora/opinosiscorpus.py
 create mode 100644 gensim/models/ensemblelda.py

diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/Opinosis.ipynb
new file mode 100644
index 0000000000..bbcc2f6525
--- /dev/null
+++ b/docs/notebooks/Opinosis.ipynb
@@ -0,0 +1,248 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-12-01T17:57:35.858751Z",
+     "start_time": "2018-12-01T17:57:34.507686Z"
+    },
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "from gensim.models import EnsembleLda, LdaMulticore\n",
+    "from gensim.corpora import OpinosisCorpus\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "enable the ensemble logger to show what it is doing currently"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-12-01T17:57:36.539523Z",
+     "start_time": "2018-12-01T17:57:36.534986Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "elda_logger = logging.getLogger(EnsembleLda.__module__)\n",
+    "elda_logger.setLevel(logging.INFO)\n",
+    "elda_logger.addHandler(logging.StreamHandler())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Experiments on the Opinosis Dataset\n",
+    "\n",
+    "Opinosis is an extremely small corpus that contains 289 product reviews for 51 products, which is why it is hard to extract topics from it. https://github.com/kavgan/opinosis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparing the corpus\n",
+    "\n",
+    "First, download the opinosis dataset. On linux it can be done like this for example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-29T20:19:17.891961Z",
+     "start_time": "2018-11-29T20:19:16.731678Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!mkdir ~/opinosis\n",
+    "!wget -P ~/opinosis https://github.com/kavgan/opinosis/raw/master/OpinosisDataset1.0_0.zip\n",
+    "!unzip ~/opinosis/OpinosisDataset1.0_0.zip -d ~/opinosis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-12-01T17:57:39.575650Z",
+     "start_time": "2018-12-01T17:57:39.571588Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "path = os.path.expanduser('~/opinosis/')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Corpus and id2word mapping can be created using the load_opinosis_data function provided in the package.\n",
+    "It preprocesses the data using the PorterStemmer and stopwords from the nltk package.\n",
+    "\n",
+    "The parameter of the function is the relative path to the folder, into which the zip file was extracted before. That folder contains a 'summaries-gold' subfolder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-12-01T17:57:40.842440Z",
+     "start_time": "2018-12-01T17:57:39.905273Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data source:\n",
+      "title:\t\tOpinosis: a graph-based approach to abstractive summarization of highly redundant opinions\n",
+      "authors:\tGanesan, Kavita and Zhai, ChengXiang and Han, Jiawei\n",
+      "booktitle:\tProceedings of the 23rd International Conference on Computational Linguistics\n",
+      "pages:\t\t340-348\n",
+      "year:\t\t2010\n",
+      "organization:\tAssociation for Computational Linguistics\n"
+     ]
+    }
+   ],
+   "source": [
+    "opinosis = OpinosisCorpus(path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**parameters**\n",
+    "\n",
+    "**topic_model_kind** ldamulticore is highly recommended for EnsembleLda. ensemble_workers and **distance_workers** are used to improve the time needed to train the models, as well as the **masking_method** 'rank'. ldamulticore is not able to fully utilize all cores on this small corpus, so **ensemble_workers** can be set to 3 to get 95 - 100% cpu usage on my i5 3470.\n",
+    "\n",
+    "Since the corpus is so small, a high number of **num_models** is needed to extract stable topics. The Opinosis corpus contains 51 categories, however, some of them are quite similar. For example there are 3 categories about the batteries of portable products. There are also multiple categories about cars. So I chose 20 for num_topics, which is smaller than the number of categories.\n",
+    "\n",
+    "The default for **min_samples** would be 64, half of the number of models. But since this does not return any topics, or at most 2, I set this to 32."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-12-01T18:10:13.690983Z",
+     "start_time": "2018-12-01T17:57:41.089661Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Spawned worker to generate 42 topic models...\n",
+      "Spawned worker to generate 43 topic models...\n",
+      "Generating 42 topic models...\n",
+      "Generating 43 topic models...\n",
+      "Spawned worker to generate 43 topic models...\n",
+      "Generating 43 topic models...\n",
+      "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n",
+      "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n",
+      "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n",
+      "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n",
+      "Fitting the clustering model\n",
+      "Generating stable topics\n",
+      "Generating classic gensim model representation based on results from the ensemble\n"
+     ]
+    }
+   ],
+   "source": [
+    "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n",
+    "                   passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n",
+    "                   topic_model_kind='ldamulticore', masking_method='rank', min_samples=32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-12-01T18:10:13.712818Z",
+     "start_time": "2018-12-01T18:10:13.695844Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "- 0.145 free, 0.043 park, 0.033 coffe, 0.030 wine, 0.027 even, 0.025 morn, 0.022 internet \n",
+      "\n",
+      "- 0.161 screen, 0.062 bright, 0.043 clear, 0.027 easi, 0.021 read, 0.019 touch, 0.011 size \n",
+      "\n",
+      "- 0.127 staff, 0.075 friendli, 0.074 help, 0.070 servic, 0.016 quick, 0.012 profession, 0.012 good \n",
+      "\n",
+      "- 0.123 seat, 0.067 comfort, 0.050 uncomfort, 0.039 front, 0.037 back, 0.036 firm, 0.032 drive \n",
+      "\n",
+      "- 0.113 room, 0.104 clean, 0.041 small, 0.037 bathroom, 0.036 comfort, 0.023 size, 0.016 well \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# pretty print, note that the words are stemmed so they appear chopped off\n",
+    "for t in elda.print_topics(num_words=7):\n",
+    "    print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py
new file mode 100644
index 0000000000..eee2229849
--- /dev/null
+++ b/gensim/corpora/opinosiscorpus.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Author: Tobias B <github.com/sezanzeb>
+
+import os
+from pathlib import Path
+import re
+from gensim.corpora import Dictionary
+from nltk.stem import PorterStemmer
+from nltk.corpus import stopwords
+
+
+class OpinosisCorpus():
+    """Creates a corpus and dictionary from the opinosis dataset:
+
+    http://kavita-ganesan.com/opinosis-opinion-dataset/#.WyF_JNWxW00
+
+    This data is organized in folders, each folder containing a few short docs.
+
+    Data can be obtained quickly using the following commands in bash:
+
+        mkdir opinosis && cd opinosis
+        wget https://github.com/kavgan/opinosis/raw/master/OpinosisDataset1.0_0.zip
+        unzip OpinosisDataset1.0_0.zip
+
+    corpus and dictionary can be accessed by using the .corpus and .id2word members
+
+    """
+
+    def __init__(self, path):
+        """
+
+        Parameters
+        ----------
+        path : string
+            Path to the extracted zip file. If 'summaries-gold' is in a folder
+            called 'opinosis', then the Path parameter would be 'opinosis',
+            either relative to you current working directory or absolute.
+
+        """
+
+        # citation
+        print("data source:")
+        print("title:\t\tOpinosis: a graph-based approach to abstractive summarization of highly redundant opinions")
+        print("authors:\tGanesan, Kavita and Zhai, ChengXiang and Han, Jiawei")
+        print("booktitle:\tProceedings of the 23rd International Conference on Computational Linguistics")
+        print("pages:\t\t340-348")
+        print("year:\t\t2010")
+        print("organization:\tAssociation for Computational Linguistics")
+
+        path = Path(path).joinpath("summaries-gold")
+        dictionary = Dictionary()
+        corpus = []
+        stemmer = PorterStemmer()
+
+        for directory, b, filenames in os.walk(path):
+            # iterates over folders, so in this
+            # scope a new topic is prepared
+
+            # root directory?
+            if len(filenames) == 0:
+                continue
+
+            # folder = directory.split(os.sep)[-1]
+            # print("processing folder ", "'"+folder+"'", "...")
+
+            # now get the corpus/documents
+            for filename in filenames:
+
+                filepath = directory + os.sep + filename
+                # write down the document and the topicId and split into train and testdata
+                with open(filepath) as file:
+
+                    doc = file.read()
+                    # overwrite dictionary, add corpus to test or train afterwards
+                    # the following function takes an array of documents, so wrap it in square braces
+                    processed = [
+                        stemmer.stem(token) for token in re.findall('\w+', doc.lower())
+                        if token not in stopwords.words('english')
+                    ]
+
+                    dictionary.add_documents([processed])
+                    corpus += [dictionary.doc2bow(processed)]
+
+        # and return the results the same way the other corpus generating functions do
+        self.corpus = corpus
+        self.id2word = dictionary
diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
index 96ca698b27..382e585c79 100644
--- a/gensim/models/__init__.py
+++ b/gensim/models/__init__.py
@@ -21,6 +21,7 @@
 from .ldaseqmodel import LdaSeqModel  # noqa:F401
 from .fasttext import FastText  # noqa:F401
 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix  # noqa:F401
+from .ensemblelda import EnsembleLda  # noqa:F401
 
 from . import wrappers  # noqa:F401
 from . import deprecated  # noqa:F401
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
new file mode 100644
index 0000000000..6abf70f5e3
--- /dev/null
+++ b/gensim/models/ensemblelda.py
@@ -0,0 +1,1368 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Authors: Tobias B <github.com/sezanzeb>, Alex Loosley <aloosley@alumni.brown.edu>,
+# Stephan Sahm <stephan.sahm@gmx.de>, Alex Salles <alex.salles@gmail.com>, Data Reply Munich
+
+"""Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models. They are clustered
+using DBSCAN, for which nearby topic mixtures - which are mixtures of two or more true topics and therefore undesired -
+are being used to support topics in becoming cores, but not vice versa.
+
+Usage examples
+--------------
+
+Train an ensemble of LdaModels using a Gensim corpus
+
+.. sourcecode:: pycon
+
+    >>> from gensim.test.utils import common_texts
+    >>> from gensim.corpora.dictionary import Dictionary
+    >>> from gensim.models import EnsembleLda, LdaModel
+    >>>
+    >>> # Create a corpus from a list of texts
+    >>> common_dictionary = Dictionary(common_texts)
+    >>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
+    >>>
+    >>> # Train the model on the corpus. corpus has to be provided as a
+    >>> # keyword argument, as they are passed through to the children.
+    >>> elda = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=10, num_models=4)
+
+Save a model to disk, or reload a pre-trained model
+
+.. sourcecode:: pycon
+
+    >>> from gensim.test.utils import datapath
+    >>>
+    >>> # Save model to disk.
+    >>> temp_file = datapath("model")
+    >>> elda.save(temp_file)
+    >>>
+    >>> # Load a potentially pretrained model from disk.
+    >>> elda = EnsembleLda.load(temp_file)
+
+Query, the model using new, unseen documents
+
+.. sourcecode:: pycon
+
+    >>> # Create a new corpus, made of previously unseen documents.
+    >>> other_texts = [
+    ...     ['computer', 'time', 'graph'],
+    ...     ['survey', 'response', 'eps'],
+    ...     ['human', 'system', 'computer']
+    ... ]
+    >>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
+    >>>
+    >>> unseen_doc = other_corpus[0]
+    >>> vector = elda[unseen_doc]  # get topic probability distribution for a document
+
+Increase the ensemble size by adding a new model
+
+.. sourcecode:: pycon
+
+    >>> elda.add_model(LdaModel(common_corpus, id2word=common_dictionary, num_topics=10))
+    >>> vector = elda[unseen_doc]
+
+To optimize the ensemble for your specific case, the children can be clustered again using
+different hyperparameters
+
+.. sourcecode:: pycon
+
+    >>> elda.recluster(eps=0.2)
+
+References
+----------
+.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic
+       modelling with large corpora. In : THE LREC 2010 WORKSHOP ON NEW
+       CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta.
+       2010. p. 45-50. Available from:
+       http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595
+
+"""
+
+import logging
+import os
+from multiprocessing import Process, Pipe, ProcessError
+import copy
+try:
+    import cPickle as _pickle
+except ImportError:
+    import pickle as _pickle
+import numpy as np
+import pandas as pd
+from scipy.spatial.distance import cosine
+from gensim import utils
+from gensim.models import ldamodel, ldamulticore
+
+logger = logging.getLogger(__name__)
+
+GENSIM_MODEL = dict(
+    lda=ldamodel.LdaModel,
+    ldamulticore=ldamulticore.LdaMulticore
+)
+
+MAX_RANDOM_STATE = 2**32 - 1
+
+
+class EnsembleLda():
+    """Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models.
+    They are clustered using DBSCAN, for which nearby topic mixtures - which are mixtures of two or more
+    true topics and therefore undesired - are being used to support topics in becoming cores,
+    but not vice versa.
+
+    """
+    def __init__(self, topic_model_kind="lda", num_models=3,
+                 min_cores=None,  # default value from generate_stable_topics()
+                 epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
+                 min_samples=None, masking_method="mass", masking_threshold=None,
+                 distance_workers=1, random_state=None, **gensim_kw_args):
+        """
+
+        Parameters
+        ----------
+        topic_model_kind : str, optional
+            Examples:
+                'ldamulticore' (recommended) or 'lda' (default)
+        num_models : number, optional
+            How many LDA models to train in this ensemble.
+            Default: 3
+        min_cores : number, optional
+            Minimum cores a cluster of topics has to contain
+            so that it is recognized as stable topic.
+        epsilon : float, optional
+            Defaults to 0.1. Epsilon for the cbdbscan clustering
+            that generates the stable topics.
+        ensemble_workers : number, optional
+            Spawns that many processes and distributes the
+            models from the ensemble to those as evenly as
+            possible. num_models should be a multiple of
+            ensemble_workers.
+
+            Setting it to 0 or 1 will both use the non-
+            multiprocessing version. Default:1
+        memory_friendly_ttda : boolean, optional
+            If True, the models in the ensemble are
+            deleted after training and only the
+            total topic word distribution is kept
+            to save memory.
+
+            Defaults to True. When False, trained
+            models are stored in a list in self.tms,
+            and no models that are not of a gensim
+            model type can be added to this ensemble
+            using the add_model function.
+
+            If False, any topic term matrix can be
+            supllied to add_model.
+        min_samples : number, optional
+            Required number of nearby topics for
+            a topic to be considered as 'core' in
+            the CBDBSCAN clustering.
+        masking_method : {'mass', 'rank}, optional
+            One of "mass" (default) or "rank" (faster).
+        masking_threshold : float, optional
+            Default: None, which uses 0.11 for
+            masking_method "rank", and 0.95
+            for "mass".
+        distance_workers : number, optional
+            When distance_workers is None, it defaults to
+            os.cpu_count() for maximum performance.
+            Default is 1, which is not multiprocessed.
+            Set to > 1 to enable multiprocessing.
+        **gensim_kw_args
+            All the parameters for the gensim model,
+            that is used for each model in the ensemble,
+            can be provided as additional keyword arguments.
+            https://radimrehurek.com/gensim/models/ldamodel.html
+
+        """
+
+        if "id2word" not in gensim_kw_args:
+            gensim_kw_args["id2word"] = None
+        if "corpus" not in gensim_kw_args:
+            gensim_kw_args["corpus"] = None
+
+        # dictionary. modified version of from gensim/models/ldamodel.py
+        # error messages are copied from the original gensim module [1]:
+        # will create a fake dict if no dict is provided.
+        if gensim_kw_args["id2word"] is None and not gensim_kw_args["corpus"] is None:
+            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
+            gensim_kw_args["id2word"] = utils.dict_from_corpus(gensim_kw_args["corpus"])
+        if gensim_kw_args["id2word"] is None and gensim_kw_args["corpus"] is None:
+            raise ValueError("at least one of corpus/id2word must be specified, to establish "
+                             "input space dimensionality. Corpus should be provided using the "
+                             "keyword argument corpus.")
+
+        # Store some of the parameters:
+        # - store topic_model_kind for generate_gensim_representation,
+        #   so that it is made sure that the gensim_kw_args fit into
+        #   the constructor of the gensim model.
+        self.topic_model_kind = topic_model_kind
+        self.num_models = num_models
+        self.gensim_kw_args = gensim_kw_args
+
+        # some settings affecting performance
+        self.memory_friendly_ttda = memory_friendly_ttda
+        self.distance_workers = distance_workers
+
+        # this will provide the gensim api to the ensemble basically
+        self.classic_model_representation = None
+
+        # the ensembles state
+        self.random_state = utils.get_random_state(random_state)
+        self.sstats_sum = 0
+        self.eta = None
+        self.tms = []
+        self.ttda = None
+
+        # in case the model will not train due to some
+        # parameters, stop here and don't train.
+        if num_models <= 0:
+            return
+        if ("corpus" not in gensim_kw_args):
+            return
+        if gensim_kw_args["corpus"] is None:
+            return
+        if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0:
+            return
+        if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0:
+            return
+
+        # training
+        if ensemble_workers > 1:
+            # multiprocessed
+            self.generate_ttda_multiprocessed(num_models, ensemble_workers)
+        else:
+            # singlecore
+            self.generate_topic_models(num_models)
+
+        if not memory_friendly_ttda:
+            self.generate_ttda()
+
+        self.generate_asymmetric_distance_matrix(workers=self.distance_workers,
+                                                 threshold=masking_threshold,
+                                                 method=masking_method)
+        self.generate_topic_clusters(epsilon, min_samples)
+        self.generate_stable_topics(min_cores)
+
+        # create model that can provide the usual gensim api to the stable topics from the ensemble
+        self.generate_gensim_representation()
+
+    def generate_gensim_representation(self):
+        """creates a gensim-model from the stable topics
+
+        The prior for the gensim model, eta, Can be anything,
+        (a parameter in object constructor) and won't influence
+        get_topics(). Note that different eta means different
+        log_perplexity (the default is the same as the gensim
+        default eta, which is for example [0.5, 0.5, 0.5]).
+
+        Note, that when the clustering of the topics
+        was changed using add_model, etc., and when no
+        topics were detected because of that, the old
+        classic gensim model will stay in place.
+
+        Returns
+        -------
+        LdaModel
+            A Gensim LDA Model classic_model_representation for which:
+            classic_model_representation.get_topics() == self.get_topics()
+
+        """
+
+        logger.info("Generating classic gensim model representation based on results from the ensemble")
+
+        numberOfWords = self.sstats_sum
+        # if numberOfWords should be wrong for some fantastic funny reason
+        # that makes you want to peel your skin off, recreate it (takes a while):
+        if numberOfWords == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None:
+            for document in self.gensim_kw_args["corpus"]:
+                for token in document:
+                    numberOfWords += token[1]
+            self.sstats_sum = numberOfWords
+        assert numberOfWords != 0
+
+        stableTopics = self.get_topics()
+
+        nStableTopics = len(stableTopics)
+
+        if nStableTopics == 0:
+            logger.warning("The model did not detect any stable topic. You can try to adjust epsilon: "
+                           "generate_topic_clusters(eps); generate_stable_topics()")
+            return
+
+        # create a new gensim model
+        params = self.gensim_kw_args.copy()
+        params["eta"] = self.eta
+        params["num_topics"] = nStableTopics
+        # adjust params in a way that no training happens
+        params["iterations"] = 0  # no training
+        params["passes"] = 0  # no training
+
+        classic_model_representation = GENSIM_MODEL[self.topic_model_kind](**params)
+
+        # when eta was None, use what gensim generates as default eta for the following tasks:
+        eta = classic_model_representation.eta
+
+        # the following is important for the denormalization
+        # to generate the proper sstats for the new gensim model:
+        # transform to dimensionality of stableTopics. axis=1 is summed
+        eta_sum = 0
+        if int == type(eta) or float == type(eta):
+            eta_sum = [eta * len(stableTopics[0])] * nStableTopics
+        else:
+            if len(eta.shape) == 1:  # [e1, e2, e3]
+                eta_sum = [[eta.sum()]] * nStableTopics
+            if len(eta.shape) > 1:  # [[e11, e12, ...], [e21, e22, ...], ...]
+                eta_sum = np.array(eta.sum(axis=1)[:, None])
+
+        # the factor, that will be used when get_topics() is used, for normalization
+        # will never change, because the sum for eta as well as the sum for sstats is constant.
+        # Therefore predicting normalization_factor becomes super easy.
+        # corpus is a mapping of id to occurences
+
+        # so one can also easily calculate the
+        # right sstats, so that get_topics() will return the stable topics no
+        # matter eta.
+
+        normalization_factor = np.array([[numberOfWords / nStableTopics]] * nStableTopics) + eta_sum
+
+        if stableTopics.shape[1] != len(eta):
+            raise ValueError("If load() was used: was the correct dictionary used when loading the model? "
+                             "because stableTopics.shape[0] {} != len(eta) {}".format(stableTopics.shape[1], len(eta)))
+
+        sstats = stableTopics * normalization_factor
+        sstats -= eta
+
+        # Note that this will cause some stuff to fail if nStableTopics is 1
+        # if nStableTopics is 1, the result will be (close to) uniform anyway
+        # inserting the new sstats
+        classic_model_representation.state.sstats = sstats.astype("float32")
+        # fix expElogbeta.
+        classic_model_representation.sync_state()
+
+        self.classic_model_representation = classic_model_representation
+
+        return classic_model_representation
+
+    def add_model(self, target, generate=True, num_new_models=None):
+        """Adds the ttda of another model to the ensemble
+        this way, multiple topic models can be connected
+        to an ensemble.
+
+        The ttda of another ensemble can also be used,
+        in that case set num_new_models to the num_models
+        parameter of the ensemble, that means the number
+        of classic models in the ensemble that generated
+        the ttda. This is important, because that information
+        is used to estimate "min_samples" for
+        generate_topic_clusters.
+
+        Make sure that all the models use the exact same
+        dictionary/idword mapping and size.
+
+        If you trained this ensemble in the past with a
+        certain Dictionary that you want to reuse for other
+        models, you can get it from: self.id2word
+
+        This will also trigger the generation of
+        stable topics automatically, but it can be
+        disabled with the generate parameter.
+
+        Parameters
+        ----------
+        target : {see description}
+            1. topic-term-distribution-array
+
+            example: [[0.1, 0.1, 0.8], [...], ...]
+
+            [topic_1, topic_2, ...]
+            with topic being:
+            [word_1, word_2, ...]
+
+            sum of words in a single topic sum to one,
+            therefore, all the words sum to len(ttda)
+
+            You can use this (option 1) only if
+            memory_friendly_ttda is set to True.
+
+            2. A single EnsembleLda object
+            3. List of EnsembleLda objects
+            4. A single Gensim topic model
+            5. List of Gensim topic models
+        generate : boolean, optional
+            if True, will cluster and generate
+            stable_topics afterwards. This is the default.
+
+            if False, you have to call:
+            self.generate_asymmetric_distance_matrix()
+            self.generate_topic_clusters()
+            self.generate_stable_topics()
+            self.generate_gensim_representation()
+            afterwards manually.
+        num_new_models : integer, optional
+            the model keeps track of how many models
+            were used in this ensemble. Set higher
+            if ttda contained topics from more than
+            one model. Default: None, which takes
+            care of it automatically.
+
+            If target is a 2D-array of float values,
+            it assumes 1.
+
+            If the ensemble has memory_friendly_ttda
+            set to False, then it will always use
+            the number of models in the target
+            parameter.
+
+        """
+
+        # If the model has never seen a ttda before, initialize.
+        # If it has, append.
+
+        # Be flexible. Can be a single element or a list of elements
+        # make sure it's a numpy array
+        if not isinstance(target, (np.ndarray, list)):
+            target = np.array([target])
+        else:
+            target = np.array(target)
+            assert len(target) > 0
+
+        detected_num_models = None
+
+        if self.memory_friendly_ttda:
+            # for memory friendly models/ttdas, append the ttdas to itself
+
+            ttda = None
+
+            # ttdas
+            if isinstance(target.dtype.type(), (np.number, float)):
+                # multiple ttdas, concatenate first
+                if len(target.shape) == 3:
+                        detected_num_models = len(target)
+                        ttda = np.concatenate(target)
+
+                # one single ttda
+                elif len(target.shape) == 2:
+                    detected_num_models = 1
+                    ttda = target
+
+            # ensemble lda object
+            else:
+                if type(target[0]) == EnsembleLda:
+                    # check if it is an ensemble with ttdas
+                    ttda = np.concatenate([model.ttda for model in target], axis=0)
+                    detected_num_models = sum([model.num_models for model in target])
+
+                # any gensim topic model object
+                elif isinstance(target[0], ldamodel.LdaModel):
+                    # might be a gensim model, use get_topics() then
+                    ttda = np.concatenate([model.get_topics() for model in target], axis=0)
+                    detected_num_models = len(target)
+
+                # unknown
+                else:
+                    raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0])))
+
+            logger.info("Adding {} model(s) with {} topics".format(detected_num_models, len(ttda)))
+
+            # check if ttda array already exists
+            if self.ttda is None:
+                self.ttda = ttda
+            else:
+                if self.ttda.shape[1] != ttda.shape[1]:
+                    raise ValueError(("target ttda dimensions do not match. Topics must be {} but was"
+                                      "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1]))
+                self.ttda = np.append(self.ttda, ttda, axis=0)
+
+            # new models were added, increase num_models
+            if num_new_models is None:
+                self.num_models += detected_num_models
+            else:
+                self.num_models += num_new_models
+        else:
+            # memory unfriendly ensembles have all the models
+            # stored in tms. Can be a list of ensembles or
+            # a list of gensim topic models.
+            if type(target[0]) == type(self):
+                detected_num_models = 0
+                for ensemble in target:
+                    self.tms += ensemble.tms
+                    detected_num_models += len(ensemble.tms)
+            else:
+                # else it is a list of gensim models
+                # (np.ndarray of len 1 or more, since it was wrapped in the beginning)
+                self.tms += target.tolist()
+                detected_num_models = len(target)
+
+            # in this case, len(self.tms) should
+            # always match self.num_models
+            self.num_models = len(self.tms)
+
+        if generate:
+            self.generate_asymmetric_distance_matrix(workers=None)  # multiprocessing enabled
+            self.generate_topic_clusters()
+            self.generate_stable_topics()
+            self.generate_gensim_representation()
+
+    def save(self, fname):
+        """Save the ensemble to a file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to the system file where the model will be persisted.
+
+        """
+
+        # parameter descriptions are similar to [1] in
+        # order to make the api less confusing
+
+        gensim_kw_args_persist = self.gensim_kw_args.copy()
+
+        payload = {
+            "ensemble_kw_args": {
+                "topic_model_kind": self.topic_model_kind,
+                "memory_friendly_ttda": self.memory_friendly_ttda,
+                "num_models": self.num_models
+            },
+            "gensim_kw_args": gensim_kw_args_persist,
+            "ttda": self.ttda,
+            "amatrix": self.asymmetric_distance_matrix,
+            "stable_topics": self.stable_topics,
+            "cluster_model.results": self.cluster_model.results,
+            "sstats_sum": self.sstats_sum,
+            "id2word": self.id2word
+        }
+
+        if not self.memory_friendly_ttda:
+            payload["topicModels"] = self.tms
+
+        # Version of the save/load api, to eventually make load
+        # backwards compatible if the save-function changes.
+        payload["version"] = "1.0"
+
+        with open(fname, 'wb') as f:
+            _pickle.dump(payload, f)
+
+        logger.info("saved %s object", self.__class__.__name__)
+
+    @staticmethod
+    def load(fname):
+        """Load a previously stored ensemble from disk.
+
+        Parameters
+        ----------
+        fname : str
+            Path to file that contains the needed object.
+
+        Returns
+        -------
+            A previously saved ensembleLda object
+
+        """
+
+        # message copied from [1]
+        logger.info("loading %s object from %s", EnsembleLda.__name__, fname)
+
+        with open(fname, 'rb') as f:
+            data = _pickle.load(f)
+
+        # update() on the dict can be safely used, because they don't have
+        # keywords in common that overwrite each other
+        kwArgs = data["ensemble_kw_args"]
+        kwArgs.update(data["gensim_kw_args"])
+
+        kwArgs["id2word"] = data["id2word"]
+        # prevent training
+        kwArgs["iterations"] = 0
+
+        # now reconstruct an ensemble LDA object based on the data:
+        eLDA = EnsembleLda(**kwArgs)
+        # since no training will take place, num_models will be set to 0. restore:
+        eLDA.num_models = kwArgs["num_models"]
+
+        # some stuff that was not created automatically, because the "empty" eLDA object did not train
+        eLDA.ttda = data["ttda"]
+        eLDA.asymmetric_distance_matrix = data["amatrix"]
+        eLDA.sstats_sum = data["sstats_sum"]
+
+        # depends on memory_friendly_ttda in the save function:
+        if "topicModels" in data:
+            eLDA.tms = []
+            for i in range(len(data["topicModels"])):
+                eLDA.tms += [data["topicModels"][i]]
+
+        # As long as fit is not called on cluster_model, no results will be generated,
+        # so the CBDBSCAN constructor is safe to call without consuming cpu time.
+        # No need to provide the original params from the constructor, the result are
+        # going to be written into the model from the pickle.
+        eLDA.cluster_model = CBDBSCAN()
+        eLDA.cluster_model.results = data["cluster_model.results"]
+        eLDA.stable_topics = data["stable_topics"]
+
+        eLDA.generate_gensim_representation()
+
+        # message copied from [1]
+        logger.info("loaded %s", fname)
+
+        return eLDA
+
+    def generate_ttda_multiprocessed(self, num_models, ensemble_workers):
+        """Will make the ensemble multiprocess, which results
+        in a speedup on multicore machines. Results from the
+        processes will be piped to the parent and concatenated.
+
+        Parameters
+        ----------
+        num_models : number
+            how many models to train in the ensemble
+        ensemble_workers : number
+            into how many processes to split the models
+            will be set to max(workers, num_models), to avoid
+            workers that are supposed to train 0 models.
+
+            to get maximum performance, set to the number
+            of your cores, if non-parallelized models
+            are being used in the ensemble (LdaModel).
+
+            For LdaMulticore, the performance gain is small
+            and gets larger for a significantly smaller corpus.
+            In that case, ensemble_workers=2 can be used.
+
+        """
+
+        # the way random_states is handled needs to prevent getting
+        # different results when multiprocessing is on, or getting
+        # the same results in every lda children.
+        # so it is solved by generating a list of state seeds before
+        # multiprocessing is started.
+        random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)]
+
+        # worker, that trains a bunch of models:
+        def worker(id, num_models, random_states, pipe):
+            # print(id,"training")
+            # since models cannot be piped to the parent because messages need to be pickled for that:
+            # memory_friendly_ttda needs to be true, which will not store ensemble submodels
+            logger.info("Spawned worker to generate {} topic models...".format(num_models))
+            self.generate_topic_models(num_models, random_states=random_states)
+            # send the ttda that is in the child/workers version of the memory into the pipe
+            # available, after generate_topic_models has been called in the worker
+            if self.memory_friendly_ttda:
+                pipe.send((id, self.ttda))  # remember that this code is inside the workers memory
+            else:
+                pipe.send((id, self.tms))
+
+            pipe.close()
+
+        # each worker has to work on at least one model.
+        # Don't spawn idle workers:
+        workers = min(ensemble_workers, num_models)
+
+        # create worker processes:
+        # from what I know this is basically forking with a jump to a target function in each child
+        # so modifying the ensemble object will not modify the one in the parent
+        # because of no shared memory
+        processes = []
+        pipes = []
+        num_models_unhandled = num_models  # how many more models need to be trained by workers?
+
+        for i in range(workers):
+            try:
+                parentConn, childConn = Pipe()
+                num_subprocess_models = 0
+                if i == workers - 1:  # i is a index, hence -1
+                    # is this the last worker that needs to be created?
+                    # then task that worker with all the remaining models
+                    num_subprocess_models = num_models_unhandled
+                else:
+                    num_subprocess_models = int(num_models_unhandled / (workers - i))
+
+                # get the chunk from the random states that is meant to be for those models
+                random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
+
+                p = Process(target=worker, args=(i, num_subprocess_models, random_states_for_worker, childConn))
+
+                processes += [p]
+                pipes += [(parentConn, childConn)]
+                p.start()
+
+                num_models_unhandled -= num_subprocess_models
+
+            except ProcessError:
+                logger.error("could not start process {}".format(i))
+                # close all pipes
+                for p in pipes:
+                    p[1].close()
+                    p[0].close()
+                # end all processes
+                for p in processes:
+                    if p.is_alive():
+                        p.terminate()
+                    del p
+                # stop
+                raise
+
+        # aggregate results
+        if self.memory_friendly_ttda:
+
+            # collect results from the pipes
+            # will also block until workers are finished
+            self.ttda = None
+            for p in pipes:
+                answer = p[0].recv()  # [0], because that is the parentConn
+                p[0].close()
+                # this does basically the same as the generate_topic_models function (concatenate all the ttdas):
+                if self.ttda is None:
+                    self.ttda = answer[1]  # [1] is the ttda ([0] is the worker id)
+                else:
+                    self.ttda = np.concatenate([self.ttda, answer[1]])
+                # print(answer[0], "received and ttda concatenated")
+                # in [0] the id of the worker that sent the data is stored
+
+            # end all processes
+            for p in processes:
+                p.terminate()
+
+            # again, the same thing generate_topic_models does at the end
+            self.ttda = self.ttda / self.ttda.sum(axis=1)[:, None]
+
+        else:
+            # now do the same thing for a memory unfriendly ensemble
+            self.tms = []
+            for p in pipes:
+                answer = p[0].recv()  # [0], because that is the parentConn
+                p[0].close()
+                self.tms += answer[1]
+
+            # end all processes
+            for p in processes:
+                p.terminate()
+
+    def generate_topic_models(self, num_models, random_states=None):
+        """Will generate the topic models, that form the ensemble.
+
+        Parameters
+        ----------
+        num_models : number
+            number of models to be generated
+        random_states : list
+            list of numbers or np.random.RandomState objects.
+            Will be autogenerated based on the ensembles RandomState
+            if None (default).
+        """
+
+        if random_states is None:
+            random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)]
+
+        assert len(random_states) == num_models
+
+        logger.info("Generating {} topic models...".format(num_models))
+
+        kwArgs = self.gensim_kw_args.copy()
+
+        tm = None  # remember one of the topic models from the following
+        # loop, in order to collect some properties from it afterwards.
+
+        for i in range(num_models):
+
+            kwArgs["random_state"] = random_states[i]
+            tm = GENSIM_MODEL[self.topic_model_kind](**kwArgs)
+
+            # memory friendly ttda = generate ttda on the run and delete model
+            if self.memory_friendly_ttda:
+                if self.ttda is None:
+                    self.ttda = tm.state.get_lambda()
+                else:
+                    # adds the lambda (that is the unnormalized get_topics) to ttda, which is
+                    # a list of all those lambdas
+                    self.ttda = np.concatenate([self.ttda, tm.state.get_lambda()])
+            else:
+                # only saves the model if it is not "memory friendly"
+                self.tms += [tm]
+
+        # use one of the tms to get some info that will be needed later
+        self.sstats_sum = tm.state.sstats.sum()
+        self.eta = tm.eta
+
+        if self.memory_friendly_ttda:
+            # normalize all the ttda
+            # this results in the same that is being displayed on .get_topics() of a gensim LDA model
+            self.ttda = self.ttda / self.ttda.sum(axis=1)[:, None]
+
+    def generate_ttda(self):
+        """Generate the topic_term_distribution for all the topics - P(W|T)"""
+
+        logger.info("Generating topic term distributions over all topics over all topic models in the ensemble...")
+
+        lambda_all = np.concatenate([self.tms[x].state.get_lambda() for x in range(self.num_models)], axis=0)
+
+        self.ttda = lambda_all / lambda_all.sum(axis=1)[:, None]
+
+    def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"):
+        """Makes the pairwise similarity matrix for all the ttdas
+        from the ensemble.
+
+        Returns the asymmetric pairwise distance matrix that
+        is used in the DBSCAN clustering.
+
+        Parameters
+        ----------
+        threshold : float, optional
+            if threshold is None and method == "rank":
+                threshold = 0.11
+            if threshold is None and method == "mass":
+                threshold = 0.95
+
+            threshold keeps by default 95% of the largest terms by
+            mass. Except the "fast" parameter is "rank", then it
+            just selects that many of the largest terms.
+        workers : number, optional
+            when workers is None, it defaults to os.cpu_count()
+            for maximum performance. Default is 1, which is not
+            multiprocessed. Set to > 1 to enable multiprocessing.
+
+        """
+
+        # singlecore:
+        if workers is not None and workers <= 1:
+            logger.info("Generating a {} x {} asymmetric similarity matrix...".format(len(self.ttda), len(self.ttda)))
+            self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda,
+                                                                                              ttda2=self.ttda,
+                                                                                              threshold=threshold,
+                                                                                              method=method)
+            return self.asymmetric_distance_matrix
+
+        # multicore:
+
+        # best performance on 2-core machine: 2 workers
+        if workers is None:
+            workers = os.cpu_count()
+
+        # worker, that computes the distance to
+        # all other nodes from a chunk of nodes.
+        # https://stackoverflow.com/questions/1743293/why-does-my-python-program-average-only-33-cpu-per-process-how-can-i-make-pyth#1743350
+        def worker(worker_id, ttdas_sent, n_ttdas, pipe):
+            logger.info("Spawned worker to generate {} rows of the asymmetric similarity matrix...".format(n_ttdas))
+            # print(id,"calculating")
+            # the chunk of ttda that's going to be calculated:
+            ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
+            distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda,
+                                                                             threshold=threshold,
+                                                                             start_index=ttdas_sent,
+                                                                             method=method)
+            pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
+            pipe.close()
+
+        # create worker processes:
+        processes = []
+        pipes = []
+        ttdas_sent = 0
+
+        for i in range(workers):
+
+            try:
+                parentConn, childConn = Pipe()
+
+                # figure out how many ttdas to send to the worker
+                # 9 ttdas to 4 workers: 2 2 2 3
+                n_ttdas = 0
+                if i == workers - 1:  # i is a index, hence -1
+                    # is this the last worker that needs to be created?
+                    # then task that worker with all the remaining models
+                    n_ttdas = len(self.ttda) - ttdas_sent
+                else:
+                    n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i))
+
+                p = Process(target=worker, args=(i, ttdas_sent, n_ttdas, childConn))
+                ttdas_sent += n_ttdas
+
+                processes += [p]
+                pipes += [(parentConn, childConn)]
+                p.start()
+
+            except ProcessError:
+                logger.error("could not start process {}".format(i))
+                # close all pipes
+                for p in pipes:
+                    p[1].close()
+                    p[0].close()
+                # end all processes
+                for p in processes:
+                    if p.is_alive():
+                        p.terminate()
+                    del p
+                raise
+
+        distances = []
+        # note, that the following loop maintains order in how the ttda will be concatenated
+        # which is very important. Ordering in ttda has to be the same as when using only one process
+        for p in pipes:
+            answer = p[0].recv()  # [0], because that is the parentConn
+            p[0].close()  # child conn will be closed from inside the worker
+            # this does basically the same as the generate_topic_models function (concatenate all the ttdas):
+            distances += [answer[1]]
+
+        # end all processes
+        for p in processes:
+            p.terminate()
+
+        self.asymmetric_distance_matrix = np.concatenate(distances)
+
+        return self.asymmetric_distance_matrix
+
+    def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold=None, start_index=0, method="mass"):
+        """Iterates over ttda1 and calculates the
+        distance to every ttda in tttda2.
+
+        Parameters
+        ----------
+        ttda1 and ttda2: 2D arrays of floats
+            Two ttda matrices that are going to be used for distance calculation.
+            Each row in ttda corresponds to one topic. Each cell in the resulting
+            matrix corresponds to the distance between a topic pair.
+        threshold : float, optional
+            threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected
+            method
+        start_index : number
+            this function might be used in multiprocessing, so start_index has to be set as
+            ttda1 is a chunk of the complete ttda in that case. start_index would be 0
+            if ttda1 == self.ttda. When self.ttda is split into two pieces, each 100 ttdas
+            long, then start_index should be be 100. default is 0
+        method : {'mass', 'rank}, optional
+            method can be "mass" for the original masking method or "rank" for a faster
+            masking method that selects by rank of largest elements in the topic word
+            distribution, to determine which tokens are relevant for the topic.
+
+        """
+
+        if method not in ["mass", "rank"]:
+            raise ValueError("method {} unknown".format(method))
+
+        if threshold is None:
+            threshold = {"mass": 0.95, "rank": 0.11}[method]
+
+        # select masking method:
+        def mass_masking(a):
+            """original masking method. returns a binary mask"""
+            return (a.sort_values(by=0, ascending=False).cumsum() < threshold).sort_index()[0].values
+
+        def rank_masking(a):  # faster
+            """faster masking method. returns a mask that contains indices to keep"""
+            return a.sort_values(by=0, ascending=False)[0:int(len(a) * threshold)].index
+
+        create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
+
+        # initialize the distance matrix. ndarray is faster than zeros
+        distances = np.ndarray((len(ttda1), len(ttda2)))
+
+        # now iterate over each topic
+        for i in range(len(ttda1)):
+
+            # prepare that topic and create the mask from it
+            a = pd.DataFrame(ttda1[i])
+            # create mask from a, that removes noise from a and keeps the largest terms
+            mask = create_mask(a)
+            a_masked = a[0][mask].values
+
+            # now look at every possible pair for topic a:
+            for j in range(len(ttda2)):
+
+                # distance to itself is 0
+                if i + start_index == j:
+                    distances[i][j] = 0
+                    continue
+
+                # now mask b based on a, which will force the shape of a onto b
+                b_masked = ttda2[j][mask]
+
+                distance = 0
+                # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1
+                # (not 2 because that correspondsto negative values. The maximum distance is 1 here)
+                # don't normalize b_masked, otherwise the following threshold will never:
+                if b_masked.sum() <= 0.05:
+                    distance = 1
+                else:
+                    # if there is indeed some non-noise stuff in the masked topic b, look at
+                    # how similar the two topics are. note that normalizing is not needed for cosine distance,
+                    # as it only looks at the angle between vectors
+                    distance = cosine(a_masked, b_masked)
+
+                distances[i][j] = distance
+
+        return distances
+
+    def generate_topic_clusters(self, eps=0.1, min_samples=None):
+        """Runs the DBSCAN algorithm on all the detected topics from
+        the models in the ensemble and labels them with label-indices.
+
+        The final approval and generation of stable topics is done in
+        generate_stable_topics().
+
+        Parameters
+        ----------
+        eps : float
+            eps is 0.1 by default.
+        min_samples : number
+            min_samples is int(self.num_models / 2)
+        """
+
+        if min_samples is None:
+            min_samples = int(self.num_models / 2)
+
+        logger.info("Fitting the clustering model")
+
+        self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples)
+        self.cluster_model.fit(self.asymmetric_distance_matrix)
+
+    def generate_stable_topics(self, min_cores=None):
+        """generates stable topics out of the clusters.
+        This function is the last step that has to be done
+        in the ensemble.
+
+        Stable topics can be retreived afterwards using
+        get_topics().
+
+        Parameters
+        ----------
+        min_cores : number
+            how many cores a cluster has to have,
+            to be treated as stable topic. That means, how many topics
+            that look similar have to be present, so that the average
+            topic in those is used as stable topic.
+
+            defaults to min_cores = min(3, max(1, int(self.num_models /4 +1)))
+
+        """
+
+        logger.info("Generating stable topics")
+
+        # min_cores being 0 makes no sense. there has to be a core for a cluster
+        # or there is no cluster
+        if min_cores == 0:
+            min_cores = 1
+
+        if min_cores is None:
+            # min_cores is a number between 1 and 3, depending on the number of models
+            min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
+
+        # helper functions
+        def remove_label_from_colum(label, valid_parent_labels):
+            """
+            Example
+            -------
+                - label = 0
+                - valid_parent_labels = {0, 1}
+                - result: {1}
+                will overwrite valid_parent_labels
+
+            """
+            try:
+                valid_parent_labels.remove(label)
+            except AttributeError:
+                # parent_labes is NaN
+                pass
+            except KeyError:
+                # Key does not exist
+                pass
+
+        def remove_label_from_sorted_labels(label, grouped_parent_labels):
+            """Despite the name, note that no sorting of anything is needed
+            for this to work.
+
+            Example
+            -------
+                - label = 2
+                - grouped_parent_labels = [{1, 2}, {1, 2, 3}]
+                - result: [{1}, {1, 3}]
+                will overwrite grouped_parent_labels
+
+            """
+            for i in grouped_parent_labels:
+                remove_label_from_colum(label, i)
+
+        def check_content_from_parent_labels(parent_label):
+            """
+            """
+            if not isinstance(parent_label, set):
+                return 0
+            else:
+                return len(parent_label)
+
+        def keep_only_valid_labels(parent_labels):
+            if not isinstance(parent_labels, set):
+                return set()
+            else:
+                return {label for label in parent_labels if label in valid_labels}
+
+        def validate_core(core):
+            """Core is an entry in the self.cluster_model.results dataframe
+
+            Will overwrite "is_valid_core" on that core with:
+            - False, if it is actually not marked as core (core.is_core)
+            - True, if valid_parents equals the cores label. E.g.: {1} == {1}
+
+            """
+            if not core.is_core:
+                return False
+            else:
+                return core.valid_parents == {core["labels"]}
+
+        # counts how many different labels a core has as parents
+        self.cluster_model.results.loc[self.cluster_model.results.is_core, "amount_parent_labels"] = \
+            self.cluster_model.results.loc[self.cluster_model.results.is_core, "parent_labels"].apply(
+            check_content_from_parent_labels)
+
+        # sorting the cores by the amount of labels in the "parent_labels"
+        # groups by labels, and aggregates each entry of the group. example: [{1, 2}, {1, 2, 3}]
+        sorted_labels = self.cluster_model.results.loc[
+            self.cluster_model.results.is_core, ["labels", "parent_labels", "amount_parent_labels"]].groupby(
+            "labels").agg({"amount_parent_labels": max, "parent_labels": lambda x: list(x)}).sort_values(
+            "amount_parent_labels")
+
+        # removing NaN
+        sorted_labels.parent_labels = sorted_labels.parent_labels.apply(lambda L: [x for x in L if x is not np.nan])
+
+        sorted_labels["is_valid"] = np.nan
+        # sorted_labels["is_not_valid"] = np.nan
+
+        # APPLYING THE RULES 1 to 3
+        # iterate over the cluster labels, see which clusters/labels
+        # are valid to cause the creation of a stable topic
+        for label in sorted_labels.index:
+            # sorted_labels.index example: Int64Index([0, 1, 2, 3, 4, 5], dtype='int64', name='labels')
+            # label is iterating over 0, 1, 3, ...
+
+            # 1. rule - remove if the cores in the cluster have no parents
+            if sorted_labels.loc[label].amount_parent_labels == 0:
+                # label is NOT VALID
+                sorted_labels.loc[label, "is_valid"] = False
+                sorted_labels.parent_labels.apply(
+                    lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels))
+
+            # 2. rule - remove if it has less than min_cores as parents
+            # (parents are always also cores)
+            if len(sorted_labels.loc[label, "parent_labels"]) < min_cores:
+                sorted_labels.loc[label, "is_valid"] = False
+                sorted_labels.parent_labels.apply(
+                    lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels))
+
+            # 3. checking for "easy_valid"s
+            # checks if the core has at least min_cores of cores with the only label as itself
+            if sorted_labels.loc[label].amount_parent_labels >= 1:
+                if sum(map(lambda x: x == {label}, sorted_labels.loc[label, "parent_labels"])) >= min_cores:
+                    sorted_labels.loc[label, "is_valid"] = True
+
+        # Reaplying the rule 3
+        # this is NOT DETERMINISTIC, the order will influence the result
+        # this happens when we have a close relationship among 2 or more clusters
+        for label in sorted_labels[sorted_labels.is_valid.isnull()].index:
+            # 3. checking for "easy_valid"s
+            # checks if the core has at least min_cores of cores with the only label as itself
+            if sum(map(lambda x: x == {label}, sorted_labels.loc[label, "parent_labels"])) >= min_cores:
+                sorted_labels.loc[label, "is_valid"] = True
+            else:
+                sorted_labels.loc[label, "is_valid"] = False
+                sorted_labels.parent_labels.apply(
+                    lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels))
+
+        self.sorted_labels = sorted_labels
+
+        valid_labels = self.sorted_labels[self.sorted_labels.is_valid].index.values
+
+        self.cluster_model.results["valid_parents"] = \
+            self.cluster_model.results.parent_labels.apply(keep_only_valid_labels)
+
+        # VALIDATING CORES
+        # only cores with the valid_parents as its own label can be a valid core
+        self.cluster_model.results["is_valid_core"] = self.cluster_model.results.apply(validate_core, axis=1)
+
+        # trainsforming the ttda into pandas and adding labels
+        ttda_pd = pd.DataFrame(self.ttda)
+        ttda_pd["labels"] = self.cluster_model.labels_  # same as self.cluster_model.results["labels"]
+
+        # keeping only VALID cores
+        ttda_pd = ttda_pd.loc[self.cluster_model.results.is_valid_core]
+
+        # averaging the ttd of the cores in the same cluster
+        # "labels" marks the cluster-id from DBSCAN
+        stable_topics = ttda_pd.groupby("labels").mean()
+
+        self.stable_topics = stable_topics
+
+    def recluster(self, eps=0.1, min_samples=None, min_cores=None):
+        """Runs the CBDBSCAN algorithm on all the detected topics from
+        the children of the ensemble.
+
+        Generates stable topics out of the clusters afterwards.
+
+        Finally, stable topics can be retreived using
+        get_topics().
+
+        Parameters
+        ----------
+        eps : float
+            epsilon for the CBDBSCAN algorithm, having the same
+            meaning as in classic DBSCAN clustering.
+            default: 0.1
+        min_samples : number
+            The minimum number of sampels in the neighborhood
+            of a topic to be considered a core in CBDBSCAN.
+            default: int(self.num_models / 2)
+        min_cores : number
+            how many cores a cluster has to have,
+            to be treated as stable topic. That means, how many topics
+            that look similar have to be present, so that the average
+            topic in those is used as stable topic.
+            default: min(3, max(1, int(self.num_models /4 +1)))
+
+        """
+        self.generate_topic_clusters(eps, min_samples)
+        self.generate_stable_topics(min_cores)
+        self.generate_gensim_representation()
+
+    # GENSIM API
+    # to make using the ensemble in place of a gensim model as easy as possible
+
+    def get_topics(self):
+        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
+        return self.stable_topics.values
+
+    def __getitem__(self, i):
+        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
+        if self.classic_model_representation is None:
+            raise ValueError("use generate_gensim_representation() first")
+        return self.classic_model_representation[i]
+
+    def inference(self, *posargs, **kwArgs):
+        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
+        if self.classic_model_representation is None:
+            raise ValueError("use generate_gensim_representation() first")
+        return self.classic_model_representation.inference(*posargs, **kwArgs)
+
+    def log_perplexity(self, *posargs, **kwArgs):
+        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
+        if self.classic_model_representation is None:
+            raise ValueError("use generate_gensim_representation() first")
+        return self.classic_model_representation.log_perplexity(*posargs, **kwArgs)
+
+    def print_topics(self, *posargs, **kwArgs):
+        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
+        if self.classic_model_representation is None:
+            raise ValueError("use generate_gensim_representation() first")
+        return self.classic_model_representation.print_topics(*posargs, **kwArgs)
+
+    @property
+    def id2word(self):
+        return self.gensim_kw_args["id2word"]
+
+
+class CBDBSCAN():
+
+    def __init__(self, eps=0.1, min_samples=4):
+
+        self.eps = eps
+        self.min_samples = min_samples
+
+    def fit(self, amatrix):
+
+        self.next_label = 0
+        self.results = pd.DataFrame(index=range(len(amatrix)),
+                                    columns=["labels", "is_core",
+                                             "parent_ids", "parent_labels", "num_samples"])
+        self.results.is_core = False
+
+        # otherwise sets cannot be written into the dataframe (will be converted to integers).
+        # The dtype should be object for everything to fix this.
+        self.results = self.results.astype(dtype="object")
+
+        tmp_amatrix = copy.deepcopy(amatrix)
+        # to avoid problem about comparing the topic with itself
+        np.fill_diagonal(tmp_amatrix, 1)
+
+        min_distance_per_topic = pd.Series(tmp_amatrix.min(axis=1))
+        min_distance_per_topic_sorted = min_distance_per_topic.sort_values()
+
+        ordered_min_similarity = list(min_distance_per_topic_sorted.index.values)
+
+        def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None):
+
+            # count how many neighbors
+            neighbors = pd.Series(tmp_amatrix[topic_index] < self.eps)
+            num_samples = sum(neighbors)
+
+            # If the number of neighbors of a topic is large enough,
+            # it is considered a core.
+            # This also takes neighbors that already are identified as core in count.
+            if num_samples >= self.min_samples:
+                # This topic is a core!
+                self.results.loc[topic_index, "is_core"] = True
+                self.results.loc[topic_index, "num_samples"] = num_samples
+
+                # if current_label is none, then this is the first core
+                # of a new cluster (hence next_label)
+                if current_label is None:
+                    # next_label is initialized with 0 in
+                    # fit() for the first cluster
+                    current_label = self.next_label
+                    self.next_label += 1
+
+                else:
+                    # In case the core has a parent,
+                    # that means it has a label, namely the
+                    # label from the parent.
+                    # Check the distance between the
+                    # new core and the parent
+
+                    # parent neighbors is the list of neighbors of parent_id
+                    # (the topic_index that called this function recursively)
+                    all_members_of_current_cluster = list(parent_neighbors[parent_neighbors].index.values)
+                    all_members_of_current_cluster.append(parent_id)
+
+                    # look if 25% of the members of the current cluster are also
+                    # close to the current/new topic...
+
+                    # example: (topic_index, 0), (topic_index, 2), (topic_index, ...)
+                    all_members_of_current_cluster_ix = np.ix_([topic_index], all_members_of_current_cluster)
+                    # use the result of the previous step to index the matrix and see if those distances
+                    # are smaller then epsilon. relations_to_the_cluster is a boolean array, True for close elements
+                    relations_to_the_cluster = tmp_amatrix[all_members_of_current_cluster_ix] < self.eps
+
+                    # if less than 25% of the elements are close, then the topic index in question is not a
+                    # core of the current_label, but rather the core of a new cluster
+                    if relations_to_the_cluster[0].mean() < 0.25:
+                        # start new cluster by changing current_label
+                        current_label = self.next_label
+                        self.next_label += 1
+
+                self.results.loc[topic_index, "labels"] = current_label
+
+                for neighbor in neighbors[neighbors].index:
+
+                    if self.results.loc[neighbor, "labels"] is np.nan:
+                        ordered_min_similarity.remove(neighbor)
+                        # try to extend the cluster into the direction
+                        # of the neighbor
+                        scan_topic(neighbor, parent_id=topic_index,
+                                   current_label=current_label,
+                                   parent_neighbors=neighbors)
+
+                    if isinstance(self.results.loc[neighbor, "parent_ids"], set):
+                        self.results.loc[neighbor, "parent_ids"].add(topic_index)
+                        self.results.loc[neighbor, "parent_labels"].add(current_label)
+                    else:
+                        self.results.loc[neighbor, "parent_ids"] = set([topic_index])
+                        self.results.loc[neighbor, "parent_labels"] = set([current_label])
+
+            else:
+                # this topic is not a core!
+                if current_label is None:
+                    self.results.loc[topic_index, "labels"] = -1
+                else:
+                    self.results.loc[topic_index, "labels"] = current_label
+
+        # elements are going to be removed from that array in scan_topic, do until it is empty
+        while len(ordered_min_similarity) != 0:
+            next_topic_index = ordered_min_similarity.pop(0)
+            scan_topic(next_topic_index)
+
+        self.labels_ = self.results["labels"]

From a67d5db944e97f5c3c492a6d70cbf281a32873ac Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Fri, 5 Apr 2019 02:31:00 +0200
Subject: [PATCH 002/166] improvements to add_model, various small changes to
 comments and code

---
 gensim/corpora/__init__.py   |   1 +
 gensim/models/ensemblelda.py | 269 ++++++++++++++++-------------------
 2 files changed, 123 insertions(+), 147 deletions(-)

diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py
index 0d51a9b903..7b7044e0b9 100644
--- a/gensim/corpora/__init__.py
+++ b/gensim/corpora/__init__.py
@@ -15,3 +15,4 @@
 from .textcorpus import TextCorpus, TextDirectoryCorpus  # noqa:F401
 from .ucicorpus import UciCorpus  # noqa:F401
 from .malletcorpus import MalletCorpus  # noqa:F401
+from .opinosiscorpus import OpinosisCorpus  # noqa:F401
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 6abf70f5e3..5e442eb62c 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -55,11 +55,12 @@
     >>> unseen_doc = other_corpus[0]
     >>> vector = elda[unseen_doc]  # get topic probability distribution for a document
 
-Increase the ensemble size by adding a new model
+Increase the ensemble size by adding a new model. Make sure it uses the same dictionary
 
 .. sourcecode:: pycon
 
     >>> elda.add_model(LdaModel(common_corpus, id2word=common_dictionary, num_topics=10))
+    >>> elda.recluster()
     >>> vector = elda[unseen_doc]
 
 To optimize the ensemble for your specific case, the children can be clustered again using
@@ -91,7 +92,7 @@
 import pandas as pd
 from scipy.spatial.distance import cosine
 from gensim import utils
-from gensim.models import ldamodel, ldamulticore
+from gensim.models import ldamodel, ldamulticore, basemodel
 
 logger = logging.getLogger(__name__)
 
@@ -212,7 +213,9 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         self.sstats_sum = 0
         self.eta = None
         self.tms = []
-        self.ttda = None
+        # initialize empty topic term distribution array
+        self.ttda = np.empty((0, len(gensim_kw_args["id2word"])), np.float32)
+        self.asymmetric_distance_matrix_outdated = True
 
         # in case the model will not train due to some
         # parameters, stop here and don't train.
@@ -230,14 +233,11 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         # training
         if ensemble_workers > 1:
             # multiprocessed
-            self.generate_ttda_multiprocessed(num_models, ensemble_workers)
+            self.generate_topic_models_multiproc(num_models, ensemble_workers)
         else:
             # singlecore
             self.generate_topic_models(num_models)
 
-        if not memory_friendly_ttda:
-            self.generate_ttda()
-
         self.generate_asymmetric_distance_matrix(workers=self.distance_workers,
                                                  threshold=masking_threshold,
                                                  method=masking_method)
@@ -247,6 +247,11 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         # create model that can provide the usual gensim api to the stable topics from the ensemble
         self.generate_gensim_representation()
 
+    def convert_to_memory_friendly(self):
+        """removes the stored gensim models and only keeps their ttdas"""
+        self.tms = []
+        self.memory_friendly_ttda = True
+
     def generate_gensim_representation(self):
         """creates a gensim-model from the stable topics
 
@@ -344,11 +349,18 @@ def generate_gensim_representation(self):
 
         return classic_model_representation
 
-    def add_model(self, target, generate=True, num_new_models=None):
+    def add_model(self, target, num_new_models=None):
         """Adds the ttda of another model to the ensemble
         this way, multiple topic models can be connected
         to an ensemble.
 
+        Make sure that all the models use the exact same
+        dictionary/idword mapping.
+
+        In order to generate new stable topics afterwards, use
+            self.generate_asymmetric_distance_matrix()
+            self.recluster()
+        
         The ttda of another ensemble can also be used,
         in that case set num_new_models to the num_models
         parameter of the ensemble, that means the number
@@ -357,48 +369,30 @@ def add_model(self, target, generate=True, num_new_models=None):
         is used to estimate "min_samples" for
         generate_topic_clusters.
 
-        Make sure that all the models use the exact same
-        dictionary/idword mapping and size.
-
         If you trained this ensemble in the past with a
         certain Dictionary that you want to reuse for other
-        models, you can get it from: self.id2word
-
-        This will also trigger the generation of
-        stable topics automatically, but it can be
-        disabled with the generate parameter.
+        models, you can get it from: self.id2word.
 
         Parameters
         ----------
         target : {see description}
-            1. topic-term-distribution-array
+            1. A single EnsembleLda object
+            2. List of EnsembleLda objects
+            3. A single Gensim topic model
+            4. List of Gensim topic models
+
+            if memory_friendly_ttda is True, target can also be:
+            5. topic-term-distribution-array
 
             example: [[0.1, 0.1, 0.8], [...], ...]
 
-            [topic_1, topic_2, ...]
-            with topic being:
-            [word_1, word_2, ...]
+            [topic1, topic2, ...]
+            with topic being an array of probabilities:
+            [token1, token2, ...]
 
-            sum of words in a single topic sum to one,
+            token probabilities in a single topic sum to one,
             therefore, all the words sum to len(ttda)
 
-            You can use this (option 1) only if
-            memory_friendly_ttda is set to True.
-
-            2. A single EnsembleLda object
-            3. List of EnsembleLda objects
-            4. A single Gensim topic model
-            5. List of Gensim topic models
-        generate : boolean, optional
-            if True, will cluster and generate
-            stable_topics afterwards. This is the default.
-
-            if False, you have to call:
-            self.generate_asymmetric_distance_matrix()
-            self.generate_topic_clusters()
-            self.generate_stable_topics()
-            self.generate_gensim_representation()
-            afterwards manually.
         num_new_models : integer, optional
             the model keeps track of how many models
             were used in this ensemble. Set higher
@@ -415,7 +409,6 @@ def add_model(self, target, generate=True, num_new_models=None):
             parameter.
 
         """
-
         # If the model has never seen a ttda before, initialize.
         # If it has, append.
 
@@ -427,82 +420,77 @@ def add_model(self, target, generate=True, num_new_models=None):
             target = np.array(target)
             assert len(target) > 0
 
-        detected_num_models = None
-
         if self.memory_friendly_ttda:
             # for memory friendly models/ttdas, append the ttdas to itself
 
-            ttda = None
+            detected_num_models = 0
 
-            # ttdas
-            if isinstance(target.dtype.type(), (np.number, float)):
-                # multiple ttdas, concatenate first
-                if len(target.shape) == 3:
-                        detected_num_models = len(target)
-                        ttda = np.concatenate(target)
+            ttda = []
 
-                # one single ttda
-                elif len(target.shape) == 2:
-                    detected_num_models = 1
-                    ttda = target
+            # 1. ttda array, because that's the only accepted input that contains numbers
+            if isinstance(target.dtype.type(), (np.number, float)):
+                ttda = target
+                detected_num_models = 1
 
-            # ensemble lda object
-            else:
-                if type(target[0]) == EnsembleLda:
-                    # check if it is an ensemble with ttdas
-                    ttda = np.concatenate([model.ttda for model in target], axis=0)
-                    detected_num_models = sum([model.num_models for model in target])
-
-                # any gensim topic model object
-                elif isinstance(target[0], ldamodel.LdaModel):
-                    # might be a gensim model, use get_topics() then
-                    ttda = np.concatenate([model.get_topics() for model in target], axis=0)
-                    detected_num_models = len(target)
-
-                # unknown
-                else:
-                    raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0])))
+            # 2. list of ensemblelda objects
+            elif isinstance(target[0], type(self)):
+                ttda = np.concatenate([ensemble.ttda for ensemble in target], axis=0)
+                detected_num_models = sum([ensemble.num_models for ensemble in target])
 
-            logger.info("Adding {} model(s) with {} topics".format(detected_num_models, len(ttda)))
+            # 3. list of gensim models
+            elif isinstance(target[0], basemodel.BaseTopicModel):
+                ttda = np.concatenate([model.get_topics() for model in target], axis=0)
+                detected_num_models = len(target)
 
-            # check if ttda array already exists
-            if self.ttda is None:
-                self.ttda = ttda
+            # unknown
             else:
-                if self.ttda.shape[1] != ttda.shape[1]:
-                    raise ValueError(("target ttda dimensions do not match. Topics must be {} but was"
-                                      "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1]))
-                self.ttda = np.append(self.ttda, ttda, axis=0)
+                raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0])))
 
             # new models were added, increase num_models
+            # if the user didn't provide a custon numer to use
             if num_new_models is None:
                 self.num_models += detected_num_models
             else:
                 self.num_models += num_new_models
-        else:
-            # memory unfriendly ensembles have all the models
-            # stored in tms. Can be a list of ensembles or
-            # a list of gensim topic models.
-            if type(target[0]) == type(self):
-                detected_num_models = 0
+
+        else: # memory unfriendly ensembles
+            
+            ttda = []
+
+            # 1. ttda array
+            if isinstance(target.dtype.type(), (np.number, float)):
+                raise ValueError('ttda arrays cannot be added to ensembles that are not memory friendly, '
+                                 'you can call convert_to_memory_friendly to discard the stored gensim models'
+                                 'and only keep the ttdas')
+
+            # 2. list of ensembles
+            elif isinstance(target[0], type(self)):
                 for ensemble in target:
                     self.tms += ensemble.tms
-                    detected_num_models += len(ensemble.tms)
-            else:
-                # else it is a list of gensim models
-                # (np.ndarray of len 1 or more, since it was wrapped in the beginning)
+                ttda = np.concatenate([ensemble.ttda for ensemble in target], axis=0)
+
+            # 3. list of gensim models
+            elif isinstance(target[0], basemodel.BaseTopicModel):
                 self.tms += target.tolist()
-                detected_num_models = len(target)
+                ttda = np.concatenate([model.get_topics() for model in target], axis=0)
+
+            # unknown
+            else:
+                raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0])))
 
             # in this case, len(self.tms) should
             # always match self.num_models
             self.num_models = len(self.tms)
 
-        if generate:
-            self.generate_asymmetric_distance_matrix(workers=None)  # multiprocessing enabled
-            self.generate_topic_clusters()
-            self.generate_stable_topics()
-            self.generate_gensim_representation()
+        logger.info("Ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
+        
+        if self.ttda.shape[1] != ttda.shape[1]:
+            raise ValueError(("target ttda dimensions do not match. Topics must be {} but was"
+                                "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1]))
+        self.ttda = np.append(self.ttda, ttda, axis=0)
+
+        # tell recluster that the distance matrix needs to be regenerated
+        self.asymmetric_distance_matrix_outdated = True
 
     def save(self, fname):
         """Save the ensemble to a file.
@@ -607,7 +595,7 @@ def load(fname):
 
         return eLDA
 
-    def generate_ttda_multiprocessed(self, num_models, ensemble_workers):
+    def generate_topic_models_multiproc(self, num_models, ensemble_workers):
         """Will make the ensemble multiprocess, which results
         in a speedup on multicore machines. Results from the
         processes will be piped to the parent and concatenated.
@@ -648,7 +636,9 @@ def worker(id, num_models, random_states, pipe):
             # send the ttda that is in the child/workers version of the memory into the pipe
             # available, after generate_topic_models has been called in the worker
             if self.memory_friendly_ttda:
-                pipe.send((id, self.ttda))  # remember that this code is inside the workers memory
+                # remember that this code is inside the worker processes memory,
+                # so self.ttda is the ttda of only a chunk of models
+                pipe.send((id, self.ttda))
             else:
                 pipe.send((id, self.tms))
 
@@ -707,25 +697,14 @@ def worker(id, num_models, random_states, pipe):
 
             # collect results from the pipes
             # will also block until workers are finished
-            self.ttda = None
             for p in pipes:
                 answer = p[0].recv()  # [0], because that is the parentConn
                 p[0].close()
                 # this does basically the same as the generate_topic_models function (concatenate all the ttdas):
-                if self.ttda is None:
-                    self.ttda = answer[1]  # [1] is the ttda ([0] is the worker id)
-                else:
-                    self.ttda = np.concatenate([self.ttda, answer[1]])
+                self.ttda = np.concatenate([self.ttda, answer[1]])
                 # print(answer[0], "received and ttda concatenated")
                 # in [0] the id of the worker that sent the data is stored
 
-            # end all processes
-            for p in processes:
-                p.terminate()
-
-            # again, the same thing generate_topic_models does at the end
-            self.ttda = self.ttda / self.ttda.sum(axis=1)[:, None]
-
         else:
             # now do the same thing for a memory unfriendly ensemble
             self.tms = []
@@ -733,13 +712,15 @@ def worker(id, num_models, random_states, pipe):
                 answer = p[0].recv()  # [0], because that is the parentConn
                 p[0].close()
                 self.tms += answer[1]
+                new_ttda = np.concatenate([model.get_topics() for model in answer[1]])
+                self.ttda = np.concatenate([self.ttda, new_ttda])
 
-            # end all processes
-            for p in processes:
-                p.terminate()
+        # end all processes
+        for p in processes:
+            p.terminate()
 
     def generate_topic_models(self, num_models, random_states=None):
-        """Will generate the topic models, that form the ensemble.
+        """Will train the topic models, that form the ensemble.
 
         Parameters
         ----------
@@ -768,38 +749,20 @@ def generate_topic_models(self, num_models, random_states=None):
             kwArgs["random_state"] = random_states[i]
             tm = GENSIM_MODEL[self.topic_model_kind](**kwArgs)
 
-            # memory friendly ttda = generate ttda on the run and delete model
-            if self.memory_friendly_ttda:
-                if self.ttda is None:
-                    self.ttda = tm.state.get_lambda()
-                else:
-                    # adds the lambda (that is the unnormalized get_topics) to ttda, which is
-                    # a list of all those lambdas
-                    self.ttda = np.concatenate([self.ttda, tm.state.get_lambda()])
-            else:
-                # only saves the model if it is not "memory friendly"
+            # adds the lambda (that is the unnormalized get_topics) to ttda, which is
+            # a list of all those lambdas
+            self.ttda = np.concatenate([self.ttda, tm.get_topics()])
+            
+            # only saves the model if it is not "memory friendly"
+            if not self.memory_friendly_ttda:
                 self.tms += [tm]
 
         # use one of the tms to get some info that will be needed later
         self.sstats_sum = tm.state.sstats.sum()
         self.eta = tm.eta
 
-        if self.memory_friendly_ttda:
-            # normalize all the ttda
-            # this results in the same that is being displayed on .get_topics() of a gensim LDA model
-            self.ttda = self.ttda / self.ttda.sum(axis=1)[:, None]
-
-    def generate_ttda(self):
-        """Generate the topic_term_distribution for all the topics - P(W|T)"""
-
-        logger.info("Generating topic term distributions over all topics over all topic models in the ensemble...")
-
-        lambda_all = np.concatenate([self.tms[x].state.get_lambda() for x in range(self.num_models)], axis=0)
-
-        self.ttda = lambda_all / lambda_all.sum(axis=1)[:, None]
-
     def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"):
-        """Makes the pairwise similarity matrix for all the ttdas
+        """Makes the pairwise distance matrix for all the ttdas
         from the ensemble.
 
         Returns the asymmetric pairwise distance matrix that
@@ -823,12 +786,18 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
 
         """
 
+        # matrix is up to date afterwards
+        self.asymmetric_distance_matrix_outdated = False
+
+        if threshold is None:
+            threshold = {"mass": 0.95, "rank": 0.11}[method]
+
         # singlecore:
         if workers is not None and workers <= 1:
-            logger.info("Generating a {} x {} asymmetric similarity matrix...".format(len(self.ttda), len(self.ttda)))
-            self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda,
-                                                                                              ttda2=self.ttda,
+            logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda)))
+            self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda, ttda2=self.ttda,
                                                                                               threshold=threshold,
+                                                                                              start_index=0,
                                                                                               method=method)
             return self.asymmetric_distance_matrix
 
@@ -840,10 +809,10 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
 
         # worker, that computes the distance to
         # all other nodes from a chunk of nodes.
-        # https://stackoverflow.com/questions/1743293/why-does-my-python-program-average-only-33-cpu-per-process-how-can-i-make-pyth#1743350
+        # https://stackoverflow.com/a/1743350
         def worker(worker_id, ttdas_sent, n_ttdas, pipe):
-            logger.info("Spawned worker to generate {} rows of the asymmetric similarity matrix...".format(n_ttdas))
-            # print(id,"calculating")
+            logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix...".format(n_ttdas))
+            # print(id, "calculating")
             # the chunk of ttda that's going to be calculated:
             ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
             distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda,
@@ -910,7 +879,7 @@ def worker(worker_id, ttdas_sent, n_ttdas, pipe):
 
         return self.asymmetric_distance_matrix
 
-    def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold=None, start_index=0, method="mass"):
+    def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method):
         """Iterates over ttda1 and calculates the
         distance to every ttda in tttda2.
 
@@ -933,14 +902,14 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold=Non
             masking method that selects by rank of largest elements in the topic word
             distribution, to determine which tokens are relevant for the topic.
 
+        Returns
+        -------
+        2D Numpy.numpy.ndarray of floats
         """
 
         if method not in ["mass", "rank"]:
             raise ValueError("method {} unknown".format(method))
 
-        if threshold is None:
-            threshold = {"mass": 0.95, "rank": 0.11}[method]
-
         # select masking method:
         def mass_masking(a):
             """original masking method. returns a binary mask"""
@@ -1216,6 +1185,11 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
             default: min(3, max(1, int(self.num_models /4 +1)))
 
         """
+        # if new models were added to the ensemble, the distance
+        # matrix needs to be generated again
+        if self.asymmetric_distance_matrix_outdated:
+            self.generate_asymmetric_distance_matrix()
+
         self.generate_topic_clusters(eps, min_samples)
         self.generate_stable_topics(min_cores)
         self.generate_gensim_representation()
@@ -1224,7 +1198,6 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
     # to make using the ensemble in place of a gensim model as easy as possible
 
     def get_topics(self):
-        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
         return self.stable_topics.values
 
     def __getitem__(self, i):
@@ -1275,11 +1248,13 @@ def fit(self, amatrix):
         # The dtype should be object for everything to fix this.
         self.results = self.results.astype(dtype="object")
 
-        tmp_amatrix = copy.deepcopy(amatrix)
+        tmp_amatrix = amatrix.copy()
+
         # to avoid problem about comparing the topic with itself
         np.fill_diagonal(tmp_amatrix, 1)
 
         min_distance_per_topic = pd.Series(tmp_amatrix.min(axis=1))
+
         min_distance_per_topic_sorted = min_distance_per_topic.sort_values()
 
         ordered_min_similarity = list(min_distance_per_topic_sorted.index.values)

From e27be0af1e8d03375aa0cab07ea14d8013ccf6fa Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 6 Apr 2019 01:02:20 +0200
Subject: [PATCH 003/166] pandas -> numpy: group by label and mean

---
 gensim/models/ensemblelda.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 5e442eb62c..fa6a2a2152 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1145,16 +1145,22 @@ def validate_core(core):
         # only cores with the valid_parents as its own label can be a valid core
         self.cluster_model.results["is_valid_core"] = self.cluster_model.results.apply(validate_core, axis=1)
 
-        # trainsforming the ttda into pandas and adding labels
-        ttda_pd = pd.DataFrame(self.ttda)
-        ttda_pd["labels"] = self.cluster_model.labels_  # same as self.cluster_model.results["labels"]
+        # numpy solution
 
         # keeping only VALID cores
-        ttda_pd = ttda_pd.loc[self.cluster_model.results.is_valid_core]
+        valid_core_mask = self.cluster_model.results.is_valid_core.values
+        valid_topics = self.ttda[valid_core_mask]
+        topic_labels = self.cluster_model.results.labels.values[valid_core_mask]
+        unique_labels = np.unique(topic_labels)
 
-        # averaging the ttd of the cores in the same cluster
-        # "labels" marks the cluster-id from DBSCAN
-        stable_topics = ttda_pd.groupby("labels").mean()
+        num_topics = len(unique_labels)
+        stable_topics = np.empty((num_topics, len(self.id2word)), dtype=np.float32)
+
+        # for each cluster
+        for l, label in enumerate(unique_labels):
+            # mean of all the topics that are of that cluster
+            topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label])
+            stable_topics[l] = topics_of_cluster.mean(axis=0)
 
         self.stable_topics = stable_topics
 
@@ -1198,7 +1204,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
     # to make using the ensemble in place of a gensim model as easy as possible
 
     def get_topics(self):
-        return self.stable_topics.values
+        return self.stable_topics
 
     def __getitem__(self, i):
         """see https://radimrehurek.com/gensim/models/ldamodel.html"""

From 83de2dd627bb2853dabc79c51f0ea0b0f2517b38 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 7 Apr 2019 00:58:55 +0200
Subject: [PATCH 004/166] pandas -> numpy: generate_stable_topics

---
 gensim/models/ensemblelda.py | 279 ++++++++++++++++++++---------------
 1 file changed, 163 insertions(+), 116 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index fa6a2a2152..a12d880483 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1014,143 +1014,109 @@ def generate_stable_topics(self, min_cores=None):
             # min_cores is a number between 1 and 3, depending on the number of models
             min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
 
-        # helper functions
-        def remove_label_from_colum(label, valid_parent_labels):
-            """
-            Example
-            -------
-                - label = 0
-                - valid_parent_labels = {0, 1}
-                - result: {1}
-                will overwrite valid_parent_labels
-
-            """
-            try:
-                valid_parent_labels.remove(label)
-            except AttributeError:
-                # parent_labes is NaN
-                pass
-            except KeyError:
-                # Key does not exist
-                pass
-
-        def remove_label_from_sorted_labels(label, grouped_parent_labels):
-            """Despite the name, note that no sorting of anything is needed
-            for this to work.
-
-            Example
-            -------
-                - label = 2
-                - grouped_parent_labels = [{1, 2}, {1, 2, 3}]
-                - result: [{1}, {1, 3}]
-                will overwrite grouped_parent_labels
-
-            """
-            for i in grouped_parent_labels:
-                remove_label_from_colum(label, i)
-
-        def check_content_from_parent_labels(parent_label):
-            """
-            """
-            if not isinstance(parent_label, set):
-                return 0
-            else:
-                return len(parent_label)
-
-        def keep_only_valid_labels(parent_labels):
-            if not isinstance(parent_labels, set):
-                return set()
-            else:
-                return {label for label in parent_labels if label in valid_labels}
-
-        def validate_core(core):
-            """Core is an entry in the self.cluster_model.results dataframe
-
-            Will overwrite "is_valid_core" on that core with:
-            - False, if it is actually not marked as core (core.is_core)
-            - True, if valid_parents equals the cores label. E.g.: {1} == {1}
-
-            """
-            if not core.is_core:
-                return False
-            else:
-                return core.valid_parents == {core["labels"]}
-
         # counts how many different labels a core has as parents
-        self.cluster_model.results.loc[self.cluster_model.results.is_core, "amount_parent_labels"] = \
-            self.cluster_model.results.loc[self.cluster_model.results.is_core, "parent_labels"].apply(
-            check_content_from_parent_labels)
-
-        # sorting the cores by the amount of labels in the "parent_labels"
-        # groups by labels, and aggregates each entry of the group. example: [{1, 2}, {1, 2, 3}]
-        sorted_labels = self.cluster_model.results.loc[
-            self.cluster_model.results.is_core, ["labels", "parent_labels", "amount_parent_labels"]].groupby(
-            "labels").agg({"amount_parent_labels": max, "parent_labels": lambda x: list(x)}).sort_values(
-            "amount_parent_labels")
-
-        # removing NaN
-        sorted_labels.parent_labels = sorted_labels.parent_labels.apply(lambda L: [x for x in L if x is not np.nan])
-
-        sorted_labels["is_valid"] = np.nan
-        # sorted_labels["is_not_valid"] = np.nan
+        results = self.cluster_model.results.to_dict('index')
+        for topic in results.values():
+            if topic["is_core"]:
+                topic["amount_parent_labels"] = StableTopicHelpers.len_of_parent_labels(topic["parent_labels"])
+
+        # TODO write this back to self.cluster_model.results
+        # it just adds another column, so that's safe
+        # but atm cluster_model.results is a dataframe so first I need to
+        # remove pandas from the cluster_model
+
+        # first, group all the learned cores based on the label,
+        # which was assigned in the cluster_model
+        grouped_by_labels = {}
+        for topic in results.values():
+            if topic["is_core"]:
+                label = topic["labels"] # "labels", but it actually means just a single number
+                if not label in grouped_by_labels:
+                    grouped_by_labels[label] = []
+                grouped_by_labels[label].append(topic)
+
+        # then aggregate their amount_parent_labels by maxing and parent_labels by concatenating
+        # the result is sorted by amount_parent_labels and stored in sorted_clusters
+        sorted_clusters = []
+        for label, group in grouped_by_labels.items():
+            amount_parent_labels = 0
+            parent_labels = [] # will be a list of sets
+            for topic in group:
+                amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels)
+                parent_labels.append(topic["parent_labels"])
+            # - here, nan means "not yet evaluated"
+            # - removing nan from parent_labels
+            sorted_clusters.append({
+                "amount_parent_labels": amount_parent_labels,
+                "parent_labels": [x for x in parent_labels if isinstance(x, set)],
+                "is_valid": np.nan,
+                "label": label
+            })
+        # start with the most significant core
+        sorted_clusters = sorted(sorted_clusters, key=lambda cluster: cluster["amount_parent_labels"], reverse=True)
 
         # APPLYING THE RULES 1 to 3
         # iterate over the cluster labels, see which clusters/labels
         # are valid to cause the creation of a stable topic
-        for label in sorted_labels.index:
-            # sorted_labels.index example: Int64Index([0, 1, 2, 3, 4, 5], dtype='int64', name='labels')
+        for i, cluster in enumerate(sorted_clusters):
+            label = cluster["label"]
             # label is iterating over 0, 1, 3, ...
 
             # 1. rule - remove if the cores in the cluster have no parents
-            if sorted_labels.loc[label].amount_parent_labels == 0:
+            if cluster["amount_parent_labels"] == 0:
                 # label is NOT VALID
-                sorted_labels.loc[label, "is_valid"] = False
-                sorted_labels.parent_labels.apply(
-                    lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels))
+                cluster["is_valid"] = False
+                # removes a label from every set in parent_labels for each core
+                for a in sorted_clusters:
+                    if label in a["parent_labels"]:
+                        a["parent_labels"].remove(label)
 
             # 2. rule - remove if it has less than min_cores as parents
             # (parents are always also cores)
-            if len(sorted_labels.loc[label, "parent_labels"]) < min_cores:
-                sorted_labels.loc[label, "is_valid"] = False
-                sorted_labels.parent_labels.apply(
-                    lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels))
-
+            if len(cluster["parent_labels"]) < min_cores:
+                cluster["is_valid"] = False
+                # removes a label from every set in parent_labels for each core
+                for a in sorted_clusters:
+                    if label in a["parent_labels"]:
+                        a["parent_labels"].remove(label)
+ 
             # 3. checking for "easy_valid"s
             # checks if the core has at least min_cores of cores with the only label as itself
-            if sorted_labels.loc[label].amount_parent_labels >= 1:
-                if sum(map(lambda x: x == {label}, sorted_labels.loc[label, "parent_labels"])) >= min_cores:
-                    sorted_labels.loc[label, "is_valid"] = True
+            if cluster["amount_parent_labels"] >= 1:
+                if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores:
+                    cluster["is_valid"] = True
 
         # Reaplying the rule 3
-        # this is NOT DETERMINISTIC, the order will influence the result
         # this happens when we have a close relationship among 2 or more clusters
-        for label in sorted_labels[sorted_labels.is_valid.isnull()].index:
-            # 3. checking for "easy_valid"s
-            # checks if the core has at least min_cores of cores with the only label as itself
-            if sum(map(lambda x: x == {label}, sorted_labels.loc[label, "parent_labels"])) >= min_cores:
-                sorted_labels.loc[label, "is_valid"] = True
+        for cluster in [cluster for cluster in sorted_clusters if np.isnan(cluster["is_valid"])]:
+            
+            # this checks for parent_labels, which are also modified in this function
+            # hence order will influence the result. it starts with the most significant
+            # label (hence "sorted_clusters")
+            if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores:
+                cluster["is_valid"] = True
             else:
-                sorted_labels.loc[label, "is_valid"] = False
-                sorted_labels.parent_labels.apply(
-                    lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels))
-
-        self.sorted_labels = sorted_labels
-
-        valid_labels = self.sorted_labels[self.sorted_labels.is_valid].index.values
-
-        self.cluster_model.results["valid_parents"] = \
-            self.cluster_model.results.parent_labels.apply(keep_only_valid_labels)
-
-        # VALIDATING CORES
-        # only cores with the valid_parents as its own label can be a valid core
-        self.cluster_model.results["is_valid_core"] = self.cluster_model.results.apply(validate_core, axis=1)
-
-        # numpy solution
+                cluster["is_valid"] = False
+                # removes a label from every set in parent_labels for each core
+                for a in sorted_clusters:
+                    if label in a["parent_labels"]:
+                        a["parent_labels"].remove(label)
+
+        # list of all the label numbers that are valid
+        valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]])
+
+        for i in results:
+            # TODO don't even have nan values, have {}/empty-sets instead
+            if not isinstance(results[i]["parent_labels"], set):
+                results[i]["parent_labels"] = set()
+                results[i]["valid_parents"] = set()
+            else:
+                results[i]["valid_parents"] = {label for label in results[i]["parent_labels"] if label in valid_labels}
 
         # keeping only VALID cores
-        valid_core_mask = self.cluster_model.results.is_valid_core.values
+        valid_core_mask = np.vectorize(StableTopicHelpers.validate_core)([results[i] for i in results])
         valid_topics = self.ttda[valid_core_mask]
-        topic_labels = self.cluster_model.results.labels.values[valid_core_mask]
+        topic_labels = np.array([results[i]['labels'] for i in results])[valid_core_mask]
         unique_labels = np.unique(topic_labels)
 
         num_topics = len(unique_labels)
@@ -1162,6 +1128,7 @@ def validate_core(core):
             topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label])
             stable_topics[l] = topics_of_cluster.mean(axis=0)
 
+        self.sorted_clusters = sorted_clusters
         self.stable_topics = stable_topics
 
     def recluster(self, eps=0.1, min_samples=None, min_cores=None):
@@ -1234,6 +1201,86 @@ def print_topics(self, *posargs, **kwArgs):
     def id2word(self):
         return self.gensim_kw_args["id2word"]
 
+class StableTopicHelpers():
+    """since there are so many helper functions involved,
+    a class is to be prefered over functions in functions
+    to be able to run unit tests on them individually"""
+
+    @staticmethod
+    def remove_label_from_colum(label, valid_parent_labels):
+        """
+        Parameters
+        -------
+            valid_parent_labels : set
+            label : number
+
+        Example
+        -------
+            - label = 0
+            - valid_parent_labels = {0, 1}
+            - result: {1}
+            will overwrite valid_parent_labels
+
+        """
+        try:
+            valid_parent_labels.remove(label)
+        except AttributeError:
+            # parent_labes is NaN
+            pass
+        except KeyError:
+            # Key does not exist
+            pass
+
+    @staticmethod
+    def remove_from_all_sets(label, grouped_parent_labels):
+        """
+        Example
+        -------
+            - label = 2
+            - grouped_parent_labels = [{1, 2}, {1, 2, 3}]
+            - result: [{1}, {1, 3}]
+            will overwrite grouped_parent_labels
+
+        """
+        for valid_parent_labels in grouped_parent_labels:
+            StableTopicHelpers.remove_label_from_colum(label, valid_parent_labels)
+            # valid_parent_labels.remove(label)
+
+    @staticmethod
+    def len_of_parent_labels(parent_label):
+        """
+
+        """
+        if not isinstance(parent_label, set):
+            return 0
+        else:
+            return len(parent_label)
+
+    @staticmethod
+    def keep_only_valid_labels(parent_labels, valid_labels):
+        if not isinstance(parent_labels, set):
+            return set()
+        else:
+            return {label for label in parent_labels if label in valid_labels}
+
+    @staticmethod
+    def validate_core(core):
+        """
+        
+        Core is a dict of {is_core, valid_parents, labels} among others
+
+        If not a core returns False
+
+        Only cores with the valid_parents as its own label can be a valid core. Returns
+        True if that is the case, False otherwise
+
+        """
+        # print('validate_core', core)
+        if not core["is_core"]:
+            return False
+        else:
+            ret = core["valid_parents"] == {core["labels"]}
+            return ret
 
 class CBDBSCAN():
 

From 2af165863519dd58d543fa5d9e76299c7b59efa9 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 7 Apr 2019 14:17:03 +0200
Subject: [PATCH 005/166] pandas -> numpy: distance matrix creation

---
 gensim/models/ensemblelda.py | 82 ++++++++++++------------------------
 1 file changed, 26 insertions(+), 56 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index a12d880483..e6b70804b6 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -628,7 +628,6 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
 
         # worker, that trains a bunch of models:
         def worker(id, num_models, random_states, pipe):
-            # print(id,"training")
             # since models cannot be piped to the parent because messages need to be pickled for that:
             # memory_friendly_ttda needs to be true, which will not store ensemble submodels
             logger.info("Spawned worker to generate {} topic models...".format(num_models))
@@ -702,7 +701,6 @@ def worker(id, num_models, random_states, pipe):
                 p[0].close()
                 # this does basically the same as the generate_topic_models function (concatenate all the ttdas):
                 self.ttda = np.concatenate([self.ttda, answer[1]])
-                # print(answer[0], "received and ttda concatenated")
                 # in [0] the id of the worker that sent the data is stored
 
         else:
@@ -812,7 +810,6 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
         # https://stackoverflow.com/a/1743350
         def worker(worker_id, ttdas_sent, n_ttdas, pipe):
             logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix...".format(n_ttdas))
-            # print(id, "calculating")
             # the chunk of ttda that's going to be calculated:
             ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
             distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda,
@@ -912,12 +909,15 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, st
 
         # select masking method:
         def mass_masking(a):
-            """original masking method. returns a binary mask"""
-            return (a.sort_values(by=0, ascending=False).cumsum() < threshold).sort_index()[0].values
-
-        def rank_masking(a):  # faster
-            """faster masking method. returns a mask that contains indices to keep"""
-            return a.sort_values(by=0, ascending=False)[0:int(len(a) * threshold)].index
+            """original masking method. returns a new binary mask"""
+            sorted_a = np.sort(a)[::-1]
+            largest_mass = sorted_a.cumsum() < threshold
+            smallest_valid = sorted_a[largest_mass][-1]
+            return a >= smallest_valid
+            
+        def rank_masking(a):
+            """faster masking method. returns a new binary mask"""
+            return a > np.sort(a)[::-1][int(len(a) * threshold)]
 
         create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
 
@@ -927,11 +927,10 @@ def rank_masking(a):  # faster
         # now iterate over each topic
         for i in range(len(ttda1)):
 
-            # prepare that topic and create the mask from it
-            a = pd.DataFrame(ttda1[i])
             # create mask from a, that removes noise from a and keeps the largest terms
+            a = ttda1[i]
             mask = create_mask(a)
-            a_masked = a[0][mask].values
+            a_masked = a[mask]
 
             # now look at every possible pair for topic a:
             for j in range(len(ttda2)):
@@ -944,10 +943,16 @@ def rank_masking(a):  # faster
                 # now mask b based on a, which will force the shape of a onto b
                 b_masked = ttda2[j][mask]
 
+                # TODO instaed of checking <= 0.05 and cosine, rather make create_mask(b),
+                # then check how much of mask_a is covered by mask_b. By doing so, the check
+                # for <= 0.05 falls away which 1. seems rather like a magic number guess
+                # and 2. jumps the distance to 1 suddenly when this threshold is not reached
+                # Do experiments for this.
+
                 distance = 0
                 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1
                 # (not 2 because that correspondsto negative values. The maximum distance is 1 here)
-                # don't normalize b_masked, otherwise the following threshold will never:
+                # don't normalize b_masked, otherwise the following threshold will never work:
                 if b_masked.sum() <= 0.05:
                     distance = 1
                 else:
@@ -1108,6 +1113,8 @@ def generate_stable_topics(self, min_cores=None):
         for i in results:
             # TODO don't even have nan values, have {}/empty-sets instead
             if not isinstance(results[i]["parent_labels"], set):
+                # TODO remove that:
+                print('TODO: nan found in parent_labels:', results[i])
                 results[i]["parent_labels"] = set()
                 results[i]["valid_parents"] = set()
             else:
@@ -1202,49 +1209,10 @@ def id2word(self):
         return self.gensim_kw_args["id2word"]
 
 class StableTopicHelpers():
-    """since there are so many helper functions involved,
+    """since there are so many helper functions involved
+    (EDIT: there were more of them before I improved things),
     a class is to be prefered over functions in functions
-    to be able to run unit tests on them individually"""
-
-    @staticmethod
-    def remove_label_from_colum(label, valid_parent_labels):
-        """
-        Parameters
-        -------
-            valid_parent_labels : set
-            label : number
-
-        Example
-        -------
-            - label = 0
-            - valid_parent_labels = {0, 1}
-            - result: {1}
-            will overwrite valid_parent_labels
-
-        """
-        try:
-            valid_parent_labels.remove(label)
-        except AttributeError:
-            # parent_labes is NaN
-            pass
-        except KeyError:
-            # Key does not exist
-            pass
-
-    @staticmethod
-    def remove_from_all_sets(label, grouped_parent_labels):
-        """
-        Example
-        -------
-            - label = 2
-            - grouped_parent_labels = [{1, 2}, {1, 2, 3}]
-            - result: [{1}, {1, 3}]
-            will overwrite grouped_parent_labels
-
-        """
-        for valid_parent_labels in grouped_parent_labels:
-            StableTopicHelpers.remove_label_from_colum(label, valid_parent_labels)
-            # valid_parent_labels.remove(label)
+    to be able to run unit tests on them individually (TODO)"""
 
     @staticmethod
     def len_of_parent_labels(parent_label):
@@ -1252,6 +1220,9 @@ def len_of_parent_labels(parent_label):
 
         """
         if not isinstance(parent_label, set):
+            # TODO make sure this holds always true. either an empty set
+            # or a populated set, but no weird stuff pls
+            print('TODO: parent_label should be a set! but is', parent_label)
             return 0
         else:
             return len(parent_label)
@@ -1275,7 +1246,6 @@ def validate_core(core):
         True if that is the case, False otherwise
 
         """
-        # print('validate_core', core)
         if not core["is_core"]:
             return False
         else:

From 100bbf0cbef105e22b1851a3591e26ca9ffed117 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 7 Apr 2019 17:00:08 +0200
Subject: [PATCH 006/166] pandas -> numpy: CBDBSCAN

---
 gensim/models/ensemblelda.py | 176 +++++++++++++----------------------
 1 file changed, 64 insertions(+), 112 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index e6b70804b6..116d9d8070 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -83,13 +83,11 @@
 import logging
 import os
 from multiprocessing import Process, Pipe, ProcessError
-import copy
 try:
     import cPickle as _pickle
 except ImportError:
     import pickle as _pickle
 import numpy as np
-import pandas as pd
 from scipy.spatial.distance import cosine
 from gensim import utils
 from gensim.models import ldamodel, ldamulticore, basemodel
@@ -360,7 +358,7 @@ def add_model(self, target, num_new_models=None):
         In order to generate new stable topics afterwards, use
             self.generate_asymmetric_distance_matrix()
             self.recluster()
-        
+
         The ttda of another ensemble can also be used,
         in that case set num_new_models to the num_models
         parameter of the ensemble, that means the number
@@ -453,8 +451,8 @@ def add_model(self, target, num_new_models=None):
             else:
                 self.num_models += num_new_models
 
-        else: # memory unfriendly ensembles
-            
+        else:  # memory unfriendly ensembles
+
             ttda = []
 
             # 1. ttda array
@@ -483,7 +481,7 @@ def add_model(self, target, num_new_models=None):
             self.num_models = len(self.tms)
 
         logger.info("Ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
-        
+
         if self.ttda.shape[1] != ttda.shape[1]:
             raise ValueError(("target ttda dimensions do not match. Topics must be {} but was"
                                 "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1]))
@@ -750,7 +748,7 @@ def generate_topic_models(self, num_models, random_states=None):
             # adds the lambda (that is the unnormalized get_topics) to ttda, which is
             # a list of all those lambdas
             self.ttda = np.concatenate([self.ttda, tm.get_topics()])
-            
+
             # only saves the model if it is not "memory friendly"
             if not self.memory_friendly_ttda:
                 self.tms += [tm]
@@ -793,7 +791,8 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
         # singlecore:
         if workers is not None and workers <= 1:
             logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda)))
-            self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda, ttda2=self.ttda,
+            self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda,
+                                                                                              ttda2=self.ttda,
                                                                                               threshold=threshold,
                                                                                               start_index=0,
                                                                                               method=method)
@@ -914,7 +913,7 @@ def mass_masking(a):
             largest_mass = sorted_a.cumsum() < threshold
             smallest_valid = sorted_a[largest_mass][-1]
             return a >= smallest_valid
-            
+
         def rank_masking(a):
             """faster masking method. returns a new binary mask"""
             return a > np.sort(a)[::-1][int(len(a) * threshold)]
@@ -1020,23 +1019,18 @@ def generate_stable_topics(self, min_cores=None):
             min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
 
         # counts how many different labels a core has as parents
-        results = self.cluster_model.results.to_dict('index')
+        results = self.cluster_model.results
         for topic in results.values():
             if topic["is_core"]:
-                topic["amount_parent_labels"] = StableTopicHelpers.len_of_parent_labels(topic["parent_labels"])
-
-        # TODO write this back to self.cluster_model.results
-        # it just adds another column, so that's safe
-        # but atm cluster_model.results is a dataframe so first I need to
-        # remove pandas from the cluster_model
+                topic["amount_parent_labels"] = len(topic["parent_labels"])
 
         # first, group all the learned cores based on the label,
         # which was assigned in the cluster_model
         grouped_by_labels = {}
         for topic in results.values():
             if topic["is_core"]:
-                label = topic["labels"] # "labels", but it actually means just a single number
-                if not label in grouped_by_labels:
+                label = topic["label"]
+                if label not in grouped_by_labels:
                     grouped_by_labels[label] = []
                 grouped_by_labels[label].append(topic)
 
@@ -1045,7 +1039,7 @@ def generate_stable_topics(self, min_cores=None):
         sorted_clusters = []
         for label, group in grouped_by_labels.items():
             amount_parent_labels = 0
-            parent_labels = [] # will be a list of sets
+            parent_labels = []  # will be a list of sets
             for topic in group:
                 amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels)
                 parent_labels.append(topic["parent_labels"])
@@ -1053,13 +1047,19 @@ def generate_stable_topics(self, min_cores=None):
             # - removing nan from parent_labels
             sorted_clusters.append({
                 "amount_parent_labels": amount_parent_labels,
-                "parent_labels": [x for x in parent_labels if isinstance(x, set)],
+                "parent_labels": [x for x in parent_labels if len(x) > 0],
                 "is_valid": np.nan,
                 "label": label
             })
         # start with the most significant core
         sorted_clusters = sorted(sorted_clusters, key=lambda cluster: cluster["amount_parent_labels"], reverse=True)
 
+        def remove_from_all_sets(label):
+            # removes a label from every set in parent_labels for each core
+            for a in sorted_clusters:
+                if label in a["parent_labels"]:
+                    a["parent_labels"].remove(label)
+
         # APPLYING THE RULES 1 to 3
         # iterate over the cluster labels, see which clusters/labels
         # are valid to cause the creation of a stable topic
@@ -1071,20 +1071,14 @@ def generate_stable_topics(self, min_cores=None):
             if cluster["amount_parent_labels"] == 0:
                 # label is NOT VALID
                 cluster["is_valid"] = False
-                # removes a label from every set in parent_labels for each core
-                for a in sorted_clusters:
-                    if label in a["parent_labels"]:
-                        a["parent_labels"].remove(label)
+                remove_from_all_sets(label)
 
             # 2. rule - remove if it has less than min_cores as parents
             # (parents are always also cores)
             if len(cluster["parent_labels"]) < min_cores:
                 cluster["is_valid"] = False
-                # removes a label from every set in parent_labels for each core
-                for a in sorted_clusters:
-                    if label in a["parent_labels"]:
-                        a["parent_labels"].remove(label)
- 
+                remove_from_all_sets(label)
+
             # 3. checking for "easy_valid"s
             # checks if the core has at least min_cores of cores with the only label as itself
             if cluster["amount_parent_labels"] >= 1:
@@ -1094,7 +1088,7 @@ def generate_stable_topics(self, min_cores=None):
         # Reaplying the rule 3
         # this happens when we have a close relationship among 2 or more clusters
         for cluster in [cluster for cluster in sorted_clusters if np.isnan(cluster["is_valid"])]:
-            
+
             # this checks for parent_labels, which are also modified in this function
             # hence order will influence the result. it starts with the most significant
             # label (hence "sorted_clusters")
@@ -1111,23 +1105,27 @@ def generate_stable_topics(self, min_cores=None):
         valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]])
 
         for i in results:
-            # TODO don't even have nan values, have {}/empty-sets instead
-            if not isinstance(results[i]["parent_labels"], set):
-                # TODO remove that:
-                print('TODO: nan found in parent_labels:', results[i])
-                results[i]["parent_labels"] = set()
-                results[i]["valid_parents"] = set()
+            results[i]["valid_parents"] = {label for label in results[i]["parent_labels"] if label in valid_labels}
+
+        def validate_core(core):
+            """Core is a dict of {is_core, valid_parents, labels} among others.
+            If not a core returns False. Only cores with the valid_parents as
+            its own label can be a valid core. Returns True if that is the case,
+            False otherwise"""
+            if not core["is_core"]:
+                return False
             else:
-                results[i]["valid_parents"] = {label for label in results[i]["parent_labels"] if label in valid_labels}
+                ret = core["valid_parents"] == {core["label"]}
+                return ret
 
         # keeping only VALID cores
-        valid_core_mask = np.vectorize(StableTopicHelpers.validate_core)([results[i] for i in results])
+        valid_core_mask = np.vectorize(validate_core)(list(results.values()))
         valid_topics = self.ttda[valid_core_mask]
-        topic_labels = np.array([results[i]['labels'] for i in results])[valid_core_mask]
+        topic_labels = np.array([results[i]["label"] for i in results])[valid_core_mask]
         unique_labels = np.unique(topic_labels)
 
-        num_topics = len(unique_labels)
-        stable_topics = np.empty((num_topics, len(self.id2word)), dtype=np.float32)
+        num_stable_topics = len(unique_labels)
+        stable_topics = np.empty((num_stable_topics, len(self.id2word)), dtype=np.float32)
 
         # for each cluster
         for l, label in enumerate(unique_labels):
@@ -1208,49 +1206,6 @@ def print_topics(self, *posargs, **kwArgs):
     def id2word(self):
         return self.gensim_kw_args["id2word"]
 
-class StableTopicHelpers():
-    """since there are so many helper functions involved
-    (EDIT: there were more of them before I improved things),
-    a class is to be prefered over functions in functions
-    to be able to run unit tests on them individually (TODO)"""
-
-    @staticmethod
-    def len_of_parent_labels(parent_label):
-        """
-
-        """
-        if not isinstance(parent_label, set):
-            # TODO make sure this holds always true. either an empty set
-            # or a populated set, but no weird stuff pls
-            print('TODO: parent_label should be a set! but is', parent_label)
-            return 0
-        else:
-            return len(parent_label)
-
-    @staticmethod
-    def keep_only_valid_labels(parent_labels, valid_labels):
-        if not isinstance(parent_labels, set):
-            return set()
-        else:
-            return {label for label in parent_labels if label in valid_labels}
-
-    @staticmethod
-    def validate_core(core):
-        """
-        
-        Core is a dict of {is_core, valid_parents, labels} among others
-
-        If not a core returns False
-
-        Only cores with the valid_parents as its own label can be a valid core. Returns
-        True if that is the case, False otherwise
-
-        """
-        if not core["is_core"]:
-            return False
-        else:
-            ret = core["valid_parents"] == {core["labels"]}
-            return ret
 
 class CBDBSCAN():
 
@@ -1262,39 +1217,40 @@ def __init__(self, eps=0.1, min_samples=4):
     def fit(self, amatrix):
 
         self.next_label = 0
-        self.results = pd.DataFrame(index=range(len(amatrix)),
-                                    columns=["labels", "is_core",
-                                             "parent_ids", "parent_labels", "num_samples"])
-        self.results.is_core = False
 
-        # otherwise sets cannot be written into the dataframe (will be converted to integers).
-        # The dtype should be object for everything to fix this.
-        self.results = self.results.astype(dtype="object")
+        results = {index: {
+            "is_core": False,
+            "parent_labels": set(),
+            "parent_ids": set(),
+            "num_samples": 0,
+            "label": None
+        } for index in range(len(amatrix))}
 
         tmp_amatrix = amatrix.copy()
 
         # to avoid problem about comparing the topic with itself
         np.fill_diagonal(tmp_amatrix, 1)
 
-        min_distance_per_topic = pd.Series(tmp_amatrix.min(axis=1))
-
-        min_distance_per_topic_sorted = min_distance_per_topic.sort_values()
+        min_distance_per_topic = [(index, distance) for index, distance in enumerate(tmp_amatrix.min(axis=1))]
+        min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x[1])
+        ordered_min_similarity = [index for index, distance in min_distance_per_topic_sorted]
 
-        ordered_min_similarity = list(min_distance_per_topic_sorted.index.values)
+        num_topics = len(amatrix)
 
         def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None):
 
             # count how many neighbors
-            neighbors = pd.Series(tmp_amatrix[topic_index] < self.eps)
-            num_samples = sum(neighbors)
+            # check which indices (arange 0 to num_topics) is closer than eps
+            neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps]
+            num_samples = len(neighbors)
 
             # If the number of neighbors of a topic is large enough,
             # it is considered a core.
             # This also takes neighbors that already are identified as core in count.
             if num_samples >= self.min_samples:
                 # This topic is a core!
-                self.results.loc[topic_index, "is_core"] = True
-                self.results.loc[topic_index, "num_samples"] = num_samples
+                results[topic_index]["is_core"] = True
+                results[topic_index]["num_samples"] = num_samples
 
                 # if current_label is none, then this is the first core
                 # of a new cluster (hence next_label)
@@ -1313,7 +1269,7 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
 
                     # parent neighbors is the list of neighbors of parent_id
                     # (the topic_index that called this function recursively)
-                    all_members_of_current_cluster = list(parent_neighbors[parent_neighbors].index.values)
+                    all_members_of_current_cluster = list(parent_neighbors)
                     all_members_of_current_cluster.append(parent_id)
 
                     # look if 25% of the members of the current cluster are also
@@ -1332,11 +1288,11 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
                         current_label = self.next_label
                         self.next_label += 1
 
-                self.results.loc[topic_index, "labels"] = current_label
+                results[topic_index]["label"] = current_label
 
-                for neighbor in neighbors[neighbors].index:
+                for neighbor in neighbors:
 
-                    if self.results.loc[neighbor, "labels"] is np.nan:
+                    if results[neighbor]["label"] is None:
                         ordered_min_similarity.remove(neighbor)
                         # try to extend the cluster into the direction
                         # of the neighbor
@@ -1344,23 +1300,19 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
                                    current_label=current_label,
                                    parent_neighbors=neighbors)
 
-                    if isinstance(self.results.loc[neighbor, "parent_ids"], set):
-                        self.results.loc[neighbor, "parent_ids"].add(topic_index)
-                        self.results.loc[neighbor, "parent_labels"].add(current_label)
-                    else:
-                        self.results.loc[neighbor, "parent_ids"] = set([topic_index])
-                        self.results.loc[neighbor, "parent_labels"] = set([current_label])
+                    results[neighbor]["parent_ids"].add(topic_index)
+                    results[neighbor]["parent_labels"].add(current_label)
 
             else:
                 # this topic is not a core!
                 if current_label is None:
-                    self.results.loc[topic_index, "labels"] = -1
+                    results[topic_index]["label"] = -1
                 else:
-                    self.results.loc[topic_index, "labels"] = current_label
+                    results[topic_index]["label"] = current_label
 
         # elements are going to be removed from that array in scan_topic, do until it is empty
         while len(ordered_min_similarity) != 0:
             next_topic_index = ordered_min_similarity.pop(0)
             scan_topic(next_topic_index)
 
-        self.labels_ = self.results["labels"]
+        self.results = results

From aff3287c786ee29947d31754d207250e4d9f59a2 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 7 Apr 2019 17:32:23 +0200
Subject: [PATCH 007/166] fixes for automated checks

---
 gensim/corpora/opinosiscorpus.py | 11 +++++------
 gensim/models/ensemblelda.py     |  3 ++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py
index eee2229849..446c5e7804 100644
--- a/gensim/corpora/opinosiscorpus.py
+++ b/gensim/corpora/opinosiscorpus.py
@@ -4,11 +4,10 @@
 # Author: Tobias B <github.com/sezanzeb>
 
 import os
-from pathlib import Path
 import re
 from gensim.corpora import Dictionary
-from nltk.stem import PorterStemmer
-from nltk.corpus import stopwords
+from gensim.parsing.porter import PorterStemmer
+from gensim.parsing.preprocessing import STOPWORDS
 
 
 class OpinosisCorpus():
@@ -49,7 +48,7 @@ def __init__(self, path):
         print("year:\t\t2010")
         print("organization:\tAssociation for Computational Linguistics")
 
-        path = Path(path).joinpath("summaries-gold")
+        path = os.path.join(path, "summaries-gold")
         dictionary = Dictionary()
         corpus = []
         stemmer = PorterStemmer()
@@ -76,8 +75,8 @@ def __init__(self, path):
                     # overwrite dictionary, add corpus to test or train afterwards
                     # the following function takes an array of documents, so wrap it in square braces
                     processed = [
-                        stemmer.stem(token) for token in re.findall('\w+', doc.lower())
-                        if token not in stopwords.words('english')
+                        stemmer.stem(token) for token in re.findall(r'\w+', doc.lower())
+                        if token not in STOPWORDS
                     ]
 
                     dictionary.add_documents([processed])
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 116d9d8070..e2a439f077 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -17,7 +17,7 @@
 
     >>> from gensim.test.utils import common_texts
     >>> from gensim.corpora.dictionary import Dictionary
-    >>> from gensim.models import EnsembleLda, LdaModel
+    >>> from gensim.models import EnsembleLda
     >>>
     >>> # Create a corpus from a list of texts
     >>> common_dictionary = Dictionary(common_texts)
@@ -59,6 +59,7 @@
 
 .. sourcecode:: pycon
 
+    >>> from gensim.models import LdaModel
     >>> elda.add_model(LdaModel(common_corpus, id2word=common_dictionary, num_topics=10))
     >>> elda.recluster()
     >>> vector = elda[unseen_doc]

From a545ddf1a12aca42e346e55a8ac8c0b13a17ddd6 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Mon, 8 Apr 2019 21:24:23 +0200
Subject: [PATCH 008/166] improvements on logs, comments and variable naming.
 Changed save function to simply pickling the whole thing.

---
 gensim/models/ensemblelda.py | 198 ++++++++++++++---------------------
 1 file changed, 78 insertions(+), 120 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index e2a439f077..b8cc0b9e9c 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -229,6 +229,7 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0:
             return
 
+        logger.info("Generating {} topic models...".format(num_models))
         # training
         if ensemble_workers > 1:
             # multiprocessed
@@ -275,29 +276,29 @@ def generate_gensim_representation(self):
 
         logger.info("Generating classic gensim model representation based on results from the ensemble")
 
-        numberOfWords = self.sstats_sum
-        # if numberOfWords should be wrong for some fantastic funny reason
+        number_of_words = self.sstats_sum
+        # if number_of_words should be wrong for some fantastic funny reason
         # that makes you want to peel your skin off, recreate it (takes a while):
-        if numberOfWords == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None:
+        if number_of_words == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None:
             for document in self.gensim_kw_args["corpus"]:
                 for token in document:
-                    numberOfWords += token[1]
-            self.sstats_sum = numberOfWords
-        assert numberOfWords != 0
+                    number_of_words += token[1]
+            self.sstats_sum = number_of_words
+        assert number_of_words != 0
 
-        stableTopics = self.get_topics()
+        stable_topics = self.get_topics()
 
-        nStableTopics = len(stableTopics)
+        num_stable_topics = len(stable_topics)
 
-        if nStableTopics == 0:
-            logger.warning("The model did not detect any stable topic. You can try to adjust epsilon: "
-                           "generate_topic_clusters(eps); generate_stable_topics()")
+        if num_stable_topics == 0:
+            logger.error("The model did not detect any stable topic! You can try to adjust epsilon: "
+                         "recluster(eps=...)")
             return
 
         # create a new gensim model
         params = self.gensim_kw_args.copy()
         params["eta"] = self.eta
-        params["num_topics"] = nStableTopics
+        params["num_topics"] = num_stable_topics
         # adjust params in a way that no training happens
         params["iterations"] = 0  # no training
         params["passes"] = 0  # no training
@@ -309,13 +310,13 @@ def generate_gensim_representation(self):
 
         # the following is important for the denormalization
         # to generate the proper sstats for the new gensim model:
-        # transform to dimensionality of stableTopics. axis=1 is summed
+        # transform to dimensionality of stable_topics. axis=1 is summed
         eta_sum = 0
         if int == type(eta) or float == type(eta):
-            eta_sum = [eta * len(stableTopics[0])] * nStableTopics
+            eta_sum = [eta * len(stable_topics[0])] * num_stable_topics
         else:
             if len(eta.shape) == 1:  # [e1, e2, e3]
-                eta_sum = [[eta.sum()]] * nStableTopics
+                eta_sum = [[eta.sum()]] * num_stable_topics
             if len(eta.shape) > 1:  # [[e11, e12, ...], [e21, e22, ...], ...]
                 eta_sum = np.array(eta.sum(axis=1)[:, None])
 
@@ -328,18 +329,11 @@ def generate_gensim_representation(self):
         # right sstats, so that get_topics() will return the stable topics no
         # matter eta.
 
-        normalization_factor = np.array([[numberOfWords / nStableTopics]] * nStableTopics) + eta_sum
-
-        if stableTopics.shape[1] != len(eta):
-            raise ValueError("If load() was used: was the correct dictionary used when loading the model? "
-                             "because stableTopics.shape[0] {} != len(eta) {}".format(stableTopics.shape[1], len(eta)))
+        normalization_factor = np.array([[number_of_words / num_stable_topics]] * num_stable_topics) + eta_sum
 
-        sstats = stableTopics * normalization_factor
+        sstats = stable_topics * normalization_factor
         sstats -= eta
 
-        # Note that this will cause some stuff to fail if nStableTopics is 1
-        # if nStableTopics is 1, the result will be (close to) uniform anyway
-        # inserting the new sstats
         classic_model_representation.state.sstats = sstats.astype("float32")
         # fix expElogbeta.
         classic_model_representation.sync_state()
@@ -458,9 +452,9 @@ def add_model(self, target, num_new_models=None):
 
             # 1. ttda array
             if isinstance(target.dtype.type(), (np.number, float)):
-                raise ValueError('ttda arrays cannot be added to ensembles that are not memory friendly, '
-                                 'you can call convert_to_memory_friendly to discard the stored gensim models'
-                                 'and only keep the ttdas')
+                raise ValueError('ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, '
+                                 'you can call convert_to_memory_friendly, but it will discard the stored gensim'
+                                 'models and only keep the relevant topic term distributions from them.')
 
             # 2. list of ensembles
             elif isinstance(target[0], type(self)):
@@ -479,6 +473,9 @@ def add_model(self, target, num_new_models=None):
 
             # in this case, len(self.tms) should
             # always match self.num_models
+            if num_new_models is not None and num_new_models + self.num_models != len(self.tms):
+                logger.info('num_new_models will be ignored. num_models should match the number of '
+                            'stored models for a memory unfriendly ensemble')
             self.num_models = len(self.tms)
 
         logger.info("Ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
@@ -501,37 +498,10 @@ def save(self, fname):
 
         """
 
-        # parameter descriptions are similar to [1] in
-        # order to make the api less confusing
-
-        gensim_kw_args_persist = self.gensim_kw_args.copy()
-
-        payload = {
-            "ensemble_kw_args": {
-                "topic_model_kind": self.topic_model_kind,
-                "memory_friendly_ttda": self.memory_friendly_ttda,
-                "num_models": self.num_models
-            },
-            "gensim_kw_args": gensim_kw_args_persist,
-            "ttda": self.ttda,
-            "amatrix": self.asymmetric_distance_matrix,
-            "stable_topics": self.stable_topics,
-            "cluster_model.results": self.cluster_model.results,
-            "sstats_sum": self.sstats_sum,
-            "id2word": self.id2word
-        }
-
-        if not self.memory_friendly_ttda:
-            payload["topicModels"] = self.tms
-
-        # Version of the save/load api, to eventually make load
-        # backwards compatible if the save-function changes.
-        payload["version"] = "1.0"
+        logger.info("saving %s object to %d", self.__class__.__name__, fname)
 
         with open(fname, 'wb') as f:
-            _pickle.dump(payload, f)
-
-        logger.info("saved %s object", self.__class__.__name__)
+            _pickle.dump(self, f)
 
     @staticmethod
     def load(fname):
@@ -552,45 +522,7 @@ def load(fname):
         logger.info("loading %s object from %s", EnsembleLda.__name__, fname)
 
         with open(fname, 'rb') as f:
-            data = _pickle.load(f)
-
-        # update() on the dict can be safely used, because they don't have
-        # keywords in common that overwrite each other
-        kwArgs = data["ensemble_kw_args"]
-        kwArgs.update(data["gensim_kw_args"])
-
-        kwArgs["id2word"] = data["id2word"]
-        # prevent training
-        kwArgs["iterations"] = 0
-
-        # now reconstruct an ensemble LDA object based on the data:
-        eLDA = EnsembleLda(**kwArgs)
-        # since no training will take place, num_models will be set to 0. restore:
-        eLDA.num_models = kwArgs["num_models"]
-
-        # some stuff that was not created automatically, because the "empty" eLDA object did not train
-        eLDA.ttda = data["ttda"]
-        eLDA.asymmetric_distance_matrix = data["amatrix"]
-        eLDA.sstats_sum = data["sstats_sum"]
-
-        # depends on memory_friendly_ttda in the save function:
-        if "topicModels" in data:
-            eLDA.tms = []
-            for i in range(len(data["topicModels"])):
-                eLDA.tms += [data["topicModels"][i]]
-
-        # As long as fit is not called on cluster_model, no results will be generated,
-        # so the CBDBSCAN constructor is safe to call without consuming cpu time.
-        # No need to provide the original params from the constructor, the result are
-        # going to be written into the model from the pickle.
-        eLDA.cluster_model = CBDBSCAN()
-        eLDA.cluster_model.results = data["cluster_model.results"]
-        eLDA.stable_topics = data["stable_topics"]
-
-        eLDA.generate_gensim_representation()
-
-        # message copied from [1]
-        logger.info("loaded %s", fname)
+            eLDA = _pickle.load(f)
 
         return eLDA
 
@@ -629,7 +561,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
         def worker(id, num_models, random_states, pipe):
             # since models cannot be piped to the parent because messages need to be pickled for that:
             # memory_friendly_ttda needs to be true, which will not store ensemble submodels
-            logger.info("Spawned worker to generate {} topic models...".format(num_models))
+            logger.info("Spawned worker to generate {} topic models".format(num_models))
             self.generate_topic_models(num_models, random_states=random_states)
             # send the ttda that is in the child/workers version of the memory into the pipe
             # available, after generate_topic_models has been called in the worker
@@ -734,8 +666,6 @@ def generate_topic_models(self, num_models, random_states=None):
 
         assert len(random_states) == num_models
 
-        logger.info("Generating {} topic models...".format(num_models))
-
         kwArgs = self.gensim_kw_args.copy()
 
         tm = None  # remember one of the topic models from the following
@@ -744,6 +674,7 @@ def generate_topic_models(self, num_models, random_states=None):
         for i in range(num_models):
 
             kwArgs["random_state"] = random_states[i]
+
             tm = GENSIM_MODEL[self.topic_model_kind](**kwArgs)
 
             # adds the lambda (that is the unnormalized get_topics) to ttda, which is
@@ -765,6 +696,9 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
         Returns the asymmetric pairwise distance matrix that
         is used in the DBSCAN clustering.
 
+        Afterwards, the model needs to be reclustered for
+        this generated matrix to take effect.
+
         Parameters
         ----------
         threshold : float, optional
@@ -789,9 +723,10 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
         if threshold is None:
             threshold = {"mass": 0.95, "rank": 0.11}[method]
 
+        logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda)))
+
         # singlecore:
         if workers is not None and workers <= 1:
-            logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda)))
             self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda,
                                                                                               ttda2=self.ttda,
                                                                                               threshold=threshold,
@@ -809,7 +744,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
         # all other nodes from a chunk of nodes.
         # https://stackoverflow.com/a/1743350
         def worker(worker_id, ttdas_sent, n_ttdas, pipe):
-            logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix...".format(n_ttdas))
+            logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas))
             # the chunk of ttda that's going to be calculated:
             ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
             distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda,
@@ -921,6 +856,9 @@ def rank_masking(a):
 
         create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
 
+        # some help to find a better threshold by useful log messages
+        avg_mask_size = 0
+
         # initialize the distance matrix. ndarray is faster than zeros
         distances = np.ndarray((len(ttda1), len(ttda2)))
 
@@ -932,6 +870,8 @@ def rank_masking(a):
             mask = create_mask(a)
             a_masked = a[mask]
 
+            avg_mask_size += mask.sum()
+
             # now look at every possible pair for topic a:
             for j in range(len(ttda2)):
 
@@ -947,7 +887,7 @@ def rank_masking(a):
                 # then check how much of mask_a is covered by mask_b. By doing so, the check
                 # for <= 0.05 falls away which 1. seems rather like a magic number guess
                 # and 2. jumps the distance to 1 suddenly when this threshold is not reached
-                # Do experiments for this.
+                # TODO Do experiments for this to see if it makes a lot of difference
 
                 distance = 0
                 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1
@@ -963,6 +903,17 @@ def rank_masking(a):
 
                 distances[i][j] = distance
 
+        avg_mask_size = avg_mask_size / ttda1.shape[0] / ttda1.shape[1]
+        percent = round(100 * avg_mask_size, 1)
+        if avg_mask_size > 0.75:
+            # if the masks covered more than 75% of tokens on average,
+            # print a warning. info otherwise.
+            # The default mass setting of 0.95 makes uniform true masks on the opinosis corpus.
+            logger.warning('The given threshold of {} covered on average '
+                '{}% of tokens, which might be too much!'.format(threshold, percent))
+        else:
+            logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
+
         return distances
 
     def generate_topic_clusters(self, eps=0.1, min_samples=None):
@@ -1019,17 +970,18 @@ def generate_stable_topics(self, min_cores=None):
             # min_cores is a number between 1 and 3, depending on the number of models
             min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
 
-        # counts how many different labels a core has as parents
         results = self.cluster_model.results
-        for topic in results.values():
-            if topic["is_core"]:
-                topic["amount_parent_labels"] = len(topic["parent_labels"])
 
         # first, group all the learned cores based on the label,
         # which was assigned in the cluster_model
         grouped_by_labels = {}
         for topic in results.values():
             if topic["is_core"]:
+                topic = topic.copy()
+                
+                # counts how many different labels a core has as parents
+                topic["amount_parent_labels"] = len(topic["parent_labels"])
+
                 label = topic["label"]
                 if label not in grouped_by_labels:
                     grouped_by_labels[label] = []
@@ -1045,7 +997,7 @@ def generate_stable_topics(self, min_cores=None):
                 amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels)
                 parent_labels.append(topic["parent_labels"])
             # - here, nan means "not yet evaluated"
-            # - removing nan from parent_labels
+            # - removing empty sets from parent_labels
             sorted_clusters.append({
                 "amount_parent_labels": amount_parent_labels,
                 "parent_labels": [x for x in parent_labels if len(x) > 0],
@@ -1056,7 +1008,8 @@ def generate_stable_topics(self, min_cores=None):
         sorted_clusters = sorted(sorted_clusters, key=lambda cluster: cluster["amount_parent_labels"], reverse=True)
 
         def remove_from_all_sets(label):
-            # removes a label from every set in parent_labels for each core
+            """removes a label from every set in "parent_labels" for
+            each core in sorted_clusters."""
             for a in sorted_clusters:
                 if label in a["parent_labels"]:
                     a["parent_labels"].remove(label)
@@ -1167,6 +1120,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
         # if new models were added to the ensemble, the distance
         # matrix needs to be generated again
         if self.asymmetric_distance_matrix_outdated:
+            logger.info("asymmetric distance matrix is outdated due to add_model")
             self.generate_asymmetric_distance_matrix()
 
         self.generate_topic_clusters(eps, min_samples)
@@ -1179,28 +1133,33 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
     def get_topics(self):
         return self.stable_topics
 
+    def has_gensim_representation(self):
+        """checks if stable topics area vailable and if the internal
+        gensim representation exists. If not, raises errors"""
+        if self.classic_model_representation is None:
+            if len(self.stable_topics) == 0:
+                raise ValueError("no stable topic was detected!")
+            else:
+                raise ValueError("use generate_gensim_representation() first")
+
     def __getitem__(self, i):
         """see https://radimrehurek.com/gensim/models/ldamodel.html"""
-        if self.classic_model_representation is None:
-            raise ValueError("use generate_gensim_representation() first")
+        self.has_gensim_representation()
         return self.classic_model_representation[i]
 
     def inference(self, *posargs, **kwArgs):
         """see https://radimrehurek.com/gensim/models/ldamodel.html"""
-        if self.classic_model_representation is None:
-            raise ValueError("use generate_gensim_representation() first")
+        self.has_gensim_representation()
         return self.classic_model_representation.inference(*posargs, **kwArgs)
 
     def log_perplexity(self, *posargs, **kwArgs):
         """see https://radimrehurek.com/gensim/models/ldamodel.html"""
-        if self.classic_model_representation is None:
-            raise ValueError("use generate_gensim_representation() first")
+        self.has_gensim_representation()
         return self.classic_model_representation.log_perplexity(*posargs, **kwArgs)
 
     def print_topics(self, *posargs, **kwArgs):
         """see https://radimrehurek.com/gensim/models/ldamodel.html"""
-        if self.classic_model_representation is None:
-            raise ValueError("use generate_gensim_representation() first")
+        self.has_gensim_representation()
         return self.classic_model_representation.print_topics(*posargs, **kwArgs)
 
     @property
@@ -1239,7 +1198,7 @@ def fit(self, amatrix):
         num_topics = len(amatrix)
 
         def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None):
-
+            
             # count how many neighbors
             # check which indices (arange 0 to num_topics) is closer than eps
             neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps]
@@ -1254,7 +1213,7 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
                 results[topic_index]["num_samples"] = num_samples
 
                 # if current_label is none, then this is the first core
-                # of a new cluster (hence next_label)
+                # of a new cluster (hence next_label is used)
                 if current_label is None:
                     # next_label is initialized with 0 in
                     # fit() for the first cluster
@@ -1262,9 +1221,8 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
                     self.next_label += 1
 
                 else:
-                    # In case the core has a parent,
-                    # that means it has a label, namely the
-                    # label from the parent.
+                    # In case the core has a parent, that means
+                    # it has a label (= the parents label).
                     # Check the distance between the
                     # new core and the parent
 

From d1a6854a413a92e50a805998cf43e6be03415345 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Tue, 9 Apr 2019 00:17:09 +0200
Subject: [PATCH 009/166] minor fix in log message format

---
 gensim/models/ensemblelda.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index b8cc0b9e9c..7f3f071e77 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -498,7 +498,7 @@ def save(self, fname):
 
         """
 
-        logger.info("saving %s object to %d", self.__class__.__name__, fname)
+        logger.info("saving %s object to %s", self.__class__.__name__, fname)
 
         with open(fname, 'wb') as f:
             _pickle.dump(self, f)
@@ -883,12 +883,6 @@ def rank_masking(a):
                 # now mask b based on a, which will force the shape of a onto b
                 b_masked = ttda2[j][mask]
 
-                # TODO instaed of checking <= 0.05 and cosine, rather make create_mask(b),
-                # then check how much of mask_a is covered by mask_b. By doing so, the check
-                # for <= 0.05 falls away which 1. seems rather like a magic number guess
-                # and 2. jumps the distance to 1 suddenly when this threshold is not reached
-                # TODO Do experiments for this to see if it makes a lot of difference
-
                 distance = 0
                 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1
                 # (not 2 because that correspondsto negative values. The maximum distance is 1 here)
@@ -978,7 +972,7 @@ def generate_stable_topics(self, min_cores=None):
         for topic in results.values():
             if topic["is_core"]:
                 topic = topic.copy()
-                
+
                 # counts how many different labels a core has as parents
                 topic["amount_parent_labels"] = len(topic["parent_labels"])
 
@@ -1198,7 +1192,7 @@ def fit(self, amatrix):
         num_topics = len(amatrix)
 
         def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None):
-            
+
             # count how many neighbors
             # check which indices (arange 0 to num_topics) is closer than eps
             neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps]

From 3650895d825765d67aa8a20c0ba3cabe56901fdd Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Tue, 9 Apr 2019 21:13:43 +0200
Subject: [PATCH 010/166] added tests

---
 gensim/test/test_data/ensemblelda | Bin 0 -> 9923 bytes
 gensim/test/test_ensemblelda.py   | 318 ++++++++++++++++++++++++++++++
 2 files changed, 318 insertions(+)
 create mode 100644 gensim/test/test_data/ensemblelda
 create mode 100644 gensim/test/test_ensemblelda.py

diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
new file mode 100644
index 0000000000000000000000000000000000000000..905aa8bf8a9ab4e3f8c2fc54758af605b910d30f
GIT binary patch
literal 9923
zcma)i2|U!@*Z9~OOJpyKkS#l9nHiNVH3_4nEMqXtFvHwg5*lf<q_R|sqENQVk~~Qy
zMJgf{Qc;#rsgy`1{`Whh=lMPF|9#)j`}O(W@44rkd(S!doICEl=f(>Kg_0v!RJt)e
zGMG$b8N&*h9!Mk8f=Ot{1$Z-r0ZM-#5a0qqJ#SeQ3dN3OP=koPHpFmhL@*HY7J_Kl
z3J80nAr!%(^T{kAf*1CdfhfKpVt5ph#0+HtQ7#bE^A?4KphzZz!vf-dKtj(Cp!M)5
zyg&kw#0wxFDZC)^ktX{AnFY!sybywt#S0@JIlKt+S&SD&KJs`m<f8!D74`T{mG}(G
ze2NNQ9AQv}3`_L*B9=l1!33bjr>H~9GCjVC<@`=H_)Rq-Wd&XWX|6@`1KN6gSO<?r
zP%9x-cLAlxN9p6C5edKmQVjL@+OFbLjQA8|GTB=UjsP{-JSvhI3{3ub0~wM?CdoLM
z8pNhXMv$1%=nemXz!W;ej0?>5yv1Q_c4Rm?!kiimEV#guj|z&UGdOHA6IkI<-Xf5P
z!l9EQz-qjJw*(?D0@6ZAL1eH7FUaeINoFx3p}&DOUWn%u7Kgcu91U!s^AI1gqFHP*
z9oXVUyoC{wR5}^h;YE2B4hssx;Kg_$KxVK27B9{d%4U+uEP%sH@M@t<5`zNl@n~L$
zbZSH-lLgk|B{@6=;1KZa0PDD5J#;k13NJuejTfY>!3$BW@xl}vya>e>FG{h)i%~Fm
zaS9eMLBZkC6nnfRWi5v%J2->|9JpWu;#Wc59}1Sj@D+pp6oIc0ghk;i4qpjaL&H}R
zFUVH`iP_-DAQo_h8Y6B&5R6C`aKfwqU5|)}0nT_ePT=2R&YKG?-~vYy87T&d#e!CE
z#G7-x6=0oCrbjZPi6KlXIU<-AO=Pp-B-xb2n?}J@7Mm0iL?%M-hm+xO;Gu)mV51-!
z?B(wn#w0VKf5;JR5)uqx^PebLn6q%!@gOvusIJH~MgvzEAX~V=O^-M7!R%-T8Mp^Q
zdl=DZUJVUAyal145KBNn=3650B!I0j?7amb76Z0z*|KHRJbZWs9>oS;ffN&p883X0
zae)CuXMyc_DQ`*G05OC~4TK>V47`&m$XFn87DVAhgtMqIWU#~au^3+$au6vR>~yw&
z1L^SLjag(!2#d@HKF;<N3s`yhg&{(vBQ9V@29g4)G%7n9_&VDkj2Un)Q;6lsMKs|s
zf=O&R;(jn_Ho|b1f=&52B6$~?84dj5pd!CX<Xt4%0*VNuQ0_lPP^d&m<&Z(ZKLQp(
z1_qglI4796;AK)W-!Kx57Dx&TXMw;iJhMnN289HI_(N<Ar!U!<&Y`iXBqoy-jV3b5
zL2!1lnVcXt8U!ymA|jYi1!O$x5D0M)2n3;tfI<LNUH~5gVU(@O4#*6*M_v%l2DCul
zPiZ6rPCz)UM7~&hGJoo^Ng#p}OyPAGNr8Pa2ml~`NgV=Aq%YGyef?j*fW-#vK;Dr4
zEt3qzaFTgACMhB~l1_v{%qD|f3&!)2Xd;ooqk1AlVh}r$i3U;n1h89=H<w(un%US`
ztO3z~6pld@jwOISP+?bw;32@J@I#bi3F7!6x|aarVTe-x|KQvgn7sZrjJVQrRgcN!
z1Mz6<Y&ApFg=d|_X~!c%_s`g;W`?Ua#f1=pPUr_-(p7S|_9sujZ7ORt{vfDTba>y6
zpuEJIW-0TZEov(#YE*Z>Uf(3Idq4Dx&yPN-x0B1B9#Vh)TX+#}-11R=P5uut+D3P2
zy>#6yHM7d{8PdajUfmyRMMfzsomb=)er3sSCqwf?muK3(*;XS?_PU;XztPt}Smpg)
zAusfNbW!QD2ai_fNA0|N-fQ!Xgn$%-Qp>Y_e;EwocT^RzNG;#L)q2+BLwiLtH@a+o
zjMJ~rG)jzJ-Lxs^X7q&gqF1E!C89%=7!%3p9pPQ;?gg^#T1E7m49#*dX=fu<x9+b|
z(-~0nxkXVddO{qmo}I0nD!3!*Z?R7|t2U<JN&n}`rk)OMHCxFWKHHNIxgC-j#-}X4
zKR`>o%6N9>sjHig(y=#xh3yM`5*lV`%3yu3X>2<9abi@hVwKBX;)9i0b^@9!cZ$vv
zFw`8=Jl&MdN{<$)ZgJQia$qpKV*3(}mYqKrg+BLeOVRlHz$jBo^^(h|Yl%&gd4Fnv
z;``qD<kw$RBjO6>#141f+3o+xAn;9|a=Gu6c2?TMw{r*8OWSc{A7Yf%)I}}c4e!;d
zwYhO*)T#H5+_m?lL6=Q!YT=9>BR(&#GBRzBb&<6`Rb18?pW8UPX*^P==L6@MpXACo
zL(KbTjiF3?ol!G8eT_l$q>InaQamo>n_Sk#tX2;wzc*}&)gdPO_&&dw=G6bK-GO|u
z?wPXaIgi+e2bdDAqk?|D8|aDl<8FIQB`glSC$+Q-uA5ZY>P(Ru7(Fc8P-wfit#^Pj
zy6T8l5o&2~?S{{nbJEt-m<#SavgFAP=dZbA))5<(mTNf_$9L!Vc$h0+af|7%^^7Wg
zKk4E!e6CdMYV9m3Ir{oN?7er-YZ-gDYiZr7IcXuzT*p)%mZx92)bjPsZpzX9D|3qE
zJ)hQ<J1%}6yQOjX?Oaj*RLfC$z1c*}5doub0vGW8V)ccpiS|biBra*aHQJLO;52UW
z;f@{Vr?;NX9?Q?!zrwsePouZ1*uHC9;~jK-ZQ?;a^2Tc}IH$#xeKU&_(*mrcFPu3N
z`S$$q?>h3dXP;~D7Jb3vM=m~dQ!Mp%f5deUezN=!TIsx{u80BYuW2F4{Yh?TZ=Fc^
zl{zKm?K0+aC7TlQ(x&imuGd%nTc(3sKc$zRY-ry6xjOf2!ohKs{IP0p8piRg#jljt
z6T6Q_|MZQpxoEXze@6Sk<Fiq%d37V*UWq0?n%B1#t`qTl{x!`u`Q63DXK&@>YhR;A
z=(!K+MF*VhZ++?2e0MNq&(rYCU0gz5+##8lzg=c@zqg$st|@sSws9ujEB2^t=*=%f
zwr5|O1_oA(ZTY5heEpi><K>DWz4s&!xsUHGyPHMus4Y7fTxxc0rLlF_wv+ATCdJ~Y
zcRtI-Hm%;U;fY0<^(Wb;*o;j@_uPh*LJC`5Of6q#OCNgHp3t@TN0vp@%8&!*ttW~i
z?1i;E%QlfJ_axNJaPA726Eo~xK61G8+(E_11@}efdUUw#y*FxIfCr}qJuPN?Po2~x
zlxms#`AEgyfuR-6o+3~DXR?d2A7dmkMJ9BXoVaiq`?MhG#^!AnXvV<WTSE!oMZc3{
zDq3f>0x13r#e?>iosq0r^?Akvt>ACI1wU6A7*P$zld-aE=^IzLE99!@I&Wk4?3|k&
z1L}U#o#0$(#@W6EL1v29`|{j!FaO3DQT<gnJZCb)&#l`YrE^j3!g|B5->1KSL6r!_
zRVFnp{gB_U;yXHe;eAL>hK$|%5!XWz#DX`1%e@U|m-L1G+-mOWJ|Pi*blaZ!{I9--
z6D<MrxqFBOk-v@oxD<6EM^@CE-{%-sSf9%Y?$`3|Uz+0fy-cfU&3*K!R<7Rt(}txM
zl!<p$6-hIxk6J@6S5kguW!?NZa&g)%@5<aS%z-kUd2tLuRIs6G_Jg$ij>IJU7il$m
z`8vM|r!0-9IeS)*-AxJ0I!h8R-B@C6*D2h-)EGl(tF^Z}9~Enu9eW@D40Uq<Lz(Tb
zlP5zeir>CkZ0@<$8%1n%tkBoH(3jkF_>SqYnA)OKbI<YHye7ohtjKA@)h?^A()K^X
z+>M%VAsx_q|LjJ;<O<*Azk_|(UEA`q^4)vSri;>8z#tzcH?^fGz114S(QY5Jju&Y7
zp|$g#?aR$s)~eZW=A1s1ba&To$mo6bOW~GHUwxGPa>}(CGt*f)p9$OZ#UFFt;3(oJ
zH|hGLojWDp=;|{ygKirYe;00<+7Xp}KtlMhx-BI7San2rRgv43uBG?}@Orz|!0b&0
z&tUlzR~mN?M0Yh=I0v0p5y=RwxqUZ4W&N^diQV|u1x52VFX*?QS8^q#W?wBAJ^qe*
zz_Y~L?!&t+_MM00Hfu@kJc~Kw(I1+E(Jad_*tA<=h`B~LZ$&w2^QEt{WAre?5|d4u
z<z3IkYQE#~8K0{<%gy!Q7>U<^?ui+np7PK7Ysc7y)r(qA2Q1p@Z!VO-w05<HiCvIG
z-}tq})>Y`@+1Z1{oJqN*ZVs28TAf1Uu)X_7o9~6+yNADPlYTj4UxdM_RIL|_wGJo<
z`xFet&RGjloi;qpV$8cvxRM`g&*n3tKesBB9MX^a6z_O^k(uV%h-)na-(p9+FzX{^
z*9F#&?z6~fuH2mPm==7Y@}u_m@{IJZ^~=JJ*Nn{PtsLyAzueKZzP*oBJOBGldSuAr
zN0x5SoSN05Jwlbto+Ovr^t$~#UgdJW#@%&|S5yC0#r~~1eSVr%$)24Bx1*v3a{8W%
zb(@!Qn!=xDJX_TS%0!O+Y{pzYVVJXB@yJ}=VXf?I<D1)$3TMmJVytlHKBqNj<c#~K
zi#6i!_*}2eaNlz=7JY2A$tS=_GX4y8^D6zQs~s5Q)AA#?C%1$YOnuU*pv*oQR3fTB
zb31YQK(^XhT+WTU(S!+7MvRiL=^g{Dpxxj@+tFH;g5l`i&qw?g>+AQ1P}kA(51HR>
zV6H|tb&~`cg)X;^Pjr51f1#`$xcyU{l!|Dx*i25J$!eKvOUNtB1TVcvjC*O<Yp*w#
z{Uxt4=YF}0kHasyQ`!%Pch{(>7;zU-1NvMpA8^k9)UB+zbwE~LT&M4Th-K&Ap_IbG
zA?{@p2l<UVtXDZ2=n^k8PLy_U`PPoTSdprFeEY96n#J+~J(nMskF=UwS$Q6_)7o;(
z?)Pv(ccb`z8;ecfl&&a$q<D<F4Qc4SDD%nrc$mXs8O|-LXKH9!ieyi`{UEWvjNI2n
zI%5@JsnTspCzubojaEnI=$`hhjY&9AMVkIiBu&iR^>!;xN<6h>HHJ2TDk3W^PZFt<
z8cP?5AZUU8Y;YhDzZ;KohD%i5!v+WWdo3#5a_}}lEN}=e^O4ngD2Yxd5!n>DJEBC=
zf<c0_y?dbRlP@K+yahkpl`u%`APU@`CT{t&U;DGwN`kurWM|A@ye9uIdn(=v77Y%=
zJyi-99MPjhPyl~ZjI?EeqkopL$B-p#DgmVN*6(<!1dxubx&v%XTJ1L)blTt6>a>?E
zO|y5uCTicDyKSu$gbhCluf2b%(>}U9zTPG-q5j<G#QLN6@VM5w&A3Y%uDJ8zj<{6K
z2lhGH4!BYX+etX$N`E-uDoPU98aE}bjjp#|3o_W?c;KHcDsKyW0`898DO=$-71^GV
zcQYJmk)eUe(*5KgZ8H&VPZ7ZBzqQTcX`7^Q5$n`ef>m!R!QOO>z%Cj1jPc%Yh(qAw
zqHZk5ycD}JFyBro@tobtC%JZ?R{%`^B)}X)GckF?G>r1D0_>Bra7;0T6$)q=oloJI
z{mDCUtyMd4cjFasAe#-&1abl?yC}$h;H)0y?>#^cqEapaocpr}I8TY^FYFgf=cNmc
z8?W1{^F1;qotr21rfeO@qG7DZ+e^=>$z@MX;BTDZL+lVq)^T6-DVnVJr-t))ey6p~
z6Pp89^xf)Q)K9eWU0#AyH0T=?IJsqOv5{7E|1zl{Zf@%Oq{FvQt2}nE53Wkm_qRr;
z3bxIB43_<<d3MRMv%p_h92HkJ*QZ}3=iR!e?b6}<@&{F$Kuew-+oVreo0*;1qwwOC
z-&V{<>;nVko#rujQsq&<eg^cOsQD^_RW-(Y>E7&8D%YIi9u!YbSGsB?zd_)4>dC&_
zRXc|!rYw(Gev`4hACTejQ2&cGN`akDGW7J?6?`msQ}r5xK(?eujp;V7+0ngIXM}uH
zLXP>JX<G92`k|WG(|$T;()I4<lqc;CpTERf9dJ`H3%etYY1LSE(?3@-C6poalKWoH
z(7~C}H*DyeEAP!Y%1ZFx>BcSd*(Va=d1S4RSJR_Y_05&k-O@(UcNJdj#@$~YSu?+E
z@>S41m7(RB!cW+hx9!hj#U+l|w&|uP_*pD}JV8p6OY9XkEF5^p`epcSr*z7e4epbw
zdQ5)}L+tx3ft|QxVYRLlfu>{=@Xg6WZK(!suViS*y*J`Nju3Qh_Gun*yeljDGo_0*
zRL~f=eCy;vXW><&b>hcuo;nJZG$f0vqCSZG7$)D0zCk-z@LNFqX;)7Ax^;!w$!QU!
z^CdA#F-|}B6a+mzmU^nySg6_1pvJY|$J140>E4_esb!9D_xk8aY~$Rq2x;BC+wIG8
zC;d*t21N^|q3yQ4Nm)}Wx0MU$sgB1lNzIdN)hiDNy|j&fv^!8X$?DEov+ni{Z+~B+
z7e@Yw|9nJPmcHpkS;iiDW%SW+11EpJzSZf)8q|x8%S??uIXI_j^7O{-!0?1sl_Q(u
zoc#M{b}B}2{HFu`lfP`1el61xD;Xl~d8uc=wckUnjn;W8S2G2wz{Yf;iq}89jHjND
z)_pj0GbdH<=Jvv+y&6Tw?~!^>x~6E%l$ENK7fxo>KQw71gaebD!OKQbmt^821s@2O
zUfwg?QF`yt`VChagW881SdF@_lRu`|hexkme5|?JIsYVHTSb-Y(C_m*zFqQXscvq{
z_W+CxJu^$tQs#krvHCc-FRS~hSiAHhnKO^8M@(NG__c^0i_O&>=uwc1EKc*@=C@st
zy;#IlkG^)9{>I|tsIy9w0Y>HSvy1#Rm2l+><5t?6068E1!>6QTUk{2CUcGi({Sc?S
zSJ3S>qY<7d{Wxi|T=aXujuo1u`j+iYv(3#vzK<AG-SK>yeDDAvZarOcs>q{Exu9jN
z`9MwPT59c4S>-(irEfK7#!PIP8bW;s1184A`nO+q`)M>Ulb#tfvp(&)X_c>si;31!
z?$(Fen71Le_P<NL_K97xu1r7kRR?P>$##t!P&~6^_UnT}1BLH2j%HheQ?vTX)ng(F
zP0P%*2-lUT{H_n|tDe;9Hyg8dmUs5Zb}ajC$w}NNP*y>kb+ER^q}+MNB}W|e7z55x
z#zeOzS!&IRSENqV#qX;#(WTz^Kk`AVKzY9is_$68zD&OM{7{*=>+|R5N)uK;{JItU
z%U6&>zhdK<@XRk(xXbTGq2Qqx&JEFq#N+8#QY)59*w6K=sGnlk%ik2bP#5P&7tHIk
z%quSZ@-0?&>*_{h+ry)$y!Lv;1L;ictjVt09p#gUAD<;un$^Wyaviip^%qx1QLM9X
zbFP@Waz8%2pf4W5*mz)1%oly5TcP)yOKaCoJb8WNduf|?8BY8uA<*y1c+$<%OOba<
z`njGeJ01<)r8y{m`jI@lNZDdsvi$Oc-d!i!635bBt-31e@JXR#;>$LVcrOLK;+__C
zr|en1x|!346UIMQ?AYCWJnsl*SX-}6Zp~edziulxr{9oO7(P0hW^nb&H#?J+&$D8^
zJwN3Wx}DYvr*{;WCJw4oe9L@e^&?VnL+_W{F6pUM%~4ttx3nY9_}#S<vAjXESv?Za
zq{)6uUKZaJWSf-ytg1z}PBqfdT?GB18H}OlSDwm^9&<T7NF0kQi&2vXnQ~S=+V_s@
zKAKrsCbd~G^j-O#Ll2ksKfE%wX+V%^Fc+Ho*t_j|Nrq8uqlYcqb(v9&*`m04#r6@}
zn2%`}?E+{%FH&WEI?q|*<kr=(YdHINmb`4e`i`vQ)G)?MeNeyY<=!Jp{$hteGgPWc
zD*tQkT=&hTsw2f(`#-OBqkh<{b6sqkjIHOBoE&T1#pfTZPb4Wd;MJzTW<GH{OVhPD
z7LxF3U-gBnL)sr-_@#+vqUrOJA!0YAo$YQ3uGjT`XGWX<S@e`aCWM^qD|oW~Oin%N
z^Nyt*)>X>ACUxs)2L=v)_uh5y<mF_Q3y%VHPMf_*@g;4`P8@Nxh(g^xb!?@1bqlix
zT{LmEcC+i+gBLcgi<ZAl=H@P+vweGX$ro1H&GY>ec~(m+hi?fQV%0Pq)#*y%A8)0V
zrCNox4#`=+8pscO=i+$ldr%je;N6gNEQr`VUt(6ROggC_o5Zpy=%`Yhj1yVb@}Y5Q
zt-!@DW)fGkE2J58sO44`iY7$VD6rDtp_!DN`F;y4g)bvn>9~y%<gn8o-jkr=)ydoq
zgBe0XGICYBZe?yN+-kip#jb$SAaU+e<es`-tziu-p{hHrn>o!MM<i~c*czCvhez|%
zjehtRXAoEU7wi}GOJoTM`pRj=>QcGclV9WNR7Tq3<V+9V?>ZY#%CSawuKHG)d~~M?
zc_is^&Jj$WLt|-m4^BjRPo88YD&MB-(LS#O^1BYFchqzRu}@QPtD83ymgifeaJaWx
zRu?r%<<{;LX=-{G+vUn?D{t5E5yyeZQ_ZFx)v^Y=MFQTfC=Hr!kPgd#aOcyTqh2g|
zjk`k^z6xAF{Yu22F%|E6_OD-4ArUS8p@nS?wY!HV2lVr9Nv?^WDfQp*dA*|6kAy(B
za1ScNf7Rm=<=US?Me9PvoW61%|0tTeA(FJ(eu+}n;rVLwaZY0VHlay_F`xN@ZlTZ`
ziM+8{=V{XE$hC_tB+iQ;U2$E#WyMU*@25Maj{9i@nx5(4cFuj9ER9iri0hrEHGa~L
zxOL_0hNR*zl_A|_<M`&uYlVvw_KzEw<g`6;%C0gz^sax!%chmvJ18<r1Qzx9-I0qG
zOFecGDshkMwHRLew2eluPc-iCj12Btj51|YZ>)Kbx+W4{MTmKuY}}Fh`O^XE2f8g<
znR)3i>PqI8<tFJ*qDhv9(*^g8)u$i!o$)VL{dh&>^j=JLl}BeP=X^+o!f8|Lde5pI
z(|e5KUOuc%y7{K!R)3we7iOG0Hr>KhNSqjCs&%}O@d{8ZYw0%X_If{%^W-m)V35ZK
z`GGJO^Pda*0{+6D#al)bSsXgJ@IM@&vw}!85<Eb@NZINj{9%4{9VkoyMFddHTZ<#9
zM>fnxl=y*CetHVYv0VDI(7lYTDz6Yg8M4q7N0#N~aG|?C9%ec;PNK10chJ~1O=zrB
zDjNF(+kve$Lt`BvToQxEavadusn0Z=h$<aNcGbuIXlue2H8f%E;d`aM3G4TSf^|P=
zhC^VXuM)VX;U+AZcpfu%?i|JsoWm$J2w=BYG+=s$8!+z%-eSkJN3iFNdoV)~_L9Gc
zRZeZdT(!`_ePQe1=(ut0bo@8$@=77x;}RiU<?F8)T=f^sh(H+b8HCktKEO6s3*pX@
zdNEFFJ(#$pZj3=S8e2Ss#%8`nW7#QaY>rt6w$2ER-3{TdyU^IT_Gs+zdm2t$fsX6g
zsE;#fZo=kVZ^BB$*R!(;o7GRjo=GsnxkI>ZQUbSqv<Zs~JdcUUKZm))I)|ZDNn?LY
z%V75|lfmX1ps`}U?bwPg3G4+3YcD}#8Sf;pW<(lJI+=!BF`|dlm2JY7PTj?JPu#^e
zt2AR5ouOi%-8IAAfv}r48u!ew8LM^iJZ8(lIn3g#=P;mx4Jre@wV(q@tY|u&46{f<
z#J|&GbP}xZCPs3gKateZ)nxA_|0^q~`s4I#h|{YH;4i+@#S_5wg?vJYHK^g|6K)Vd
z?SG$7xC!$KC?r4j&*R?qWC+Q_pgyJDK)w?)$M|)aB*MXZe8B=8rSS%-SM2WK(c@Xy
zT6ygtj`t{?>fqG}&9CylwH5!O$1_6~>hTYdeo$))c=F;|PkaBfpR^w8Ag><7r?0hG
zpeq*g`7_#i{IX8cd_8@%dGabKoA~Yc+MpQTQhYj7heyBdJH@9L;rRM3*ypabU>`o6
z*T2T1qkrt@^>qGc=R!Tc57GPl<92>s@W0r<ER%2ltOdXSP5-tZCGp38R144kh4Ew<
zHuC%H&g98w7_<K+FHLBHzF=3y`+6Qdex&?g;~DR|pD#bml`ntw<v-)kmsb|}r;b8_
zIyR^e^p=Kw2GKY$1<vH(Y5=$X%Xz_n-kWe(zhS+n!@4bK&;XANZgWAS9tG=ce<KTd
zvUwYFnA#+=NOT4b<}L5QWGU~;1bH``NFxQ3Y2YqIAqi-Bt^t#XEYQRS%`hj<1^4*L
zGb)RSq!>X<5>GV-i3yX#{I)D`KPZqxg{j8~78)j1(cpm}c!(Du9RjTc(8iOrz)uZk
zfkzZbZr7t+T$mhGYtX@;9FGa0^S_@QPXZmpkxKx`1D+;;X9Um%JLq%}NAlmu1G*DH
z4*@(U9|FAu@Z#@O?H`k0!aOr__QJRJ6+DSSMi?ewsli0PaxBn?Xx7hD{51i*;o#xz
zlL0+SOA_TiLK(!Pph-gnFihdu@D{NFs(}12KprrX07eO5jE_ZxArE+$0NxY82R;@V
zV&nni3E(3Ee1htJCV&YFRFCosW|Mg;eBlZDN&u4_ydW2RgDNERRhZ%_H%$QF`6|pH
z*A}`ch}Ay`;3w<`1-lX8ck_$KJWBw-Id~y1nES_)c|-sc1^*inm;$imb;cBg5d7()
ze4s$Jm_j_NFhc!L<(MK6!)q?eV-SNS2QSQJibLRU2}}uu3q}AF&Et}UC0~dX!Una3
zDl?^dG#P}(7r|GYxd>u-jb(WZa<Js!MYznx5co%gJi>+8p}^x(ge6~u62b=U7~|QY
z%%iFx)PLEb3NgIqOLz=RVadUZa+zun_(zC3!iCtejK{ScmV6-^2pcl^&<;%=Z3RO6
zmmOLV!)vU~W6*&m2QS8Du7topB6JZh#11_kmp&}{A`B2Vo*f^c5JMhy6+-=&9Yzqt
zYi`VAFo7k90}tVl!%9{p6aEh)F8q$Ln5K~E$25bb9+zp3e0cXo;9U+5oXr24(@YCU
zhQjarF)a~BN?RD>#4uz8$o%UwEad(IHJE7yQAtSHak$LY2yfUwZPy@m%5WGGWnqYm
z!Vvq&VSn17IAq>LgzGVQ1#IEU1JfGvEc{nx+8}UfBGVRr2~0a!!ZCq&X`~XE7=(nJ
zc0NvejD2X)g{?5|!mfIik1dgK#p++P#~DHR!wnSf)#)y*>eVs~D(wnp-1!P-q(l<?
zbVL#x`B4(fqM@<LpW3n4<>1;2!fSWHwU{;<D|Cy7^Bbk%TJ-gCOe~bdX5wH8PZ5p(
EAC9HzMgRZ+

literal 0
HcmV?d00001

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
new file mode 100644
index 0000000000..97e3b31bf0
--- /dev/null
+++ b/gensim/test/test_ensemblelda.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Author: Tobias B <github.com/sezanzeb>
+
+
+import logging
+
+# TODO remove this before pushing tests:
+import sys
+sys.path.insert(0, '/run/media/mango/Data/Code/gensim/')
+
+import pandas as pd
+import numpy as np
+from gensim.models import EnsembleLda
+from pathlib import Path
+import unittest
+from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
+
+num_topics = 2
+num_models = 4
+
+
+class TestModel(unittest.TestCase):
+    def setUp(self):
+        # same configuration for each model to make sure
+        # the topics are equal
+        passes = 50
+        random_state = 0
+
+        self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                                passes=passes, num_models=num_models, random_state=random_state)
+
+        self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                                   passes=passes, num_models=num_models, random_state=random_state,
+                                   memory_friendly_ttda=False)
+
+    def check_ttda(self, ensemble):
+        """tests the integrity of the ttda of any ensemble"""
+        self.assertGreater(len(ensemble.ttda), 0)
+        a = ensemble.ttda.sum(axis=1)
+        b = np.ones(len(ensemble.ttda)).astype(np.float32)
+        np.testing.assert_allclose(a, b, rtol=1e-04)
+
+    def test_eLDA(self):
+        # given that the random_state doesn't change, it should
+        # always be 4 detected topics in this setup.
+        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
+        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
+        self.check_ttda(self.eLDA)
+
+        # reclustering shouldn't change anything without
+        # added models or different parameters
+        self.eLDA.recluster()
+
+        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
+        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
+        self.check_ttda(self.eLDA)
+
+        reference = EnsembleLda.load(datapath('ensemblelda'))
+
+        self.assertEqual(self.eLDA.cluster_model.results, reference.cluster_model.results)
+        # sorting will not be the same in case the sorting key is equal
+        # across multiple clusters. hence use assertCountEqual
+        self.assertCountEqual(self.eLDA.sorted_clusters, reference.sorted_clusters)
+        np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
+                                   reference.asymmetric_distance_matrix)
+
+    def test_memory_unfriendly(self):
+        # at this point, self.eLDA_mu and self.eLDA are already trained
+        # in the setUpClass function
+        # both should be 100% similar (but floats cannot
+        # be compared that easily, so check for threshold)
+        self.assertEqual(len(self.eLDA_mu.tms), num_models)
+        np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05)
+        self.check_ttda(self.eLDA_mu)
+
+    def test_generate_gensim_rep(self):
+        gensimModel = self.eLDA.generate_gensim_representation()
+        pd.DataFrame(gensimModel.get_topics())  # should return the stable topics from the ensemble
+        topics = gensimModel.get_topics()
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+
+    def assert_cluster_results_equal(self, a, b):
+        self.assertTrue(isinstance(a, dict))
+        self.assertTrue(isinstance(b, dict))
+        np.testing.assert_array_equal([row["label"] for row in a.values()],
+                                      [row["label"] for row in b.values()])
+        np.testing.assert_array_equal([row["is_core"] for row in a.values()],
+                                      [row["is_core"] for row in b.values()])
+        np.testing.assert_array_equal([row["num_samples"] for row in a.values()],
+                                      [row["num_samples"] for row in b.values()])
+
+    def test_persisting(self):
+        fname = get_tmpfile('gensim_models_ensemblelda')
+        self.eLDA.save(fname)
+        loaded_eLDA = EnsembleLda.load(fname)
+        # using Path objects instead of strings
+        self.eLDA.save(Path(fname))
+        loaded_eLDA = EnsembleLda.load(Path(fname))
+        # storing the ensemble without memory_friendy_ttda
+        self.eLDA_mu.save(fname)
+        loaded_eLDA_mu = EnsembleLda.load(fname)
+
+        # was it stored and loaded correctly?
+        # memory friendly
+        loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation()
+        topics = loaded_eLDA_representation.get_topics()
+        ttda = loaded_eLDA.ttda
+        amatrix = loaded_eLDA.asymmetric_distance_matrix
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05)
+
+        a = self.eLDA.cluster_model.results
+        b = loaded_eLDA.cluster_model.results
+
+        self.assert_cluster_results_equal(a, b)
+
+        # memory unfriendly
+        loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation()
+        topics = loaded_eLDA_mu_representation.get_topics()
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+
+    def test_multiprocessing(self):
+        # same configuration
+        param_ref = self.eLDA_mu.tms[0]
+        passes = param_ref.passes
+        iterations = param_ref.iterations
+        num_topics = param_ref.num_topics
+        num_models = self.eLDA.num_models
+        random_state = 0
+
+        # use 3 processes for the ensemble and the distance,
+        # so that the 4 models and 16 topics cannot be distributed
+        # to each worker evenly
+        workers = 3
+
+        # memory friendly. contains List of topic word distributions
+        eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                 num_topics=num_topics, passes=passes, num_models=num_models, iterations=iterations,
+                                 random_state=random_state, ensemble_workers=workers, distance_workers=workers)
+
+        # memory unfriendly. contains List of models
+        eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                    num_topics=num_topics, passes=passes, num_models=num_models, iterations=iterations,
+                                    random_state=random_state, ensemble_workers=workers, distance_workers=workers,
+                                    memory_friendly_ttda=False)
+
+        np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05)
+
+    def test_add_models(self):
+        # same configuration
+        num_models = self.eLDA.num_models
+
+        # make sure countings and sizes after adding are correct
+        # create new models and add other models to them.
+
+        # there are a ton of configurations for the first parameter possible,
+        # try them all
+
+        # quickly train something that can be used for counting results
+        num_new_models = 3
+        num_new_topics = 3
+
+        # 1. memory friendly
+        eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                num_topics=num_new_topics, passes=1, num_models=num_new_models,
+                                iterations=1, random_state=0, topic_model_kind='ldamulticore',
+                                workers=3, ensemble_workers=2)
+
+        # 1.1 ttda
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model(self.eLDA.ttda)
+        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
+        self.assertEqual(eLDA_base.num_models, b + 1)  # defaults to 1 for one ttda matrix
+
+        # 1.2 an ensemble
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model(self.eLDA, 5)
+        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
+        self.assertEqual(eLDA_base.num_models, b + 5)
+
+        # 1.3 a list of ensembles
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        # it should be totally legit to add a memory unfriendly object to a memory friendly one
+        eLDA_base.add_model([self.eLDA, self.eLDA_mu])
+        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda))
+        self.assertEqual(eLDA_base.num_models, b + 2 * num_models)
+
+        # 1.4 a single gensim model
+        model = self.eLDA.classic_model_representation
+
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model(model)
+        self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics()))
+        self.assertEqual(eLDA_base.num_models, b + 1)
+
+        # 1.5 a list gensim models
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model([model, model])
+        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics()))
+        self.assertEqual(eLDA_base.num_models, b + 2)
+
+        self.check_ttda(eLDA_base)
+
+        # 2. memory unfriendly
+        eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                   num_topics=num_new_topics, passes=1, num_models=num_new_models,
+                                   iterations=1, random_state=0, topic_model_kind='ldamulticore',
+                                   workers=3, ensemble_workers=2, memory_friendly_ttda=False)
+
+        # 2.1 a single ensemble
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model(self.eLDA_mu)
+        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
+        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+
+        # 2.2 a list of ensembles
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu])
+        self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models)
+        self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models)
+
+        # 2.3 a single gensim model
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model(self.eLDA_mu.tms[0])
+        self.assertEqual(len(eLDA_base_mu.tms), a + 1)
+        self.assertEqual(eLDA_base_mu.num_models, b + 1)
+
+        # 2.4 a list of gensim models
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model(self.eLDA_mu.tms)
+        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
+        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+
+        # 2.5 topic term distributions should throw errors, because the
+        # actual models are needed for the memory unfriendly ensemble
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics()))
+        # remains unchanged
+        self.assertEqual(len(eLDA_base_mu.tms), a)
+        self.assertEqual(eLDA_base_mu.num_models, b)
+
+        self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
+        self.check_ttda(eLDA_base_mu)
+
+    def test_recluster(self):
+        # 6.2: see if after adding a model, the model still makes sense
+        num_new_models = 3
+        num_new_topics = 3
+
+        # train
+        new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                               num_topics=num_new_topics, passes=10, num_models=num_new_models,
+                               iterations=30, random_state=1, topic_model_kind='ldamulticore',
+                               distance_workers=4)
+        new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                  num_topics=num_new_topics, passes=10, num_models=num_new_models,
+                                  iterations=30, random_state=1, topic_model_kind='ldamulticore',
+                                  distance_workers=4, memory_friendly_ttda=False)
+        # both should be similar
+        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
+        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+        # and every next step applied to both should result in similar results
+
+        # 1. adding to ttda and tms
+        new_eLDA.add_model(self.eLDA)
+        new_eLDA_mu.add_model(self.eLDA_mu)
+        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
+        self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics)
+        self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics)
+        self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models)
+        self.check_ttda(new_eLDA)
+        self.check_ttda(new_eLDA_mu)
+
+        # 2. distance matrix
+        new_eLDA.generate_asymmetric_distance_matrix()
+        new_eLDA_mu.generate_asymmetric_distance_matrix()
+        np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix,
+                                   new_eLDA_mu.asymmetric_distance_matrix)
+
+        # 3. CBDBSCAN results
+        new_eLDA.generate_topic_clusters()
+        new_eLDA_mu.generate_topic_clusters()
+        a = new_eLDA.cluster_model.results
+        b = new_eLDA_mu.cluster_model.results
+        self.assert_cluster_results_equal(a, b)
+
+        # 4. finally, the stable topics
+        new_eLDA.generate_stable_topics()
+        new_eLDA_mu.generate_stable_topics()
+        np.testing.assert_allclose(new_eLDA.get_topics(),
+                                   new_eLDA_mu.get_topics())
+
+        new_eLDA.generate_gensim_representation()
+        new_eLDA_mu.generate_gensim_representation()
+
+        # same random state, hence topics should be still similar
+        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
+    unittest.main()

From 00a06e96b9711a4ba8915a9338b7050b3304c87d Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Tue, 9 Apr 2019 21:24:31 +0200
Subject: [PATCH 011/166] fixed test

---
 gensim/test/test_ensemblelda.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 97e3b31bf0..c6e12dbad0 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -5,11 +5,6 @@
 
 
 import logging
-
-# TODO remove this before pushing tests:
-import sys
-sys.path.insert(0, '/run/media/mango/Data/Code/gensim/')
-
 import pandas as pd
 import numpy as np
 from gensim.models import EnsembleLda
@@ -19,13 +14,13 @@
 
 num_topics = 2
 num_models = 4
+passes = 50
 
 
 class TestModel(unittest.TestCase):
     def setUp(self):
         # same configuration for each model to make sure
         # the topics are equal
-        passes = 50
         random_state = 0
 
         self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
@@ -67,6 +62,9 @@ def test_eLDA(self):
         np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
                                    reference.asymmetric_distance_matrix)
 
+        np.testing.assert_allclose(self.eLDA.classic_model_representation.get_topics(),
+                                   self.eLDA.get_topics(), rtol=1e-05)
+
     def test_memory_unfriendly(self):
         # at this point, self.eLDA_mu and self.eLDA are already trained
         # in the setUpClass function
@@ -126,11 +124,6 @@ def test_persisting(self):
 
     def test_multiprocessing(self):
         # same configuration
-        param_ref = self.eLDA_mu.tms[0]
-        passes = param_ref.passes
-        iterations = param_ref.iterations
-        num_topics = param_ref.num_topics
-        num_models = self.eLDA.num_models
         random_state = 0
 
         # use 3 processes for the ensemble and the distance,
@@ -140,12 +133,12 @@ def test_multiprocessing(self):
 
         # memory friendly. contains List of topic word distributions
         eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                 num_topics=num_topics, passes=passes, num_models=num_models, iterations=iterations,
+                                 num_topics=num_topics, passes=passes, num_models=num_models,
                                  random_state=random_state, ensemble_workers=workers, distance_workers=workers)
 
         # memory unfriendly. contains List of models
         eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                    num_topics=num_topics, passes=passes, num_models=num_models, iterations=iterations,
+                                    num_topics=num_topics, passes=passes, num_models=num_models,
                                     random_state=random_state, ensemble_workers=workers, distance_workers=workers,
                                     memory_friendly_ttda=False)
 
@@ -314,5 +307,5 @@ def test_recluster(self):
 
 
 if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()

From f5f1c9c15a1272d03d6160598d033ebf33b8e500 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Tue, 9 Apr 2019 21:34:43 +0200
Subject: [PATCH 012/166] removed some dead leftover pandas code from test

---
 gensim/test/test_ensemblelda.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index c6e12dbad0..b251fdd8d6 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -5,7 +5,6 @@
 
 
 import logging
-import pandas as pd
 import numpy as np
 from gensim.models import EnsembleLda
 from pathlib import Path
@@ -77,7 +76,6 @@ def test_memory_unfriendly(self):
 
     def test_generate_gensim_rep(self):
         gensimModel = self.eLDA.generate_gensim_representation()
-        pd.DataFrame(gensimModel.get_topics())  # should return the stable topics from the ensemble
         topics = gensimModel.get_topics()
         np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
 

From c32ddad75e9d4dca604de92f94953a73f2f3af21 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Tue, 9 Apr 2019 22:01:21 +0200
Subject: [PATCH 013/166] removed pathlib from test

---
 gensim/test/test_ensemblelda.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index b251fdd8d6..c4ac16b454 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -7,7 +7,6 @@
 import logging
 import numpy as np
 from gensim.models import EnsembleLda
-from pathlib import Path
 import unittest
 from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
 
@@ -38,7 +37,7 @@ def check_ttda(self, ensemble):
 
     def test_eLDA(self):
         # given that the random_state doesn't change, it should
-        # always be 4 detected topics in this setup.
+        # always be 2 detected topics in this setup.
         self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
         self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
         self.check_ttda(self.eLDA)
@@ -93,9 +92,6 @@ def test_persisting(self):
         fname = get_tmpfile('gensim_models_ensemblelda')
         self.eLDA.save(fname)
         loaded_eLDA = EnsembleLda.load(fname)
-        # using Path objects instead of strings
-        self.eLDA.save(Path(fname))
-        loaded_eLDA = EnsembleLda.load(Path(fname))
         # storing the ensemble without memory_friendy_ttda
         self.eLDA_mu.save(fname)
         loaded_eLDA_mu = EnsembleLda.load(fname)
@@ -125,7 +121,7 @@ def test_multiprocessing(self):
         random_state = 0
 
         # use 3 processes for the ensemble and the distance,
-        # so that the 4 models and 16 topics cannot be distributed
+        # so that the 4 models and 8 topics cannot be distributed
         # to each worker evenly
         workers = 3
 

From dab067f3a2f52e4f294d05b8409133d707b7cb1b Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 13 Apr 2019 00:49:42 +0200
Subject: [PATCH 014/166] tests work in python2 locally now

---
 gensim/models/ensemblelda.py      |  30 +++++++++++++++---------------
 gensim/test/test_data/ensemblelda | Bin 9923 -> 9332 bytes
 gensim/test/test_ensemblelda.py   |  19 ++++++++++++++-----
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 7f3f071e77..79b399fc40 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -84,10 +84,6 @@
 import logging
 import os
 from multiprocessing import Process, Pipe, ProcessError
-try:
-    import cPickle as _pickle
-except ImportError:
-    import pickle as _pickle
 import numpy as np
 from scipy.spatial.distance import cosine
 from gensim import utils
@@ -105,9 +101,9 @@
 
 class EnsembleLda():
     """Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models.
-    They are clustered using DBSCAN, for which nearby topic mixtures - which are mixtures of two or more
-    true topics and therefore undesired - are being used to support topics in becoming cores,
-    but not vice versa.
+    They are clustered using a variation of DBSCAN, for which nearby topic mixtures - which are mixtures
+    of two or more true topics and therefore undesired - are being used to support topics in becoming
+    cores, but not vice versa.
 
     """
     def __init__(self, topic_model_kind="lda", num_models=3,
@@ -497,11 +493,10 @@ def save(self, fname):
             Path to the system file where the model will be persisted.
 
         """
-
+        
         logger.info("saving %s object to %s", self.__class__.__name__, fname)
 
-        with open(fname, 'wb') as f:
-            _pickle.dump(self, f)
+        utils.pickle(self, fname)
 
     @staticmethod
     def load(fname):
@@ -521,8 +516,7 @@ def load(fname):
         # message copied from [1]
         logger.info("loading %s object from %s", EnsembleLda.__name__, fname)
 
-        with open(fname, 'rb') as f:
-            eLDA = _pickle.load(f)
+        eLDA = utils.unpickle(fname)
 
         return eLDA
 
@@ -967,7 +961,8 @@ def generate_stable_topics(self, min_cores=None):
         results = self.cluster_model.results
 
         # first, group all the learned cores based on the label,
-        # which was assigned in the cluster_model
+        # which was assigned in the cluster_model. The result is a
+        # dict of {group: [topic, ...]}
         grouped_by_labels = {}
         for topic in results.values():
             if topic["is_core"]:
@@ -998,8 +993,13 @@ def generate_stable_topics(self, min_cores=None):
                 "is_valid": np.nan,
                 "label": label
             })
-        # start with the most significant core
-        sorted_clusters = sorted(sorted_clusters, key=lambda cluster: cluster["amount_parent_labels"], reverse=True)
+            
+        # start with the most significant core.
+        sorted_clusters = sorted(sorted_clusters,
+            key=lambda cluster: (
+                cluster["amount_parent_labels"],
+                cluster["label"]  # makes sorting deterministic
+            ), reverse=True)
 
         def remove_from_all_sets(label):
             """removes a label from every set in "parent_labels" for
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 905aa8bf8a9ab4e3f8c2fc54758af605b910d30f..249196da763e8cfe508117c34fa4db5870cd7747 100644
GIT binary patch
delta 6162
zcmZ`-dt6N07oX`pQ|ala_d}&ArI%Fh%v8E0M<EV{OjC2}jONjq2PHK<R6;7_hFlei
z@=S7Fk<d%Hl}aR{$lFC8H<5n(%!&K+$8Uc=GkdSS*8c9b*Is*{y;ei;hxy8>N;LUw
zYtsUAZ7NxaDJ5gdG}uJG)xtyxQ;`>#O_9f{PLioc3#9z`M5=(p7Kzx2R7?#bWy0jB
z>`+Xd29Y5RNF!@Pgd<6eN2r(vtQ6x7ksvY<q0k^@m`o1|30W`>isBC<NuqL=k(5aP
zTl{WxKJvvo-sGQC)H>T0v;pXfXM(@{vIzkaNxs~2+SJGH)e5=Vs_TjxABz`J%y(K|
zY&lJqP^N(M^XTeTh#4Lt=pAhe`r%w{V%jyf3b-nC!QlVGMTTh+e9TV4&tLtK;2&lZ
z(@$s@X)~eS!U6(cCs#2}9$=-fe(65GfGYWFD<eJg!gdAzsE1;FYLGnB%F3Gb5SNIb
zewmo8r%(YBIYUCwq)YN<D|;_eD=tL}PA4GZf(aF79s9oG7M!1YJnB150+xo&<hzy4
z$y7O2&r}vH<RCmMrs<G|X(eOYG*~Ad5wX?_MI4qy7|)FsW0P`VRmDDHIw-!8Gbc7u
z_y24pOa<-%0OAml7}JBuFqJDoL~IFHC=g@%5DC_aW%K!LmIM_c;uWZn$H5E&eHTXs
zcfPCq3aiDl#bQK^83I8~0K}5UbJ!BlF&T8Ia*$|tBKDs^--r0JNkD|d6-z|iD5(VD
zU{ipmVhNDOjKJEa`WS;nnIvPTG*k!Gh1CHXM+g>V22o&TM8d|*QC0L`-?hMfS1S|M
z&4J}_RNZ92P-<cvPK;UpBxr>TDnVLVxM-x0%XQ!RjyHYlZEpHfkL3F<s!{W8EQbBG
zK-=+e6~C)TZ~AULnOa9r$*Mc>Cc7@b8Dh45Tga@m3T76^&1G)4ZuKoH3Sd@%wl`%i
zv*L3Av#K)NPv+d1?UzvJ?I($nM!}PW@e(edy8&UbdGRP4v(AAH_*?;(4=fZf5=OD3
zxIC^T5t|z5yJ^Ef;0ePdn2M}p^H|6_M3jixKvb9#jY<V^Vt|3!26w2#dOyaaTow?O
zBADI3qbagzHjfv@j*b&!(?Vbhz7=2_`C`l-(t_25F)?C9g3$tf-%jj=DxsLePo{EI
zm`Y(&4Iqd6xi7G*){oWT+b}%`rh;vZXG@|{M1(noC_E_iyZ<>uO0Xu95I>hEjE%zO
zXZ$pU%TK$p*|3~3??xrVyrq&+vq-=&8F)hvPoK^_3)<y!H+=*1Pg61a(0h!&ohqX5
zf6Sv#Sy#sBJrPH*0BvnKTi?JESCH_!3G2B6CsyZj-0@R^xykd%_A+<iS{}*ROd70#
zTSnnmp1`j(B*HjE;Lhb>vyw3{T9_JGF|e}`90hMkGfWkbu_AUniupiP1yh_TmLPm=
zHl(UxN<;|o^<N-Lm@*JW;>8#p(g?$C7cJz+11}e042Tpa!*Rt@(K;j%V?runDu9%V
z0n-;!SKy#hK3jnKL1YCBxDJB(Luz3ZAjcH|lo)n2g3W=H6wu%t#)HF&U;&V_6jtWM
zh_PRjvAHyG(3KRft^{gj@KXUlRnStCfrdJ0Xn>jmYAX0?LP}D_A!7*y8?bqh8LR<p
zrmzPVn2gN_a3}*JqkfPQItNll10WTY38|v9AvM$=Qb&Cu4U`U1&|e@b>H}$_-V<o?
zLNOKunM<QkbJPOX60rpwA%)K(4*CLlu-a4^<PeH2q`~UJ%iMf?JiM^rpFAW4ct~)(
zQYL<TNx$D-i~i5;m9y*;Fw)887fPh@b)WRt+ufd`n;R3?Hu>@1%;J=qm+qbCb>G*M
z@=G#FLGc5LTkLv~aalp;x=2zv=VgcR`HbU6EjO+Qf7>A3Lx~+JwWpX=?Pf>4HHNO-
z|J#3XajhG6*~9jm(}=XKQRUbrS!mV0*<0Q?%!fyR$;eo+>cNy()Z3r;Sj_BPD-L|r
zwbs|*;m3t#{^R<iyZsNo4D!g^Qui`4PbpeC`}xwxj3r~jl%yqzYh`Y2C54OGW5+xf
zZgc#rV0Z`RL`<Mdzs<Qpb-tD3m+AhrZL1%&=HBKS9UOf%n6KaG_*+)4NzSHQ%6e91
z_M5AmooGm$v(D_{9R)0FyRS0)otks%UVSy$5}X$}va=O)88NOq;?X{0vg7MBGq(6_
z#^kxjo6}nt4<9>GVf)@HM#fi~)pbU%dCjx-e94lwKI0a%ir*HEmgetY(D<xVM>dnq
z-oCnhr@t9wTAJTwVePT<iot`efyV9G30G{r=I=De?m878PCFhuWA`!5o|bb^OB(fx
zbFL&8s!=~RbWGefGuQd@^e5M5NuPaw8SOE5g;xyKkIB4gY;3gOZd3Ui*?V^O^Wo7m
zLtW3xn3J!O><{!J)7SV`2h>Q`&evaF+RiVwc2e#4JRIiUa1>Hozg4zYUsU(c*pRFH
z;J%kZwFmBJ=NwoWy@X3*+T{(GWm$$=ofru*d0#b(y@C_NT99iO;mbQ%r%l6KhD=&!
z`)2R5c;TON^{$%3+k^IhrYw}@yAC>x<hl9zoI8G3Tc;->X_wUX6eRSnSkTlo=c?+A
z6*ui}1w<bzUX|c9Y*x6Wd7HRnZ=a5d!|b&k4<5`mclo+3`wjI`)-<1ML;o38UfRbj
zQFm3k8aZ6^FhQ%c+CQSy|M_c`i?kw*>@Ob~zen_ajeh&bz1*h3#X?)pyk@fO(P}A=
zywdFAtes0j)ce)@3`M4yZ7cgmbbnhjet5}+a!2>QCObY<8cmB)dt*B0Y4eYM_11?A
zXS$?%4elD@z3^P4w(92VW#KCiFIq)6Djd{d_icO2NA^~yb*Im-CEqzUW82mq%l`lD
z`{%^$^wK*eWwqw}9t+G&Lq&&jO`d$pm0cXI@Vk3F{Ai_F;m(w>Jl7;?2eQhbJJTfD
z(ROt4o_VK-qRuowSI(+-uaHIP@B2K8O**;pQ1APtHea&C`}XY5GrE15UG9cCf7AKO
z*d38E`*6@e{%=CVsnP6vm%}fd>0dab==_|Z<riX-m1@%J+CB@0r@#NjH6b{}XV&rR
zm$JAZp3$n8rFV{mM$8#WvP0fJ?b^L1U`lUu$|suys$Sj3DuYLR!o3CpFD#2=Ox;qD
zvDxmfKF?im>Bv`EM8|p`rR7xdljPI4r;m*k91pb)Xispud$#b6vJvHmLx|EXWyhr&
z9p}?24;M3Qoj3li^{n;6f;7vnpcKpLsrI?&7s@u?cy~Rev^uo$*4);esrSwcKe%5k
zdUfeWe7<K`p2dq6<0W12-b8kwNB5w8?As$_oNJGZoO{3hI$-%nnz7+)w6HeF-u+5n
zT#~^8t`cWMd%5i5j>6%A##YhflR+vwHWk?0OfEF7C_dlWy{X{(cA<XZfa%$a?Q>ZN
zpIEfdx*}8cF+Sw@b;j!k3ypVaH~rpnW6NGWY)c9L)cfbfjmze1l<@1&w>l3Jons>=
zHMEQ+d*@GcH5JV5;!E>GTB*^4aAiO)f4Na0Qa00eP9#!c-<BxZ($rykReMTGp{jA@
zJIh{@2kZ03IYPn1U%Q^v=bKTcTHW|`?ETrr@APB?vVNuKf3_KgdY)^pu6R(fq-?UM
zzJ6(km;0#Z)kXI+DiUA)RZ{8X_%8g!ML$c1nrh0INdJBPD|;5ttSv8^MfosZZ?;qX
z^fI+y%Y(O((mmg$Dm^DS(r>ETd}e6zjg7Tx%**}#`Mu${<--S|K>wccZC?8?I*oTn
zTVD+x4&J><wt2{*$TYO$M8R>rs(_lkN<Gi#eSD&qzSsBlfX8-+g=U-mU0(2|YZ@+5
z<&28K2Zv8?qojnWMA|shygt3?4Dr_AYD>=Llea@Jrf6M$Kl~{hT6a6v^Xd4WW%{8>
z-@4~^ok^De&Dws-MKgRiBk;47x^-dP^E)*+FRgLeu<=HXY)88P%FcOfcJ_Q)KmS_n
zry6ap$;aAVvZI;hp58leo*w@7ZvCd2?V-O|nUstej6OfykhYQGrQ~s8&bSBD#qq6b
z_43co(M2m)47^&Qv6)Aj_0Z5>lZW<|#M}H8uWdpO2;UsH+K`so%NVdoKVZD0M!G+t
z;&c7G+Stcgr-D-)WzJcPTt|ZQ5=z$8kG<A6EW7By5jg#3f8h2nov+@+hR|*45oC>q
zRI4j*ki0|BOZTa!=8hG8`#64U1oH?DyE3om!yfUO6@r<6a=$bkzUMZ5!{t@CE_hl!
z{Tda~#Br$~oYHAMb;Obq;3<~JhHigkNWWTBM?HAu?#QP0(LJTjvZ5r9S&nD&2EroT
z(4%P%Jk^qzsk_q?saDH!BNm$uRES@hR!$mME|WpsuaAdU?~h9(vByoFNJkp<l9HZU
z8n<qb`lESaqjh?F=7XUVSzbz+BTLl!Hhk!BUm;MwWO};i;G8)XM=tNPOc)64NV~I(
zHMlP?$XfqnkHyIhIO(-)Rl>ftYOyV%Pp`dAC~q^q_3ob`bG_8Pawx5F-D#7H*DjZ?
zynWU`vgWAr)6VuC2^PK`YMQyX^or(9_vx^s^}tOd=Y3WjGC1vqt!$qb{3-NI!fDAr
z&rsF%S236LDyL$vV*M-%d2oTV)|X`lr3NrOFiYsR{!Q@jN6$Ym`S_RYYAR(kVQ!Lf
z)6$h;wHii)7nBy<9CI+%+P)yczzONnP?a_=(qGGZCbNMuU8`R;KYr@LxO?6E&#Qq`
z-nH4hw!JQP4Q*v!clK7+Jpt`nw?27m`=NdkbBP}|?}X3b*vA9;+*)C32E|pjWZo&4
zVkbexSFL8Do9nh)RazO7Kg#aX*H068E}$5y?!b@;^9GjNu><$#6yNyEdVr@&IW_f4
zi_UoXx5V0i+Pex?_1EhBGQ4|iEc%-6oB(q9*RajIeEu#!`q{38v92uR@1bL~uYn($
z@@1)opU*kPg@}*#6~8BQwf(--)E&@gEaIwZwK}GT#|5q37^HX5`|zn>pUCP+9V66J
z>7=^}X`f*z<U)^6)AJHb7u}|rk5gs^#067zFfZf<%evG5xK`s*_qETCvtJq9b;^#&
z*6DgYw+_+hcTk!v%~-xS?9HZ`+{|t(M&p2K`;)8uzO^%M-{L8SesNpqM&<{s`>6BK
zBw)%zO&#a@(6_pBUC}pFFY$fZ74HjCU#?r99NRUxBFW-<0XtvaTdtR6kdyiu{yiMI
zAWU)TtCvJUJctwsUcic?2&<GOAfQ<s0c(TTWE2ni9&#-C>HQe`Q-sm6xr1@d;})Z8
z#w~`^y#tI&O0Yb3nz`!JN=-($JZIW;BMI?R)sE&#!P{QM;^QGUwiM5-*vXa&F*q4p
zP6IIxk|1I6*inGD0uOOOn7|UV`SCmu0EB@KW&HIY9_V{+4_=C3b?{7=@<24YG8tP1
z9@EL#Y8tE!0yi#)70(tS0>FxhX0f8AT%H7Av8W(Iqhf0!Fcw5+XNF>tuy(wULXenN
zjImL`jUu?wuqvSp2M^f^Ko*xH#t>X6h6d6<ma<s<4bA<Y%0d6n8yeQY;}N`Mg<@PZ
z7DZtVJYc~K=vj*ZW#WkF{`-P?AStBI6|<s+A_U`amO|uYOh7{;@VSK$30EJ4y)vA{
z2vUS{KtQ2TPD~@_z(XdyfH@>Y4<2vf1@uUX9(AGv?^%cIu^t6eqxg&<+m$deV<KUa
z4XDCENd&bK<;XzzqL?um4`#TiVkId6frRpNq>yqlCYwM=C6r7f5Yh=M<6o6G0iQyp
zs6vIzA1bIMV_6db*~Fxq34k1el5GJ96Dn-QRZu9f?Z<f4WGr`L{B~l>JYsx4K^6R~
zzz$pi388@e2Y^~KwsQjDH)6^{0$`VdLa^QcDzHak1Kfs1KgO#kV|ypY|4vM~j~HJ}
zP$mB=P)aBu#R2wz2atkPhYiR#+BzJ_fmI|D4jU``=`0?^&!PsAds&uBCXuY=WOGQS
zGE8AMK5k?ntWx^7k`j6ZRzja#PiBNy)zj}huBX2k7-BrPeZnZ6ahv`av{x83GmP@;
z>8CyHn8Ok~rjYrH@hSBK!{U@Ov%ON8dFsJ1oqpyWeTcl4*$LXF=UW+<&L}gB*}e1t
zv)lCK%{_F-GgL;!BPyeCh)QKhwo)0zZtaZo@>}LMK3ZoR8B1?AGWPVNjJ;WI%*CL6
z{XK=b;%Os;8C6Ob94Mh*5tq=pr?na5+LIV*7Lypoj#P%K+|6Q&Of!eaoce^ubkJ>N
zRDNt=^t^3gG#NE9boO!?-3@Nc2GCw)OJ#OXZ(`V#m(oK9O6dC4C3KLt(8beXY(X@F
zXYdpWxN`h76KO1#Sjxu^fn%TxtGpP0>JN^<VRQkC2hSBW^heCBr^w`10z)f;=0!Qn
zPM7O9Qe@!%RQc)tgbn!!Unojs#fZ3wfWu2<DK_lLW_VI8A`o-=thn_owkTGN9ZklL
z(NH(k12u%)VKs2b<E3Kkcm#HW7J;1vegu)Tuqv>#pgILo3ZSY+BCyjFoEnf_Aj`6_
zGe7~r`UB)3Ky?;U!>d0*A_7$BfP5{D7<--&xIl2~fWH8Ni$I_r_a4x`1O$|_u*(GJ
z3g9$=s|@Ef5}-}Q&}P7C0SOMkxGG~uV68NQxCRm)pzk_h-<at7i|A?tX%N6^2b>NX
zq3%tB(@Ai;5CjDoEh((4NUstf2UZ+cz`?p<Wgc+&TZ;4-!4P9TkP56L7K$Vo!eLE(
zQN-Bo2<#3m8S4e<Cva&hz5&=>a92cNeKa&0-Gc5$z)2&+hHSo2Dv(TkAHHAv_aL$~
c8M{wIi6Q+oRD@=r1!xIAh=k&7L>e{Yzt{M=mjD0&

delta 6764
zcmZuV30zET`|ZnALP-cQs;P(;lx8MmiKE428IozHId!IHd(X_=6k{T7c8%p06(uQD
zwtLgHHx<GaiMSy}$dcV9^?%Pf;ro97@67MK%d@=C@;>jG_jx`9RR<d+8)~Q74BIxs
zN~7nx2I#AS0fXCLPft%NlZ&Ex5}6Pc^WsHPAu!|`La<oK2S!{P97$CY3atRfh!M9R
z1jV9Kg-F7SPvG(KSOqXq15*aq1R|nkxLl<GW|6?05eaAvL=VwV0~Uxr(ODt}M5nPr
zBSF7Dlr>^V!1^ObL^l92Cc1%$3DMagrbK58$p<kgUOS3lFom&4%m{)ZkYFf-k}(Vt
z7^DFQ3NsvHMldK5BdJ9l5e>QOD2N)3m=pYTek2&fpvF!JjQ}}AtV<t=L4jNmXhs@v
zgBW)PrS4b?GmgS|peWZA`amS~N|50~Fg}`clLz4x8P4|*ilUVwnN)kqq`zhY+yhTF
z@M3VyAd6BKk4n8nLg1|iJ`^ZgCXuU@C=MnfdR${jgQ+BZDVT)lbIl1MQizM;N26de
zVnD8eqYAl9sz8A+Vo2_VLWTc^CIUaWXT;Vh5*12R0;V9wTq8XVlqr&+z#lPzGd(?(
z0!m^crsUS6as^-^W~5Xlj-m>{M$E}sEY6o>AON9}D@sIC8Lj|R5epTm0W_R61WZ$d
z>2QCsiHJTn2{FJXBZinSVubl2#@H0Z1oMY`fH4s>jD?tEY=njdAQsqE6)9X8qX2<w
zFoW1n1M;T_TO;_I!hJM`uOS?pz}F1E=5R)XuLWX2=|BXlP!_EKGoi}FZV&*uOaW#g
z!+*}DQcV(=jX0<TKLeaR1qu)Zok=)L&Q~a)-E$Bx71tKdC8$J(C-P!&5h@jm6M0G{
zJRWm1$O9x4DU^I^G|Gd!9*;tYAaIWyAXBs$uJZG2;iw#L5h_*kVGtNf!948;gVCB`
z7zZJ05XvC^D^w=RQ4kglm6j*c$QcdH=NiDpVtfFHI8EswoC6lXnC9w3C=)CU2??3o
z3m>vU^ps$c02_~al2Jk!5936l0E-bzt_5Tul*L5?7+FHV&BSIA1{3iVjj5#Z3Q-aY
zmIU80rIbOV`H5iZ?0`oQ4<FJovX~eJsss_U1G@U4Hn7(YFt*h!n;oz&sdaXl?MhM-
zAzLLE@|Dn!kuVtMz&N&qv=oeo{)Xa-U^#RZ@r^-$<BR)1JQ!UAAP7d6NJ4BLCgMS?
z3I$QW1Xx1?If@f57xJjRa|HZM$`rm>EZ|4SD}W$`G=wjfV|)<3ZN8I6*ymcQkdNd1
zL>fR5y;Kkrs4oDq=>X#Z5gB)>AP!rQ8Au$>0OAGlN+1@HpFWu!9vtW=9<{0@lR7d=
zK9FKUj9gBJ!L{TZ0Eo3LQvpt_HGx=1JJV?FIG|7hrGWIs&&5#4ugWB0IA1E1Nq8`(
zlqmR38)`OMli<n$D;Q+>1~2gR^Yfkz5`PFwB808vfK^afu-qUOs4*&FR6bxe6)+kO
zNQMD}{r^x|BgmY79&UeW`H=bVGgFdjzBvx=dixt2c%No&G`zGsV9U<<Av(?K7*6zd
zSHU3{yD;D7=%**TvI`!a2K3_fYnDXsORuc4^!i%o;M`R?WW~ejIvbZuv2P>(ZLxgv
ze#EWR;dj3qS+igITq~$7_}5fCC(Mem%_ZBx^Jw`O{*^V0nmVhDJ1~XQKj`SlvdqvO
zu?4XscTRb<u+j`&^k?3s3(J-Z?O$lB4HwaR6N^hnT)yU9kg)XF-bM3HrA2LUEA`pa
z@`u}FWXbVj1;6g|hpO;1NNlso&N)HzZm?a?>>QWAa*}Rt?q7*rR@VRUw+%IE!;;2Z
zBrb`+J*`%t^shH|)wz4-GPmxL4Oy_Z(!r_KA>u4HsQ4!D@yYJ)qaO<|S}gZo<C3kZ
zN_sHM_3IAZoxjI8OtCl>u{bj|G__wlvSHw*R&n|<dE@R|!J$ren;-oVw?=R?HqL#5
zT=BW`f^OZbt`3Kau|d_m%g)*U`lFnen)Gs*qTC7lTsF+JyJkHkBye#|%Hy1h#X}wI
zmVUL4y&HaYgX6o)<95=A917|PF7eCodaxx*Gw4NgZ|1|dL!_$@3@}~aaB;=*Yi@!^
z`v#XU`#2_h>y;-x>xP$JXFuyq8tgFK#QS->#;MBh)W(ik%@+rpc)@=hH211QynM;i
zh<nH6JN-7_M(M9B4m-W<nbR@%rOfG0r)qPgh4X56=8GD~ww(b^9iINKj*q=EiW>J|
z^A97sAkDO-NyDScYukNTPQ3JpWp|6V&U*0SdLUYKx^b{c{`{5aE;CE$n+zhGXGqcm
zUWTrkVD6prf?s#tVA^}z1+y{B){gc4&mEYex!T-{b&TCeFV-8@Ts7m(;oPm0E4>Vs
zZX9~^)a-Y8&wQnG>_*Z94<<Ji+?nq+_(*8dgR1a^(ifWdK|$^LrSxM}-TchNKWkaF
z&+k^rHH+!=i<LXP&G2dX;C36ykwbOw9<9JOt#!^Vwh6y=x_suqyDLL3v_I)7F8Elt
z$%fIL&fKU!?t}h*<bmm#14GgSHl?Hwtv}mwryy$9OYhE${>-mjhTkfmH#y(p7QOjI
zTWmk&`PIqXXsu>y`Z@+W=R^>D*1)4JUk0Xcjq*+0zk8$X$=>$wr_oR0Ey2wpt_IqT
zMUA0@O1WXz)M3J#BU5R1dwpDt-S~feGPGEm5xVE>_Ox$XK3Z~vo&_Dr!KC;74y?~x
z^v?C{gvSeBZ!6t#u4dkwlX>sb*1fbZcy^L2X3pH>{cXd;t`(a!iC>pV{fZ`rtj)T<
zPTQSOzwh+ZrbX%FBS!tX@W3?V$h+^hPRV>;l-~HnCb{aN-cw256-jZ*tbns`n@2rg
zw_(+-_?^G0Ir~<p_PhT*=!?tetGjuVOD>zv`I5Y7<);3zf4yy+vgiH;f#9TR$On7v
z^vObP`JkBQT8q@MmrKj4vpJgiRb}ghrJg68J$!F3+;JV%4LX?cJYuBj+(|QL-1Ls~
zecfNTGHY&eZD^Za%z^r#2|o98tWq1Vr`^{4o9&(89FyWzzr9!*U^J$oY%c%ks<g^4
zs%m{NURFTRE0wxe{dmxg!b`?IcbwEp&8ey&Fke+i`(!$$b~s;WxWy;$jb>wornPN!
zO}O#R<zI3RvR);b?=<dm8oGV|Vb-m}j8pR#deh{sQ_r@geKz@wCRNmbp+{lM<%8A*
z_%z5A-NSq3m+8U}%L>1absHygdzs1VKUFekbeL`4@Vwaz@jFX<x}SmJkyZ^LKQ?Pm
zOPT?`f&QXAuYA$+3-=Np96uGV`LZ)Uf7;>%ry_^_)7@`>|NZk@y%NLKM>BN8Itw1y
zFYD;o{~{(gtDpb$r@^UGUg0BykzBX#p)GM=7kGt-b(tq`TDYpW;N3F!uDYn+yj8qH
z*>{gfH8$LErXu0d_k8(8R>a}7uoDFV_b0H|+}~PJpLgwA)qpc$uV)PN#=4##*HmPD
z*>bHu=I~MMTXyzeubvit3f*_4=NmJn%&FIm$uTiFr|a&tvRRUz5pZv7C8NOUJ7<@V
z$0yaQNzbY`#AWZ{8<ox}@%3*ox<1T<$+=n;FmZ3fO81<Vmykxi9c!=jTl_HdeN4r{
zC;tre3SYq0<6W3p;mX+GlBrvNaYDPPgT{JS&s}8UqApXVLiWjhQqZJh;<eY9)d{_I
z{1p0&##0Y0MlT!rUAS!8iIDq8pT7v#6<M)>99@s<u5Pe<LVv~<pWp18tbgub`qJ7d
z_vdB%4#|1cGpn<tsi|s4R`WmKY|r|&oJp`5iJkc3IiY(%MAwwP2VdnrVq<1I=DIAW
zX=XQAJaTcxM@63>ckr`O-Nz*fnJMN*f1D2COP-yS#vd;ZJ#u>(at=IPJh8R=FWYdT
z&GsV~mbNC|)_Kp4{@va<OHg^fI?8_fh#K=1$iu?oUcY;i^LLM`EiAkL8EK+@E=mb6
z;re$z4+&^kzj_|sa_JuC?)eX5H!w$)Wx35=p|NelC%fz$UCy6(=w1J3k~sI0@pDI&
z-@a>F`58g7-W+cz_i}wS&g{&aJ4r7;eO#XX$C79JCt2719%a3BxtC$Vu&PPk<Nc!p
zTV9?>uOCZ0*xkL3m-~Leu+YFmw<hkQv02S)J8Ej<Yip5ezio%J)=1rUZK2;ANKdgf
ziYRPb*`x6_6wR7(D_h<h+!c&o8Pi=LPkd8vTaxOU@H%;>*4lH_9_fj?)(<P6E@DoX
z_MaxG>R98QRdaM++6}RA|It@tK9^^0yFGnGoVN05?>^_pf1f%0w{H6N7JgOl_ea}g
zF$1spgf`BqaY&pWYv*}0v(&FS^sDxG(B8_h;K_@04~}UDJy?+25;^L4W_Uy4`GiFM
z+?HFWO<rXxU3_C!<5(RiGv55QhIwqed+y>v8+%T#r{|n_Iq&)=qnrU%%!zETh~Gzj
z8Q{_K>7Zls#fU$vvcgsst)y-4&_zUzvq;`8nm5)p;n?3ykKb*co_`+_Q~2?<V+GcI
z^RXRocw^}H!zno$hdu1vQ>Q!9y7*a1cFQKLa$_0zKfW@hqsqRpJ+b-C#>j!LuFWx`
zX_A6euj+I7B$}>?Zy-Mqblzio!`tik29FUee!be#-lWF#OK!{fN&QX?MV-qG4&6&%
zec!)1fYFolcHf2EOXc<vf!_w~8gsdQMWwy{IJLDXswL=f%IpHo>!!hj7PR)aF>`9U
z6ywvNY1?q%ahv+^_&}REOMJ)9baUYymTxa@3i)uIRaCKMh<5R}-J=fLMBO=jqx@;T
z*Tjk8oBin_oBhAH7dBlmTkGdN_k-P$!LP9S9ieTGPWQ?pa$l`isTA%#)@N`>x{q;A
z*ON~3>1AlkZT{|wQ9kyXCLamMt2MOaq%7Cv_wcHuw3OrgPv3d`t}oTx(1RK2yM|6;
zid*%HQQMIj#-}ZxZPS->=wPi9qzI4|h~8{?^&}r9SVv_%BA65qX^2z-QsK3nxJ}3M
zB@#YQiNPENlZk~OZFWGIAo%9nl5Xw<;)V_r2RUCEjln!IJ>*B)^dsZRfY*2;N2M;U
znc63Yqc!VcX0t&JHZm|N2B-{_Kr6teA6M7S#MN~R2W%zp+=yiw*hbuxqx{C#2h4G6
z2slq~2(T#K8W48EB%mg5;Z#dFc7JU&_0pk+fW-3TGk&Yn&g8#IKeMS8Vb}M}V;^!1
zX77!k$=)*RazL&oCy-qVC;sL$*`@ymvMWl`r+VnprzW29oeHv)Kr8r>4wC8Oc9@BV
zVGCeJNMuRq3i(X2ELK2V%Xj=xcPF9lE)Mwpr@Gmsx*4`btXWq}Si|c|Sbv2|Swmaj
zFu7|r?rdUmu(*k(@+xJ`5fu2_rRV!Q-^}xWGa4`-ya&w9G@QAwUCbQ(TOsRaSv>O~
z9NQL(nNF|cnQJqbu<MU6VOJ;HvO$g#>=vj5*l!q-3G88DKX3oJgid)JkpJWMzZXlU
zuJp%i?a!Mtz&<4?5Ww{LKZBr<3IYYW9Xy3f0`_m4KX{2EnlI+#G*E;s2sG;K?U)7*
zq=8}%I7n_FQ4><aM6x6jl#(R_G6g>L!#RfuMUHSl8Q~l=VlT^~bEYT5B-K&dX@_VG
z-<h*@?K&39+so|9&u2!0e5Tzweb(ZNbId#K=a|o1pRk^ddCJQ7xWjCN<3%>LtifB(
zF^_pWvEM44*b??j)~DnTtdU0z**8iI*+(C~W3o@aWj@u9V>iOF!(W$K7fu?o^ZCup
zSq^uYt23IIZYOE1gKad{&L=b)OSyr@%Jux4by|CFFkNHyr;Zigpkrk}z*xJ}JlSDz
zeD%FKdvS-3#TM*kN(=It7Zv$T?6?){yH!7yW<)<$o*Rv2y6ifu;<h<!KOB!4N@K~N
zo3lK5VzyPLm_7O_gYD8^$143;&1&kZX4Tl&u&j5BSdG=5?2B+5I)=t>bgyC2i}u<w
zLt68h1CQl1L4^_=6>#a$z1lpxF{Z~dxkIV(Pz^Z#!@Va6_nzc{KPdN_rGY>D(!3a7
zP)VhEr#PVMf2VnW!8A{gC?foN!mi$IBT`$v*IQ2!eOJHD)Ews3Y&cIYyxC?-62Sd4
z(|6W?tR1g!XOWD@)Kk!8ZJ55ZYf>S}J=ObGGYKAafSf01T_@-LXIW9JMriZ(?fm3i
zOKOTc9V5ukFZ=kIWGiNqa|j}Zhtd0#L$;C2Ici_)4>3({Ar#xI?Pp*=V8qV9Nyct(
zigB)XvVr3$$43N(-2DP2rD-S0E%#9NnU<_wVc^m~`7&u>@LFn#IBmIsor_+XpwBS4
z0ywP%X9Qd;IE)snVA%|(3ZURDETKAS&cQF+c{RAez*w^bPGu8M4*A)DB@3Q{FOiF3
z$>Sm{tdP}HbcK>9<_l0UsD>b-Bn3aQDlz;J>eQeHmdVthmdfo#3LY#KqM$B=6fNiD
zuv$d%D!`>^fl4G+ilhn+4OU=i;Bq9mg6Q*8K|KdtB}Mfyh=dAo4TI?G46LXxN<_Zk
zZz@V|a6rTVj*^>#Kr^BiNIc+H8ffHz+i-z~Kr^BuMm(S?4cy^qz+E&IG;_ecpLL!e
zM&E~JDdK;CviBeOuR!<^R&PW?9;KZEv=FL2AT@u;0gqG&EJe36u(}NF5`lS)=s}y>
zIG`OPjd(&B0d*k0Kg0u`YSKUl2Rx&|gf!v-&(pvQ4(OzygqMj2yi5bHIN&wZ_YDVh
zVNgHpA6Sbbb$ClkddC6pRfvHae1JM+QaXGjwfn>YpD7)_5XH>f7-9Cm9H99M*U^LP
z=u_)_BdNPN;JXSjRD+&hw)7Gba6R}9<ly?SC0D}@;0PXLtP_KJ;f5sEh`|1*cDONw
zknAQTfhlZNh>;pM)7L=uAMV1<2{IT3IE^H;fGs7(5{?M9q0+b&iR(w;C>fOIxHW{3
z%>7A%0kBme#%g>Z9Q+btLr@Wh*pgI(G_a#&*b#Kll4qnPgGsbKf&DK_hCm3(Ka?aG
z23r+kqQ)KI;Fpx)1QlV(2$E_fY$+j*1R3FfXvio6r@==P$p5m04k0A>7=i+Kf~^WM
zRpZWZ@JogZK}A@?AgNqoOUZB}$Vfvvp%ix#JC?xymm%XI1hQ*VaSxJWJZx1ecx52&
zPYM|hYpT4y?}Y-N0FjZnCu|vN+>7YQLNZ?>Q^7<1UuPTlhG;1LQY7v}5Mo#3h<%JB
ze1KAwbp=rZ6$$Z)5TwZ<Vo#;UClSPPznCWzbF4j%2(`FDL_=7J#t}>ekDdx8&m;Wb
z7l_fmvMKHhv3-9txF0c&O~<Feo`d_t7J3Di*)7v>CV?VuXE(H0297s=T4KSvrR^E$
RZjG}bMv1dw3-2f%{|83-Sk?dl

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index c4ac16b454..057ac64bd5 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -3,6 +3,9 @@
 #
 # Author: Tobias B <github.com/sezanzeb>
 
+# TODO remove this before pushing tests:
+import sys
+sys.path.insert(0, '/home/mango/Data/Code/gensim/')
 
 import logging
 import numpy as np
@@ -53,12 +56,18 @@ def test_eLDA(self):
         reference = EnsembleLda.load(datapath('ensemblelda'))
 
         self.assertEqual(self.eLDA.cluster_model.results, reference.cluster_model.results)
-        # sorting will not be the same in case the sorting key is equal
-        # across multiple clusters. hence use assertCountEqual
-        self.assertCountEqual(self.eLDA.sorted_clusters, reference.sorted_clusters)
+        self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters)
+        
+        np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
         np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05)
+
+        # as small values in the distance matrix are subject to rounding differences of
+        # around 1-2% between python 2 and 3, use atol and select a 100000th of the
+        # largest value instead of rtol. Large values of the distance matrix are not prone
+        # to those rounding problems.
+        atol = reference.asymmetric_distance_matrix.max() * 1e-05
         np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
-                                   reference.asymmetric_distance_matrix)
+                                   reference.asymmetric_distance_matrix, atol=atol)
 
         np.testing.assert_allclose(self.eLDA.classic_model_representation.get_topics(),
                                    self.eLDA.get_topics(), rtol=1e-05)
@@ -301,5 +310,5 @@ def test_recluster(self):
 
 
 if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
     unittest.main()

From eb9ea2761467406a67bbe0da85576bef37662026 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 13 Apr 2019 01:14:45 +0200
Subject: [PATCH 015/166] updated ensemble test reference model

---
 gensim/test/test_data/ensemblelda | Bin 9332 -> 9332 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 249196da763e8cfe508117c34fa4db5870cd7747..cbca32ca7dc6285620cb7c7b53796fd352ed8ec7 100644
GIT binary patch
delta 2532
zcmV<A2^;qGNc2ds!U%t%$quyR=|Ts=Er{nPJ7f>2us7%i2$R7KM`qP~_>n>u>|u%y
zN9F-Le1}rp!p21WcFIB9m(V&Q9Ki0q#kGa`_iCyc--1=sMcZIlXFL(yD*T#@p6Zj{
zBWwg!(L~&9%|Mh*PCo_>@D<cM!Yl~{(Eg)&?<cdgoPgm)ubzKD$HOATZaz(fgrSoR
z#cswivRyB(d|Lx!P$@{%0iVw<4!H#>S1XhecWBCYO>$n1f>Ul<NbopiLn*vQ7DvD=
zaRz-OrbdtG*8>idykG^a1l!N+aP~_zp~TEdn*6a06>!0lHWU>{+P@j=c=|DuL{lqM
z69c5~Z&5tnr_+CP2_G50_}%FYM5+Sit5!L=EC^Gl1Y_JQLc1Rw$}_$(O1weu;q-Ft
zzl)udGSJ|7o%ni@C<w*uyQB(RyWx*9IXWlJseINY-{3tl<qd|IULS$pHI1@9RYWNz
zW{HSUrL_pMf$4aO&7KV7SD|3~SE0ZKw6Wuqo$nhVl`?<vr4$CCoEn2)(4s4j>iNbl
zsgRO62W#5{14HShY)2jm=D@Hjzd4H9sMUBFg%h0tpP<!%(bX`BM7DtW2eR#~_<j4e
zT$NtS$*Hvi*VRk%$3;@c->oOp_Mu&qpwx+BwM7Hjrq$T{9@7v2c)7uKGWk(KLaP0a
zNImnm<WGM<7Q+*(5oI_|M~BE3V$<B^O5oO+AD_QCj=!0XgrAA9e)cafItKPc{lO!z
z_+d@Zq?}|f&4^m23vbWN=%9e8{s<-gd_VN<mZjdzYD+mmWLUgSlZ<xl&behrstDZ8
zJLB{MC-Zap8em8jLO@C&ledSwOvf?n{!KI;`;mVQukPR<fuR~=0X2(_oJ}WsyDw4-
zj<k|vPH+^`Mf}&MKk06%06NP*M#Y2ne8u2QxKhdKyX0cj_NAY~1GafkuIcy1ND1)v
zYWZxkJ3s5dg<}P{vV>rEs&u=u_f)v@@Ww%^-G&BZ;Ul{+wTFd$=!FB{xK)8IhhK${
zn`D2q<;6w=U<tps09MHr6V{^Ko<c{MVLF+blbUFeO~R<mp1`%Vf?9WqHe=hL+BZtO
zyVN7OP7NY4TnI}Es()l>4$}nc%AUBQVUG@x7N!{!3cYs4M&493!n<(f^l%Qp2uk18
z!&Ss|?K-$Qx;+T1k|gmoL5>V~B}WD3mzaMr&Kud_pSdc|FhWUi<9w|iF$?&7VAU2o
zmf5cU*TW5`2c1~z-l;l1gISlts69XCDK4|^2HdjBKYu*8#!5dN-nY~~V?*mme#%#r
zPkeEOyfYaK?r=)MU5cJQc|?K6iijpmF_-OS&-!U@ogDK)p-!dfTk72gr*QyHDdK-s
zjxW3QjvWZ+x|HB`bWrFwwhnHKiS{h-7mf(((9^qg5`nY5p+~NZb*L7I4!!XXSft^z
z`t&d15t1-)P44N05$2*rB+d8Mb^d&FpU9N>lq1e4$d<oJ;s5JIr#V_aRT^)$*s4vE
zZjblwop%`D4TpAjz*W>lh?<qxi>!a9BEq!=)CFPw4^GBXs7&AVr)ZL=lb4z5(?4;i
z8{;h7{%bqbQ}r9BpA0+$tz);W0Uf4fe(;UpVJ(UF8JmHF82WTNpl#E|s{Nuf+IrS3
zi=U9J0k8fG__CqeL2MNPARtw~NW+eYi2|Z=hkN{N<kh78Wbif9A4*V;<K=%OSgisZ
zF=C}r4*lZj;Hqr}P}E*$UQVC6d9cNXLc1*dWWnm3?P=@P5&66z4fw@^!L9jgak`2;
z<UTr04T<Y?w(e3O)j#HfA($uH31XCuO#yE;Fux#wu;Y{v-q_K+8Cm_n`#x{~8Y(Jp
z!1ZP#{%E!p-*zpDFLTFyH)Vf*oU{5e2N^ytAGM7wXds)%>h>i3vj@y)YoNw*8zMht
z6E_gO6}8}1%K+MLMT57>`RjGLFJk#0tgF2)v9-=N>UVNpj5{q^E{im2$b5VoH<ijZ
zIn@H-s#S>efZ{%Smy(s2=#4xCwyUf|L-^$5tuK`=jbu83AzEUXCeMGwMB|<3U%T^+
zi~6^5<DN;Vct(3k2Co#?9#*R$ruz7k10bjLb#PvoG60sQgectFV#;$zHE)}_3N#KO
zf)f~*_y1!i0$D8o+q@a2iOCbF?ra8-ifP=$=qQIt76dWfteR=Ysc@p0#!#Bq&3nz}
z-K;d66aK5;4;-t~lFolzy8&pd1||TGsqQXmH#l|mi&a^qoV*Jld22nBjZ==uv3-(z
z-q*$;OCKC&P62BrSs^u`yM~si#|-{GV#zHdnDQkq!u#j{LI()ujHm1jEQ`Jw0>Z{g
z3$l1zQ22vW6L*H#F8!S1P<p?UEEAV&R6=XLyTw}2?V*jGbmD)cUPbHVR#5iW>pWhz
zcpzx0G99RXz>5R1kdW@97|x~XnPfoeAUMK3i-QU#64_J+Hx2(64^vb2H6Tf86IqF%
z8f8WHFs^KQZG;4k{@EoCAsBQP5v(zOs~-P)a&=KAJ*#J4Y4jp_=NeT{3%TrU=GYuz
zpM@}!P@yv#(6@iIOy|c*e+IQ)kn5o+6oVO6-M70oyy~3oGvy*$!zizFG2bx%Q_kGL
zy3a?*ZZV!3H*jnmkrVtNfcklK#!BsFKjaRM9yZfl!7c86H-W1K^cfq8NZ&80J(moS
zS}Za=LwV``!JAffH^YE-2D(WrJz0*YC1feaVgZ6%F|>bC8Y;p&)&uxCH=5ExKkQKZ
zE8q0T2kbD}u5SfYpGBHxN1AVx3#WfHq1V1DV1}50Mo(KbPw#c6J9L<0<X?yy=e_>5
z24(8W-a{M01`fIWPBjU&&88(peOAgN9IRf%EV^S{J<trMwvJ7z;SWf)*4<0K44zq<
z!?Y$3Ly~{87t{E<WaS9;Gua21(i;XqMr|deO@b&6ya*kvknRR3wvYeLnM^2S9GQw5
zR)UNls^<S7UElnl$zHy#yumc|M1>tFzu{lO@|F$GT0odz%5#DrA!>26Uy5C(j8v9U
zeA8@%gVOxoX=vvuf4-pZMZoRgr^)jcjyOR#BJ_W8`y7&(#E3q9d<{%mB*qM<;}F!5
ze1O)>q6CKZt%A60t!C|z_wadxZNRf1(4QSFQ`eA8A*>mL|0gUf=YY2H{-7c|k0$jn
z|ERRg-PsmIz_MMm?o2V^hQSi3X<%<iOgShlej42M@dKW;ikQ3EXaoh}=gGc+H?=2|
z`X5*I&>-|I$-Jn|l(_Y-BT*j$V4Mnn?<pgV3er0Brb`e_u+>Rv0YPU=yGqKSWJ#ja
uel{3m2|YRKwbHN4wQaTHjKIC5P8djK#el)<!tX9t9Gi4vOMtV667CVH4fWCh

delta 2532
zcmV<A2^;qGNc2ds!U%t_R=l5eaJ}{tcO};s5RGPQ&K2aFg`R%2>oU^25c9tPke`JB
zNpRtLiY3`)|B;1uU;w3L>(FoMF0B~H%*##wdT*HtXZWBe2^z4NVPfzZOTyjmLE=!l
zF>%2&B>pV;b<V^DtiXR#u-rO|?<q)C`a6SzN?YF-><ZWUnI3;M(rR-@<I`$C6X5ks
zqe1@?`j|na>q#?@ioNS#j{;)@hv`$~JyHE{etS`QYBA2AmQZ2+tTjxGD$SAdk_oP6
zM=#zZx#9<HAS(MSK`4w|-^q>FX&9vX?Bb9T+bWfYjT(uB)dLYAqbG>4Ei5QyYb_5t
z@{*BXAtn5OCzO8+LcQ$#8j4MiNBES<aWD88y{a?M_!^S@=p12lw}TZ$t;T}MQ1Yy<
zs3h|sW^DpC)3p)CUg*z|bWzUR8OI!`l~4MhkfBP%=+X~=G+|+mT+ft297`LZkkcL^
zGg`tE--|~X&xd%zBsWNu9dX($o~MDWXD*nm3)aWEOUHkK3c@Xobd5{22eIX>bIvr4
zEx{}1$~JZA`RijdMOQaw3P3(S(HR*Sp(TW+@AF}Y>GJxu<<sb+J{8IUC!*P9D_%dc
zL$q{iND@|{&uyL|ECt>*r&uz>t4juVi+^eoa=rKc<uEeho$E=uqTPpyqFQ57X#hSY
zkMg62A5woHuJ}$G^RW7H>{WSK$7RaCRl$;9vm05*<r>F2KZlnd>Op?U+6F1`q$jw3
zOpq|*Dfo{uKsmXs+6@oZczc(1FtJN-IH*d+#X`shE?3bd)k9;Zo?Cb<@*I{?#*A~&
zo7)c>DLQJ<-`_eNFZ@)8?+W9FCOOLG7V-mA+dhAv2QUK2VDhx!cns3AL0zCh>For*
zD4Ga|`}IAXUEBO)@V4BI#o|zJBsGu50pnbCYynyvzBZImP6yrx+Z1veh0a>r_z;y|
z|EE#Br7AL;8j|#>7$#;0?;HI!Bli-ri{MN&FMv1Vm-uYzHC_f=(d|@OTBlE2JQ$YZ
z4`F}XjPPw`o3eq_f=Idn*s(5*i`F0B7M=I5I)b3spQE}Ro#bvD8&YznjT+|mjlTM*
zK-#TYtEn88lzv!`Fne{-Wm^-}g&KV-B>GU9MziH&wZ`cKhO#oKe_axt`TB7VuY9K2
z^HU@HhgsX1p^q5X!C|E_aV`E2{5_amgF1hwN#T%{ZxkY9Vcfx4ytUp;E}FYSNmjgO
zeFC(Bz0Ub=@+<Q@FnCQ)IX11b>uX7D7+dS0*s4-pLil?nW$@?In2JLe*~WhMBX|lC
zy?n3}<C$4E;YYkwYds>0k%NdO+7fIs5mWg5e_habIRaJ+bLM@s*DL+_k*!i8L(hMB
zEZVo0?*kYK%qdO+)dMP12++HN3E)sZx-ER}4Cu+cN`W8KNq!$IfG3T+Onl7o%YLA;
zQpD9o$&`THyKnR|zMAa7%y5u3SdSj+#~D%6RhxNXM>EvoCui`g{bb7Inl0J>MByK{
zff*F-XqLK3Co;m@YkLz)X#!+=&!vBVzLJ*m;l#;u6|YGIl7x{bBNdh#sGhsh)P#}C
zj&Bl{;TyN8jzwRj<{r;B!UZ`QrYig{?ZX}j@qy7m@M&kG?BLFRP4?NizI;?j2%l}e
zXz&l;c`avM4#UU&eK?RNFdJ?~(`|K-PRR;m;#H|bjcryKM`fclBtl?is3(8Ud31`!
z&>P4N7k-un8DQ}r*#I+N`FuieZs0`I=D&~}2_hiO_N?=_Q1KCe;obu2xXu_-HMz#J
zsNbkjqZKt05}?pGGWrY1Pu+v4dF;)fsVpk-S+2f7A3X*Ie*0iSo!(m3P&B%wnl=ga
z|GylRbLYVd-V8Hrd<oP@FR*`tiA`WYA_ho4Ql89wx&}VM-rkVeS=W>Dq)SIZ*8hw*
zp}s8t)MFvYP4Z2cgox!Snj2Eku92+~utT()0@mq9_2v<Rn?LR0Gma@t9Ed?L>TPvi
z!@vrYJ*eW}r>~3&eoh2nBP}R5_Uh73I1-B_0gY_|j!Wtn48imA_J@B<cGqV$=l_{h
z5>k8q)J4;^eUt8Aj<GKbS(rUX`E?45Ol#@bw9&v`FM52;w332BTGB>dl-BllNXlpS
zv<+z*^}3gTtA(XCIF!+|@<iIdgf!1mJ0KdL_!Ii+r^A7KJvRa~yh8soJ})Zp1+rH8
zEn}KmSK;he2#9O|HsF60Ckt$7+n;bF&2SAG0Yh1cYg`m4fZ09a9)hA7lC*W9c&Pcm
z@w#W^hOte4DlLXjF!)W6c%NRs{p}JIqrNF*ZY-52qSrgp?C)nzJS03_0SLo@Aj0ou
zkEZFModtl6{hI#u|JC?$Gg4T>M%MJ1bG27)G`MN|#i!gcD|&yyTh+WZAm{vIUBzTC
zzv36tAtLx62}3n=lV?(n;}kr|w7m+X!rJ(R&-$65#+rLGHY&A`;aFWUXsdxKYz3cY
zBA9}C3LsRCT~Hh0sB`QasSf`Gqkl`(?X6j|p=*HvVgDN}0II_gdwb^}8Oe@fw#H1v
zA%f3^-{r1`Hv)f!_)!Midi37US8fBq8?)A=LPDsj!JQv?;YZMc*q2}8osUT&67|*|
zuY*;4?OS-AY6fS=a`x>w8VT@&{@I}}e=xw*TIGSncC#A3%E6#o*SA4nw5tQ>($A83
z9zW0q3yswgnno)*&?P9=RmJ$bIaj6=voUd6&n8XwQni10vvl|9Xa#r3aW8_fES@)>
z5{3x@;z}(H`&1L46IE5=Hg7R^?@gSmyXl|x&B%ZW`glcq8O2jtSh@%p;=BS%(fuhI
z430{86D(!Z2nBV-PZDZh=zk+ig)p-0#^mQSJ=)7SxX4Er@y;XdB+GL!QprBcEjR-Z
zu1lA8cYS}0&!*l0K2bn%My@&H{q>@dX}WKKg9$KyQAV*Zo-A&t{0zo#F))nPundD0
z_1ZjlCT=xK2^0mAab;Z{!(TD1qTNEC%*`R;Yy}CiBErWH|5^Tdy7$l1kz3xn4?FUh
z{rzLg5JE!%rTkckmpSgGtNA6LJ$9pm?&Yi~{6~NE#gKo1mif6XYff{k+n)0QX$?U8
zw7sGdJx^%{49O~hS!+pZd`S_cIH$2h=Dh&W_zJOt0NQwg`Bh6!yw*9z>7O4j5XCg)
zel|mEO$rZjH)ZNyA<{vu48&71YUr(D>=W87hh2vc)8s|HWeDCW0u^<GR-0JwgfwY|
z)F4+q#Nivy=E!Z^YA)B+YywjIF-$Q5Z9{hT58xU@7vKvIEx%Im5R(vc{u?)Q-NHD$
uV*AS>ul<+eS9>1IkztSrIFk{36N!NNRh(I6yl;Ll+rM;TOKh`-67CV=Bj(-!


From 6b0dc77c091c3c3a8a644e4d73d6e2b23ef923b1 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 13 Apr 2019 01:15:00 +0200
Subject: [PATCH 016/166] passing tox8

---
 gensim/models/ensemblelda.py    | 10 +++++-----
 gensim/test/test_ensemblelda.py |  8 ++------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 79b399fc40..6b96655238 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -287,7 +287,7 @@ def generate_gensim_representation(self):
         num_stable_topics = len(stable_topics)
 
         if num_stable_topics == 0:
-            logger.error("The model did not detect any stable topic! You can try to adjust epsilon: "
+            logger.error("The model did not detect any stable topic. You can try to adjust epsilon: "
                          "recluster(eps=...)")
             return
 
@@ -493,7 +493,7 @@ def save(self, fname):
             Path to the system file where the model will be persisted.
 
         """
-        
+
         logger.info("saving %s object to %s", self.__class__.__name__, fname)
 
         utils.pickle(self, fname)
@@ -898,7 +898,7 @@ def rank_masking(a):
             # print a warning. info otherwise.
             # The default mass setting of 0.95 makes uniform true masks on the opinosis corpus.
             logger.warning('The given threshold of {} covered on average '
-                '{}% of tokens, which might be too much!'.format(threshold, percent))
+                '{}% of tokens, which might be too much'.format(threshold, percent))
         else:
             logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
 
@@ -993,7 +993,7 @@ def generate_stable_topics(self, min_cores=None):
                 "is_valid": np.nan,
                 "label": label
             })
-            
+
         # start with the most significant core.
         sorted_clusters = sorted(sorted_clusters,
             key=lambda cluster: (
@@ -1132,7 +1132,7 @@ def has_gensim_representation(self):
         gensim representation exists. If not, raises errors"""
         if self.classic_model_representation is None:
             if len(self.stable_topics) == 0:
-                raise ValueError("no stable topic was detected!")
+                raise ValueError("no stable topic was detected")
             else:
                 raise ValueError("use generate_gensim_representation() first")
 
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 057ac64bd5..fba8ed5acb 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -3,10 +3,6 @@
 #
 # Author: Tobias B <github.com/sezanzeb>
 
-# TODO remove this before pushing tests:
-import sys
-sys.path.insert(0, '/home/mango/Data/Code/gensim/')
-
 import logging
 import numpy as np
 from gensim.models import EnsembleLda
@@ -57,7 +53,7 @@ def test_eLDA(self):
 
         self.assertEqual(self.eLDA.cluster_model.results, reference.cluster_model.results)
         self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters)
-        
+
         np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
         np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05)
 
@@ -310,5 +306,5 @@ def test_recluster(self):
 
 
 if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()

From 6dc60014b96dadc2109a805f4d1f1c5c836675c1 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 13 Apr 2019 10:47:41 +0200
Subject: [PATCH 017/166] improved determinism of methods

---
 gensim/models/ensemblelda.py      |  20 ++++++++++----------
 gensim/test/test_data/ensemblelda | Bin 9332 -> 9316 bytes
 gensim/test/test_ensemblelda.py   |  20 ++++++++++----------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 6b96655238..ded79117a5 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -964,7 +964,7 @@ def generate_stable_topics(self, min_cores=None):
         # which was assigned in the cluster_model. The result is a
         # dict of {group: [topic, ...]}
         grouped_by_labels = {}
-        for topic in results.values():
+        for topic in results:
             if topic["is_core"]:
                 topic = topic.copy()
 
@@ -1052,8 +1052,8 @@ def remove_from_all_sets(label):
         # list of all the label numbers that are valid
         valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]])
 
-        for i in results:
-            results[i]["valid_parents"] = {label for label in results[i]["parent_labels"] if label in valid_labels}
+        for topic in results:
+            topic["valid_parents"] = {label for label in topic["parent_labels"] if label in valid_labels}
 
         def validate_core(core):
             """Core is a dict of {is_core, valid_parents, labels} among others.
@@ -1067,9 +1067,9 @@ def validate_core(core):
                 return ret
 
         # keeping only VALID cores
-        valid_core_mask = np.vectorize(validate_core)(list(results.values()))
+        valid_core_mask = np.vectorize(validate_core)(results)
         valid_topics = self.ttda[valid_core_mask]
-        topic_labels = np.array([results[i]["label"] for i in results])[valid_core_mask]
+        topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask]
         unique_labels = np.unique(topic_labels)
 
         num_stable_topics = len(unique_labels)
@@ -1172,22 +1172,22 @@ def fit(self, amatrix):
 
         self.next_label = 0
 
-        results = {index: {
+        results = [{
             "is_core": False,
             "parent_labels": set(),
             "parent_ids": set(),
             "num_samples": 0,
             "label": None
-        } for index in range(len(amatrix))}
+        } for _ in range(len(amatrix))]
 
         tmp_amatrix = amatrix.copy()
 
         # to avoid problem about comparing the topic with itself
         np.fill_diagonal(tmp_amatrix, 1)
 
-        min_distance_per_topic = [(index, distance) for index, distance in enumerate(tmp_amatrix.min(axis=1))]
-        min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x[1])
-        ordered_min_similarity = [index for index, distance in min_distance_per_topic_sorted]
+        min_distance_per_topic = [(distance, index) for index, distance in enumerate(tmp_amatrix.min(axis=1))]
+        min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x)
+        ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted]
 
         num_topics = len(amatrix)
 
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index cbca32ca7dc6285620cb7c7b53796fd352ed8ec7..0ccd9d4c0d96da2ce49e3cd6e1af0d22623e8e88 100644
GIT binary patch
delta 2576
zcmV+r3h(vwNaRSc!U%sOBX{F;>8~%%v)YGxxO1{1`t5N4<|jt>^P@D)T9;O=H#Dh<
zUvB;>m?uc9cA^T~m6<Y&3Z>fJQ9Uyx%1@UZe7>wf`-Zq4TH!vYe+J&H3YHs~@RTS4
z>?{yT;%m_yxaDH0RACtQ>Wm3&qMfc%lUyNar#;EWQTXMSQ;C19JWhEn`VppYAdaED
zNP{8ckzY&|J&@qgLLTtIGe<Q1nA}|?v)%MDWa{xYn>|Qvcev0a^w;*{-4Py5buGIN
z{x_NeDfo0^O$m>ZWc+aFB6SK{NT`wP@eTR%mWXFAKnAeAQlp%!(B28YZA)iFIBsp1
zM(9*RetbC*X1sqzk!T-XxnDH3cRX}((xg2*a-p4dPbbbZ>rPCaH3ji)WfqjHr0lfH
z(I_cZUb3P_*BwkxHekcjL{;a}M>$K%ri3i%0!kB;9H6#CM-!&u^$i^GC;7ygXl58o
zcKQI!k);A1ZDpv1VPbQ{f<z}UTlbna@k%a74`FiPo<n~m7uf|mpwDTKPh=u@r{a05
zv{D&A5<G24a#bdm5`F$_RF0;}cEBU{LneYErdv{xouzgs)%s|v%0;W<wj*hq6<YNM
z^||3e;4zylFB~u}S9WJ9lEvoHeP$IlL^XZ$rlWB`(L1<QB^=|cH&BLd$J_PRdI9i(
z&jer3*l~Y6K|Wc@QG|Q@f<pUwJ#g(!%KSPSX})It+=T1!2mS5_ZVB0PXV&=e&zgCm
zVYIrf<~GrsW|Ml6TVc~q)vROPBY6oID?C#Gli5%BMNsN`qc#H(&a)R-+ZwBF2owbf
zH5!E2Xkbxaw8~mHxr%&errBLUZs~;JiN+^~cgBAj7k;L>llebnmcF@YTAqkNnUHVj
z*6l2cT(pW41$*#|2WoGY3Rrz=u<VE{qLbW~cGYIa^<GoThED8`D<L_XA*+)80>GUl
z7az*+m3!$akRc={)X_>K{z~4BMJx;k>7|^drJOjDFH%y~9k`scnBU}$Ut+@EEXh97
zD_eih`X6T;5Fh!zrJFPO^SbgkTEi3YI197L3c6>MH{|d|lE8%C^=jXl4hR^D9E4hw
ztegxnFhi!$m7Z7?VWt(Q1kg|bRJN@-^sp+g(mls^ZE>5K97bpMeDmAg05c#pq&6QR
zwG^`czr#RJ#FV7(Gv*OixHoLfQA3FDQYL@%x|`n2fj!(N28i65Y68~ccv`Y{9Iy4&
z8o_067}mTV)|OvUm>`{ajV#Wl5fZn~-W(_SFnF?xhijLT80|Fn^$v_@{C)NUX_|!;
z9gsqBOTa>5$c#$v+hM2SSo=ITk$Cb)yr<G6g&8!|T5S4o0ex75OSOtX=<CR#IV^u<
zQ6Yd$(7E<p0h#UpAtobhsvVlBsi?n$MqPwjfJsT}wjw+ar+*93@xVDD*sN9i{kY#N
zRK(fDt~A=4;6gURnGNAiw}m2TcTxa6O4wj~g$X}wKaojtr>Q^+9J2LR;$_}&Sh^Du
zE4d=VeS^U678UL%TK;RNwW3k@w!?p>N>t2qIo6003rv8Zfu~uIS?q*AJk=YLNE<!o
zL<`TDH7bM9jev;VzJiXal6i%U?M?^U<=A<-(I@qf|C{#9fdZse%%6TvAc@Ip5eb*m
zApwgzXNz+_R=>ejky{}xh3chK>r%l}Qd42p5<3*7F&%wuhh?a(4Hoi<g<OBAh!M~0
z8|NO0Q)HFbob&$}KTKIl#dtwdI201>(VQthVVmYLD02EmfEuoph#=U;ww75D&30Q^
zv-gYP;xAL4uWGfEvz}-eN;AIBj{YlAo!q_T20NwLIjxqx&E2#Qtckd%@%?}(3dQh*
z;J<vCE~k52L=`{JBhk{O;QD_LI8=t$9ZI0pHrx&tg$L9-2T3(wrn~hn!y#S5XiP}`
zs5C4c<YF15M^VB6DKhG+Th4?9_BBDj%<m4>%w;13(*D91iG<N}zLKHBQusdPMe1>V
zP=JT&!L?CO-H<d;<lT#8djQ0>_*b{l1znbExtfI#%xiGP96xI7it~R_Cp?wv@|~l)
zIY@TNT>X+Ee(sM3LK`3oD5f=9JqoF*Meax_TF(rpYWZxdj9Z7Pw8vZe6@K*FBh>b%
zVTL?mmLR*gIbGoJ)-sUTJrkkV^#54k-Ht8yRx;B=)5)zo@dLrO0qLdRsPz=KcD@d8
zEyC*LH#W1s=>y1T?pc4g3|UiDg}9d*o{L_v8Y>CzXgevqaZ%B7<W-`f|IX?YdEh_m
z7b&c&apqOSAcHI~B6gNxwg%qZFNjT;AaeavhKL^Ib|k^<lhNIcdy~_`H*NW!>yw%B
zQnoRs4=c-MjGcW1yCVE!+o`!<3_U&Tsf4acklgcu=ux^|=N5l^E$U*uaOUUaXkLJZ
z6tRf|(alIv`l>?Kg-C41;_R3dDYU}4*cqkdi<{5n2^`;fJGO>#7P2#bb$HU0@wSy4
zSH!)MG@dVJL+%ZgkmYp>=RC125X+3WPi+0qtOa0rv9*80F!JkVTDu#68U+m6-;%f1
z6@!9-#NKQkXf1!;JN}Rf`_9xUqO0la$2cg|Y1+JC2sZ#>5hg+|a@)td*D}4j#8UHf
z!DR6%0%7V(PySeU-ZT5Z%7x~u{XjVX|4gu!W9X(V-VO?XtOyw;J%{M?4+y<JD{hPh
zB8G-fmqh`xQ0*X~tagFH)!$4HnJFCXrcn1i7`|3)N;7{YzWJcSgI;tNsQl9K7K)f1
zi$)?_Pxn?WyBatl8`}YKb#f&KnI&@j_M!+n1$3hw^@vqGjf5zT5BF0k1e+5?AjpN1
zH?AGsX4WT;&U>>R#Kfiy{~*?k+V{tkD|=AMfGO=hI~%drqH5v|P|G_cmC~=q^<&~I
zg{1jhM4f*?rTrGPJ<8b)lN<agEbtl_P7PneI#H$Ga1W0JMh@29NJM27a9;GSSgv2C
z(x9?C*5lHlMQKyEX8FXL0ii_`6{Pl}EXb>MAxy=%Ex`0?3-KNgk;H(-k581gsy5(H
zR7_~o7Qe}pAHEwpOqmig6WEM1W_sM`P#U&2r%`_s;3j5Pstz_mN14*h4Sixfoki1n
z;(9EoN?3iaS?MJ%GaibP*<@JJhrg2OO~~q%5v5rv+CCtn$M8suRehNggg2&#+5yz)
zDHrd&20mf9LSPpy#R>C=bc2LS?F&5Jvn6eK*U_Hh7(fFV#GAtTMc6Kt!qLME)#7T-
zzh+m{K{E3K#5v|@ZR$87)1Ynv=M<@~5Xco8gXyn-S8xoo+r{ZK;+xL!+E$?WDIpL#
zKE=D#_3&p9#6BcaDJdt=)Z}=`1i_^u5`!@xipJ?cHI#H>OL((}5+nfxU2$6|ldu!^
mlc*GLlNl91lUfx&lbaPiliC$OlN%O2lUf!(0cEqA7TOKepZpU5

delta 2599
zcmV+?3fT4JNc2ds!U%t%$quyR=|Ts=Er{nPJ7f>2us7%i2$R7KM`qP~_>n>u>|u%y
zN9F-Le1}rp!p21WcFIB9m(V&Q9Ki0q#kGa`_iCyc--1=sMcZIlXFL(yD*T#@p6Zj{
zBWwg!(L~&9%|Mh*PCo_>@D<cM!Yl~{(Eg)&?<cdgoPgm)ubzKD$HOATZaz(fgrSoR
z#cswivRyB(d|Lx!P$@{%0iVw<4!H#>S1XhecWBCYO>$n1f>Ul<NbopiLn*vQ7DvD=
zaRz-OrbdtG*8>idykG^a1l!N+aP~_zp~TEdn*6a06>!0lHWU>{+P@j=c=|DuL{lqM
z69c5~Z&5tnr_+CP2_G50_}%FYM5+Sit5!L=EC^Gl1Y_JQLc1Rw$}_$(O1weu;q-Ft
zzl)udGSJ|7o%ni@C<w*uyQB(RyWx*9IXWlJseINY-{3tl<qd|IULS$pHI1@9RYWNz
zW{HSUrL_pMf$4aO&7KV7SD|3~SE0ZKw6Wuqo$nhVl`?<vr4$CCoEn2)(4s4j>iNbl
zsgRO62W#5{14HShY)2jm=D@Hjzd4H9sMUBFg%h0tpP<!%(bX`BM7DtW2eR#~_<j4e
zT$NtS$*Hvi*VRk%$3;@c->oOp_Mu&qpwx+BwM7Hjrq$T{9@7v2c)7uKGWk(KLaP0a
zNImnm<WGM<7Q+*(5oI_|M~BE3V$<B^O5oO+AD_QCj=!0XgrAA9e)cafItKPc{lO!z
z_+d@Zq?}|f&4^m23vbWN=%9e8{s<-gd_VN<mZjdzYD+mmWLUgSlZ<xl&behrstDZ8
zJLB{MC-Zap8em8jLO@C&ledSwOvf?n{!KI;`;mVQukPR<fuR~=0X2(_oJ}WsyDw4-
zj<k|vPH+^`Mf}&MKk06%06NP*M#Y2ne8u2QxKhdKyX0cj_NAY~1GafkuIcy1ND1)v
zYWZxkJ3s5dg<}P{vV>rEs&u=u_f)v@@Ww%^-G&BZ;Ul{+wTFd$=!FB{xK)8IhhK${
zn`D2q<;6w=U<tps09MHr6V{^Ko<c{MVLF+blbUFeO~R<mp1`%Vf?9WqHe=hL+BZtO
zyVN7OP7NY4TnI}Es()l>4$}nc%AUBQVUG@x7N!{!3cYs4M&493!n<(f^l%Qp2uk18
z!&Ss|?K-$Qx;+T1k|gmoL5>V~B}WD3mzaMr&Kud_pSdc|FhWUi<9w|iF$?&7VAU2o
zmf5cU*TW5`2c1~z-l;l1gISlts69XCDK4|^2HdjBKYu*8#!5dN-nY~~V?*mme#%#r
zPkeEOyfYaK?r=)MU5cJQc|?K6iijpmF_-OS&-!U@ogDK)p-!dfTk72gr*QyHDdK-s
zjxW3QjvWZ+x|HB`bWrFwwhnHKiS{h-7mf(((9^qg5`nY5p+~NZb*L7I4!!XXSft^z
z`t&d15t1-)P44N05$2*rB+d8Mb^d&FpU9N>lq1e4$d<oJ;s5JIr#V_aRT^)$*s4vE
zZjblwop%`D4TpAjz*W>lh?<qxi>!a9BEq!=)CFPw4^GBXs7&AVr)ZL=lb4z5(?4;i
z8{;h7{%bqbQ}r9BpA0+$tz);W0Uf4fe(;UpVJ(UF8JmHF82WTNpl#E|s{Nuf+IrS3
zi=U9J0k8fG__CqeL2MNPARtw~NW+eYi2|Z=hkN{N<kh78Wbif9A4*V;<K=%OSgisZ
zF=C}r4*lZj;Hqr}P}E*$UQVC6d9cNXLc1*dWWnm3?P=@P5&66z4fw@^!L9jgak`2;
z<UTr04T<Y?w(e3O)j#HfA($uH31XCuO#yE;Fux#wu;Y{v-q_K+8Cm_n`#x{~8Y(Jp
z!1ZP#{%E!p-*zpDFLTFyH)Vf*oU{5e2N^ytAGM7wXds)%>h>i3vj@y)YoNw*8zMht
z6E_gO6}8}1%K+MLMT57>`RjGLFJk#0tgF2)v9-=N>UVNpj5{q^E{im2$b5VoH<ijZ
zIn@H-s#S>efZ{%Smy(s2=#4xCwyUf|L-^$5tuK`=jbu83AzEUXCeMGwMB|<3U%T^+
zi~6^5<DN;Vct(3k2Co#?9#*R$ruz7k10bjLb#PvoG60sQgectFV#;$zHE)}_3N#KO
zf)f~*_y1!i0$D8o+q@a2iOCbF?ra8-ifP=$=qQIt76dWfteR=Ysc@p0#!#Bq&3nz}
z-K;d66aK5;4;-t~lFolzy8&pd1||TGsqQXmH#l|mi&a^qoV*Jld22nBjZ==uv3-(z
z-q*$;OCKC&P62BrSs^u`yM~si#|-{GV#zHdnDQkq!u#j{LI()ujHm1jEQ`Jw0>Z{g
z3$l1zQ22vW6L*H#F8!S1P<p?UEEAV&R6=XLyTw}2?V*jGbmD)cUPbHVR#5iW>pWhz
zcpzx0G99RXz>5R1kdW@97|x~XnPfoeAUMK3i-QU#64_J+Hx2(64^vb2H6Tf86IqF%
z8f8WHFs^KQZG;4k{@EoCAsBQP5v(zOs~-P)a&=KAJ*#J4Y4jp_=NeT{3%TrU=GYuz
zpM@}!P@yv#(6@iIOy|c*e+IQ)kn5o+6oVO6-M70oyy~3oGvy*$!zizFG2bx%Q_kGL
zy3a?*ZZV!3H*jnmkrVtNfcklK#!BsFKjaRM9yZfl!7c86H-W1K^cfq8NZ&80J(moS
zS}Za=LwV``!JAffH^YE-2D(WrJz0*YC1feaVgZ6%F|>bC8Y;p&)&uxCH=5ExKkQKZ
zE8q0T2kbD}u5SfYpGBHxN1AVx3#WfHq1V1DV1}50Mo(KbPw#c6J9L<0<X?yy=e_>5
z24(8W-a{M01`fIWPBjU&&88(peOAgN9IRf%EV^S{J<trMwvJ7z;SWf)*4<0K44zq<
z!?Y$3Ly~{87t{E<WaS9;Gua21(i;XqMr|deO@b&6ya*kvknRR3wvYeLnM^2S9GQw5
zR)UNls^<S7UElnl$zHy#yumc|M1>tFzu{lO@|F$GT0odz%5#DrA!>26Uy5C(j8v9U
zeA8@%gVOxoX=vvuf4-pZMZoRgr^)jcjyOR#BJ_W8`y7&(#E3q9d<{%mB*qM<;}F!5
ze1O)>q6CKZt%A60t!C|z_wadxZNRf1(4QSFQ`eA8A*>mL|0gUf=YY2H{-7c|k0$jn
z|ERRg-PsmIz_MMm?o2V^hQSi3X<%<iOgShlej42M@dKW;ikQ3EXaoh}=gGc+H?=2|
z`X5*I&>-|I$-Jn|l(_Y-BT*j$V4Mnn?<pgV3er0Brb`e_u+>Rv0YPU=yGqKSWJ#ja
zel{3m2|YRKwbHN4wQaTHjKIC5P8djK#el)<!tX9t9Gi4vOMtV65+nfzeQ{eTO8}Fu
z6ZQg20h6Q@ZvsmKlM@v`0!ssvQWZY}O9YdV6+Hq=1(VJdKLSezlMNO<0!s&zN)|r>
Jb+d{V+6|Yx2PXgk

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index fba8ed5acb..aa419f24f6 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -51,7 +51,10 @@ def test_eLDA(self):
 
         reference = EnsembleLda.load(datapath('ensemblelda'))
 
-        self.assertEqual(self.eLDA.cluster_model.results, reference.cluster_model.results)
+        self.assert_cluster_results_equal(self.eLDA.cluster_model.results, reference.cluster_model.results)
+
+        # the following test is quite specific to the current implementation and not part of any api,
+        # but it makes improving those sections of the code easier.
         self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters)
 
         np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
@@ -84,14 +87,11 @@ def test_generate_gensim_rep(self):
         np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
 
     def assert_cluster_results_equal(self, a, b):
-        self.assertTrue(isinstance(a, dict))
-        self.assertTrue(isinstance(b, dict))
-        np.testing.assert_array_equal([row["label"] for row in a.values()],
-                                      [row["label"] for row in b.values()])
-        np.testing.assert_array_equal([row["is_core"] for row in a.values()],
-                                      [row["is_core"] for row in b.values()])
-        np.testing.assert_array_equal([row["num_samples"] for row in a.values()],
-                                      [row["num_samples"] for row in b.values()])
+        """compares important attributes of the cluster results"""
+        np.testing.assert_array_equal([row["label"] for row in a],
+                                      [row["label"] for row in b])
+        np.testing.assert_array_equal([row["is_core"] for row in a],
+                                      [row["is_core"] for row in b])
 
     def test_persisting(self):
         fname = get_tmpfile('gensim_models_ensemblelda')
@@ -306,5 +306,5 @@ def test_recluster(self):
 
 
 if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
     unittest.main()

From 3ec31e7b5a57addca9945d3ed787a42801167f54 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 13 Apr 2019 11:13:37 +0200
Subject: [PATCH 018/166] improved order of assertions

---
 gensim/test/test_ensemblelda.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index aa419f24f6..285fa293c7 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -50,16 +50,7 @@ def test_eLDA(self):
         self.check_ttda(self.eLDA)
 
         reference = EnsembleLda.load(datapath('ensemblelda'))
-
-        self.assert_cluster_results_equal(self.eLDA.cluster_model.results, reference.cluster_model.results)
-
-        # the following test is quite specific to the current implementation and not part of any api,
-        # but it makes improving those sections of the code easier.
-        self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters)
-
         np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
-        np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05)
-
         # as small values in the distance matrix are subject to rounding differences of
         # around 1-2% between python 2 and 3, use atol and select a 100000th of the
         # largest value instead of rtol. Large values of the distance matrix are not prone
@@ -67,6 +58,12 @@ def test_eLDA(self):
         atol = reference.asymmetric_distance_matrix.max() * 1e-05
         np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
                                    reference.asymmetric_distance_matrix, atol=atol)
+        self.assert_cluster_results_equal(self.eLDA.cluster_model.results, reference.cluster_model.results)
+        # the following test is quite specific to the current implementation and not part of any api,
+        # but it makes improving those sections of the code easier as long as sorted_clusters is supposed
+        # to stay the same
+        self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters)
+        np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05)
 
         np.testing.assert_allclose(self.eLDA.classic_model_representation.get_topics(),
                                    self.eLDA.get_topics(), rtol=1e-05)

From 7afd192e6c8a0817ca5c5e884776edb81074bb7b Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 13 Apr 2019 13:16:39 +0200
Subject: [PATCH 019/166] trying to achieve higher precision with float64 to
 avoid some sorting differences across architectures

---
 gensim/models/ensemblelda.py      |   4 ++--
 gensim/test/test_data/ensemblelda | Bin 9316 -> 9316 bytes
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index ded79117a5..01b27c8da4 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -330,7 +330,7 @@ def generate_gensim_representation(self):
         sstats = stable_topics * normalization_factor
         sstats -= eta
 
-        classic_model_representation.state.sstats = sstats.astype("float32")
+        classic_model_representation.state.sstats = sstats.astype(np.float32)
         # fix expElogbeta.
         classic_model_representation.sync_state()
 
@@ -854,7 +854,7 @@ def rank_masking(a):
         avg_mask_size = 0
 
         # initialize the distance matrix. ndarray is faster than zeros
-        distances = np.ndarray((len(ttda1), len(ttda2)))
+        distances = np.ndarray((len(ttda1), len(ttda2)), dtype=np.float64)
 
         # now iterate over each topic
         for i in range(len(ttda1)):
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 0ccd9d4c0d96da2ce49e3cd6e1af0d22623e8e88..b161f268193a28a80ca1950db4081d379df29885 100644
GIT binary patch
delta 2532
zcmV<A2^;q0NaRSc!U%tcUg_yN;Q$-B>3M+5D9;_T9Q)S21paD>1~Jmlu(%_{80Sp{
z{Lglcm{ao=g)wRrusnqouKVF%-9K|Zs&cg^v#W0ed-hahBSOYY!@+(;kn~T**}^=_
z3hN>8Si8#2UPcl+N%`R)%~<6}ibG|f!eTa4G4tF+8io`tg+YIYvWrJCa&g~z$Fjaa
z5$lQ$&ttcE-REs|@y9fm6y~Fiw=lxfvn-1v_OzKqX_>*;#X$qyqzlmnRtA=rhSt|>
zBG$Z2(#lS)4$hN^7rhXVCM5n*Bfvm;wUm$zf_^d&_-uMc`!{MQXT~-YFfg2Og6wN`
z0$5PFN^+A3`&)l?Zye?Mph`!WpeI1=BRcyOo$B}*pGb7I&*baBzSJ*+!Xi31=Ihir
zubtJJRjqU0L~4a;EP9PvZ$0}_+x^zI>V@Q|QA}|=$3qAxw!#o_-A;H<(2ygLiclbf
zvBBmvMH)M+H)M;JZqul$AF<>k4<oD~(Ff}SDR!(xojre24?#F*>6aE2b<NT!0JjN*
z)vOcMzUkD0h2Qeu&6+_77lv(&Z@wGB@R?jH_FQn;D4xbN8W~S}ZWz3Q!%bUXflpiT
z=R=IhCe|{5;0bM)cN(6KoSxm>#`P2XIDn%2Ld14GFIy3?wm70onj2^90Vn+|=HfJu
zS|7v~uws8`@EOXVp|PIpb-n;9pO$xU=R^D|^gUZa8Tzj~PuGz-UZ0YLKy5Jh3Cp6r
zRcbC5E`kl07B?&Qrx)<SBQ+HU$$QBvo);^Ci=qw3Yj#fY9LCfj)UM&%;+^%&8LC%*
z(y5Miy8Q5#Yi`B^Ki(ei?zeHK=?WmC-{Rj(pqzij<0Xscmzz=JOfA;U_lMo{5%8yB
z+fN9Ow~+DKbIpWw2#gZ6J`5Urro|o`KlNEhj;Po~;W(ki(3qJlTH7<LfiadWz(4dv
z&9fvgX+cWz{A5{goZ#&(Npk>fbyN}cVT|?bF`UE}dXCMvOH?6O)D?~NJYGE9`s6t?
zNMC=l_tJiFmjlG6;-F!3#<sBLpyFV0!C2WDo0XMwh8ePM#k|}5TGP?~q2VTh0MYuI
z8}$PDP0eGsdR=E?481;wc8eKGwf;&{vy<2~=fqAw158%gm3hEsIkJ#RC2PMEd=-Gs
zQNg8RxdB2mV^UXH^DyjxK^7pV=TwwIPRM^F6~G|C!l~E@uGv(bQ_Pvfe)LB#EkRsq
zROA@b7Ps^fe;`7MP<40R^<cxqkoGbJlrSDDbu3VCFKKxCHZXcgol(%Z$D@K#pv;s%
zzufLULuV`H-Cj(AL_Ybn0i?vjF23{LS^9Iz@W(%!#W0An5$zkd(h7O45Rp!zL7#t-
zb$>ssw2XVZpxrUIm!oer_IbIRi_F|`vg*DvpWr-ab4HXWdL=qTxW*OTgMLoI4AFTb
z;MS1Q`aJyPaQ65;$cJ!N_J^qJpdf1&ejPrWjg9dq&T4=*POv}pOqIViYvO>B!H*-y
zRwU=27CD>roO3T_%D8I9k{kR$MKyo5=*RY%GwXc+@MYw~<OAsp<7PU<&OES<Dwu@n
zvLZ<?p*Wlr-erq?^Jou$lk}l;c<nCab8@u;JeUS{!t=Rss}ABIf_|)Z+>$3VO9H*~
z+f|W7^JnxoO&s7a0xyc(rz-ddcqm)7Umk~fnRU#tR7B~>zxR&~L4{CHPcnbG++4MW
z;->F^nTC_Ef#rZRHj-P@#FcDre0uR$>K!V#iBUmhxLgXys9WBN0R<YP&eYSfw(_{{
z?iWT$(?HQQ_x?p)tW<}Y%5&n4b5TGqp!>ws-TiB_z;igAf5?-fBv~_`9Tl`ION(Du
zH(;LB78AOwz{OB{^j(RnR!o0!e=gm<QL3Kb8pWt)wt`kuGJhVR5k=~+;L;?(_bF}N
z<s)-c<nHI*BPi>P&Lx}!7p0#gA^0v?dCWQ>J9vqwKu}yJFfF4xOn=bva9{(+I7V?I
zDY`irKV;k1Iv8H|rBLtaKDG_%m@+k^Sfq|7fIzCH)%O{4B#=+*#=n36%%Zw!9|MR`
z;%?q%f>kdDC`<&*O9c&3!wM>E)&w;vI_=YX(tImYtXmW3n%3tnza@jiCOKx#N!or=
z56!^hQ`0^+zG<@WO~}h&IYIdIb|0|W4QsL*0T+E~2LN)rXT^kr9*W=cTQ_`+bzIA!
zNwyOH^pCIilfRzVzz2UQ0dBe`;;Ut;_N8}477cjFU@7smfymMkoH2oWk;vhZa-SmD
z%tYFRgr^hwEP9{*y4AR~sO1>ii3)7!gBvvjaBM+J(6MsFlsrXK&0RZoCW!vw1&S!A
zAucnVm|8=$(T6ySE~Y~~$ZG(L&_i=^T|)F2jO+yjNgbdF38a6aD2?PjlZ|ChoZOyX
z+Ga$35Ulo2j^=yHqy0G%vB$@5IGHxH&N+V{@CR7JA+(RIMY*#Hx3dH9swmQ9svV+d
zr+^#YVD#e1p&W_)1dO=ktCq#E9q&0@MvO}`>k2;0gVF&9BYZoUyKQ<_DZW(;{jxFX
z&bM6D0h{IYOjm!%?SIC`Mwu=TC59J@x8#AIVFTA^fNE_>k_8*Ot7&H9RuhsoGN60O
zrD%RM9KvNEEaBA~%ri{!soL-rF8h-Z>>Ep#3yrMcVbet?qNG}Ke1t^!4=04}EX9=(
zfnghjg~$6WaaF&l6#C&mPlXxK#Z_C6qcaIBc^@c+-1&bZHx0Q;V*)cg)`j7Q%v?}1
z9R1LGiLTdQA{a#Ve%(j;4y>>!+z(3<-)LDaNLkTGKG$RQwd#jDr)1eN7hc97BrsXv
z`di+v2#h-ITyEC$UrYX}qO1^#5r5Hjw?xtC>-yUrStH+@1R@Zr>y>l9`60H|E}d&n
z?+=@D6li}u4kD)pb(<@6BN6sEP<XB#@eh2IO`>7Ve&du?nBsBIB7Jn3S(0PoL&D5b
z!T6p5m0v^l1r|g-R<Uzpp4+r$%qS*iMY<WDcqR|{$$P8xPnn?2Um_n}HPR^%2}Ki*
z+(&tK4C!aIm;rEj7OfM<Ei(Ud?+<1TO%e}Uh0=fgC{rPJhLpHK0t-nQuNYff`rA46
zKe|$PF@boYK`st0QPj7EI;4s3rF>1}qrpZEDOw8FD|zdTWeMZYfI?56!NCZA+h2uq
zwFe}qxh1rn+n4*7s@9Pourk9^(~%Nmh-0h1EHLo=#^74`1p;O21GWSkM=-**sA%&X
zKyp{Zt-=rp=d~<S2(j+KM&5AzvCew#X$&Ab3dLja4Yh!~78F(2bZtJ6W_xv1K?;c|
u%dc>2e-!hT67S=1bp5XfQ(_RGA&hFPF7yJGS4I(_IrMa5OK`J>65bK2YwP9!

delta 2532
zcmV<A2^;q0NaRSc!U%sOBX{F;>8~%%v)YGxxO1{1`t5N4<|jt>^P@D)T9;O=H#Dh<
zUvB;>m?uc9cA^T~m6<Y&3Z>fJQ9Uyx%1@UZe7>wf`-Zq4TH!vYe+J&H3YHs~@RTS4
z>?{yT;%m_yxaDH0RACtQ>Wm3&qMfc%lUyNar#;EWQTXMSQ;C19JWhEn`VppYAdaED
zNP{8ckzY&|J&@qgLLTtIGe<Q1nA}|?v)%MDWa{xYn>|Qvcev0a^w;*{-4Py5buGIN
z{x_NeDfo0^O$m>ZWc+aFB6SK{NT`wP@eTR%mWXFAKnAeAQlp%!(B28YZA)iFIBsp1
zM(9*RetbC*X1sqzk!T-XxnDH3cRX}((xg2*a-p4dPbbbZ>rPCaH3ji)WfqjHr0lfH
z(I_cZUb3P_*BwkxHekcjL{;a}M>$K%ri3i%0!kB;9H6#CM-!&u^$i^GC;7ygXl58o
zcKQI!k);A1ZDpv1VPbQ{f<z}UTlbna@k%a74`FiPo<n~m7uf|mpwDTKPh=u@r{a05
zv{D&A5<G24a#bdm5`F$_RF0;}cEBU{LneYErdv{xouzgs)%s|v%0;W<wj*hq6<YNM
z^||3e;4zylFB~u}S9WJ9lEvoHeP$IlL^XZ$rlWB`(L1<QB^=|cH&BLd$J_PRdI9i(
z&jer3*l~Y6K|Wc@QG|Q@f<pUwJ#g(!%KSPSX})It+=T1!2mS5_ZVB0PXV&=e&zgCm
zVYIrf<~GrsW|Ml6TVc~q)vROPBY6oID?C#Gli5%BMNsN`qc#H(&a)R-+ZwBF2owbf
zH5!E2Xkbxaw8~mHxr%&errBLUZs~;JiN+^~cgBAj7k;L>llebnmcF@YTAqkNnUHVj
z*6l2cT(pW41$*#|2WoGY3Rrz=u<VE{qLbW~cGYIa^<GoThED8`D<L_XA*+)80>GUl
z7az*+m3!$akRc={)X_>K{z~4BMJx;k>7|^drJOjDFH%y~9k`scnBU}$Ut+@EEXh97
zD_eih`X6T;5Fh!zrJFPO^SbgkTEi3YI197L3c6>MH{|d|lE8%C^=jXl4hR^D9E4hw
ztegxnFhi!$m7Z7?VWt(Q1kg|bRJN@-^sp+g(mls^ZE>5K97bpMeDmAg05c#pq&6QR
zwG^`czr#RJ#FV7(Gv*OixHoLfQA3FDQYL@%x|`n2fj!(N28i65Y68~ccv`Y{9Iy4&
z8o_067}mTV)|OvUm>`{ajV#Wl5fZn~-W(_SFnF?xhijLT80|Fn^$v_@{C)NUX_|!;
z9gsqBOTa>5$c#$v+hM2SSo=ITk$Cb)yr<G6g&8!|T5S4o0ex75OSOtX=<CR#IV^u<
zQ6Yd$(7E<p0h#UpAtobhsvVlBsi?n$MqPwjfJsT}wjw+ar+*93@xVDD*sN9i{kY#N
zRK(fDt~A=4;6gURnGNAiw}m2TcTxa6O4wj~g$X}wKaojtr>Q^+9J2LR;$_}&Sh^Du
zE4d=VeS^U678UL%TK;RNwW3k@w!?p>N>t2qIo6003rv8Zfu~uIS?q*AJk=YLNE<!o
zL<`TDH7bM9jev;VzJiXal6i%U?M?^U<=A<-(I@qf|C{#9fdZse%%6TvAc@Ip5eb*m
zApwgzXNz+_R=>ejky{}xh3chK>r%l}Qd42p5<3*7F&%wuhh?a(4Hoi<g<OBAh!M~0
z8|NO0Q)HFbob&$}KTKIl#dtwdI201>(VQthVVmYLD02EmfEuoph#=U;ww75D&30Q^
zv-gYP;xAL4uWGfEvz}-eN;AIBj{YlAo!q_T20NwLIjxqx&E2#Qtckd%@%?}(3dQh*
z;J<vCE~k52L=`{JBhk{O;QD_LI8=t$9ZI0pHrx&tg$L9-2T3(wrn~hn!y#S5XiP}`
zs5C4c<YF15M^VB6DKhG+Th4?9_BBDj%<m4>%w;13(*D91iG<N}zLKHBQusdPMe1>V
zP=JT&!L?CO-H<d;<lT#8djQ0>_*b{l1znbExtfI#%xiGP96xI7it~R_Cp?wv@|~l)
zIY@TNT>X+Ee(sM3LK`3oD5f=9JqoF*Meax_TF(rpYWZxdj9Z7Pw8vZe6@K*FBh>b%
zVTL?mmLR*gIbGoJ)-sUTJrkkV^#54k-Ht8yRx;B=)5)zo@dLrO0qLdRsPz=KcD@d8
zEyC*LH#W1s=>y1T?pc4g3|UiDg}9d*o{L_v8Y>CzXgevqaZ%B7<W-`f|IX?YdEh_m
z7b&c&apqOSAcHI~B6gNxwg%qZFNjT;AaeavhKL^Ib|k^<lhNIcdy~_`H*NW!>yw%B
zQnoRs4=c-MjGcW1yCVE!+o`!<3_U&Tsf4acklgcu=ux^|=N5l^E$U*uaOUUaXkLJZ
z6tRf|(alIv`l>?Kg-C41;_R3dDYU}4*cqkdi<{5n2^`;fJGO>#7P2#bb$HU0@wSy4
zSH!)MG@dVJL+%ZgkmYp>=RC125X+3WPi+0qtOa0rv9*80F!JkVTDu#68U+m6-;%f1
z6@!9-#NKQkXf1!;JN}Rf`_9xUqO0la$2cg|Y1+JC2sZ#>5hg+|a@)td*D}4j#8UHf
z!DR6%0%7V(PySeU-ZT5Z%7x~u{XjVX|4gu!W9X(V-VO?XtOyw;J%{M?4+y<JD{hPh
zB8G-fmqh`xQ0*X~tagFH)!$4HnJFCXrcn1i7`|3)N;7{YzWJcSgI;tNsQl9K7K)f1
zi$)?_Pxn?WyBatl8`}YKb#f&KnI&@j_M!+n1$3hw^@vqGjf5zT5BF0k1e+5?AjpN1
zH?AGsX4WT;&U>>R#Kfiy{~*?k+V{tkD|=AMfGO=hI~%drqH5v|P|G_cmC~=q^<&~I
zg{1jhM4f*?rTrGPJ<8b)lN<agEbtl_P7PneI#H$Ga1W0JMh@29NJM27a9;GSSgv2C
z(x9?C*5lHlMQKyEX8FXL0ii_`6{Pl}EXb>MAxy=%Ex`0?3-KNgk;H(-k581gsy5(H
zR7_~o7Qe}pAHEwpOqmig6WEM1W_sM`P#U&2r%`_s;3j5Pstz_mN14*h4Sixfoki1n
z;(9EoN?3iaS?MJ%GaibP*<@JJhrg2OO~~q%5v5rv+CCtn$M8suRehNggg2&#+5yz)
zDHrd&20mf9LSPpy#R>C=bc2LS?F&5Jvn6eK*U_Hh7(fFV#GAtTMc6Kt!qLME)#7T-
zzh+m{K{E3K#5v|@ZR$87)1Ynv=M<@~5Xco8gXyn-S8xoo+r{ZK;+xL!+E$?WDIpL#
uKE=D#_3&p9#6BcaDJdt=)Z}=`1i_^u5`!@xipJ?cHI#H>OL((}65bIm5$yQ@


From 16d035707968f8ccf4ea3ba05f4402f9999e6af2 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 14 Apr 2019 15:33:57 +0200
Subject: [PATCH 020/166] better approach for comparing with pretrained model

---
 gensim/models/ensemblelda.py    |  6 ++--
 gensim/test/test_ensemblelda.py | 58 ++++++++++++++++++++++++++-------
 2 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 01b27c8da4..d1d5c87254 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -209,7 +209,7 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         self.eta = None
         self.tms = []
         # initialize empty topic term distribution array
-        self.ttda = np.empty((0, len(gensim_kw_args["id2word"])), np.float32)
+        self.ttda = np.empty((0, len(gensim_kw_args["id2word"])))
         self.asymmetric_distance_matrix_outdated = True
 
         # in case the model will not train due to some
@@ -854,7 +854,7 @@ def rank_masking(a):
         avg_mask_size = 0
 
         # initialize the distance matrix. ndarray is faster than zeros
-        distances = np.ndarray((len(ttda1), len(ttda2)), dtype=np.float64)
+        distances = np.ndarray((len(ttda1), len(ttda2)))
 
         # now iterate over each topic
         for i in range(len(ttda1)):
@@ -1073,7 +1073,7 @@ def validate_core(core):
         unique_labels = np.unique(topic_labels)
 
         num_stable_topics = len(unique_labels)
-        stable_topics = np.empty((num_stable_topics, len(self.id2word)), dtype=np.float32)
+        stable_topics = np.empty((num_stable_topics, len(self.id2word)))
 
         # for each cluster
         for l, label in enumerate(unique_labels):
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 285fa293c7..b426a736af 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -8,6 +8,7 @@
 from gensim.models import EnsembleLda
 import unittest
 from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
+from copy import deepcopy
 
 num_topics = 2
 num_models = 4
@@ -49,24 +50,57 @@ def test_eLDA(self):
         self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
         self.check_ttda(self.eLDA)
 
+        # compare with a pre-trained reference model
         reference = EnsembleLda.load(datapath('ensemblelda'))
         np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
-        # as small values in the distance matrix are subject to rounding differences of
-        # around 1-2% between python 2 and 3, use atol and select a 100000th of the
-        # largest value instead of rtol. Large values of the distance matrix are not prone
-        # to those rounding problems.
+        # small values in the distance matrix tend to vary quite a bit around 2%,
+        # so use some absolute tolerance measurement to check if the matrix is at least
+        # close to the target.
         atol = reference.asymmetric_distance_matrix.max() * 1e-05
         np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
                                    reference.asymmetric_distance_matrix, atol=atol)
-        self.assert_cluster_results_equal(self.eLDA.cluster_model.results, reference.cluster_model.results)
+
+    def test_clustering(self):
         # the following test is quite specific to the current implementation and not part of any api,
-        # but it makes improving those sections of the code easier as long as sorted_clusters is supposed
-        # to stay the same
-        self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters)
-        np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05)
+        # but it makes improving those sections of the code easier as long as sorted_clusters and the
+        # cluster_model results are supposed to stay the same. Potentially this test will deprecate.
 
-        np.testing.assert_allclose(self.eLDA.classic_model_representation.get_topics(),
-                                   self.eLDA.get_topics(), rtol=1e-05)
+        reference = EnsembleLda.load(datapath('ensemblelda'))
+        cluster_model_results = deepcopy(reference.cluster_model.results)
+        sorted_clusters = deepcopy(reference.sorted_clusters)
+        stable_topics = deepcopy(reference.get_topics())
+
+        # continue training with the distance matrix of the pretrained reference and see if
+        # the generated clusters match.
+        reference.asymmetric_distance_matrix_outdated = True
+        reference.recluster()
+
+        self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
+        self.assertEqual(reference.sorted_clusters, sorted_clusters)
+        np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05)
+
+    def test_not_trained(self):
+        # should not throw errors and no training should happen
+
+        # 0 passes
+        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                           passes=0, num_models=num_models, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+        # no corpus
+        eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics,
+                           passes=passes, num_models=num_models, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+        # 0 iterations
+        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                           iterations=0, num_models=num_models, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+        # 0 models
+        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                           passes=passes, num_models=0, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
 
     def test_memory_unfriendly(self):
         # at this point, self.eLDA_mu and self.eLDA are already trained
@@ -247,7 +281,7 @@ def test_add_models(self):
         self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
         self.check_ttda(eLDA_base_mu)
 
-    def test_recluster(self):
+    def test_add_and_recluster(self):
         # 6.2: see if after adding a model, the model still makes sense
         num_new_models = 3
         num_new_topics = 3

From 01b68e4480596a8185f91a328cf95eee8a92fcc4 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 14 Apr 2019 16:27:43 +0200
Subject: [PATCH 021/166] potentially fixing the tests on windows

---
 gensim/models/ensemblelda.py      |   2 +-
 gensim/test/test_data/ensemblelda | Bin 9316 -> 9820 bytes
 gensim/test/test_ensemblelda.py   |  13 ++++++++++---
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index d1d5c87254..e49f946c21 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -96,7 +96,7 @@
     ldamulticore=ldamulticore.LdaMulticore
 )
 
-MAX_RANDOM_STATE = 2**32 - 1
+MAX_RANDOM_STATE = np.iinfo(np.int32).max
 
 
 class EnsembleLda():
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index b161f268193a28a80ca1950db4081d379df29885..9b1df8521e47cfc0c68e80269b5aa0bcf5849454 100644
GIT binary patch
delta 5345
zcmY*dc|6oz7awaxM1?HbGpUrslNvK)4P&o)Dv_~Fezq7hviBphK1B8<hO$Stv>`%7
zA`xX9MZKhyJ*nQA-}QXF@BH(<=X>tC-+RwJ_slS9IAW6U4n~J%Dqt(RaFbqpqQ!4?
z;ZP`4guFH#tLI5!zgWPsLympvrD2=0-1xm~5Wj`ov;r3TEc79#?*nW8#s}?4I8ckd
z@W)!v_!Ywcsq>4(rqP58_kek%iUe8g56&dtI;OpzO))C%`T#Q*0D=E5lg)m?<7_8u
z0eVO8Tn<(hv#zqgi)9}RjTOF!oo^F(-i+8_qb3VuO9gkja8Xxy<8CD&8$^~`T*~4Z
zzJ(k>E-pP1>(s0M^KBK&#-n#0rDHEI5AOcc18?s1U%_JTIjHO<!Xpb^3AHwXdsub|
zSDk(gHlX)up-3-cBbmv5eJ!r7ybG87fF^x6aZ@nrS61!(?ad9<H=IiTQH}U>GJI*+
zgl>^D_u>CZ&A%sX=eG26KvD<<QF3ZK{`#L(FSb*^yaYCK6$eCv)FtnB;S3+u(^ioy
z1Sd2hzL3-_+!P$4@f%yzZf=O?K$NU`!~T?wWD{VB&AVlU%}F%El3(d#If&13Mrb{(
z6LVbTrjNDr(!(a#N8+~*M&jY2WB4M@dffWg$2gO3k8u_n^|-DeWgI3@7tb1_+qdKS
zEb4KW8P6L&Jnn7~FMHWgv&y-jjr8p6C5**APmEECCx(jlz!c=%(o);zj*N>Z-7!bB
z+%ZK(=dhEn&SCXacVf58WoWSrWoYpWXJ{!HWoX@O6V{>#NMTvyE3|1X2ipuS!$kN*
zpIFcdOP#<IHDTO1c9`>$wze$BJPocyNjRkE;+Dgxk;$Dm_Qy*s(9@?5#I2#tMoeRS
zAm2kaYcO@#b*k<{>S?0cf~K6A$6VW0y4qIP?S)z5xF^?p50i$8=5}i0LVL$>0rm85
z`f-hw7uI`TsvES1B&CRj?<?H{^T~)0xORtmjQ)dLJn*^G4DQtBxsqHTj+O}@+IIR-
z%*yijpab#Ovy42|j|b)*H8d%kN#0pre*v@KUvxyk<{R;IQ>&T*TRwl`g2a*vX;#77
z2mjNsAc3OTQ^q~IUAR^3XziMUJVvw6w)Ic)8^ytcnh#RvGTWu~6k<oSmBO<IRCefL
zurr~u4Ie*-QGH;|hGX^#@1#!UrH}MgXI6DV#;2{VXOm+gIeYf>n?fg*v%hugo*&Kf
zKEL$%Ye4d;s~sjDS!4Xz2;(bJXR~Y_9+2Bl)?e+Oz@D=ik`X^!LJMD3g}GIEBQAXV
zed4!dtdnxGupGM|uGT$MF=C|uvyCY(rTDsNl3aUj{DOulTq^%ueR+F$<t{1T&tlSD
zA!}pJ-%rNu@k}y1;@t5wP&}>j@@4nj+QP8&KUKafiHRJ`5#NE1&vn7on{}?NhxI!e
z2t}12+*@6A^oP?@L68ljai54Mfqs3oQSM-%Y$aRgC)sB`>poQj`hh%|`yVc+On$u<
z%{9Bi^O|iYs5=o}H&87(*kQ%9tMs+0bfRKusKc<v0{i^I!}CPHGcp3f{ME^85-NKd
zj|iWNR<}^R;U~!F#vd#)A$~^JsAF*!{*o)S!|O}lN=20Pwm}WG4nEO_q^~7~I_ft=
z-i%i4>*?J$Sm=|Qw_~{uc36~~*1xOr?GU4DhecJ;D4SqcC=4&Do?*W=Iy-;Oo${0X
zN;kXkP>j>4tob7s*~yfUEqY<{9P&z)fz-FZ!%CkEhC5Z&sfPw9_KY}Z?b><$%&oyu
z!`tsjiszcY_TV^%t!d}4*q6k=dE}FN?BKijs6bo&8ed<X)r&%pWL?+l()JEq;>&<y
zClqNaFOFTrj`pv^StsXwE@`WrbI^L(NY&M;u=e*zQ_eUtHn!GFvk=#Q^hLzpTO?{d
z@>oBWAr>&W7D{828PAikn^nzU#H1NZdsqb+e8^qw*M1n_P}ox{^~eV0Ytr(*u~9&(
z7yaiH^q;*^=%_hMcCER)7j&f4&k|g$4~hs|nkn7Rv#Ea-2}^G4Zkk+-uNz9^UrWhJ
ztR5~}oOkd2IrHWq{VvW@Vt~&;FfHS+%iXd1jV6Y{&h}?Y^G_;|YDjv`dbze3X3u}w
z68Bxlz%f%G%6sP?$(D+=CdxhYD}h1$-cb#`v(Fk950b{$Dp%R{PDL}S-$q#S9kcpX
zYyUOml4PwnLGy3;H_opmV5Jpu?baBkLfVV@yHBrpz58Mt@x#o?dHQYOS#7^nl!3G{
zN49gbp(M%21TSuAZm8{T#2$Iv_3`TrO#v(;$Is~YPqEzpO#V|_5<_W`l^MCFJLP$F
zCrRw~U&Y-YQs3VQJr_R|TV)XIT%hbZf55Rre?WYw?`hnjZ1^G}*ll5CsWop-=O0d6
z-htgo;}t2hiL}>@_5ibnQ#)it3BHq&Vryby;S>7}6G(r%b9<_whuFr8DyCQKf>7O9
z4}s`HuNKK-g+Hg~I4hI*LrEPk;?|u%Ha1%o8lNx?J)1GnFFkNbTeU0a@1Bmik%ra|
z$^uG=rljSd)ByiUL!%azD&O;eFje@dd(U#nd4zVc#4STmK78bJnrK!Twb)>;+bi`&
zYp8D3$rS~!FZcIG3N#YrwsnRth#uhRyKdFvD4a@Bjw$XDQEAhjS?%eTfCO>B!{RGn
zeYEd-Se~ipNw5u&@6u_+MIVh>N$-EbP-?a$$7xn$aN}^jc~_UM`fFwWxI~)(<3563
zyC9MCo|RsgRMBBcbJ^#+o|1l9dLMAZx38h1UfWb>=w0<`pfd8W8JzJPo2fhfhe3Q)
z?W_@FX4aQfB-|#YKsv9<Zu{3Q>-+I8j?auO&fS^y@)xXi<@UFfTa{lbc-=HI*;Vp&
z0{<pv>P4~}yjb=z*+gY1_ThVZ(VqUO)!{wpPT>zuvj$|^HhlWR&j2@^XXWv2k>)?#
zGUkP-(;{yrryOD8mO+xEP40+Eb`bry<GSC&=Je8|*$Kgt=)|Qg-kp!vv5C*72lp!P
zW7i-U-)Bsz#A_dy_!YE*3HbO%sb&j3l5#EcmfY|8?QqCwijYh`7kj+F#A|(nUDX6%
zT1l}$Nl;Q^U5yQjt3SWUBJW<@x0`AUyL2_zMYs2jPAhT!OnXu=5vk_KQI%xYOfnKT
zIU1^Xl`Z4V#bi2Z-6d^bY?hJvc<i$j5AKit#2Lk&wzH}C$(0RGOFs<96#HA3cNc#O
z;jTU({c;&b`}odYRw=t?asBucb-p&Ci?>@Rhx<Z9{nn5F2$dSkmbOxE+uM=mP5E(7
z&IhtA;+h?kYRj{=`tZoVV2v9+9_uNa{kFi-S!3tSz~qXq;@>^j+w2OzY2A26Za=8w
z0=quOOU-&GEO|h`3mxv{7UZR0+{vIT?bG)&u<tDLfO!?}CMw-w%T<mFk)F8R!f5|v
zZxHe1+r)vgbi*HixkOBTO4ZrcU)}{d-VRZ}U6McZSuN+S@)#~b!PVX+<6el*?GBuE
zJ<9aiam6Abm*{_pb>x&M8g-BHVLiTs4drajFRbd5`W{awG(V5t8x|ws86#+>-FC+2
zX0D!Rg%l?&0o4Y!OrjFmq!g=tM<cf-9?1-^*!A#togmR;*0qT5zQ?(ESmkW34B3O@
z<#BDUw=&_88d^8Uzt2P~95xzS_g6EoeM-B4ndmLV$W~ys=bHW+Onwcyh|0e_uU;Pi
zsQ}Zvq|{pB-=C(6AuB>u%w&VX4~{@#-2QRmw;ShR5ngq7o&CRxrjF(C%UiTg|57nM
z@qNKT|AdZ|_7}Yq?eQ0rKUZbom5b-b?*yH{@i3D|c+CBK;`mg^$o+x(zF1|h-RfAH
zfk1OrxryDKeG0|JI|>-@tj9gR2S152`8)Ef+*W%ux}0XqlQ^_kLULM;Ez9ign?Ibr
z<cYbXL4u#&v2T1X{(^G4(?KafE74t7H<aIT;Q(36&DQeMgk$HIdo{a$+Z24tZ}FNg
z&7%oSioMYkZbt33T>qJyz!o!)=b5JPx=&U=8m6$vOuQ4fJ-xo9h^cZO<W#<jwY-zV
zw%fY8z01EW)CZ$|s3c0RYhb&+7Czt9#R6U&jo>_0<iMp9maJhjV6PhBTsg(bJ0&m@
zqm?^vamgon%hih(x43XS9%ZC!-Y}AVA1y&Cf9c96gQ;Jhd0}8NU$eYZH2#p{R{mk0
z`5KE3^XK2OH{E8}ldcNu!o1PMCg$()rUFtP^QSn9G3AUP<O7hwYk}B?Z{9y-`J>A8
z5qoHZPmec6Y`Vuu1Xn$?x>@sjGw28o4J);=?6VfB4_J6iEg8YpQvmfV(OC!~HnXRk
zHDNVa^%)BvZ19E(xMr4ZQFarVht}Pe%(9yfE+w&0=ImS&u4dnk<7_Yo(`0=jtSBy!
zGXxQNxB$UoTmiFXlW`+ov!b{Y`7Zf**ir30yxhrDiU)y%xW$EP=jiE8rd*2h38r|W
zAzSEzEyas~vS-exK=v%l8(37xiGYI0EQW@Lfdc!cf)8LgH<`<T;of9?S-bFqn8FaE
z=f9yB02Bg&UJ$T?H}$Td32e}12nmOga2SaX*@zD%u(9@g6~u=DE1Vh62KIXm&D^;w
zgd`ziQ$BJdJPHU#gYXz&#cs;~!;<#_@^K&>W~D2BBmO!NOaSqTz`C(1pTv@P0rJV4
z@+lkPsX#Cdgr@^5V^cm8$X{k9FKZ(l%aqUFh`$L0b3lA9u;`oec|hKk6`v2{lOjd-
z!?&V|oHUw~J>@pU1#uHl*NAKmkyexfrW}EXz#jqeMxRE3I%B-eKnjvXPD3z6G9AGt
zzDxx10}@C<)3+W*3L3S+qa3kFL5B!tgZPvDNI}yz7n4|jwxj3{Uy;gY7+4`sDx}65
z!6xik1ZA=<5k#H*h&)M|6l283=KwxL)Eu!HRe)#2-$b5b)BSt|W$gDMPZDnuh|CMY
zh9O9b4ub*FYJwmhJB$c0;<SO?0(knxjfFqyB0T+FDzY$LAREDu3<V(Y?K;RvP8`4}
zfHL9WDWf_R%UK7;m#QHdqKCu4A=7|wM&7nK9i%c@5WIQA74YWWz9WbtJAe%kTo7c0
zZQwY_0O|l}T1^gmh#x<;kr@ayEmV~O!lNuf)5634pldN&K-VG-rGPscNJ3Di4Il{%
znieBZ8aV;u0cct%9N?Q&LDNFbf&<}AK-0owLDOn-Wi~D5$r(A|iW%p?)xz{>FhG3*
zcgZOFgj@j$4R%W3;7O;zNt)aYkrR-(a=;ZR-9nI23GSGYvr(iA8$~*`f&GBTyNDwP
zkcmNx6h`6yb>Kji($hedHcj#&S3p``0I5eai<J4Fj_pn^RGR%IN3xx-r?(r~n@TAp
zuo1UXnIw&BNA>Zb6fs-gb|UA@pHJG%He3uDLxg=+loB*#`hQQXl=;L=Ys$nHc;T`r
zqJT5`64k}S&dt}(-rJc<xf4t&M?<QR8nlyegviZo&t5)MN`)=uF4~q-$$ZQNwg^fU
z^N}*4n!wJ28nP{=cEfXzz`>&G2%IdqPvBy~0|GY-9ul^&;1P5BV>Bxij-b?o84bYI
z$b9<D8Bdrqnplm1Ic{c7V2_|Y1)dhBrxgt}+Q6#MKxjMD(?Q^6g*>;nrF5dfxQnor
zMZI8pyEmwpfa)ReNwZwNOxG(k*l!<jy(aLnJpD<J<;Begh#c<BHuZ+MmG$-I;ps$n
zr&0z9oJ2vYr#Fr4WasGaL#46qW5|~B79C7^hbHnczt}F@ySq41hNB?nZbs0MBNPtZ
nfapoG3X)2r1U6;|9YaIFJC25^P%sn=Wss4+%W{)_98~@bEFQUm

delta 4802
zcmZ8j2{=@3*d9AmLY6{Bw#J&J1<g4_meP==rV_G+k#&p_k~3sqie$?gm9?@a`s`yV
z`&zPuET2@ArK0-&Gjn{u|NG~<&h;$!bHDfVKIc8}HDiV|r{ms1C=^Pov;PAQwy34J
zYKG;xjl?LFCkUtGa<_04YLXt8`1UYxD!HPAQBUX6e?rMUjCe-1jA3g^{)>A#{u{rd
zuY)?CTMf|*spNr?|GQA!8|P_8?3>GINK0)C7kG@sce@$&lX~1pqgSLDf+fT=2#>xv
z{Ij{V6TwN+3_Q+q4Z-t3q(Kx4V(idHa%+c%@rVFV?7tIWER?;`w-q;xGbYg6jl|6V
zZvy+1kO^$4F$Aav|8D|Z|2u&;#sp9hGEz_ZMr6U^Bt`=^O5XnwjAPlt8STKY;A1TV
zr_Me5??{b;kp^6hkR2W?{ugHmQlHrU8>3L(c7%Q9!^}-gHX4UJaHr+^aP^XXICNng
z&bWddN6RE=?Sy@~FRWVaMSZw{XVhA3U|4O=Qh04#D<16rV+a<A8-bZFdf@GYuW<M8
z>wx*Nug;|h=Ks_IOA5lZ6lvjFzP0LF<v&>EnQS5i9s&BD1;GBc0-(a!9T1sX!kP!m
zgH^EqC~p{`s1^bSb~z`6!n01u^k<$}mmpzhS4h~~XfN!88CUEcpIl(%sS7qA_PKLi
zvC>~$u)&e$U`M$**i01w%Wtvcn4z#M3k#Z4DA5-Se$)<3&;GJ~o5QuNf38KatMvDj
zypW{roi}3r)#DSJa&AR1ROOmL2{u&l>E@L6IF1YydR5Al7kRMWSU-PN@?<N%xiL^T
zZq>MDqzT)BUOc#b>E-LLEBd=N4Sr6EbzPc06{X{l-DG#{oXX0Wu1J`GVyHG0Ru+9q
z#f$XbudS?3i*GS%SC4&_@A$l%_t&-~$pUk^u~o`V1LX?QlIs<zy3VPMZ(Fok#vbnI
zW4pkflpHqv#zkWIFT?)VXP@lqPKel3%X>#g>W`^pgO*?Ay*TdRKvkX}uKxO)$FCi7
zYCXoUteox{{K<vFe95Hxj8_8ZrUgXt*f;w#`^>47>_b|gBsDh$GCuwg&N}T~*)y|P
zUpIL8R+EJ0@wvr8^=BDF_X$tP6S~(zofZ6JFM9x+rlY@ypMMOU$ul)1ooLhH#5`}}
z^&CIzYup<r85d<Deyg-`?ueeqiK63Iqm$eR@(aaEXC!$fi^cmm7MW#zighyp$c#r@
z-Dx3NNRZOik3m&&g$xz*57jLU28X`?_P*=BHs_u&w-}E)(Z=OeOSyGR&k;;!>k$!Q
z;~VaK{|ah0vak*^wpgCmiD{P^R;5mIxg}p0$-JGOIX>39#=ogf&DlIj^8pT9@RdAQ
z&%xgpby{RP^jl$W`pBKjVl<%=JBYX}{5tDFY33rO4keeBblr1a=a<|nV4*F%@$7{0
zn|O^YS$9LU+?2m_b>!3%t|{$N3g%80IxfGSw`aLg@~9yDs~fN6GWW<+qjR|1Tzt-c
z-PbxOKKOKMbUI_LL%8rF)L(Ghr{>pkl8bvQ3vS}T(w8dI;{~*M&im>2`0RA3<%4we
zTymP}2Sdf-uJ7UFD}2j&wxh<JcdFvPj*z=Tyg6fbSAaW3Zai)|Ac|WvKXp6*t?rci
zgO=WuRE5i<YDGaRNeT_PRo$*~sl(3NXTJTqYVMIf`B~9`jB=%%=UcOlSzA;|rwRGr
z?yAC{KX`FaFm@Gt1v|DeqoH=%8Y=tVALyCPLVG-&ZA)%_UNV<GZ9{6jG$Ndqm`Dy2
zE^}}BYjpGSK;NGSQ!+uQzK#2$Ys^25y6mg`t(@$3)`H<a(ZXjc|C})^Pk4J|o^}?;
zVt8RB(XYW#qb$xq+NGX9K#<yP+W5$>n(3sPz1c<c73EKmwvc$<{P}y@XWJo3!3Ob$
zrh>PePe;yYoa;=b1+Jbttf+12e12x{fKb&cA0&P<(u8t-e9fkr7Pqd-dQbU)97Vyz
z<FK>u#xZ4ogACK&7j3!0rrDkMaP?zf039d!+3_oeLAv123Z{p&CZ)QS3G)r|>*Y3F
zT8nZ-8Q*8os(!TJ6W;i<IoerJJO#oPSH#?SnLVyjm7MEwbltByExL2ev+QG?YStvy
ziL8I`kiWF1&Wl#T@mqms8+Z2kNlp&O^=)8(&3LZ=0NTSnFRX{>FJ_Co2nFs3(_&-4
z9_qeEJ$AMPw`!PJf7E4~8sB(Fvh9M@e3p<#+G;xau*2&Y*IMq1{?gJr3RV8yww|iC
z81T=s!%XuG%fikNj+(S?Y)PzKO2|T)gn`ln^>l#=hv<M6A`g_X`he{FS!sssRmqG^
zVfSfTsrD?|H7y<-SWFqadq@q>T)Q$#h}T_lT0L&GZ}Kqn;i$1ZxgQ+97>i2l1L1zD
zl+Kd#x(n^~-|uj1hng50t3uUdmX%@CkC&j-u!LtpvsATXcP$2JiLUMe{$DSC+%H!Z
zX{vqog(bQz-(n(?iA^N8dvKui`L`EezU<LA7|`lF^8JsVRq^@o)Ys(cShA_s;p|P?
z;P`KsvIeqx2GpLABW13ZwO_D89v^Le@wiRqpb&peQA3M~|Eg7F;RQp81SyTznigih
z7ir0Nd>(w^oGNr6n@{iKv&nv`hVQa&<Fk_F^D|%OCnPb8G2PPXEPEbhNgn*6WbW6g
zDSpB?GEd9IQbt)ZSJM#c{pxAM(x$FYl8~*@*o(V5I;^?(%K9UdrN6=F+zTnHM{_Sd
zye&i3Dtt8bUD!)1&Umr49{Q&<r^Z=~CBkIdeZn!AaF`uq$l8f#<2GqV%ef4*9+lPn
zJmB9SAa7P|!9Rb0cwVty`c|`yhGVzEyFfFZu7>Gz1K_bb=dvZE_6{2j?H?;XVkINo
zE@dK2du}>&puAo>wS?R{5cU3>#qod`ie*Q(!SmgJR_{Ffo=~6prhx;(FuB*rOcyy6
ztUtQ0C&caBZX^4(BB;HeFI^?*Mtu8JyjPaQ+fLngAt8DE8w&nee`<zaROZj_eHV##
z{ri^aQC3e^?K8clUbK5yy>newCwycg{!Fn&Ve$?tsimh})~V<VSC3M9tb=WLK}GlK
zkV#f2SNFzzVq-PfpUuhj5PE=#odFVJ9gNe*GOxUI)V;}Dyngof+>O_{zcu(u+uGdK
zQ;(H*Yd~Vl9G997R@^Dpt1joNDrZ?L#Pr)2?$2?`ql!-0tWLK-*cbVWHRi=kQBq6E
z{v{1d{TRIJA{y+t)z8Es8E_)yrJMf+**e0G-(@Nb-Bp%@Olh;Lh8Lms&roZteyS3W
zbl9G#s+pinTb4IY)N5|1@3M*36gfLiU*NxcOf~z)t4GAZBm0^h#1y86L_5_CzZSe(
z7F60y;Qb_uPudY%JZU?icPQuKWv_q`-5)%MLOv_BB=QB>iiU)?Z7Prm_4xuDQ#j*L
z;ocU4#hqL=E_pvOOz7B8Na8qm^%;9+HSBQcR9L5_iR!-Jz5bC;-&jlR)m;l5KlO80
zaS3LO2fzD0(OmJgdEY7Ujs05X$8gQOt0O9VuC$6vDVtAjSWG<SjM4mT={~$-jsH`S
zQ_LI12lY{^bo>5Z+!)<&F8Mx<Rf4x*F_B#N^WgI#r3@G2C7v`d0V0ImC6UKYNt5@M
z<Xcxa@qN1gD^I{Zqa53=zz_EbDbu7LiJRW3=6CI<b(%WO8h>OmC0gsOvkB<}7fQ)?
znWGhsofsKMy&B<6Um2bsuWl5r8mDG=Sxbmn9qpIp<<jH7J$A~^XXk=bMGBLruh0|z
zHbvEcyq0(zxs7)7Tn_DreqqiX^a;E7LW_Baf$+1v78V<$8f&;3v+F8Bz7MpOb}5<;
zR)uOlj9hvYVDurkQJ-7(GJ06vZ!yM!>q8Iqq;Y0rBWK{Kbtt)#Ln^;ox*}sVc{90i
zI9{wowb^VSez$#ueNml)^760N$;&_3m>oW{JZBX>rQB4RPh8oj<#n$4NfR&Ue5HaJ
zXX%#){Rz)srQQBtoOg<AqFd~jxhttJg#-w1yxqV!#~YOM+UQ73$1_hCNMI##_tFPX
z@88cj&e`#19gMkFq_oPMcu}7(TVpjLc(=|~-bjK?Z$vNfT5Sj@Q5pgi#&UoHE+as;
z0tYI>{$e!>97`Djgv*{{QE^YOpH4l&&KK+e2IqDF9-nsrWLGp0`K24E;zI)z*w->g
z13FS@fThkAw3>GX+hk?I@<@J7CWv+GPj;9Y$(iUxBoa8;w=>rqLd{4ngdOe_H*2!3
zo2RP-ndFLRA#ga8t?fO$97t{vl$#`X4AF{s#fs#CN7=&lo<v)EL_#7j;#E9~=s?eq
zF^B-mwtzQ;q}aAo6ePp3o$;Y}ah(VY6B&BGTY7$YCc2(KqIUy{0$@ERMDHdBR(2*b
zHW3KZQAD>$xLs(APNg>pnUR7JelXJSt?hn8Fz`wgB11lOTRv=yAC3q{AoNHiirSWs
zrpq(LW46h$Tjbk_+8qdC$01?-w)kDTxHlr6u+6`>#ZN@!k`Q__5~Xa5r!w|VAKd+I
za@rO-9Z}1G$r$*(WFleKws<xoPNDN3ApC^iLh?{fB!Si2`>HJ|m&itB$D=|COm^XB
zq=#@ho)f<_jK~{#p2~zm9ZW<6LrrMlI1LTx-$4UEfgYen1wMkXU*HcPNgXt>w&V)3
z3A=#~MzY|~P8yI`PXlo9^Q4;wSoISDlMoe<?z4X50%<cez`;5j!v4w1!dj8Cu!8kW
zfO$zh_U%kP_T$tnupl)DWGlVF&cOaf{#Ib`oqBA!nl$*$TN?BLKLP91Rp3A=3)oY@
z0+zo2hQ(HX#m+Ka1N&iL^yMp{p^^p6v>n0f?0bW~88M8Nt3(6&AJ9P3EE@2RMgy5D
zJ-|yv2o3nc!8ac?FpEP2Gap?+P60Qt$3Pa8YoP%dRWx8H{G98f0jU#2AT3M<G=cq&
z6)y1NJPiQtva#+tS=c6W7S_3(8~BHN7Z7w{7mz832G}gSfs!FE;34cwiJ$?`1uj6v
z+7;Xp=?Y5BVL(}48c-m=)(i|UHv=tuTL7LkXJD{d1#E_WV<|K^DBl7|KFY?Poyx-Q
zF3Z9un1~3fKf*H+&}6c=tvA`4OmQPUhQBRd0_(?r=C$GPJ&$;Xh?g}Z<zt9O|9PJU
z<ryO1xuC)b!Y(HVce1mawaayDTQ4UvspuxD7(-Mcsu2b7sswiU4)COqNl&auPcc@c
zXYiTBGlh{#;L`w`Qam%=lsQ<D%D0dTJPSQliD#vo=Xf@{slv0<%?msS-BiQnH5fV-
z3M0KlD*i&CTKJ5?6?Je$J^jSNegj;<97bwHkR}*u#vmFrWYrdg+6p6WcrH4n-PVfq
z3WNBs@o0Lg14cWyQe8-@8^1%04)wrLF9zwi4}tpeJLt$jLaDTf&>(@u75*NE2x$5p
z>E?0O!IexJPMDGwka%N7dW*S98o>}a;p@`J*46nc=^cDqSdm6CM0;WsF(pA*X5XPP
gJQI8=jbn%iHi02}5pNOWiCGSe?~x8WQ0$cc2PHxbm;e9(

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index b426a736af..33f9901af5 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -3,12 +3,19 @@
 #
 # Author: Tobias B <github.com/sezanzeb>
 
+"""
+Automated tests for checking the EnsembleLda Class
+"""
+
+
 import logging
+import unittest
+
 import numpy as np
+from copy import deepcopy
+
 from gensim.models import EnsembleLda
-import unittest
 from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
-from copy import deepcopy
 
 num_topics = 2
 num_models = 4
@@ -337,5 +344,5 @@ def test_add_and_recluster(self):
 
 
 if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()

From 9314cb4c81304bc2a40f080b55a6810dc255daf2 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 14 Apr 2019 17:52:20 +0200
Subject: [PATCH 022/166] potentially fixing the tests on windows

---
 gensim/models/ensemblelda.py      | 133 +++++++++++++++---------------
 gensim/test/test_data/ensemblelda | Bin 9820 -> 9815 bytes
 gensim/test/test_ensemblelda.py   |   8 +-
 3 files changed, 69 insertions(+), 72 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index e49f946c21..1075b63041 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -91,11 +91,7 @@
 
 logger = logging.getLogger(__name__)
 
-GENSIM_MODEL = dict(
-    lda=ldamodel.LdaModel,
-    ldamulticore=ldamulticore.LdaMulticore
-)
-
+# nps max random state of 2**32 - 1 is too large for windows:
 MAX_RANDOM_STATE = np.iinfo(np.int32).max
 
 
@@ -115,9 +111,10 @@ def __init__(self, topic_model_kind="lda", num_models=3,
 
         Parameters
         ----------
-        topic_model_kind : str, optional
+        topic_model_kind : str, topic model, optional
             Examples:
-                'ldamulticore' (recommended) or 'lda' (default)
+                'ldamulticore' (recommended), 'lda' (default),
+                gensim.models.ldamodel, gensim.models.ldamulticore
         num_models : number, optional
             How many LDA models to train in this ensemble.
             Default: 3
@@ -188,11 +185,15 @@ def __init__(self, topic_model_kind="lda", num_models=3,
                              "input space dimensionality. Corpus should be provided using the "
                              "keyword argument corpus.")
 
-        # Store some of the parameters:
-        # - store topic_model_kind for generate_gensim_representation,
-        #   so that it is made sure that the gensim_kw_args fit into
-        #   the constructor of the gensim model.
-        self.topic_model_kind = topic_model_kind
+        if type(topic_model_kind) == str:
+            self.topic_model_kind = {
+                "lda": ldamodel.LdaModel,
+                "ldamulticore": ldamulticore.LdaMulticore
+            }[topic_model_kind]
+        else:
+            self.topic_model_kind = topic_model_kind
+
+        # Store some of the parameters
         self.num_models = num_models
         self.gensim_kw_args = gensim_kw_args
 
@@ -299,7 +300,7 @@ def generate_gensim_representation(self):
         params["iterations"] = 0  # no training
         params["passes"] = 0  # no training
 
-        classic_model_representation = GENSIM_MODEL[self.topic_model_kind](**params)
+        classic_model_representation = self.topic_model_kind(**params)
 
         # when eta was None, use what gensim generates as default eta for the following tasks:
         eta = classic_model_representation.eta
@@ -551,23 +552,6 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
         # multiprocessing is started.
         random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)]
 
-        # worker, that trains a bunch of models:
-        def worker(id, num_models, random_states, pipe):
-            # since models cannot be piped to the parent because messages need to be pickled for that:
-            # memory_friendly_ttda needs to be true, which will not store ensemble submodels
-            logger.info("Spawned worker to generate {} topic models".format(num_models))
-            self.generate_topic_models(num_models, random_states=random_states)
-            # send the ttda that is in the child/workers version of the memory into the pipe
-            # available, after generate_topic_models has been called in the worker
-            if self.memory_friendly_ttda:
-                # remember that this code is inside the worker processes memory,
-                # so self.ttda is the ttda of only a chunk of models
-                pipe.send((id, self.ttda))
-            else:
-                pipe.send((id, self.tms))
-
-            pipe.close()
-
         # each worker has to work on at least one model.
         # Don't spawn idle workers:
         workers = min(ensemble_workers, num_models)
@@ -594,7 +578,8 @@ def worker(id, num_models, random_states, pipe):
                 # get the chunk from the random states that is meant to be for those models
                 random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
 
-                p = Process(target=worker, args=(i, num_subprocess_models, random_states_for_worker, childConn))
+                p = Process(target=self.generate_topic_models,
+                            args=(num_subprocess_models, random_states_for_worker, childConn))
 
                 processes += [p]
                 pipes += [(parentConn, childConn)]
@@ -617,32 +602,23 @@ def worker(id, num_models, random_states, pipe):
                 raise
 
         # aggregate results
-        if self.memory_friendly_ttda:
-
-            # collect results from the pipes
-            # will also block until workers are finished
-            for p in pipes:
-                answer = p[0].recv()  # [0], because that is the parentConn
-                p[0].close()
-                # this does basically the same as the generate_topic_models function (concatenate all the ttdas):
-                self.ttda = np.concatenate([self.ttda, answer[1]])
-                # in [0] the id of the worker that sent the data is stored
-
-        else:
-            # now do the same thing for a memory unfriendly ensemble
-            self.tms = []
-            for p in pipes:
-                answer = p[0].recv()  # [0], because that is the parentConn
-                p[0].close()
-                self.tms += answer[1]
-                new_ttda = np.concatenate([model.get_topics() for model in answer[1]])
-                self.ttda = np.concatenate([self.ttda, new_ttda])
+        # will also block until workers are finished
+        for p in pipes:
+            answer = p[0].recv()  # [0], because that is the parentConn
+            p[0].close()
+            # this does basically the same as the generate_topic_models function (concatenate all the ttdas):
+            if not self.memory_friendly_ttda:
+                self.tms += answer
+                ttda = np.concatenate([model.get_topics() for model in answer])
+            else:
+                ttda = answer
+            self.ttda = np.concatenate([self.ttda, ttda])
 
         # end all processes
         for p in processes:
             p.terminate()
 
-    def generate_topic_models(self, num_models, random_states=None):
+    def generate_topic_models(self, num_models, random_states=None, pipe=None):
         """Will train the topic models, that form the ensemble.
 
         Parameters
@@ -653,8 +629,15 @@ def generate_topic_models(self, num_models, random_states=None):
             list of numbers or np.random.RandomState objects.
             Will be autogenerated based on the ensembles RandomState
             if None (default).
+        pipe : multiprocessing.pipe
+            Default None. If provided, will send the trained models over
+            this pipe. If memory friendly, it will only send the ttda.
+
         """
 
+        if pipe is not None:
+            logger.info("Spawned worker to generate {} topic models".format(num_models))
+
         if random_states is None:
             random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)]
 
@@ -669,7 +652,7 @@ def generate_topic_models(self, num_models, random_states=None):
 
             kwArgs["random_state"] = random_states[i]
 
-            tm = GENSIM_MODEL[self.topic_model_kind](**kwArgs)
+            tm = self.topic_model_kind(**kwArgs)
 
             # adds the lambda (that is the unnormalized get_topics) to ttda, which is
             # a list of all those lambdas
@@ -683,6 +666,33 @@ def generate_topic_models(self, num_models, random_states=None):
         self.sstats_sum = tm.state.sstats.sum()
         self.eta = tm.eta
 
+        if pipe is not None:
+            # send the ttda that is in the child/workers version of the memory into the pipe
+            # available, after generate_topic_models has been called in the worker
+            if self.memory_friendly_ttda:
+                # remember that this code is inside the worker processes memory,
+                # so self.ttda is the ttda of only a chunk of models
+                pipe.send(self.ttda)
+            else:
+                pipe.send(self.tms)
+
+            pipe.close()
+
+    def asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method):
+        """ worker, that computes the distance to all other nodes
+        from a chunk of nodes. https://stackoverflow.com/a/1743350
+        """
+
+        logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas))
+        # the chunk of ttda that's going to be calculated:
+        ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
+        distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda,
+                                                                         threshold=threshold,
+                                                                         start_index=ttdas_sent,
+                                                                         method=method)
+        pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
+        pipe.close()
+
     def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"):
         """Makes the pairwise distance matrix for all the ttdas
         from the ensemble.
@@ -734,20 +744,6 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
         if workers is None:
             workers = os.cpu_count()
 
-        # worker, that computes the distance to
-        # all other nodes from a chunk of nodes.
-        # https://stackoverflow.com/a/1743350
-        def worker(worker_id, ttdas_sent, n_ttdas, pipe):
-            logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas))
-            # the chunk of ttda that's going to be calculated:
-            ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
-            distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda,
-                                                                             threshold=threshold,
-                                                                             start_index=ttdas_sent,
-                                                                             method=method)
-            pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
-            pipe.close()
-
         # create worker processes:
         processes = []
         pipes = []
@@ -768,7 +764,8 @@ def worker(worker_id, ttdas_sent, n_ttdas, pipe):
                 else:
                     n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i))
 
-                p = Process(target=worker, args=(i, ttdas_sent, n_ttdas, childConn))
+                p = Process(target=self.asymmetric_distance_matrix_worker,
+                            args=(i, ttdas_sent, n_ttdas, childConn, threshold, method))
                 ttdas_sent += n_ttdas
 
                 processes += [p]
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 9b1df8521e47cfc0c68e80269b5aa0bcf5849454..3d77142cd210d18ed65595fed46a0f1a5a081e32 100644
GIT binary patch
delta 2625
zcmV-H3cmH+OxH}X!U%sW&9IAA`d!~Qoy`cY9KHA*FphqN&ZKKFS$}cjzUXPDtL^ub
z0Z{8(#^xReNe@W`c1>qV18(N9XR?Ry;74i{?GF`X2@^F71u^ml3^>YA^}g|m-(aw+
z?EdB9YR5s@z@;Ka)e&DsAW@wlXdRr&$(&BuI|MQHb!NymSe$=z=8oRHixqKGb<HIQ
zCVeefp5L-(fw|CZHI3B1{-DA%z4>pXQt;f%j|zq@xfpd*Hx>l<H3@T##3mEUs3LTk
z2;iWyp@SkYe(OS7wq9qP;X8qd?D^dIwa(!RsJoy@7si#Ia1coYvT{Bf9(BgyY9#Cq
zdWV&WWsB`L#JYdZBOK43J3mglQ5r$?(U>qOjp60$cadgan4!i$oE`J~ip$%6bv2?X
zn{KUk*aEoMSWp+)>qv!!<+y2>6Q2Z?!;9w&hkkH@xf@I|#%&F0uuw-BP=M-PTc0Nv
znb1pZXqw6QG)KmlFRSU-r+*y5_wHJKE@m`H1G;K(_9=fAeBR$mMDCPm&56d2ETAPT
z0GIEDQ;mbmvDUfoC<JQv^Fu|Skc&l!xb>|pMxTki2$Sk-COEp_;V4=No^eja&T)V6
zh*~bDE*V^KjJMX$;A*v%_K^*6#VnU}@>1KB7v*Cjq(tze`G;@s-D&RiiT2}JS(BbX
ztto(J8o+;pbcM_BRdh|zlfbwNW%{&k0^o+1um5m8uTU-sr@^ZKxdhh0g=Hqe?pie@
zTon?D1Dg#736WW)wyZ7Q=xVjDH{>PnQoyl89(Fd-5vJ}=8+aqT{IfNX^$azPmGEsh
zlZ{by#GcaDw=}%QKw+D^#-2BQ$R6VDd+-q3zG;6YFQ`h%i51C&?9s{$UxBjS^_J3p
zV`^1aQSbP9H}+0HXH3^+en#h;Q#E#kY(go4v@)3*-0LMC-V|-3lPzY*odMu};oT1C
zO}dFTt*#jI@Hf`aI<4gdc<{D?G9O(_j&4fn?5b{!<2LDR68U<$RSa}EVQ*^gk3{6Y
z)=Gcl)@__JsJdrez*AYSIT*n*`&~xsBaPOp=A8!ldiPzvZBIEzgz3UWKRCJ1)}m!D
zBJXSu{%DQwK=TlfyZ~%}$<N&d-#Rrt^SQSso~e0uXUdlh1BR#LCkGwihOK8{^oMQl
zpWF~13*T_RKbW*u^^j!M^Y+=KS3i`~SXF<m6HZH_msyjtyO+lBUT5_X_63OU)<w#v
zyxR@tVW96M1J<Lf2u%P%&>_Pye{S`Q983mk8x)jwice3zXFzAF*&0aQn<dUbRT(e&
zg7WgQa80oEbT|f-zL~hhHeHx0XihBVUQhY=ejyPd*p2755U(;WlOa_fy%z$(TpoXD
zD)UkD*VPvk+bc>LBH@iuYs5G;MuXfHZAowT0#gFOQEd^ODHsqDWzg`1A<+^W)TJD@
zbQ(E(UZAjx{-oD1Bl|yqG`K$=3;jfm=p9SpHjEi#ls$IEkbYPDgmFid!D66qQNk)T
zV7n`GgB>g-Zt1`d1H<0USklmSy?}o{+>2&H+2#9^Hf(`(ZZ5z9-EcG><g+9<OfP||
z_oX=c8HPe|NP&^Rh2Zp0xp0H3Kk35O5m(o9oT!<#H!EZl5#zZHlFB=ii{hMta5N17
zr)|T@1?aLH_v=Hw6v6R|a@`-X%atfMF49M~`9oz{2KQG_auNS-ZKi2CZ;gK^2N;}i
zzrKP7=H$*}L}*Qv;OOn2c_mH_R_ntFqqXXKk>M=x7P1XG@Gs}3pgX_e?mQnG@3%J%
zBpvuseb*40*6R@PtX3jDJ9~98k{v1*(O@9AR8!xrS4R`<gJ7FOi3tys6KqQ*H16V-
zltax&WT*gY3jM26J$MdK@M3>qjN<PhUa6U<J^6lzl7Y*57QUl^$`fVc!cWrURuZ36
z;KHXJ56b9?_FB{7I7$H+6g%I8sD&D3hffNk>ZsHzg5}HiqLbh>!iy&Vn^e{LMUb|j
zk^T!?I$$DWEZq0!fDkhZKdNI>d=Wd*`&V4Lvb&38Z+s_L@}ErT$aa4a8Pu*jdzv&p
zmHcnY+vW5Otr20Ffq;GAJVfSP{<khbaeew9M<z8!2uo8s!S6V(OBO8xE3aEmb!}1_
zgemF@_{Rb%lw9N-v8PB)O&vcO7M~>=o`rru{<b~9Mf_26(+(#lQe#Fvt3w<7?C3Xl
zrtAf<9rGj^oT*{z17&|%uUvJtx7zjZrlr{k{pA4*8^Qh&HLos(2h+261rSg2#S%=U
zX0HBQ!cLdMaV|YhM3y3=xk`R-3#T+k!sOc&vzD4#B$Gl|sg!>{g}e(NFQbauNi0`=
ztt>Jh%>FhQ+f|Pv)RLmEjbl<S9qcYE)+Y_k#ZO$_?8i1d1#^F8W2EYV6nrp0c=zlU
zBE%YJ8!)>&Xs9GU`-+GWo2k}eQ}J_<i3MIP!}?BgWxqBSKxV6nrl?Nh_E+LA$<-`;
z33;EYO}8q5V{wT+D)mQskB9bR)hn4)s265Tjv!T{Yf-c?r4tU3of@_KACHe4KUTVF
z3l4p!N|NxO%bI^~f0#U2YT*vx5Lc{iTS+q!$w!@eAwpv6j6n$XDF-YDE{%%LvOC3z
zYJ`qqGHHB#_;Hq}`=O~qkep|As|E{TwWdKwdY$}KQ{Hn2c2G>-)|k~m(VZZxAm)?|
zP^2@rxoJ%34<V<iNGSO8oqZa7a)8yCyR<nV0njGqhPr=-{W&X|se6i;ZBiBxG4GMp
zibAAkhsEhNr=p5^pFP#fnZyI;0RyFf=V8pKLOXn{wdIihjrVv8pj_=atNLdJ<4qqy
zUtS9n^%Wp^+qJA5{fq;hZRW`2-av@Z<S)k6esuK?DnHeA#j;1|3Q;Z~PvDcW&a%!_
zkEHzmhD(2UxZ7}`1KfvBmRuK#;+th{euYHO;6S-aqAUiqLC~#Kp?RD|luzG>LtPLW
zy)fd06NW-PAZcgXLJj0G<%glW97@;Do{RF%0^mL`y+HH<{{MU6vzHH)ezfeNp}oJc
zMp-d=#`{f)Tb#Bm0mcuuFR2J#jk=dnB(;-zZP|a!oei~~wb$Afb-UCnu^7aJFQpIX
zZ|2!w0r@n{@qB|7)Nfr-pcIb^!b8et^m8{~2>UN(IuBh3KNl;WBTL&TT76>wTkjW+
zj7$>>O`CKVMBU#*#=`=k#Klg#{9Zt-r2I?9IqWd!f$Qd%EpZuahLjt!{Y|Rh7S@#5
z@BUZs6FRxKE7tY6G2E3Q*~zV4>kiz8Ntp@4Ag>K#sRxMFrSarB#@F|ydJ8JUxvS8W
z8KuQF;SeoqIN|T(?u8M<Hl$EgY##Or`SS36R8~+@`F3<-OJuW#5=IjcXbV*eZgp*6
zZEs{{Y;$qeli?*48`oWN*eHE**(g;AX>(t8VQgt+aoUJzVO??ClMp5)C)`T`b$xN&
jC}`S<XklG(-Y95uXnkmlXqjbbWJ>^O+)Dv<Wp!dMvBnM2

delta 2645
zcmV-b3aa(jOx#Sc!U%r^Zb3mIJWrQ@umBrkn4drPgL8+J%91ON8}p!;<t&Z*C{El>
zi_d6xFI@V6bK7d=wqBN0RZjCcB2I1Z#e$$S3ThChn;JjAkI*n++f`3rGa3;l-amT0
zpv|B<Huub09?msJ#DJ5P7=$UR9)AlU8r5pegKgc6KlCi?%1(dELkND9yo4wqgxC#K
zFN*i}@q8?fkDEzvH9C8tG)PRV>Xa0%yk0yjc^BLcT<}%vz{E2~0iq11^BeUpVC*DX
zbwK+_rI2wY&#MOA5E8@~G_?98BRo0MUBv&C)+N_1Ik=YZn#UhRB#YggCxn~TE)zsN
zKI(xXz2@eFb9H|?y*Ocz+8tA&nB3B`nz74hN>o}}?39aWB4GlUoe@VWobb&<Tiu&-
zTlKr{dX!Uw$V_dU-V8p5N`8n~n_XhKWyeRng3aMRSzOp48d#}xg!VCi1~CbSUhw`q
z{v3;BDwGl;0(d{PY?>v8+|=t_O+S^Wj~A07$Fz>~Hcfx3Bgr-P5QMN59e3*(AIpIH
z-of!liXLy1PBLf6`+FLfuy%HApR}ffTl+5YCm0quo*EM<j-P2iyiUsZ{DahEMiGdu
zEhe(3H2GxprF>lAz9|-OOQ4V4z9KDqA+Q0;=OM<={B^O_MSBREE4lWS<nD!u1?=|-
z(*f#y&5?inMlq=^$WjOus?$v$ktM2uV%s+J0`e^|@>O_OAP#*DvXuH8E*`!z5>tsa
zP$iOh4-0DyeHP&wS3^n2^z47_pAi#q?V<Osh#w8tHZ#Zz7rm42sis0Toq*Qet|`yZ
zDc7cTmZ1~&wtr&uBI!l1F7Pkl%M(zseBA*L%YlD?^)XihquuQCg=}&AWzj>NrZ0+Q
z-62o9X(8m5fCof_BLgERuzPda{)4K?4}@f~wl#r$k<Z*`n-vs~SEJY6NTk|eC0W7l
z&p!j(T69}}VX2PRx^<Q~E!vKVdtF7dcXvYgW)Zp}YWlX9Ce>~WnP`jQC3G&#IA%WG
z)ck*&NAGoRK`vQhJ<h&!Lqe`vd2N>}nKItq`p|Sx8bP}47GZK0i2R2*)N|k%de{1a
zbO9jWp&(!EF{1Q5mr5UPQhG+?pY+s0xq4!z&#E1|TmW}W!{WZa4js@a|5Iq_CWt7A
z?@<Ch@2_4$ADCE6X<9875>ZYkq@i5A(T9H=4b8yh^p3XJmkj!qo{_TKr}Xk{(EIAv
zEugPIQ5)3@Mh};nH+Id7MZQc(eP>};s-j0K-8LL>>~LzsNSyNR2aWMUMq`=|h;kGj
z9K)`cz;U%tetUcja)`aq?8d$HEnwgJu=oN*Q;FcR*@jUII8yzzVeWu#9JF#vIaYuE
zKX}7>_rz#w%{JR}XhHO%8C3Uh+U;G2`A%eK>DhZ&L3sE8MjuK8oM*vE9AI@!KpIF-
zNI`N*0*5+kyVIFD4nE+XcuA!D7@z3mjI^nWal;`X+=WBsZ!{EO7^F9-&EuBhl7U%{
z*o(17i)W=OZ}Kc-$VJr}*wVs{FPwj7OMPqe-1Wqv??Q|OT?y40li#kD?2&ZS;KzDS
zy;Bn)7fW~KhZy=8gM{HLNRVJwYzA*GDA)ntr>^PvwtN81K5Y((rf|a?s3iYX?*y=u
z41r+C%#HkK=DxvFrb;_afmoU0)F0I^K{3moR?o=q+`Yueaq|EXbSFJxC%u3FmnZ=A
zC$QoS<4q*yL$y)ZXWVpVscV@JBZS=RmlvC>bErn|&2W~?#DPPxNB1Of?YAa}4!%nw
z4a$V`7c2wPk5bQL5|(i)im1;PF2zCW_|MH7Xb(UBgO0G#=3&dZt(rt{OI>;+%R;_C
zi8PA$nAFVRC&5u=jXAPBKi_}6Ps__)HPb2#jgee>O43V2#}8EmwNgaO9j7oHPa(+_
z!eISTax7~=$NB(>(_FHdM1pm_bKs(dMptj%>b6w>;8nV^XEusXPSTU762%=PU|Tr?
zT{ojzw~lFJ#!66Gtn6@k5436qc~K(xBlV@zz})1^sqW!G){5oKlxu(VtLBtUF4&8?
z;v*N&)QI@o9w^EZ<7DhcWpoWdnDhI3Yd>$WIt_<U`D>Z-5p(Gl*&O9#e^m$9U}IdL
z+)SK&p#Ee0c)7utsx+LCeH<u}^_vM4yZk<p#_884Dk%asOG>xk<t~mvIvf3b_dI&$
z)+e(EpoejVnxi8A@(_Q3-IWm_q6Gquc^lJ3MisJ4cXX+!4yk;Tk+!p300q>dr%<7_
zjqsf_^A$rm{1*_?-RUO<`<K6^;fFJL1F@4%!C*-mOf-Qdf&rP<W|W{{{Argdi<?PL
z-;2hTZ3gCfRq7?r#aNcNWw5<u_4!ENCDg?B8K~!g2C`d;&h~#Ob$9G`E~|x5k2=3K
z3&jy;q{QUg(t&|^{5tu89p0QDQYytJ$d__)`B@@$Xi=vH?A{&4p<Pnrx_PDg1}NW)
zZy}u7rDJC{6zbLF_d_LC&yU4lrtm$I#%0GXLTP_$yFeZ6a**|HX#S>Rte>T!m}V5<
zpeHFsct&B$r)_@;B(ISttO1`Yh=3pAcEjMu|6xXkzwqHKtC&dnH))3D=axbZ)UC^C
zW2AsJq^Y9n>ocC&D&9YkBx+%4nYDm*q{u&7yZ}wcIwhwOX^D(gwq=#SHnzJ!gU@%@
zy{!Sk%u>9Q(!1%9!O4jxgNhq(iVsdf#aCRNpG0r29Rz<HXtaC7<N%QY9VN1N-G>d4
zGMa?06}kSl4^?gKYNrdgZCQ?gE?Bf6Wo-k_Iza{5AcTiDJ)Ph2>WL&UN!a{(Gf%X_
zbY48+(55^gt~?N*P5sxD(`acIBhFhjt&ZoVJka$g#IAYNmoYqLC1`Uz<h@4u1A7ub
zE8kV{l39Ni2{mj&EB&YCIGzk6P{ir|E=@b}^I}CiLLEWvL_5chW|Zr(nLsM2@87I^
zTavk&2om0G@sZ!<fZVs$ywZy*1sOFybVd%rv8_yBtSKa@s1v2&+FIXj@qNFDOjd{P
zA_`$BD6Mo|2$9(IsbFOGi>sQ=((*8z^=~|^Hei3ktYN;%8q9H2%3>#aJ&|lfLxBuq
z^DJc@Yh6+2;bY3}wX+reT&3rt!*J=Up>z)97}hxw!2l*v{QIAf0gCcK2<dSNY~-@K
z+i?Pl;o2HqRQ&ZNJh5lj1S*0)QLLT;8CtT(%XzDTbv!{YsfZ%W)euEJK%z})P=EB@
zh6GoqVg*8jlr~(|VKI7Vu;l~^<qq76J)iPWZgrFgf@V;o1wRwInV30}Ng?8i8*#19
zY6~Ddy!PtMMo{vz_7oS6FC_{L+X(WrP{>co@jjhv?EG|MOK`J>5=Ik5RRe5fVR6<~
z3T}06Uu|z>Wo&bC*GmLd4|8vFbY)~;V{CPEbY*gLaoAmP*(iN++9*{BX>(t8VQgt+
zaodP!VO??DlM5y#C*4Z{b$xN(C}`V=XklG(-zaEvXnkmlXqjbbWJ>^O-Ae&=Wp!dM
DED{+p

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 33f9901af5..3f1ea8b0b1 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -14,7 +14,7 @@
 import numpy as np
 from copy import deepcopy
 
-from gensim.models import EnsembleLda
+from gensim.models import EnsembleLda, LdaMulticore
 from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
 
 num_topics = 2
@@ -199,7 +199,7 @@ def test_add_models(self):
         # 1. memory friendly
         eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
                                 num_topics=num_new_topics, passes=1, num_models=num_new_models,
-                                iterations=1, random_state=0, topic_model_kind='ldamulticore',
+                                iterations=1, random_state=0, topic_model_kind=LdaMulticore,
                                 workers=3, ensemble_workers=2)
 
         # 1.1 ttda
@@ -245,7 +245,7 @@ def test_add_models(self):
         # 2. memory unfriendly
         eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
                                    num_topics=num_new_topics, passes=1, num_models=num_new_models,
-                                   iterations=1, random_state=0, topic_model_kind='ldamulticore',
+                                   iterations=1, random_state=0, topic_model_kind=LdaMulticore,
                                    workers=3, ensemble_workers=2, memory_friendly_ttda=False)
 
         # 2.1 a single ensemble
@@ -344,5 +344,5 @@ def test_add_and_recluster(self):
 
 
 if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
     unittest.main()

From 0b7febc8ceb86ffb7e8fe6a7a945198a67a80b8b Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 21 Apr 2019 23:26:07 +0200
Subject: [PATCH 023/166] changed citation of opinosis

---
 docs/notebooks/Opinosis.ipynb    | 77 +++++++++++++++++---------------
 gensim/corpora/opinosiscorpus.py | 20 ++++-----
 2 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/Opinosis.ipynb
index bbcc2f6525..3fc845040c 100644
--- a/docs/notebooks/Opinosis.ipynb
+++ b/docs/notebooks/Opinosis.ipynb
@@ -47,7 +47,10 @@
    "source": [
     "# Experiments on the Opinosis Dataset\n",
     "\n",
-    "Opinosis is an extremely small corpus that contains 289 product reviews for 51 products, which is why it is hard to extract topics from it. https://github.com/kavgan/opinosis"
+    "Opinosis [1] is a small (but redundant) corpus that contains 289 product reviews for 51 products. Since it's so small, the results are rather unstable.\n",
+    "\n",
+    "[1] Kavita Ganesan, ChengXiang Zhai, and Jiawei Han, _Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions,_ Proceedings of the 23rd International Conference on Computational Linguistics, Association for Computational Linguistics, 2010, pp. 340–348.\n",
+    "http://kavita-ganesan.com/opinosis-opinion-dataset/ https://github.com/kavgan/opinosis"
    ]
   },
   {
@@ -61,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-29T20:19:17.891961Z",
@@ -77,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-12-01T17:57:39.575650Z",
@@ -101,28 +104,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-12-01T17:57:40.842440Z",
      "start_time": "2018-12-01T17:57:39.905273Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "data source:\n",
-      "title:\t\tOpinosis: a graph-based approach to abstractive summarization of highly redundant opinions\n",
-      "authors:\tGanesan, Kavita and Zhai, ChengXiang and Han, Jiawei\n",
-      "booktitle:\tProceedings of the 23rd International Conference on Computational Linguistics\n",
-      "pages:\t\t340-348\n",
-      "year:\t\t2010\n",
-      "organization:\tAssociation for Computational Linguistics\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "opinosis = OpinosisCorpus(path)"
    ]
@@ -149,7 +138,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-12-01T18:10:13.690983Z",
@@ -161,16 +150,19 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Spawned worker to generate 42 topic models...\n",
-      "Spawned worker to generate 43 topic models...\n",
-      "Generating 42 topic models...\n",
-      "Generating 43 topic models...\n",
-      "Spawned worker to generate 43 topic models...\n",
-      "Generating 43 topic models...\n",
-      "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n",
-      "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n",
-      "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n",
-      "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n",
+      "Generating 128 topic models...\n",
+      "Spawned worker to generate 42 topic models\n",
+      "Spawned worker to generate 43 topic models\n",
+      "Spawned worker to generate 43 topic models\n",
+      "Generating a 2560 x 2560 asymmetric distance matrix...\n",
+      "Spawned worker to generate 640 rows of the asymmetric distance matrix\n",
+      "Spawned worker to generate 640 rows of the asymmetric distance matrix\n",
+      "Spawned worker to generate 640 rows of the asymmetric distance matrix\n",
+      "Spawned worker to generate 640 rows of the asymmetric distance matrix\n",
+      "The given threshold of 0.11 covered on average 9.9% of tokens\n",
+      "The given threshold of 0.11 covered on average 9.8% of tokens\n",
+      "The given threshold of 0.11 covered on average 9.9% of tokens\n",
+      "The given threshold of 0.11 covered on average 9.8% of tokens\n",
       "Fitting the clustering model\n",
       "Generating stable topics\n",
       "Generating classic gensim model representation based on results from the ensemble\n"
@@ -185,7 +177,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-12-01T18:10:13.712818Z",
@@ -197,15 +189,19 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "- 0.145 free, 0.043 park, 0.033 coffe, 0.030 wine, 0.027 even, 0.025 morn, 0.022 internet \n",
+      "- 0.132 staff, 0.082 servic, 0.081 friendli, 0.075 help, 0.021 good, 0.015 quick, 0.010 expens \n",
+      "\n",
+      "- 0.166 screen, 0.063 bright, 0.040 clear, 0.022 easi, 0.020 touch, 0.015 read, 0.014 new \n",
+      "\n",
+      "- 0.146 free, 0.043 park, 0.034 coffe, 0.031 wine, 0.025 even, 0.025 morn, 0.022 internet \n",
       "\n",
-      "- 0.161 screen, 0.062 bright, 0.043 clear, 0.027 easi, 0.021 read, 0.019 touch, 0.011 size \n",
+      "- 0.142 batteri, 0.099 life, 0.026 short, 0.020 charg, 0.018 kindl, 0.018 good, 0.017 perform \n",
       "\n",
-      "- 0.127 staff, 0.075 friendli, 0.074 help, 0.070 servic, 0.016 quick, 0.012 profession, 0.012 good \n",
+      "- 0.123 room, 0.111 clean, 0.050 small, 0.035 comfort, 0.033 bathroom, 0.017 nice, 0.016 size \n",
       "\n",
-      "- 0.123 seat, 0.067 comfort, 0.050 uncomfort, 0.039 front, 0.037 back, 0.036 firm, 0.032 drive \n",
+      "- 0.147 seat, 0.086 comfort, 0.058 uncomfort, 0.045 firm, 0.031 long, 0.030 drive, 0.014 time \n",
       "\n",
-      "- 0.113 room, 0.104 clean, 0.041 small, 0.037 bathroom, 0.036 comfort, 0.023 size, 0.016 well \n",
+      "- 0.161 mileag, 0.106 ga, 0.056 good, 0.028 expect, 0.019 hard, 0.018 road, 0.018 high \n",
       "\n"
      ]
     }
@@ -216,6 +212,13 @@
     "    print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -240,7 +243,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.7.2"
   }
  },
  "nbformat": 4,
diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py
index 446c5e7804..eee1363045 100644
--- a/gensim/corpora/opinosiscorpus.py
+++ b/gensim/corpora/opinosiscorpus.py
@@ -3,17 +3,25 @@
 #
 # Author: Tobias B <github.com/sezanzeb>
 
+# Opinosis Corpus Source:
+# title:         Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions
+# authors:       Ganesan, Kavita and Zhai, ChengXiang and Han, Jiawei
+# booktitle:     Proceedings of the 23rd International Conference on Computational Linguistics
+# pages:         340-348
+# year:          2010
+# organization:  Association for Computational Linguistics
+# http://kavita-ganesan.com/opinosis-opinion-dataset/
+
 import os
 import re
 from gensim.corpora import Dictionary
 from gensim.parsing.porter import PorterStemmer
 from gensim.parsing.preprocessing import STOPWORDS
 
-
 class OpinosisCorpus():
     """Creates a corpus and dictionary from the opinosis dataset:
 
-    http://kavita-ganesan.com/opinosis-opinion-dataset/#.WyF_JNWxW00
+    http://kavita-ganesan.com/opinosis-opinion-dataset/
 
     This data is organized in folders, each folder containing a few short docs.
 
@@ -40,14 +48,6 @@ def __init__(self, path):
         """
 
         # citation
-        print("data source:")
-        print("title:\t\tOpinosis: a graph-based approach to abstractive summarization of highly redundant opinions")
-        print("authors:\tGanesan, Kavita and Zhai, ChengXiang and Han, Jiawei")
-        print("booktitle:\tProceedings of the 23rd International Conference on Computational Linguistics")
-        print("pages:\t\t340-348")
-        print("year:\t\t2010")
-        print("organization:\tAssociation for Computational Linguistics")
-
         path = os.path.join(path, "summaries-gold")
         dictionary = Dictionary()
         corpus = []

From 60a717df0ac82c5fb1f60dd5903960f1b7afaae0 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Mon, 22 Apr 2019 08:50:47 +0200
Subject: [PATCH 024/166] tox8 test passing after small change on opinosis
 comments/citation

---
 gensim/corpora/opinosiscorpus.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py
index eee1363045..f7b1abb134 100644
--- a/gensim/corpora/opinosiscorpus.py
+++ b/gensim/corpora/opinosiscorpus.py
@@ -18,6 +18,7 @@
 from gensim.parsing.porter import PorterStemmer
 from gensim.parsing.preprocessing import STOPWORDS
 
+
 class OpinosisCorpus():
     """Creates a corpus and dictionary from the opinosis dataset:
 

From 2ff60cae79ef62d4e323237ef50df1ef08338173 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Tue, 25 Jun 2019 20:58:52 +0200
Subject: [PATCH 025/166] Moving max_random_state inside the model as a private
 variable.

---
 gensim/models/ensemblelda.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 1075b63041..4880b22674 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -91,9 +91,6 @@
 
 logger = logging.getLogger(__name__)
 
-# nps max random state of 2**32 - 1 is too large for windows:
-MAX_RANDOM_STATE = np.iinfo(np.int32).max
-
 
 class EnsembleLda():
     """Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models.
@@ -168,6 +165,10 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             https://radimrehurek.com/gensim/models/ldamodel.html
 
         """
+        # Set random state
+        # nps max random state of 2**32 - 1 is too large for windows:
+        self._MAX_RANDOM_STATE = np.iinfo(np.int32).max
+
 
         if "id2word" not in gensim_kw_args:
             gensim_kw_args["id2word"] = None
@@ -550,7 +551,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
         # the same results in every lda children.
         # so it is solved by generating a list of state seeds before
         # multiprocessing is started.
-        random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)]
+        random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)]
 
         # each worker has to work on at least one model.
         # Don't spawn idle workers:
@@ -639,7 +640,7 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None):
             logger.info("Spawned worker to generate {} topic models".format(num_models))
 
         if random_states is None:
-            random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)]
+            random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)]
 
         assert len(random_states) == num_models
 

From d36fe43ba8f4e4824351cfbaa082bea852838a9c Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Tue, 25 Jun 2019 21:07:20 +0200
Subject: [PATCH 026/166] removed whitespace

---
 gensim/models/ensemblelda.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 4880b22674..1bc96756f7 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -169,7 +169,6 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         # nps max random state of 2**32 - 1 is too large for windows:
         self._MAX_RANDOM_STATE = np.iinfo(np.int32).max
 
-
         if "id2word" not in gensim_kw_args:
             gensim_kw_args["id2word"] = None
         if "corpus" not in gensim_kw_args:

From 1507adfa3ce0c1e718bfae0c5feacffe9d644769 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxmia@hip70890bb.de>
Date: Tue, 25 Jun 2019 21:16:14 +0200
Subject: [PATCH 027/166] docstring width

---
 gensim/models/ensemblelda.py | 221 +++++++++++++----------------------
 1 file changed, 78 insertions(+), 143 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 1bc96756f7..f4453e4b1b 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -116,53 +116,34 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             How many LDA models to train in this ensemble.
             Default: 3
         min_cores : number, optional
-            Minimum cores a cluster of topics has to contain
-            so that it is recognized as stable topic.
+            Minimum cores a cluster of topics has to contain so that it is recognized as stable topic.
         epsilon : float, optional
-            Defaults to 0.1. Epsilon for the cbdbscan clustering
-            that generates the stable topics.
+            Defaults to 0.1. Epsilon for the cbdbscan clustering that generates the stable topics.
         ensemble_workers : number, optional
-            Spawns that many processes and distributes the
-            models from the ensemble to those as evenly as
-            possible. num_models should be a multiple of
-            ensemble_workers.
+            Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
+            num_models should be a multiple of ensemble_workers.
 
-            Setting it to 0 or 1 will both use the non-
-            multiprocessing version. Default:1
+            Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1
         memory_friendly_ttda : boolean, optional
-            If True, the models in the ensemble are
-            deleted after training and only the
-            total topic word distribution is kept
-            to save memory.
-
-            Defaults to True. When False, trained
-            models are stored in a list in self.tms,
-            and no models that are not of a gensim
-            model type can be added to this ensemble
-            using the add_model function.
-
-            If False, any topic term matrix can be
-            supllied to add_model.
+            If True, the models in the ensemble are deleted after training and only the total topic word distribution
+            is kept to save memory.
+
+            Defaults to True. When False, trained models are stored in a list in self.tms, and no models that are not
+            of a gensim model type can be added to this ensemble using the add_model function.
+
+            If False, any topic term matrix can be supllied to add_model.
         min_samples : number, optional
-            Required number of nearby topics for
-            a topic to be considered as 'core' in
-            the CBDBSCAN clustering.
+            Required number of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering.
         masking_method : {'mass', 'rank}, optional
             One of "mass" (default) or "rank" (faster).
         masking_threshold : float, optional
-            Default: None, which uses 0.11 for
-            masking_method "rank", and 0.95
-            for "mass".
+            Default: None, which uses 0.11 for masking_method "rank", and 0.95 for "mass".
         distance_workers : number, optional
-            When distance_workers is None, it defaults to
-            os.cpu_count() for maximum performance.
-            Default is 1, which is not multiprocessed.
-            Set to > 1 to enable multiprocessing.
+            When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which
+            is not multiprocessed. Set to > 1 to enable multiprocessing.
         **gensim_kw_args
-            All the parameters for the gensim model,
-            that is used for each model in the ensemble,
-            can be provided as additional keyword arguments.
-            https://radimrehurek.com/gensim/models/ldamodel.html
+            All the parameters for the gensim model, that is used for each model in the ensemble, can be provided as
+            additional keyword arguments. https://radimrehurek.com/gensim/models/ldamodel.html
 
         """
         # Set random state
@@ -252,16 +233,12 @@ def convert_to_memory_friendly(self):
     def generate_gensim_representation(self):
         """creates a gensim-model from the stable topics
 
-        The prior for the gensim model, eta, Can be anything,
-        (a parameter in object constructor) and won't influence
-        get_topics(). Note that different eta means different
-        log_perplexity (the default is the same as the gensim
+        The prior for the gensim model, eta, Can be anything, (a parameter in object constructor) and won't influence
+        get_topics(). Note that different eta means different log_perplexity (the default is the same as the gensim
         default eta, which is for example [0.5, 0.5, 0.5]).
 
-        Note, that when the clustering of the topics
-        was changed using add_model, etc., and when no
-        topics were detected because of that, the old
-        classic gensim model will stay in place.
+        Note, that when the clustering of the topics was changed using add_model, etc., and when no topics were
+        detected because of that, the old classic gensim model will stay in place.
 
         Returns
         -------
@@ -340,27 +317,20 @@ def generate_gensim_representation(self):
         return classic_model_representation
 
     def add_model(self, target, num_new_models=None):
-        """Adds the ttda of another model to the ensemble
-        this way, multiple topic models can be connected
-        to an ensemble.
+        """Adds the ttda of another model to the ensemble this way, multiple topic models can be connected to an
+        ensemble.
 
-        Make sure that all the models use the exact same
-        dictionary/idword mapping.
+        Make sure that all the models use the exact same dictionary/idword mapping.
 
         In order to generate new stable topics afterwards, use
             self.generate_asymmetric_distance_matrix()
             self.recluster()
 
-        The ttda of another ensemble can also be used,
-        in that case set num_new_models to the num_models
-        parameter of the ensemble, that means the number
-        of classic models in the ensemble that generated
-        the ttda. This is important, because that information
-        is used to estimate "min_samples" for
-        generate_topic_clusters.
+        The ttda of another ensemble can also be used, in that case set num_new_models to the num_models parameter
+        of the ensemble, that means the number of classic models in the ensemble that generated the ttda. This is
+        important, because that information is used to estimate "min_samples" for generate_topic_clusters.
 
-        If you trained this ensemble in the past with a
-        certain Dictionary that you want to reuse for other
+        If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other
         models, you can get it from: self.id2word.
 
         Parameters
@@ -380,23 +350,16 @@ def add_model(self, target, num_new_models=None):
             with topic being an array of probabilities:
             [token1, token2, ...]
 
-            token probabilities in a single topic sum to one,
-            therefore, all the words sum to len(ttda)
+            token probabilities in a single topic sum to one, therefore, all the words sum to len(ttda)
 
         num_new_models : integer, optional
-            the model keeps track of how many models
-            were used in this ensemble. Set higher
-            if ttda contained topics from more than
-            one model. Default: None, which takes
-            care of it automatically.
+            the model keeps track of how many models were used in this ensemble. Set higher if ttda contained topics
+            from more than one model. Default: None, which takes care of it automatically.
 
-            If target is a 2D-array of float values,
-            it assumes 1.
+            If target is a 2D-array of float values, it assumes 1.
 
-            If the ensemble has memory_friendly_ttda
-            set to False, then it will always use
-            the number of models in the target
-            parameter.
+            If the ensemble has memory_friendly_ttda set to False, then it will always use the number of models in
+            the target parameter.
 
         """
         # If the model has never seen a ttda before, initialize.
@@ -522,8 +485,7 @@ def load(fname):
         return eLDA
 
     def generate_topic_models_multiproc(self, num_models, ensemble_workers):
-        """Will make the ensemble multiprocess, which results
-        in a speedup on multicore machines. Results from the
+        """Will make the ensemble multiprocess, which results in a speedup on multicore machines. Results from the
         processes will be piped to the parent and concatenated.
 
         Parameters
@@ -531,24 +493,19 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
         num_models : number
             how many models to train in the ensemble
         ensemble_workers : number
-            into how many processes to split the models
-            will be set to max(workers, num_models), to avoid
-            workers that are supposed to train 0 models.
+            into how many processes to split the models will be set to max(workers, num_models), to avoid workers that
+            are supposed to train 0 models.
 
-            to get maximum performance, set to the number
-            of your cores, if non-parallelized models
-            are being used in the ensemble (LdaModel).
+            to get maximum performance, set to the number of your cores, if non-parallelized models are being used in
+            the ensemble (LdaModel).
 
-            For LdaMulticore, the performance gain is small
-            and gets larger for a significantly smaller corpus.
+            For LdaMulticore, the performance gain is small and gets larger for a significantly smaller corpus.
             In that case, ensemble_workers=2 can be used.
 
         """
 
-        # the way random_states is handled needs to prevent getting
-        # different results when multiprocessing is on, or getting
-        # the same results in every lda children.
-        # so it is solved by generating a list of state seeds before
+        # the way random_states is handled needs to prevent getting different results when multiprocessing is on,
+        # or getting the same results in every lda children. so it is solved by generating a list of state seeds before
         # multiprocessing is started.
         random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)]
 
@@ -558,8 +515,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
 
         # create worker processes:
         # from what I know this is basically forking with a jump to a target function in each child
-        # so modifying the ensemble object will not modify the one in the parent
-        # because of no shared memory
+        # so modifying the ensemble object will not modify the one in the parent because of no shared memory
         processes = []
         pipes = []
         num_models_unhandled = num_models  # how many more models need to be trained by workers?
@@ -626,12 +582,11 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None):
         num_models : number
             number of models to be generated
         random_states : list
-            list of numbers or np.random.RandomState objects.
-            Will be autogenerated based on the ensembles RandomState
-            if None (default).
+            list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles
+            RandomState if None (default).
         pipe : multiprocessing.pipe
-            Default None. If provided, will send the trained models over
-            this pipe. If memory friendly, it will only send the ttda.
+            Default None. If provided, will send the trained models over this pipe. If memory friendly, it will only
+            send the ttda.
 
         """
 
@@ -694,14 +649,11 @@ def asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe
         pipe.close()
 
     def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"):
-        """Makes the pairwise distance matrix for all the ttdas
-        from the ensemble.
+        """Makes the pairwise distance matrix for all the ttdas from the ensemble.
 
-        Returns the asymmetric pairwise distance matrix that
-        is used in the DBSCAN clustering.
+        Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering.
 
-        Afterwards, the model needs to be reclustered for
-        this generated matrix to take effect.
+        Afterwards, the model needs to be reclustered for this generated matrix to take effect.
 
         Parameters
         ----------
@@ -711,12 +663,10 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
             if threshold is None and method == "mass":
                 threshold = 0.95
 
-            threshold keeps by default 95% of the largest terms by
-            mass. Except the "fast" parameter is "rank", then it
-            just selects that many of the largest terms.
+            threshold keeps by default 95% of the largest terms by mass. Except the "fast" parameter is "rank", then
+            it just selects that many of the largest terms.
         workers : number, optional
-            when workers is None, it defaults to os.cpu_count()
-            for maximum performance. Default is 1, which is not
+            when workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which is not
             multiprocessed. Set to > 1 to enable multiprocessing.
 
         """
@@ -809,21 +759,18 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, st
         Parameters
         ----------
         ttda1 and ttda2: 2D arrays of floats
-            Two ttda matrices that are going to be used for distance calculation.
-            Each row in ttda corresponds to one topic. Each cell in the resulting
-            matrix corresponds to the distance between a topic pair.
+            Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
+            topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
         threshold : float, optional
-            threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected
-            method
+            threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected method
         start_index : number
-            this function might be used in multiprocessing, so start_index has to be set as
-            ttda1 is a chunk of the complete ttda in that case. start_index would be 0
-            if ttda1 == self.ttda. When self.ttda is split into two pieces, each 100 ttdas
-            long, then start_index should be be 100. default is 0
+            this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
+            complete ttda in that case. start_index would be 0 if ttda1 == self.ttda. When self.ttda is split into two
+            pieces, each 100 ttdas long, then start_index should be be 100. default is 0
         method : {'mass', 'rank}, optional
-            method can be "mass" for the original masking method or "rank" for a faster
-            masking method that selects by rank of largest elements in the topic word
-            distribution, to determine which tokens are relevant for the topic.
+            method can be "mass" for the original masking method or "rank" for a faster masking method that selects
+            by rank of largest elements in the topic word distribution, to determine which tokens are relevant for the
+            topic.
 
         Returns
         -------
@@ -902,11 +849,10 @@ def rank_masking(a):
         return distances
 
     def generate_topic_clusters(self, eps=0.1, min_samples=None):
-        """Runs the DBSCAN algorithm on all the detected topics from
-        the models in the ensemble and labels them with label-indices.
+        """Runs the DBSCAN algorithm on all the detected topics from the models in the ensemble and labels them with
+        label-indices.
 
-        The final approval and generation of stable topics is done in
-        generate_stable_topics().
+        The final approval and generation of stable topics is done in generate_stable_topics().
 
         Parameters
         ----------
@@ -925,20 +871,16 @@ def generate_topic_clusters(self, eps=0.1, min_samples=None):
         self.cluster_model.fit(self.asymmetric_distance_matrix)
 
     def generate_stable_topics(self, min_cores=None):
-        """generates stable topics out of the clusters.
-        This function is the last step that has to be done
-        in the ensemble.
+        """generates stable topics out of the clusters. This function is the last step that has to be done in the
+        ensemble.
 
-        Stable topics can be retreived afterwards using
-        get_topics().
+        Stable topics can be retreived afterwards using get_topics().
 
         Parameters
         ----------
         min_cores : number
-            how many cores a cluster has to have,
-            to be treated as stable topic. That means, how many topics
-            that look similar have to be present, so that the average
-            topic in those is used as stable topic.
+            how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
+            that look similar have to be present, so that the average topic in those is used as stable topic.
 
             defaults to min_cores = min(3, max(1, int(self.num_models /4 +1)))
 
@@ -1082,34 +1024,27 @@ def validate_core(core):
         self.stable_topics = stable_topics
 
     def recluster(self, eps=0.1, min_samples=None, min_cores=None):
-        """Runs the CBDBSCAN algorithm on all the detected topics from
-        the children of the ensemble.
+        """Runs the CBDBSCAN algorithm on all the detected topics from the children of the ensemble.
 
         Generates stable topics out of the clusters afterwards.
 
-        Finally, stable topics can be retreived using
-        get_topics().
+        Finally, stable topics can be retreived using get_topics().
 
         Parameters
         ----------
         eps : float
-            epsilon for the CBDBSCAN algorithm, having the same
-            meaning as in classic DBSCAN clustering.
+            epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
             default: 0.1
         min_samples : number
-            The minimum number of sampels in the neighborhood
-            of a topic to be considered a core in CBDBSCAN.
+            The minimum number of sampels in the neighborhood of a topic to be considered a core in CBDBSCAN.
             default: int(self.num_models / 2)
         min_cores : number
-            how many cores a cluster has to have,
-            to be treated as stable topic. That means, how many topics
-            that look similar have to be present, so that the average
-            topic in those is used as stable topic.
+            how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
+            that look similar have to be present, so that the average topic in those is used as stable topic.
             default: min(3, max(1, int(self.num_models /4 +1)))
 
         """
-        # if new models were added to the ensemble, the distance
-        # matrix needs to be generated again
+        # if new models were added to the ensemble, the distance matrix needs to be generated again
         if self.asymmetric_distance_matrix_outdated:
             logger.info("asymmetric distance matrix is outdated due to add_model")
             self.generate_asymmetric_distance_matrix()

From 7577aca98534435751bd01926ba75e19f66ec50f Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Tue, 25 Jun 2019 21:26:38 +0200
Subject: [PATCH 028/166] sphinx udpate

---
 gensim/models/ensemblelda.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index f4453e4b1b..bbac0817b2 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -142,8 +142,7 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which
             is not multiprocessed. Set to > 1 to enable multiprocessing.
         **gensim_kw_args
-            All the parameters for the gensim model, that is used for each model in the ensemble, can be provided as
-            additional keyword arguments. https://radimrehurek.com/gensim/models/ldamodel.html
+            Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble.
 
         """
         # Set random state

From 301feacacae294691e9acafc22481d450c4c64c8 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxmia@hip70890b.de>
Date: Tue, 25 Jun 2019 21:31:09 +0200
Subject: [PATCH 029/166] fixed urls to sphinx notation

---
 gensim/models/ensemblelda.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index f4453e4b1b..11f72e55bc 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -111,6 +111,11 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         topic_model_kind : str, topic model, optional
             Examples:
                 'ldamulticore' (recommended), 'lda' (default),
+        ensemble_workers : number, optional
+            Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
+            num_models should be a multiple of ensemble_workers.
+
+            Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1
                 gensim.models.ldamodel, gensim.models.ldamulticore
         num_models : number, optional
             How many LDA models to train in this ensemble.
@@ -1069,22 +1074,22 @@ def has_gensim_representation(self):
                 raise ValueError("use generate_gensim_representation() first")
 
     def __getitem__(self, i):
-        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
+        """see :py:class:`gensim.models.LdaModel`"""
         self.has_gensim_representation()
         return self.classic_model_representation[i]
 
     def inference(self, *posargs, **kwArgs):
-        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
+        """see :py:class:`gensim.models.LdaModel`"""
         self.has_gensim_representation()
         return self.classic_model_representation.inference(*posargs, **kwArgs)
 
     def log_perplexity(self, *posargs, **kwArgs):
-        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
+        """see :py:class:`gensim.models.LdaModel`"""
         self.has_gensim_representation()
         return self.classic_model_representation.log_perplexity(*posargs, **kwArgs)
 
     def print_topics(self, *posargs, **kwArgs):
-        """see https://radimrehurek.com/gensim/models/ldamodel.html"""
+        """see :py:class:`gensim.models.LdaModel`"""
         self.has_gensim_representation()
         return self.classic_model_representation.print_topics(*posargs, **kwArgs)
 

From 0f4a6b856c9a780a25e3910e95c8a2f95dc57687 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Tue, 25 Jun 2019 21:31:59 +0200
Subject: [PATCH 030/166] changed doc strings, number --> int + some sphinx

---
 gensim/models/ensemblelda.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index bbac0817b2..2dbc29fdc5 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -112,14 +112,14 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             Examples:
                 'ldamulticore' (recommended), 'lda' (default),
                 gensim.models.ldamodel, gensim.models.ldamulticore
-        num_models : number, optional
+        num_models : int, optional
             How many LDA models to train in this ensemble.
             Default: 3
-        min_cores : number, optional
+        min_cores : int, optional
             Minimum cores a cluster of topics has to contain so that it is recognized as stable topic.
         epsilon : float, optional
             Defaults to 0.1. Epsilon for the cbdbscan clustering that generates the stable topics.
-        ensemble_workers : number, optional
+        ensemble_workers : int, optional
             Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
             num_models should be a multiple of ensemble_workers.
 
@@ -132,13 +132,13 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             of a gensim model type can be added to this ensemble using the add_model function.
 
             If False, any topic term matrix can be supllied to add_model.
-        min_samples : number, optional
-            Required number of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering.
+        min_samples : int, optional
+            Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering.
         masking_method : {'mass', 'rank}, optional
             One of "mass" (default) or "rank" (faster).
         masking_threshold : float, optional
             Default: None, which uses 0.11 for masking_method "rank", and 0.95 for "mass".
-        distance_workers : number, optional
+        distance_workers : int, optional
             When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which
             is not multiprocessed. Set to > 1 to enable multiprocessing.
         **gensim_kw_args
@@ -337,7 +337,7 @@ def add_model(self, target, num_new_models=None):
         target : {see description}
             1. A single EnsembleLda object
             2. List of EnsembleLda objects
-            3. A single Gensim topic model
+            3. A single Gensim topic model (e.g. (:py:class:`gensim.models.LdaModel`)
             4. List of Gensim topic models
 
             if memory_friendly_ttda is True, target can also be:
@@ -489,9 +489,9 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
 
         Parameters
         ----------
-        num_models : number
+        num_models : int
             how many models to train in the ensemble
-        ensemble_workers : number
+        ensemble_workers : int
             into how many processes to split the models will be set to max(workers, num_models), to avoid workers that
             are supposed to train 0 models.
 
@@ -578,7 +578,7 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None):
 
         Parameters
         ----------
-        num_models : number
+        num_models : int
             number of models to be generated
         random_states : list
             list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles
@@ -762,7 +762,7 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, st
             topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
         threshold : float, optional
             threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected method
-        start_index : number
+        start_index : int
             this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
             complete ttda in that case. start_index would be 0 if ttda1 == self.ttda. When self.ttda is split into two
             pieces, each 100 ttdas long, then start_index should be be 100. default is 0
@@ -857,7 +857,7 @@ def generate_topic_clusters(self, eps=0.1, min_samples=None):
         ----------
         eps : float
             eps is 0.1 by default.
-        min_samples : number
+        min_samples : int
             min_samples is int(self.num_models / 2)
         """
 
@@ -877,7 +877,7 @@ def generate_stable_topics(self, min_cores=None):
 
         Parameters
         ----------
-        min_cores : number
+        min_cores : int
             how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
             that look similar have to be present, so that the average topic in those is used as stable topic.
 
@@ -1034,10 +1034,10 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
         eps : float
             epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
             default: 0.1
-        min_samples : number
+        min_samples : int
             The minimum number of sampels in the neighborhood of a topic to be considered a core in CBDBSCAN.
             default: int(self.num_models / 2)
-        min_cores : number
+        min_cores : int
             how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
             that look similar have to be present, so that the average topic in those is used as stable topic.
             default: min(3, max(1, int(self.num_models /4 +1)))

From f915f50edc1074da35f9191e4adf3fc73278f763 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Tue, 25 Jun 2019 21:39:29 +0200
Subject: [PATCH 031/166] Removed hanging indents.

---
 gensim/models/ensemblelda.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 2dbc29fdc5..f75665c191 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -161,9 +161,10 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
             gensim_kw_args["id2word"] = utils.dict_from_corpus(gensim_kw_args["corpus"])
         if gensim_kw_args["id2word"] is None and gensim_kw_args["corpus"] is None:
-            raise ValueError("at least one of corpus/id2word must be specified, to establish "
-                             "input space dimensionality. Corpus should be provided using the "
-                             "keyword argument corpus.")
+            raise ValueError(
+                "at least one of corpus/id2word must be specified, to establish "
+                "input space dimensionality. Corpus should be provided using the "
+                "keyword argument corpus.")
 
         if type(topic_model_kind) == str:
             self.topic_model_kind = {
@@ -215,9 +216,10 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             # singlecore
             self.generate_topic_models(num_models)
 
-        self.generate_asymmetric_distance_matrix(workers=self.distance_workers,
-                                                 threshold=masking_threshold,
-                                                 method=masking_method)
+        self.generate_asymmetric_distance_matrix(
+            workers=self.distance_workers,
+            threshold=masking_threshold,
+            method=masking_method)
         self.generate_topic_clusters(epsilon, min_samples)
         self.generate_stable_topics(min_cores)
 
@@ -680,11 +682,8 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
 
         # singlecore:
         if workers is not None and workers <= 1:
-            self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda,
-                                                                                              ttda2=self.ttda,
-                                                                                              threshold=threshold,
-                                                                                              start_index=0,
-                                                                                              method=method)
+            self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(
+                ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method)
             return self.asymmetric_distance_matrix
 
         # multicore:

From a3161cd088c287ca56fb7271f5fcaab88a31d239 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxmia@hip70890b.de>
Date: Tue, 25 Jun 2019 21:45:54 +0200
Subject: [PATCH 032/166] improved topic_model_kind type checking

---
 gensim/models/ensemblelda.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 2a307737ac..95d169a9ba 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -168,15 +168,19 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         if gensim_kw_args["id2word"] is None and gensim_kw_args["corpus"] is None:
             raise ValueError("at least one of corpus/id2word must be specified, to establish "
                              "input space dimensionality. Corpus should be provided using the "
-                             "keyword argument corpus.")
+                             "`corpus` keyword argument.")
 
-        if type(topic_model_kind) == str:
-            self.topic_model_kind = {
+        if isinstance(topic_model_kind, ldamodel.LdaModel):
+            self.topic_model_kind = topic_model_kind
+        else:
+            kinds = {
                 "lda": ldamodel.LdaModel,
                 "ldamulticore": ldamulticore.LdaMulticore
-            }[topic_model_kind]
-        else:
-            self.topic_model_kind = topic_model_kind
+            }
+            if topic_model_kind not in kinds:
+                raise ValueError(
+                    "topic_model_kind should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel")
+            self.topic_model_kind = kinds[topic_model_kind]
 
         # Store some of the parameters
         self.num_models = num_models

From 52c239bace43fbe246f9c307739c1dd8771b8598 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Tue, 25 Jun 2019 22:06:04 +0200
Subject: [PATCH 033/166] Sphinx and docstring updates.

---
 gensim/models/ensemblelda.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index f75665c191..a46542a305 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -232,21 +232,19 @@ def convert_to_memory_friendly(self):
         self.memory_friendly_ttda = True
 
     def generate_gensim_representation(self):
-        """creates a gensim-model from the stable topics
+        """Creates a gensim model from the stable topics.
 
-        The prior for the gensim model, eta, Can be anything, (a parameter in object constructor) and won't influence
-        get_topics(). Note that different eta means different log_perplexity (the default is the same as the gensim
-        default eta, which is for example [0.5, 0.5, 0.5]).
+        The returned representation is an Gensim LdaModel (:py:class:`gensim.models.LdaModel`) that has been
+        instantiated with an A-priori belief on word probability, eta, that represents the topic-term distributions of
+        any stable topics the were found by clustering over the ensemble of topic distributions.
 
-        Note, that when the clustering of the topics was changed using add_model, etc., and when no topics were
-        detected because of that, the old classic gensim model will stay in place.
+        When no stable topics have been detected, None is returned.
 
         Returns
         -------
-        LdaModel
+        :py:class:`gensim.models.LdaModel`
             A Gensim LDA Model classic_model_representation for which:
             classic_model_representation.get_topics() == self.get_topics()
-
         """
 
         logger.info("Generating classic gensim model representation based on results from the ensemble")
@@ -1179,9 +1177,10 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
                         ordered_min_similarity.remove(neighbor)
                         # try to extend the cluster into the direction
                         # of the neighbor
-                        scan_topic(neighbor, parent_id=topic_index,
-                                   current_label=current_label,
-                                   parent_neighbors=neighbors)
+                        scan_topic(
+                            neighbor, parent_id=topic_index,
+                            current_label=current_label,
+                            parent_neighbors=neighbors)
 
                     results[neighbor]["parent_ids"].add(topic_index)
                     results[neighbor]["parent_labels"].add(current_label)

From ffc8e10fe8059fbec8c54fed7f0e6323acbc0e68 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Tue, 25 Jun 2019 22:06:25 +0200
Subject: [PATCH 034/166] review stuff

---
 gensim/models/ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 7c22739e18..eb1b0eba6c 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -183,7 +183,6 @@ def __init__(self, topic_model_kind="lda", num_models=3,
                     "topic_model_kind should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel")
             self.topic_model_kind = kinds[topic_model_kind]
 
-        # Store some of the parameters
         self.num_models = num_models
         self.gensim_kw_args = gensim_kw_args
 
@@ -277,6 +276,7 @@ def generate_gensim_representation(self):
         if num_stable_topics == 0:
             logger.error("The model did not detect any stable topic. You can try to adjust epsilon: "
                          "recluster(eps=...)")
+            self.classic_model_representation = None
             return
 
         # create a new gensim model

From 2d269a5f55c65bff69c3e38089d699ef32ce34f7 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Tue, 25 Jun 2019 22:08:43 +0200
Subject: [PATCH 035/166] removed unneccessary comments

---
 gensim/models/ensemblelda.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index a10555855c..aae9897aab 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -136,7 +136,7 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             Defaults to True. When False, trained models are stored in a list in self.tms, and no models that are not
             of a gensim model type can be added to this ensemble using the add_model function.
 
-            If False, any topic term matrix can be supllied to add_model.
+            If False, any topic term matrix can be suplied to add_model.
         min_samples : int, optional
             Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering.
         masking_method : {'mass', 'rank}, optional
@@ -186,7 +186,6 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         self.num_models = num_models
         self.gensim_kw_args = gensim_kw_args
 
-        # some settings affecting performance
         self.memory_friendly_ttda = memory_friendly_ttda
         self.distance_workers = distance_workers
 

From 0612c4a5b4002b67df8392d4dcb09bf7be4666a5 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Tue, 25 Jun 2019 22:10:30 +0200
Subject: [PATCH 036/166] Update gensim/models/ensemblelda.py

Co-Authored-By: Michael Penkov <m@penkov.dev>
---
 gensim/models/ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 1075b63041..3215ed876c 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -217,7 +217,7 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         # parameters, stop here and don't train.
         if num_models <= 0:
             return
-        if ("corpus" not in gensim_kw_args):
+        if "corpus" not in gensim_kw_args:
             return
         if gensim_kw_args["corpus"] is None:
             return

From 24b34b1116baa5d7272072ee80521a113958659c Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Tue, 25 Jun 2019 22:12:39 +0200
Subject: [PATCH 037/166] removed paranthesis

---
 gensim/models/ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index aae9897aab..b660d6601f 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -205,7 +205,7 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         # parameters, stop here and don't train.
         if num_models <= 0:
             return
-        if ("corpus" not in gensim_kw_args):
+        if "corpus" not in gensim_kw_args:
             return
         if gensim_kw_args["corpus"] is None:
             return

From d96e1a139d762d6d19ef30581e248294bd137e24 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Tue, 25 Jun 2019 22:20:01 +0200
Subject: [PATCH 038/166] review

---
 gensim/models/ensemblelda.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index b660d6601f..9d65e80d73 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -205,19 +205,16 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         # parameters, stop here and don't train.
         if num_models <= 0:
             return
-        if "corpus" not in gensim_kw_args:
-            return
-        if gensim_kw_args["corpus"] is None:
+        if gensim_kw_args.get("corpus") is None:
             return
-        if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0:
+        if gensim_kw_args.get("iterations", 0) <= 0:
             return
-        if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0:
+        if gensim_kw_args.get("passes", 0) <= 0:
             return
 
         logger.info("Generating {} topic models...".format(num_models))
-        # training
+
         if ensemble_workers > 1:
-            # multiprocessed
             self.generate_topic_models_multiproc(num_models, ensemble_workers)
         else:
             # singlecore

From a1e3d951ac8b6bad71cc93a0cf2261eb2cd51d47 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Tue, 25 Jun 2019 22:27:51 +0200
Subject: [PATCH 039/166] refactor private, hanging indent

---
 gensim/models/ensemblelda.py    | 66 ++++++++++++++++-----------------
 gensim/test/test_ensemblelda.py | 12 +++---
 2 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 9d65e80d73..3bbc73dd5a 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -100,7 +100,7 @@ class EnsembleLda():
 
     """
     def __init__(self, topic_model_kind="lda", num_models=3,
-                 min_cores=None,  # default value from generate_stable_topics()
+                 min_cores=None,  # default value from _generate_stable_topics()
                  epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
                  min_samples=None, masking_method="mass", masking_threshold=None,
                  distance_workers=1, random_state=None, **gensim_kw_args):
@@ -215,17 +215,17 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         logger.info("Generating {} topic models...".format(num_models))
 
         if ensemble_workers > 1:
-            self.generate_topic_models_multiproc(num_models, ensemble_workers)
+            self._generate_topic_models_multiproc(num_models, ensemble_workers)
         else:
             # singlecore
-            self.generate_topic_models(num_models)
+            self._generate_topic_models(num_models)
 
-        self.generate_asymmetric_distance_matrix(
+        self._generate_asymmetric_distance_matrix(
             workers=self.distance_workers,
             threshold=masking_threshold,
             method=masking_method)
-        self.generate_topic_clusters(epsilon, min_samples)
-        self.generate_stable_topics(min_cores)
+        self._generate_topic_clusters(epsilon, min_samples)
+        self._generate_stable_topics(min_cores)
 
         # create model that can provide the usual gensim api to the stable topics from the ensemble
         self.generate_gensim_representation()
@@ -327,12 +327,12 @@ def add_model(self, target, num_new_models=None):
         Make sure that all the models use the exact same dictionary/idword mapping.
 
         In order to generate new stable topics afterwards, use
-            self.generate_asymmetric_distance_matrix()
+            self._generate_asymmetric_distance_matrix()
             self.recluster()
 
         The ttda of another ensemble can also be used, in that case set num_new_models to the num_models parameter
         of the ensemble, that means the number of classic models in the ensemble that generated the ttda. This is
-        important, because that information is used to estimate "min_samples" for generate_topic_clusters.
+        important, because that information is used to estimate "min_samples" for _generate_topic_clusters.
 
         If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other
         models, you can get it from: self.id2word.
@@ -488,7 +488,7 @@ def load(fname):
 
         return eLDA
 
-    def generate_topic_models_multiproc(self, num_models, ensemble_workers):
+    def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
         """Will make the ensemble multiprocess, which results in a speedup on multicore machines. Results from the
         processes will be piped to the parent and concatenated.
 
@@ -538,7 +538,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
                 # get the chunk from the random states that is meant to be for those models
                 random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
 
-                p = Process(target=self.generate_topic_models,
+                p = Process(target=self._generate_topic_models,
                             args=(num_subprocess_models, random_states_for_worker, childConn))
 
                 processes += [p]
@@ -566,7 +566,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
         for p in pipes:
             answer = p[0].recv()  # [0], because that is the parentConn
             p[0].close()
-            # this does basically the same as the generate_topic_models function (concatenate all the ttdas):
+            # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
             if not self.memory_friendly_ttda:
                 self.tms += answer
                 ttda = np.concatenate([model.get_topics() for model in answer])
@@ -578,7 +578,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers):
         for p in processes:
             p.terminate()
 
-    def generate_topic_models(self, num_models, random_states=None, pipe=None):
+    def _generate_topic_models(self, num_models, random_states=None, pipe=None):
         """Will train the topic models, that form the ensemble.
 
         Parameters
@@ -627,7 +627,7 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None):
 
         if pipe is not None:
             # send the ttda that is in the child/workers version of the memory into the pipe
-            # available, after generate_topic_models has been called in the worker
+            # available, after _generate_topic_models has been called in the worker
             if self.memory_friendly_ttda:
                 # remember that this code is inside the worker processes memory,
                 # so self.ttda is the ttda of only a chunk of models
@@ -637,7 +637,7 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None):
 
             pipe.close()
 
-    def asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method):
+    def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method):
         """ worker, that computes the distance to all other nodes
         from a chunk of nodes. https://stackoverflow.com/a/1743350
         """
@@ -645,14 +645,12 @@ def asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe
         logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas))
         # the chunk of ttda that's going to be calculated:
         ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
-        distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda,
-                                                                         threshold=threshold,
-                                                                         start_index=ttdas_sent,
-                                                                         method=method)
+        distance_chunk = self._calculate_asymmetric_distance_matrix_chunk(
+            ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method)
         pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
         pipe.close()
 
-    def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"):
+    def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"):
         """Makes the pairwise distance matrix for all the ttdas from the ensemble.
 
         Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering.
@@ -685,7 +683,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
 
         # singlecore:
         if workers is not None and workers <= 1:
-            self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(
+            self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk(
                 ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method)
             return self.asymmetric_distance_matrix
 
@@ -715,7 +713,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
                 else:
                     n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i))
 
-                p = Process(target=self.asymmetric_distance_matrix_worker,
+                p = Process(target=self._asymmetric_distance_matrix_worker,
                             args=(i, ttdas_sent, n_ttdas, childConn, threshold, method))
                 ttdas_sent += n_ttdas
 
@@ -742,7 +740,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
         for p in pipes:
             answer = p[0].recv()  # [0], because that is the parentConn
             p[0].close()  # child conn will be closed from inside the worker
-            # this does basically the same as the generate_topic_models function (concatenate all the ttdas):
+            # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
             distances += [answer[1]]
 
         # end all processes
@@ -753,7 +751,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
 
         return self.asymmetric_distance_matrix
 
-    def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method):
+    def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method):
         """Iterates over ttda1 and calculates the
         distance to every ttda in tttda2.
 
@@ -849,11 +847,11 @@ def rank_masking(a):
 
         return distances
 
-    def generate_topic_clusters(self, eps=0.1, min_samples=None):
+    def _generate_topic_clusters(self, eps=0.1, min_samples=None):
         """Runs the DBSCAN algorithm on all the detected topics from the models in the ensemble and labels them with
         label-indices.
 
-        The final approval and generation of stable topics is done in generate_stable_topics().
+        The final approval and generation of stable topics is done in _generate_stable_topics().
 
         Parameters
         ----------
@@ -871,7 +869,7 @@ def generate_topic_clusters(self, eps=0.1, min_samples=None):
         self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples)
         self.cluster_model.fit(self.asymmetric_distance_matrix)
 
-    def generate_stable_topics(self, min_cores=None):
+    def _generate_stable_topics(self, min_cores=None):
         """generates stable topics out of the clusters. This function is the last step that has to be done in the
         ensemble.
 
@@ -1048,10 +1046,10 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
         # if new models were added to the ensemble, the distance matrix needs to be generated again
         if self.asymmetric_distance_matrix_outdated:
             logger.info("asymmetric distance matrix is outdated due to add_model")
-            self.generate_asymmetric_distance_matrix()
+            self._generate_asymmetric_distance_matrix()
 
-        self.generate_topic_clusters(eps, min_samples)
-        self.generate_stable_topics(min_cores)
+        self._generate_topic_clusters(eps, min_samples)
+        self._generate_stable_topics(min_cores)
         self.generate_gensim_representation()
 
     # GENSIM API
@@ -1060,7 +1058,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
     def get_topics(self):
         return self.stable_topics
 
-    def has_gensim_representation(self):
+    def _has_gensim_representation(self):
         """checks if stable topics area vailable and if the internal
         gensim representation exists. If not, raises errors"""
         if self.classic_model_representation is None:
@@ -1071,22 +1069,22 @@ def has_gensim_representation(self):
 
     def __getitem__(self, i):
         """see :py:class:`gensim.models.LdaModel`"""
-        self.has_gensim_representation()
+        self._has_gensim_representation()
         return self.classic_model_representation[i]
 
     def inference(self, *posargs, **kwArgs):
         """see :py:class:`gensim.models.LdaModel`"""
-        self.has_gensim_representation()
+        self._has_gensim_representation()
         return self.classic_model_representation.inference(*posargs, **kwArgs)
 
     def log_perplexity(self, *posargs, **kwArgs):
         """see :py:class:`gensim.models.LdaModel`"""
-        self.has_gensim_representation()
+        self._has_gensim_representation()
         return self.classic_model_representation.log_perplexity(*posargs, **kwArgs)
 
     def print_topics(self, *posargs, **kwArgs):
         """see :py:class:`gensim.models.LdaModel`"""
-        self.has_gensim_representation()
+        self._has_gensim_representation()
         return self.classic_model_representation.print_topics(*posargs, **kwArgs)
 
     @property
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 3f1ea8b0b1..5de7b324c7 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -318,21 +318,21 @@ def test_add_and_recluster(self):
         self.check_ttda(new_eLDA_mu)
 
         # 2. distance matrix
-        new_eLDA.generate_asymmetric_distance_matrix()
-        new_eLDA_mu.generate_asymmetric_distance_matrix()
+        new_eLDA._generate_asymmetric_distance_matrix()
+        new_eLDA_mu._generate_asymmetric_distance_matrix()
         np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix,
                                    new_eLDA_mu.asymmetric_distance_matrix)
 
         # 3. CBDBSCAN results
-        new_eLDA.generate_topic_clusters()
-        new_eLDA_mu.generate_topic_clusters()
+        new_eLDA._generate_topic_clusters()
+        new_eLDA_mu._generate_topic_clusters()
         a = new_eLDA.cluster_model.results
         b = new_eLDA_mu.cluster_model.results
         self.assert_cluster_results_equal(a, b)
 
         # 4. finally, the stable topics
-        new_eLDA.generate_stable_topics()
-        new_eLDA_mu.generate_stable_topics()
+        new_eLDA._generate_stable_topics()
+        new_eLDA_mu._generate_stable_topics()
         np.testing.assert_allclose(new_eLDA.get_topics(),
                                    new_eLDA_mu.get_topics())
 

From 5d48c8d9593e84e2e4f966cb434726c7dd678117 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Tue, 25 Jun 2019 22:40:32 +0200
Subject: [PATCH 040/166] typo

---
 gensim/models/ensemblelda.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 3bbc73dd5a..cb78f28153 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -681,14 +681,13 @@ def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method
 
         logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda)))
 
-        # singlecore:
+        # singlecore
         if workers is not None and workers <= 1:
             self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk(
                 ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method)
             return self.asymmetric_distance_matrix
 
-        # multicore:
-
+        # else, if workers > 1 use multiprocessing
         # best performance on 2-core machine: 2 workers
         if workers is None:
             workers = os.cpu_count()
@@ -822,7 +821,7 @@ def rank_masking(a):
 
                 distance = 0
                 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1
-                # (not 2 because that correspondsto negative values. The maximum distance is 1 here)
+                # (not 2 because that corresponds to negative values. The maximum distance is 1 here)
                 # don't normalize b_masked, otherwise the following threshold will never work:
                 if b_masked.sum() <= 0.05:
                     distance = 1

From e556e8cac535de9b3f7a407bed681ef60c354daa Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Tue, 25 Jun 2019 22:42:18 +0200
Subject: [PATCH 041/166] Clarifications to ttda in docstrings and in method
 docstrings.

---
 gensim/models/ensemblelda.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index a10555855c..2f42d992ed 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -130,8 +130,8 @@ def __init__(self, topic_model_kind="lda", num_models=3,
 
             Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1
         memory_friendly_ttda : boolean, optional
-            If True, the models in the ensemble are deleted after training and only the total topic word distribution
-            is kept to save memory.
+            If True, the models in the ensemble are deleted after training and only a concatenation of each model's
+            topic term distribution (called ttda) is kept to save memory.
 
             Defaults to True. When False, trained models are stored in a list in self.tms, and no models that are not
             of a gensim model type can be added to this ensemble using the add_model function.
@@ -758,8 +758,10 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method=
         return self.asymmetric_distance_matrix
 
     def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method):
-        """Iterates over ttda1 and calculates the
-        distance to every ttda in tttda2.
+        """Internal method calculating an (asymmetric) distance from each topic term distribution in ttda1 to each topic
+        term distribution in ttda2 and returns a matrix of distances, size len(ttda1) by len(ttda2).
+
+        This is an internal method and should not be used for two general ttda arrays.
 
         Parameters
         ----------
@@ -774,12 +776,13 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, st
             pieces, each 100 ttdas long, then start_index should be be 100. default is 0
         method : {'mass', 'rank}, optional
             method can be "mass" for the original masking method or "rank" for a faster masking method that selects
-            by rank of largest elements in the topic word distribution, to determine which tokens are relevant for the
+            by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the
             topic.
 
         Returns
         -------
         2D Numpy.numpy.ndarray of floats
+            Asymmetric distance matrix of size len(ttda1) by len(ttda2).
         """
 
         if method not in ["mass", "rank"]:
@@ -806,39 +809,38 @@ def rank_masking(a):
         distances = np.ndarray((len(ttda1), len(ttda2)))
 
         # now iterate over each topic
-        for i in range(len(ttda1)):
+        for ttd1_idx, ttd1 in enumerate(ttda1):
 
-            # create mask from a, that removes noise from a and keeps the largest terms
-            a = ttda1[i]
-            mask = create_mask(a)
-            a_masked = a[mask]
+            # create mask from ttd1 that removes noise from a and keeps the largest terms
+            mask = create_mask(ttd1)
+            ttd1_masked = ttd1[mask]
 
             avg_mask_size += mask.sum()
 
             # now look at every possible pair for topic a:
-            for j in range(len(ttda2)):
+            for ttd2_idx, ttd2 in enumerate(ttda2):
 
                 # distance to itself is 0
-                if i + start_index == j:
-                    distances[i][j] = 0
+                if ttd1_idx + start_index == ttd2_idx:
+                    distances[ttd1_idx][ttd2_idx] = 0
                     continue
 
                 # now mask b based on a, which will force the shape of a onto b
-                b_masked = ttda2[j][mask]
+                ttd2_masked = ttd2[mask]
 
                 distance = 0
                 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1
                 # (not 2 because that correspondsto negative values. The maximum distance is 1 here)
                 # don't normalize b_masked, otherwise the following threshold will never work:
-                if b_masked.sum() <= 0.05:
+                if ttd2_masked.sum() <= 0.05:
                     distance = 1
                 else:
                     # if there is indeed some non-noise stuff in the masked topic b, look at
                     # how similar the two topics are. note that normalizing is not needed for cosine distance,
                     # as it only looks at the angle between vectors
-                    distance = cosine(a_masked, b_masked)
+                    distance = cosine(ttd1_masked, ttd2_masked)
 
-                distances[i][j] = distance
+                distances[ttd1_idx][ttd2_idx] = distance
 
         avg_mask_size = avg_mask_size / ttda1.shape[0] / ttda1.shape[1]
         percent = round(100 * avg_mask_size, 1)

From dc566f3a68c40d9057468ae6efa2816a5c36ea76 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Mon, 29 Jul 2019 21:24:08 +0200
Subject: [PATCH 042/166] docstrings, masks explained and mask warning removed

---
 gensim/models/ensemblelda.py | 48 +++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 1719ebc4a1..133da0b9dc 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -4,9 +4,10 @@
 # Authors: Tobias B <github.com/sezanzeb>, Alex Loosley <aloosley@alumni.brown.edu>,
 # Stephan Sahm <stephan.sahm@gmx.de>, Alex Salles <alex.salles@gmail.com>, Data Reply Munich
 
-"""Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models. They are clustered
-using DBSCAN, for which nearby topic mixtures - which are mixtures of two or more true topics and therefore undesired -
-are being used to support topics in becoming cores, but not vice versa.
+"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting
+reliable topics that are consistently learned accross the ensemble.  eLDA has the added benefit that the user
+does not need to know the exact number of topics the topic model should extract ahead of time.  For more details
+read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/).
 
 Usage examples
 --------------
@@ -93,10 +94,10 @@
 
 
 class EnsembleLda():
-    """Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models.
-    They are clustered using a variation of DBSCAN, for which nearby topic mixtures - which are mixtures
-    of two or more true topics and therefore undesired - are being used to support topics in becoming
-    cores, but not vice versa.
+    """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting
+    reliable topics that are consistently learned accross the ensemble.  eLDA has the added benefit that the user
+    does not need to know the exact number of topics the topic model should extract ahead of time.  For more details
+    read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/).
 
     """
     def __init__(self, topic_model_kind="lda", num_models=3,
@@ -123,7 +124,7 @@ def __init__(self, topic_model_kind="lda", num_models=3,
         min_cores : int, optional
             Minimum cores a cluster of topics has to contain so that it is recognized as stable topic.
         epsilon : float, optional
-            Defaults to 0.1. Epsilon for the cbdbscan clustering that generates the stable topics.
+            Defaults to 0.1. Epsilon for the CBDBSCAN clustering that generates the stable topics.
         ensemble_workers : int, optional
             Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
             num_models should be a multiple of ensemble_workers.
@@ -139,10 +140,21 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             If False, any topic term matrix can be suplied to add_model.
         min_samples : int, optional
             Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering.
-        masking_method : {'mass', 'rank}, optional
-            One of "mass" (default) or "rank" (faster).
+        masking_method : {'mass', 'rank'}, optional
+            Choose one of "mass" (default) or "rank" (percentile, faster).
+
+            For clustering, distances between topic-term distributions are asymmetric.  In particular, the distance
+            (technically a divergence) from distribution A to B is more of a measure of if A is contained in B.  At a
+            high level, this involves using distribution A to mask distribution B and then calculating the cosine
+            distance between the two.  The masking can be done in two ways:
+                mass: forms mask by taking the top ranked terms until their cumulative mass reaches the
+                    `masking_threshold`
+                rank: forms mask by taking the top ranked terms (by mass) until the `masking_threshold` is reached.
+                    For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask.
         masking_threshold : float, optional
-            Default: None, which uses 0.11 for masking_method "rank", and 0.95 for "mass".
+            Default: None, which uses 0.95 for "mass", and 0.11 for masking_method "rank".  In general, too small a mask
+            threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy distance
+            calculations.  Defaults are often a good sweet spot for this hyperparameter.
         distance_workers : int, optional
             When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which
             is not multiprocessed. Set to > 1 to enable multiprocessing.
@@ -639,7 +651,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
             pipe.close()
 
     def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method):
-        """ worker, that computes the distance to all other nodes
+        """ Worker that computes the distance to all other nodes
         from a chunk of nodes. https://stackoverflow.com/a/1743350
         """
 
@@ -836,16 +848,8 @@ def rank_masking(a):
 
                 distances[ttd1_idx][ttd2_idx] = distance
 
-        avg_mask_size = avg_mask_size / ttda1.shape[0] / ttda1.shape[1]
-        percent = round(100 * avg_mask_size, 1)
-        if avg_mask_size > 0.75:
-            # if the masks covered more than 75% of tokens on average,
-            # print a warning. info otherwise.
-            # The default mass setting of 0.95 makes uniform true masks on the opinosis corpus.
-            logger.warning('The given threshold of {} covered on average '
-                '{}% of tokens, which might be too much'.format(threshold, percent))
-        else:
-            logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
+        percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
+        logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
 
         return distances
 

From 9d57533d0210fc2b23fc1c8254ed942b7a4d186d Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Mon, 29 Jul 2019 21:45:35 +0200
Subject: [PATCH 043/166] created internal variable for cosine distance
 calculations

---
 gensim/models/ensemblelda.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 133da0b9dc..d48ceac61e 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -92,7 +92,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 class EnsembleLda():
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting
     reliable topics that are consistently learned accross the ensemble.  eLDA has the added benefit that the user
@@ -162,10 +161,15 @@ def __init__(self, topic_model_kind="lda", num_models=3,
             Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble.
 
         """
+        # INTERNAL PARAMETERS
         # Set random state
         # nps max random state of 2**32 - 1 is too large for windows:
         self._MAX_RANDOM_STATE = np.iinfo(np.int32).max
 
+        # _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping
+        # distance calculations for highly masked topic-term distributions
+        self._COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05
+
         if "id2word" not in gensim_kw_args:
             gensim_kw_args["id2word"] = None
         if "corpus" not in gensim_kw_args:
@@ -834,16 +838,11 @@ def rank_masking(a):
                 # now mask b based on a, which will force the shape of a onto b
                 ttd2_masked = ttd2[mask]
 
-                distance = 0
-                # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1
-                # (not 2 because that corresponds to negative values. The maximum distance is 1 here)
-                # don't normalize b_masked, otherwise the following threshold will never work:
-                if ttd2_masked.sum() <= 0.05:
+                # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
+                # distributions that will have distance values near 1.
+                if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD:
                     distance = 1
                 else:
-                    # if there is indeed some non-noise stuff in the masked topic b, look at
-                    # how similar the two topics are. note that normalizing is not needed for cosine distance,
-                    # as it only looks at the angle between vectors
                     distance = cosine(ttd1_masked, ttd2_masked)
 
                 distances[ttd1_idx][ttd2_idx] = distance

From d27cd593c18631f37e89a3786ad08264a56c67c5 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Wed, 28 Aug 2019 20:41:25 +0200
Subject: [PATCH 044/166] cbdbscan docstring

---
 gensim/models/ensemblelda.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index d48ceac61e..41f51e3dbb 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -74,11 +74,13 @@
 
 References
 ----------
-.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic
-       modelling with large corpora. In : THE LREC 2010 WORKSHOP ON NEW
-       CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta.
-       2010. p. 45-50. Available from:
-       http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595
+.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic modelling with large corpora. In : THE LREC
+       2010 WORKSHOP ON NEW CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. 2010. p. 45-50.
+       Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595
+
+.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation: bachelor thesis.
+       Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/
+       TODO country? thi? loosley, salles, stephan?
 
 """
 
@@ -95,8 +97,7 @@
 class EnsembleLda():
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting
     reliable topics that are consistently learned accross the ensemble.  eLDA has the added benefit that the user
-    does not need to know the exact number of topics the topic model should extract ahead of time.  For more details
-    read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/).
+    does not need to know the exact number of topics the topic model should extract ahead of time [2].
 
     """
     def __init__(self, topic_model_kind="lda", num_models=3,
@@ -1098,14 +1099,15 @@ def id2word(self):
 
 
 class CBDBSCAN():
+    """A Variation of the DBSCAN algorithm which limits how much a cluster is allowed to spread out and which works
+    on an asymmetric distance matrix.
+    """
 
     def __init__(self, eps=0.1, min_samples=4):
-
         self.eps = eps
         self.min_samples = min_samples
 
     def fit(self, amatrix):
-
         self.next_label = 0
 
         results = [{

From 42aa7ad6695258874bb3e051dad598ef9dd48bbb Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Wed, 28 Aug 2019 20:49:05 +0200
Subject: [PATCH 045/166] moved validate_core outside

---
 gensim/models/ensemblelda.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 41f51e3dbb..c65f94bbdb 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -875,6 +875,13 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None):
         self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples)
         self.cluster_model.fit(self.asymmetric_distance_matrix)
 
+    def _validate_core(core):
+        """Core is a dict of {is_core, valid_parents, labels} among others.
+        If not a core returns False. Only cores with the valid_parents as
+        its own label can be a valid core. Returns True if that is the case,
+        False otherwise"""
+        return core["is_core"] and (core["valid_parents"] == {core["label"]})
+
     def _generate_stable_topics(self, min_cores=None):
         """generates stable topics out of the clusters. This function is the last step that has to be done in the
         ensemble.
@@ -999,19 +1006,8 @@ def remove_from_all_sets(label):
         for topic in results:
             topic["valid_parents"] = {label for label in topic["parent_labels"] if label in valid_labels}
 
-        def validate_core(core):
-            """Core is a dict of {is_core, valid_parents, labels} among others.
-            If not a core returns False. Only cores with the valid_parents as
-            its own label can be a valid core. Returns True if that is the case,
-            False otherwise"""
-            if not core["is_core"]:
-                return False
-            else:
-                ret = core["valid_parents"] == {core["label"]}
-                return ret
-
         # keeping only VALID cores
-        valid_core_mask = np.vectorize(validate_core)(results)
+        valid_core_mask = np.vectorize(self._validate_core)(results)
         valid_topics = self.ttda[valid_core_mask]
         topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask]
         unique_labels = np.unique(topic_labels)

From eaf62d6ced40bae39a570ff32f797b48a3ee267e Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Wed, 28 Aug 2019 21:15:27 +0200
Subject: [PATCH 046/166] added citation note

---
 gensim/models/ensemblelda.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index c65f94bbdb..3a0f31e814 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -82,6 +82,13 @@
        Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/
        TODO country? thi? loosley, salles, stephan?
 
+Citation
+--------
+At the moment, there is no paper associated to ensemble LDA but we are working on publicly releasing Tobi Brigl's
+bachelor thesis on the topic.  In the meantime, please include a mention of us (Brigl, T.; Salles, A.; Loosley, A) and
+a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if you use
+this work in a published or open-source project.
+
 """
 
 import logging

From 2e2eb1667347a8fc9b14ceec632e9b91a7332158 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Wed, 28 Aug 2019 22:15:17 +0200
Subject: [PATCH 047/166] moved more stuff outside of _generate_stable_topics

---
 gensim/models/ensemblelda.py | 141 ++++++++++++++++++++++-------------
 1 file changed, 90 insertions(+), 51 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index c65f94bbdb..1a39ebe01f 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Authors: Tobias B <github.com/sezanzeb>, Alex Loosley <aloosley@alumni.brown.edu>,
+# Authors: Tobias Brigl <github.com/sezanzeb>, Alex Loosley <aloosley@alumni.brown.edu>,
 # Stephan Sahm <stephan.sahm@gmx.de>, Alex Salles <alex.salles@gmail.com>, Data Reply Munich
 
 """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting
@@ -875,45 +875,34 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None):
         self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples)
         self.cluster_model.fit(self.asymmetric_distance_matrix)
 
-    def _validate_core(core):
-        """Core is a dict of {is_core, valid_parents, labels} among others.
-        If not a core returns False. Only cores with the valid_parents as
-        its own label can be a valid core. Returns True if that is the case,
-        False otherwise"""
-        return core["is_core"] and (core["valid_parents"] == {core["label"]})
-
-    def _generate_stable_topics(self, min_cores=None):
-        """generates stable topics out of the clusters. This function is the last step that has to be done in the
-        ensemble.
-
-        Stable topics can be retreived afterwards using get_topics().
+    def _validate_core(self, core):
+        """If core is not a core returns False. Only cores with the valid_parents as its own label can be a valid core.
+        Returns True if that is the case, False otherwise.
 
         Parameters
         ----------
-        min_cores : int
-            how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
-            that look similar have to be present, so that the average topic in those is used as stable topic.
-
-            defaults to min_cores = min(3, max(1, int(self.num_models /4 +1)))
-
+        core : {'is_core', 'valid_parents', 'labels'}
+            eps is 0.1 by default.
         """
+        return core["is_core"] and (core["valid_parents"] == {core["label"]})
 
-        logger.info("Generating stable topics")
-
-        # min_cores being 0 makes no sense. there has to be a core for a cluster
-        # or there is no cluster
-        if min_cores == 0:
-            min_cores = 1
-
-        if min_cores is None:
-            # min_cores is a number between 1 and 3, depending on the number of models
-            min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
+    def _group_by_labels(self, results):
+        """Groups all the learned cores by their label, which was assigned in the cluster_model
 
-        results = self.cluster_model.results
+        Parameters
+        ----------
+        results : {list of {'is_core', 'parent_labels', 'num_samples', 'label'}}
+            After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results
+            member, which can be used as the argument to this function. It's a list of infos gathered during
+            the clustering step and each element in the list corresponds to a single topic.
 
-        # first, group all the learned cores based on the label,
-        # which was assigned in the cluster_model. The result is a
-        # dict of {group: [topic, ...]}
+        Returns
+        -------
+            dict of (int, list of {'is_core', 'amount_parent_labels', 'parent_labels'})
+            A mapping of the label to a list of topics that belong to that particular label. Also adds
+            a new member to each topic called amount_parent_labels, which is the number of parent_labels of that
+            topic.
+        """
         grouped_by_labels = {}
         for topic in results:
             if topic["is_core"]:
@@ -926,22 +915,38 @@ def _generate_stable_topics(self, min_cores=None):
                 if label not in grouped_by_labels:
                     grouped_by_labels[label] = []
                 grouped_by_labels[label].append(topic)
+        return grouped_by_labels
+
+    def _aggregate_topics(self, grouped_by_labels):
+        """Aggregates the topics to a list of clusters
 
-        # then aggregate their amount_parent_labels by maxing and parent_labels by concatenating
-        # the result is sorted by amount_parent_labels and stored in sorted_clusters
+        Parameters
+        ----------
+        grouped_by_labels : dict of (int, list of {'is_core', 'amount_parent_labels', 'parent_labels'})
+            The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the
+            label.
+
+        Returns
+        -------
+            list of {'amount_parent_labels', 'parent_labels', 'label'}
+            amount_parent_labels is the max number of parent labels among each topic of a given cluster. label refers
+            to the label identifier of the cluster. parent_labels is a concatenated list of the parent_labels sets of
+            each topic. Its sorted by amount_parent_labels in descending order. There is one single element for each
+            cluster.
+        """
         sorted_clusters = []
+
         for label, group in grouped_by_labels.items():
             amount_parent_labels = 0
             parent_labels = []  # will be a list of sets
+
             for topic in group:
                 amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels)
                 parent_labels.append(topic["parent_labels"])
-            # - here, nan means "not yet evaluated"
-            # - removing empty sets from parent_labels
+
             sorted_clusters.append({
                 "amount_parent_labels": amount_parent_labels,
                 "parent_labels": [x for x in parent_labels if len(x) > 0],
-                "is_valid": np.nan,
                 "label": label
             })
 
@@ -952,31 +957,67 @@ def _generate_stable_topics(self, min_cores=None):
                 cluster["label"]  # makes sorting deterministic
             ), reverse=True)
 
-        def remove_from_all_sets(label):
-            """removes a label from every set in "parent_labels" for
-            each core in sorted_clusters."""
-            for a in sorted_clusters:
-                if label in a["parent_labels"]:
-                    a["parent_labels"].remove(label)
+        return sorted_clusters
+
+    def _remove_from_all_sets(self, label, clusters):
+        """removes a label from every set in "parent_labels" for
+        each core in clusters"""
+        for a in clusters:
+            if label in a["parent_labels"]:
+                a["parent_labels"].remove(label)
+
+    def _generate_stable_topics(self, min_cores=None):
+        """generates stable topics out of the clusters. This function is the last step that has to be done in the
+        ensemble.
+
+        Stable topics can be retreived afterwards using get_topics().
+
+        Parameters
+        ----------
+        min_cores : int
+            how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
+            that look similar have to be present, so that the average topic in those is used as stable topic.
+
+            defaults to min_cores = min(3, max(1, int(self.num_models /4 +1)))
+
+        """
+        logger.info("Generating stable topics")
+
+        # min_cores being 0 makes no sense. there has to be a core for a cluster
+        # or there is no cluster
+        if min_cores == 0:
+            min_cores = 1
+
+        if min_cores is None:
+            # min_cores is a number between 1 and 3, depending on the number of models
+            min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
+
+        results = self.cluster_model.results
+
+        grouped_by_labels = self._group_by_labels(results)
+
+        sorted_clusters = self._aggregate_topics(grouped_by_labels)
 
         # APPLYING THE RULES 1 to 3
         # iterate over the cluster labels, see which clusters/labels
         # are valid to cause the creation of a stable topic
         for i, cluster in enumerate(sorted_clusters):
+            # here, nan means "not yet evaluated"
+            cluster["is_valid"] = np.nan
+
             label = cluster["label"]
             # label is iterating over 0, 1, 3, ...
 
             # 1. rule - remove if the cores in the cluster have no parents
             if cluster["amount_parent_labels"] == 0:
-                # label is NOT VALID
                 cluster["is_valid"] = False
-                remove_from_all_sets(label)
+                self._remove_from_all_sets(label, sorted_clusters)
 
             # 2. rule - remove if it has less than min_cores as parents
             # (parents are always also cores)
             if len(cluster["parent_labels"]) < min_cores:
                 cluster["is_valid"] = False
-                remove_from_all_sets(label)
+                self._remove_from_all_sets(label, sorted_clusters)
 
             # 3. checking for "easy_valid"s
             # checks if the core has at least min_cores of cores with the only label as itself
@@ -984,7 +1025,7 @@ def remove_from_all_sets(label):
                 if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores:
                     cluster["is_valid"] = True
 
-        # Reaplying the rule 3
+        # Reapplying the rule 3
         # this happens when we have a close relationship among 2 or more clusters
         for cluster in [cluster for cluster in sorted_clusters if np.isnan(cluster["is_valid"])]:
 
@@ -1097,8 +1138,8 @@ def id2word(self):
 class CBDBSCAN():
     """A Variation of the DBSCAN algorithm which limits how much a cluster is allowed to spread out and which works
     on an asymmetric distance matrix.
-    """
 
+    """
     def __init__(self, eps=0.1, min_samples=4):
         self.eps = eps
         self.min_samples = min_samples
@@ -1126,7 +1167,6 @@ def fit(self, amatrix):
         num_topics = len(amatrix)
 
         def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None):
-
             # count how many neighbors
             # check which indices (arange 0 to num_topics) is closer than eps
             neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps]
@@ -1178,7 +1218,6 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
                 results[topic_index]["label"] = current_label
 
                 for neighbor in neighbors:
-
                     if results[neighbor]["label"] is None:
                         ordered_min_similarity.remove(neighbor)
                         # try to extend the cluster into the direction

From 0a6c1f6162d42ac3dfb84c69ea2b59a8547cca33 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Wed, 28 Aug 2019 22:16:36 +0200
Subject: [PATCH 048/166] typos

---
 gensim/models/ensemblelda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 55ef8d3be0..20d0c22f12 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -977,7 +977,7 @@ def _generate_stable_topics(self, min_cores=None):
         """generates stable topics out of the clusters. This function is the last step that has to be done in the
         ensemble.
 
-        Stable topics can be retreived afterwards using get_topics().
+        Stable topics can be retrieved afterwards using get_topics().
 
         Parameters
         ----------
@@ -1077,7 +1077,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
 
         Generates stable topics out of the clusters afterwards.
 
-        Finally, stable topics can be retreived using get_topics().
+        Finally, stable topics can be retrieved using get_topics().
 
         Parameters
         ----------

From 000298291139e9a549877e8018db411a80b1705b Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Wed, 28 Aug 2019 22:34:20 +0200
Subject: [PATCH 049/166] explained CBDBSCAN

---
 gensim/models/ensemblelda.py | 90 +++++++++++++++++++++++++-----------
 1 file changed, 63 insertions(+), 27 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 3a0f31e814..4727f3b2b0 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1031,6 +1031,9 @@ def remove_from_all_sets(label):
         self.sorted_clusters = sorted_clusters
         self.stable_topics = stable_topics
 
+    def _get_total_parent_labels(self, cluster):
+        return sum(map(lambda x: x == {label}, cluster["parent_labels"]))
+
     def recluster(self, eps=0.1, min_samples=None, min_cores=None):
         """Runs the CBDBSCAN algorithm on all the detected topics from the children of the ensemble.
 
@@ -1044,7 +1047,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
             epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
             default: 0.1
         min_samples : int
-            The minimum number of sampels in the neighborhood of a topic to be considered a core in CBDBSCAN.
+            The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.
             default: int(self.num_models / 2)
         min_cores : int
             how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
@@ -1102,11 +1105,39 @@ def id2word(self):
 
 
 class CBDBSCAN():
-    """A Variation of the DBSCAN algorithm which limits how much a cluster is allowed to spread out and which works
-    on an asymmetric distance matrix.
+    """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN).  The algorithm works based on
+    DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a "nearby" point is, and the
+    minimum number of nearby points needed to label a candidate datapoint a core of a cluster.
+    (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).
+
+    The algorithm works as follows:
+    1. (A)symmetric distance matrix provided at fit-time.
+    For the sake of example below, assume the there are only three topics (amatrix contains distances with dim 3x3),
+        T_1, T_2, and T_3:
+    2. Start by scanning a candidate topic
+        (e.g. T_1)
+    3. Check which topics are nearby and call them neighbours
+        (e.g. assume T_2, and T_3 are nearby and become neighbours)
+    4. If there are more neighbours than `min_samples`, the candidate topic becomes a core candidate for a cluster
+        (e.g. if `min_samples`<=2 then T_1 becomes the first core of a cluster)
+    5. CheckBack (CB) Step: Check if at least 25% of other cluster cores are nearby to core candidate
+        (e.g. at this stage in the example, the cluster is empty, so checkback passes)
+    6. Recursively scan the next nearby topic (e.g. scan T_2) - repeat step 2.-6.
+
+    The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for
+    unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is
+    one cause of unreliable topics).
+
+    Parameters
+    ----------
+    eps : float
+        epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
+        default: 0.1
+    min_samples : int
+        The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.
     """
 
-    def __init__(self, eps=0.1, min_samples=4):
+    def __init__(self, eps, min_samples):
         self.eps = eps
         self.min_samples = min_samples
 
@@ -1121,31 +1152,35 @@ def fit(self, amatrix):
             "label": None
         } for _ in range(len(amatrix))]
 
-        tmp_amatrix = amatrix.copy()
+        amatrix_copy = amatrix.copy()
 
         # to avoid problem about comparing the topic with itself
-        np.fill_diagonal(tmp_amatrix, 1)
+        np.fill_diagonal(amatrix_copy, 1)
 
-        min_distance_per_topic = [(distance, index) for index, distance in enumerate(tmp_amatrix.min(axis=1))]
+        min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))]
         min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x)
         ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted]
 
-        num_topics = len(amatrix)
-
         def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None):
 
-            # count how many neighbors
-            # check which indices (arange 0 to num_topics) is closer than eps
-            neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps]
-            num_samples = len(neighbors)
-
-            # If the number of neighbors of a topic is large enough,
+            # count how many neighbor_indices
+            # check which indices is closer than eps
+            neighbouring_topic_indices = np.array(
+                [
+                    candidate_topic_index
+                    for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps)
+                    if is_neighbour
+                ]
+            )
+            num_neighbouring_topics = len(neighbouring_topic_indices)
+
+            # If the number of neighbor_indices of a topic is large enough,
             # it is considered a core.
-            # This also takes neighbors that already are identified as core in count.
-            if num_samples >= self.min_samples:
+            # This also takes neighbor_indices that already are identified as core in count.
+            if num_neighbouring_topics >= self.min_samples:
                 # This topic is a core!
                 results[topic_index]["is_core"] = True
-                results[topic_index]["num_samples"] = num_samples
+                results[topic_index]["num_samples"] = num_neighbouring_topics
 
                 # if current_label is none, then this is the first core
                 # of a new cluster (hence next_label is used)
@@ -1161,7 +1196,7 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
                     # Check the distance between the
                     # new core and the parent
 
-                    # parent neighbors is the list of neighbors of parent_id
+                    # parent neighbor_indices is the list of neighbor_indices of parent_id
                     # (the topic_index that called this function recursively)
                     all_members_of_current_cluster = list(parent_neighbors)
                     all_members_of_current_cluster.append(parent_id)
@@ -1173,7 +1208,7 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
                     all_members_of_current_cluster_ix = np.ix_([topic_index], all_members_of_current_cluster)
                     # use the result of the previous step to index the matrix and see if those distances
                     # are smaller then epsilon. relations_to_the_cluster is a boolean array, True for close elements
-                    relations_to_the_cluster = tmp_amatrix[all_members_of_current_cluster_ix] < self.eps
+                    relations_to_the_cluster = amatrix_copy[all_members_of_current_cluster_ix] < self.eps
 
                     # if less than 25% of the elements are close, then the topic index in question is not a
                     # core of the current_label, but rather the core of a new cluster
@@ -1184,19 +1219,20 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
 
                 results[topic_index]["label"] = current_label
 
-                for neighbor in neighbors:
+                for neighbouring_topic_index in neighbouring_topic_indices:
 
-                    if results[neighbor]["label"] is None:
-                        ordered_min_similarity.remove(neighbor)
+                    if results[neighbouring_topic_index]["label"] is None:
+                        ordered_min_similarity.remove(neighbouring_topic_index)
                         # try to extend the cluster into the direction
                         # of the neighbor
                         scan_topic(
-                            neighbor, parent_id=topic_index,
+                            neighbouring_topic_index, parent_id=topic_index,
                             current_label=current_label,
-                            parent_neighbors=neighbors)
+                            parent_neighbors=neighbouring_topic_indices
+                        )
 
-                    results[neighbor]["parent_ids"].add(topic_index)
-                    results[neighbor]["parent_labels"].add(current_label)
+                    results[neighbouring_topic_index]["parent_ids"].add(topic_index)
+                    results[neighbouring_topic_index]["parent_labels"].add(current_label)
 
             else:
                 # this topic is not a core!

From 9675016e19a706a69071cc093ae6e12ce010fb6a Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Wed, 28 Aug 2019 22:39:42 +0200
Subject: [PATCH 050/166] added extra explanation:

---
 gensim/models/ensemblelda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 22b1acfbbb..f387918665 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1154,9 +1154,9 @@ class CBDBSCAN():
         T_1, T_2, and T_3:
     2. Start by scanning a candidate topic
         (e.g. T_1)
-    3. Check which topics are nearby and call them neighbours
+    3. Check which topics are nearby using `self.eps` as a threshold and call them neighbours
         (e.g. assume T_2, and T_3 are nearby and become neighbours)
-    4. If there are more neighbours than `min_samples`, the candidate topic becomes a core candidate for a cluster
+    4. If there are more neighbours than `self.min_samples`, the candidate topic becomes a core candidate for a cluster
         (e.g. if `min_samples`<=2 then T_1 becomes the first core of a cluster)
     5. CheckBack (CB) Step: Check if at least 25% of other cluster cores are nearby to core candidate
         (e.g. at this stage in the example, the cluster is empty, so checkback passes)

From b53704a8a4305d8fb47fbce5e568f04161c07ded Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Wed, 28 Aug 2019 22:44:11 +0200
Subject: [PATCH 051/166] using none instead of nan for unchecked core

---
 gensim/models/ensemblelda.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 20d0c22f12..d38c615c7e 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1009,8 +1009,8 @@ def _generate_stable_topics(self, min_cores=None):
         # iterate over the cluster labels, see which clusters/labels
         # are valid to cause the creation of a stable topic
         for i, cluster in enumerate(sorted_clusters):
-            # here, nan means "not yet evaluated"
-            cluster["is_valid"] = np.nan
+            # here, None means "not yet evaluated"
+            cluster["is_valid"] = None
 
             label = cluster["label"]
             # label is iterating over 0, 1, 3, ...
@@ -1034,7 +1034,7 @@ def _generate_stable_topics(self, min_cores=None):
 
         # Reapplying the rule 3
         # this happens when we have a close relationship among 2 or more clusters
-        for cluster in [cluster for cluster in sorted_clusters if np.isnan(cluster["is_valid"])]:
+        for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]:
 
             # this checks for parent_labels, which are also modified in this function
             # hence order will influence the result. it starts with the most significant

From 4f3de9670adebad4c0166c392ab1c3b7e2cd0e64 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Wed, 28 Aug 2019 22:54:18 +0200
Subject: [PATCH 052/166] updated docs

---
 gensim/models/ensemblelda.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index f387918665..a6c94b58e8 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -89,6 +89,11 @@
 a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if you use
 this work in a published or open-source project.
 
+Other Notes
+-----------
+.. The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and
+comments.
+
 """
 
 import logging
@@ -1005,9 +1010,8 @@ def _generate_stable_topics(self, min_cores=None):
 
         sorted_clusters = self._aggregate_topics(grouped_by_labels)
 
-        # APPLYING THE RULES 1 to 3
-        # iterate over the cluster labels, see which clusters/labels
-        # are valid to cause the creation of a stable topic
+        # Iterate over the cluster labels, see which clusters/labels
+        # are valid to trigger the creation of a stable topic
         for i, cluster in enumerate(sorted_clusters):
             # here, nan means "not yet evaluated"
             cluster["is_valid"] = np.nan
@@ -1073,11 +1077,10 @@ def _generate_stable_topics(self, min_cores=None):
         self.stable_topics = stable_topics
 
     def recluster(self, eps=0.1, min_samples=None, min_cores=None):
-        """Runs the CBDBSCAN algorithm on all the detected topics from the children of the ensemble.
-
-        Generates stable topics out of the clusters afterwards.
+        """Runs the CBDBSCAN algorithm on all the topics in the ensemble and uses resulting clusters to identify
+        reliable topics.
 
-        Finally, stable topics can be retrieved using get_topics().
+        Stable topics can be retrieved using get_topics().
 
         Parameters
         ----------
@@ -1098,8 +1101,13 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
             logger.info("asymmetric distance matrix is outdated due to add_model")
             self._generate_asymmetric_distance_matrix()
 
+        # Run CBDBSCAN to get topic clusters:
         self._generate_topic_clusters(eps, min_samples)
+
+        # Interpret the results of CBDBSCAN to identify reliable topics:
         self._generate_stable_topics(min_cores)
+
+        # Create gensim LdaModel representation of topic model with reliable topics (can be used for inference):
         self.generate_gensim_representation()
 
     # GENSIM API

From 2ed2cfe08f94eabb5f0718a76ff0c6e326bb32be Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Wed, 28 Aug 2019 23:00:00 +0200
Subject: [PATCH 053/166] refactored kind to class, fixed check how to proceed
 with topic_model_class

---
 gensim/models/ensemblelda.py    | 18 +++++++++---------
 gensim/test/test_ensemblelda.py |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index bd844adc22..ae121453f9 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -107,7 +107,7 @@ class EnsembleLda():
     does not need to know the exact number of topics the topic model should extract ahead of time [2].
 
     """
-    def __init__(self, topic_model_kind="lda", num_models=3,
+    def __init__(self, topic_model_class="lda", num_models=3,
                  min_cores=None,  # default value from _generate_stable_topics()
                  epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
                  min_samples=None, masking_method="mass", masking_threshold=None,
@@ -116,7 +116,7 @@ def __init__(self, topic_model_kind="lda", num_models=3,
 
         Parameters
         ----------
-        topic_model_kind : str, topic model, optional
+        topic_model_class : str, topic model, optional
             Examples:
                 'ldamulticore' (recommended), 'lda' (default),
         ensemble_workers : number, optional
@@ -195,17 +195,17 @@ def __init__(self, topic_model_kind="lda", num_models=3,
                 "input space dimensionality. Corpus should be provided using the "
                 "`corpus` keyword argument.")
 
-        if isinstance(topic_model_kind, ldamodel.LdaModel):
-            self.topic_model_kind = topic_model_kind
+        if type(topic_model_class) == type and issubclass(topic_model_class, ldamodel.LdaModel):
+            self.topic_model_class = topic_model_class
         else:
             kinds = {
                 "lda": ldamodel.LdaModel,
                 "ldamulticore": ldamulticore.LdaMulticore
             }
-            if topic_model_kind not in kinds:
+            if topic_model_class not in kinds:
                 raise ValueError(
-                    "topic_model_kind should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel")
-            self.topic_model_kind = kinds[topic_model_kind]
+                    "topic_model_class should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel")
+            self.topic_model_class = kinds[topic_model_class]
 
         self.num_models = num_models
         self.gensim_kw_args = gensim_kw_args
@@ -306,7 +306,7 @@ def generate_gensim_representation(self):
         params["iterations"] = 0  # no training
         params["passes"] = 0  # no training
 
-        classic_model_representation = self.topic_model_kind(**params)
+        classic_model_representation = self.topic_model_class(**params)
 
         # when eta was None, use what gensim generates as default eta for the following tasks:
         eta = classic_model_representation.eta
@@ -636,7 +636,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
 
             kwArgs["random_state"] = random_states[i]
 
-            tm = self.topic_model_kind(**kwArgs)
+            tm = self.topic_model_class(**kwArgs)
 
             # adds the lambda (that is the unnormalized get_topics) to ttda, which is
             # a list of all those lambdas
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 5de7b324c7..84bc8340d5 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -199,7 +199,7 @@ def test_add_models(self):
         # 1. memory friendly
         eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
                                 num_topics=num_new_topics, passes=1, num_models=num_new_models,
-                                iterations=1, random_state=0, topic_model_kind=LdaMulticore,
+                                iterations=1, random_state=0, topic_model_class=LdaMulticore,
                                 workers=3, ensemble_workers=2)
 
         # 1.1 ttda
@@ -245,7 +245,7 @@ def test_add_models(self):
         # 2. memory unfriendly
         eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
                                    num_topics=num_new_topics, passes=1, num_models=num_new_models,
-                                   iterations=1, random_state=0, topic_model_kind=LdaMulticore,
+                                   iterations=1, random_state=0, topic_model_class=LdaMulticore,
                                    workers=3, ensemble_workers=2, memory_friendly_ttda=False)
 
         # 2.1 a single ensemble
@@ -296,11 +296,11 @@ def test_add_and_recluster(self):
         # train
         new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
                                num_topics=num_new_topics, passes=10, num_models=num_new_models,
-                               iterations=30, random_state=1, topic_model_kind='ldamulticore',
+                               iterations=30, random_state=1, topic_model_class='ldamulticore',
                                distance_workers=4)
         new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
                                   num_topics=num_new_topics, passes=10, num_models=num_new_models,
-                                  iterations=30, random_state=1, topic_model_kind='ldamulticore',
+                                  iterations=30, random_state=1, topic_model_class='ldamulticore',
                                   distance_workers=4, memory_friendly_ttda=False)
         # both should be similar
         np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)

From 2abce75d391917e09123a320dc31fbaa75903630 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Wed, 28 Aug 2019 23:18:05 +0200
Subject: [PATCH 054/166] reverted change that accidentally broke things

---
 gensim/models/ensemblelda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 65f06dc21c..360b757a30 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -237,9 +237,9 @@ def __init__(self, topic_model_class="lda", num_models=3,
 
         if gensim_kw_args.get("corpus") is None:
             return
-        if gensim_kw_args.get("iterations", 0) <= 0:
+        if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0:
             return
-        if gensim_kw_args.get("passes", 0) <= 0:
+        if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0:
             return
 
         logger.info("Generating {} topic models...".format(num_models))

From 511eaa5360dedb7693237734d1a5f9ecc77bd30e Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 8 Sep 2019 19:32:48 +0200
Subject: [PATCH 055/166] fixed tests locally

---
 gensim/test/test_data/ensemblelda | Bin 9815 -> 9893 bytes
 gensim/test/test_ensemblelda.py   |   2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 3d77142cd210d18ed65595fed46a0f1a5a081e32..a83e87b8c7965b1871ef711a66f449a1e618b1b4 100644
GIT binary patch
delta 4596
zcmZ`-c|26@+c#q$`x2o&YAh9!lxdK?V~L!HvW!^{XJVE!i;QUUM3Rb{$5T{F;;|(W
ziWZTSEnAVwQnrdpk+gZwjOX|M-u`&cA9J1i`d;^S?rS;oxxS~zBiA#{%0YwV=@3Bj
zb?|cX_9X4{bMSLUWOmHY&kI3m9)pX>27^nekz_8HP9-r|G$@k9f!G{~3o&_Q9-YNR
z<V4fhRSxpT@rXhkqKF68*bs-bkHw*pcq}%Z%0+O=pe&6iU_*!!mC0wY1vtqHhbV*C
zFe^X`z$E!1Dxw|IhC)>U1LEjBh$C^$MbrQ(P%WIyV30{X7=0+riliaxu68>qo}E(#
zKR`J)dN7EKXrOjVsBaRVO(XMAg(j+yr9o7(0MT-_8<dh<pc0MF<#Ff~J`bWH+UT*k
zXbcdth#C$txpc;Q(WJEgN*y#3-8f`19#(=?KzY<FjfF;`2PlHl5RZ)LpY$;@H260X
zCXLMDkOeqo2_TgSVkl@NOYr~(P)>wZL_`_lx@E8d{9kOj7{da}hhdduFf=H}jhLO|
zPCS3ood|U}6S1NK85(AVNL5x>xYE@wA!f+6M0-DogQCchBq$2v2#{3(4pgMVd}aif
z9s?n(J=^6$4F;V_XQ08eVLAyl<wJ<!e|IZV$&ry1GBtvW7<qw;;v}K@MGLzIPzL2#
zVPRZ|hpctA`}l8;WXX|in7kOl|8{U=$-%`z-I84rWReWx&{VR?JSq%v5K}J@C-J*b
zZ!-W3DnZd~=SWsKMVzs9zqwlfTMiz)XA*W+un6u2ECLtLAmoUw<U&<9pnV`ur&)^I
z2r(C3l3OdZK+9qohit@y3gTx;YGQ@fM1jYOfS8ta8nP)4vBn3;p>d!IVR5*~W<V)G
z7F~vO$ZQy~0dNwKE8y}V2C@Z^m4G}Bg!ao;KruiXHHFw*#1>EpP(bggECw6x6b?cF
zqylJSzg#{i3KAgO0GR+8bjjzUAQ6z4_<{KhG83@_FcOGIXF!NOAQzyB+R>S)moPFF
zLL30B#5LN=Y!;IXA=?3IJ}6BK<06i6h?A&To)GQ~ECChJ*d=ixE^&w}9*}|w00y=L
zu&@Il4Q~fz;B9~`yakYh?E!h12q?g|fFisVz`-_v61<r&X<HJT#YNnJrF;sk4=)9k
zIb<e{#UL&8yt~L-VU=))u`dGPL3vL<i!EC=Zbm$QtCS~NDNi;w5k_BB{(oLnFFYU(
zVhCPdUgAHAdC3nT#iJzq)uOR7{<i$2m$dF{Oli@%i?EA)kw%2G+I86j^F8t}FZ7p8
zynP!vJ5wWbSkJL2<qB44n5JGoK6&-dwT`p_>*Tax5@d&WuHqI|*EK%CPak_4(~9HY
zyxCKl4sRFrrXSMSSf8Y-5OwAB&t!V@kK&BXFNZyOT8MIe-Lk%R3Ql?NX4$V_!k2Ho
zCH$-Mc3#fM+fgB<oR2L5+7sOlsfs4N38dci^RgFELUKfi>{L~nFzm&4wY-}8D~&eT
zyWiW+4Oi*@^)j%bX5IU|`914{I02#T$1;>1a?0ACGA4qapg2vlqH?TGe6Q4@iGqBZ
za=wF-r){~N<vl$|tNZk*<<C|-`SyLd6Jt(DEf2Z*BG6Y_|L-4j>5ELxH<glt&on3%
z@^4H%XxY_%z_L1dYA#dwTB@rQ(mNAVU0B*M=B<`wd@c2L$T`o4eW|6nx~EmGHd?%2
z$wVIeNyV&qH`k~}Wh!o-sd=Kj$f?8jSa|U&Ys6x|bbv|kn#V|b+vixS_w$|ehB1pT
z=hD`xXm-5<5TS2RJcI3A<!pKT&898!lh1nD)VQ(>l$TB?&Xawq!kU&ehDuBH-6x7!
z7Y-tBsoze&zoW2vkzQQC@~XM2gfO4SsRT2I|Cc$^A2ini>*>}?%S)<fR0y_?HN8~>
z`kR+mZ{4tB<YryLCi`2wEyuR5UhZc4>fM*G@TbD1f(e&`?T1#Eht36E@QDV#K&f@n
z8T3aw))7x@hcLaB6;IC#vkus7CY<i*V@*wcO$F)}^`0qsYtk`T^G8th?#}1jP^*eH
zzGG#snR||!9ooltYCm5@Jf}CS7pT2gYj;TDrH+fN?2%IYWuJkws)14?jw?UZVife&
zCQ5@D{Ik=i)60+CaaYS0w7x6+;Br6YQ&wC3p&u<~GuKrGeao*;3(g76ejWJM{5J5_
zOwNf<rT9JD9t+a{0#d`^_#d*FEZqk~D{MSwtIjdIV)u@;PFvWNEz`hXCk(cuuZre{
zWalH5PmKEYe+t*X1@n&dl8B|IZYNYS1r8NWJ65mlAr=M>rUx5QvW{6d+n%xOlDUH;
zPApI2t?)eBcy`8dnwQlr%&&3X#~Dpy-X7x`yvlh~^nS-<Uhkhn?1ovtQ)(GG$IlIp
zu52F-o!GgwAvTn;Z7^+{l}76GgoN?RwoqT4Z|1F->(=;Fn!JINi(2z-j&B`m%=GM4
zT6FVjUa!-$8{?T7NAl`%A^qkr+p0fuvtNyL(Js2Ja5|UKcXuOwv)QPSU*AA<joBu*
zROyNc|7u?bsn35nwuB74mpu^E)p@3W7C&}8ct)Gj=^6aF?}1TJQROMaAKnB*%M`&^
z(&k-IUO9C5)%y<_SF1ZeULDe9l$U1x9Ii{vE%?k0(coL1D0=}OLFyO1F8TC#*eeAD
z%Y3WDcChUodR%(;-5+leVRu1O_hf8crDNmerxqE{6J9Lyr>j?FyKapzQMt)^d}^<%
zPcPdcp)pn4Vn$zX?-^~(Wo4;fhuLN4b3<h-3hsX%|9kmdzf<?MUvI9eT*^;mcnkIh
zUF234Ie!skr_e69J5=t%d9BnRdKU6f-Df1;Xwm0mr-tfM%I3Q$+l?oLj0~&T4f*9k
z-~Ik@V3u<QH}BSTysDtg^Hc9_%m|V3E{^D!uu1ZLCex%IkwU3>)Vj!jxKetG*S%kJ
z=M!A*Tnz#}7#~d7K)N?fcp3fly=wqyojVb`c0@fU?CDRprx(hm@K{T5c^e^O>P4=f
znkLovMwC>rQp1pHZZ-!iG+KNlTzU1W!b&YWW^Z>(vRvOkpB)IsN2`y%ZvMG=Sm1^D
z%KLQYL=AjVDE$5$e!7fM(}cP0KDx~9f>cMdLyDPnvfW_|54OF(iSD_BV^z5HH%x^;
z_Pf2DD)bo3(L=_P`>qCgy<XK4{!p`-zogU9XBFd-ecz<{b5ot+jyFQW#D{(P&pOT4
zT+hruxh`<ei(e7Z{v?HF+k(=OMXtVOdYQ9syKf%Xwy|uiIJWa(M949ce)|VZQv3C8
z_^wKE2>nugPTlQwNu<%+nYXzYPL9{~^zN`y^`9FFw3Rk-$+&T{*~{mg=xRv6mO1%P
zow+++<45Dnr;E9Xfj@4h<OrQ3x=km$tJj1!n%H(rQM7X^oW8z3g!}d=YgFSQ&E(0^
zjH~ML4krCT<=OC4t@X~W{<L8=)e9=s!-YHTqg8T(o}LbGNF3cb*~bBr%_-BcX!dfw
z%hHYBEnHsCbY03U?@RnxT96j59sZ}}?UaH0KZN04zMYQ28@C<(*}cO3Me^G!UVbAb
z8G0Abl#tWb7a07B5RE!4#v5EhiZ1o7%Ik^iu~Q`Zx{iHnf_fG!8jM8wY1*6CKC-$D
zX5#kP55@vjg!GMRnDJdYFFba|$%ks~3M#Y=B@$eUV`q})N6hY*=>0u@T{C8_{eiqq
zDmjAn*B&9dn$2XPe(VmR8uM(t=)%$yR#4~Vf$N*qOLw367*^aid1HsM|L|QG?`sxj
zTWq!@`YTCYE6f<uyAhvdIhegvU~p9LQe|0f7e#8U>?+?$Q&)B7E=A#<^WKcGFkK_<
zexLV)KLYv9)I-PVrw{7BzP0<!Cp-7yJLC6=yGN9YWQ`iqzHaaFQ#Tcsm%LckG&;H~
z`CZttR=e$`eb|ZW0ky*``<Bjh4y$^&9vXbORru|I^xD{49SOP}jrZ)&vwvZIb!-+3
z3|gztR^440cVd%vmroSmczoYy=cKmR4kq{CV)vf@aU>}`!E28>U(tB(o>}r4V{5~<
z++$yoo3me*r#5s;nQ0|H3!n&(9>v|icicH~xom1s_&~XKX;8iTcN`d`k@~KmLK`l5
zx<+PTxoPtH+wr$ce;QaoCzSIU8+z^!25q$T^UStOcI5h1t1H%2y-rx1Z<D&O#q^)K
zkSE{y6%o(A#y`yc=ZEp+5#J{FBYX3*4h0{u^)n&ZwH>%_5MTeb@NQK4Hx*&32n<RL
zzo+^69!9IhFX4Wdclp3Wuj0g%I!45ba-<zT5rOlNvkM7-sTgApy}I~7L52cNSSOCU
zr6_4dukUiX^7^^I^5Td7`ajla#!d`rmyC^_&c0=+RlNnz+hF0YUVF19I@3O6s{t@~
zpefhmjMsk7RMV+EzoZ&Z!LelF&Z&r*ZmBC5JzS<?JIK08)1UeB`|htBJ&y<0QZ4RQ
zY@^QV+S1aci6xV_{#?QSB8_xX_GdMf{I!j7CFb(z3fd;yTc$dR&vaaLHVpBi^jMAa
z*5Ph`vEIpnxr2F2^fxGF)XsMNx@EfWi&5A-c4jhX9qm}L9G|{|<#z&f5tiM5Y)zjS
z`Bm;%Jdk(p+iV0!yG&SSUQJigH@vmlG3tBDhcb{y0U`k@S9Fk-Ts#V&0%}OO-M@rT
zLJtvdSRPbChf7K%^dIK8^S5!*=YI!(G4ZoOiJzmG`T72T10`6CKCd=6G}{i9gCpS*
z8OmRfgNi#PYkSe8=2}Gs_Ssg6m1wCJOB$Va%Tu*fF&O3YEbVlx?*C<Ax#B}8h#8n~
z`3~I-jArd2`oaSSyceXYXlC&mMSGtGj|a;Q^+3!z6;S5w{FuXGUnC!Phs9%a0UrK+
zS}y$0wCq)$r{Ak+xrXHEz02LpndIb-vT83!XOg3Xha>3W;OFk`Me=j=b>8LX?cszJ
zx!M(Eh_6ghoT09;_z0-FKu1Xt`$%L?I2XARhm_!9bJ!Br04zZ{DvQJBbCIjTNGU!T
zxrVX>0CN~AL*qh4IZ7{4aUBXqD*i<(0co+S3Xl<t8-T1>+yvyrq8gAFiyGAY7GAtn
zco@08U~y*wx{GEDwWvic?um1U-q)c9*uzNu0&*Wk8qgYwkp~N28yB`dM3F~;q8Ml*
z2P4h+h5Hr&CswthXxqQ4#|x@<Kq*KJb)Zlue&M`b3s5(pBu1V<5RCFmKB!OT3K$Fs
zW%E>$nAMY+REWeNqw8qo>2Gb<vrusbwBjdQ*2w06eA5S=oZTjUaCMYuGiIAit*ydG
zQ8Z?2kX5ASP+v|n+D8Xh9Y2#qq_uDQ{_L7~ZI9Qsx=6HH=3qSk__Y{c9TlBKM4J=c
znnipw9uQsBCe|-ph12zcLWx~%i_rNfF|Eb4QlyGCl&MY8Ixe!t5~WeSXdl*A7PfJk
zaTHZymk}iVS@L;>i}V6Apc<FOK|fNEsF8e>t)g7^SuoOvk3*iL^q0;RvwW2BzBmG-
pX?-bDT4S`fAHeYAkO4frpd7@*TsRI+fiobOhSEnd4Dl)J{|7)gzwrP7

delta 4376
zcmY*cc|26@7dOT-Gqw@gw`3WSEo)+A$#U(QqDW)rnmaOP?ktL8@@5ICG)l52LefG=
zN>aR~l6phE6e?0FT96dKJG{T&`)mF=_nz<fInQ&>dG379x5BsI=ctunXtf4POu$^B
zEO>~LC5lsGc??!OE|yAWv&r!|SOO49C7~kIgJ4MlfC4Z8ChZ>zl^f3jaWIA?8f6EF
z04Ril6%-+H6!i?m-I%awlz^z`h!$CF6!7&N#Y*q?ObXic){B3eF2D>37WLFO#Le`p
zE<Soq6cIs4d(xdmM1-Vbrz(j_sHS_#k3b?KNy*!M_)EO~;*IW^P6<8jX9ZjI^~4VT
zD!3v@(sx25e!)$B2XP<L0}VI9qP|wn7A+Jmho@^LWt0p{C&DrWk{k<UN9<;@sS#Wz
zi%#LdvKb`tr6gcENSKAyKUv_BEL3u)q)GAsiiD$cK{lC7XU1~i6@Umyjz(rM$PrwK
z4RRo63>8-JAO=MGKKWezlO(|+b2uOeRz!GGNF5`1EGn6cI94JKaVkh5$HPj3K$NnS
zGO{&?i(s&dfP>PtRYgvzmI$j8AUOz6l0?>0naBw>04xa&a>=kJBo6)C2`ynKq*Ech
zU=iiCR{J+hM~I06(rFMrgS0a&!PDvV884^e94{yEvAYvm5G_hYtr9#ERW{W1Af_aY
zcwA82OTvM1<d_IB4rIr}s{tGdOM!T?yEya&5Y`jOpp`@Q|K(yS<d~R9GG!MBHt-{1
zg=r!cVsKzXK$;}MjEdrbT$tcNoLXufk~cYq1(9K+-%hz^>6F3@644r-#u+3WmCj+2
zxfBRw!zO+t+)~)(HmwCjNmAe*mU|477AY)->2IOT{!Lggf>uCP2&U0HWzCV+Nz$p7
z!hXXR0*aU+-x8_ux<uHDK*9*4FBRAtDKLi1+y%y3(y6dbB5X?tkw6a4pvN-V9M}$!
z3K2(MX>2kJg6#p^63B_?a6tyV9uQvwxoi+=u>*h&K_gU<#ep{fm=Fx|OkpxuJft|V
zBOnq&6~^W8*l}Py>;#B~h#{{$4gwMZ$)z|Dk3o)wodMJm#HBMpcq1SYf<<`rSR_gm
znF7Ktfap?a<o2<UD+$7`0GdZaQ=>SrTO#Z(V2E#^Z2~k%7$o<lykL(+*pmQ=K#l+k
zat1^p7XS^p0%DL8AP%huB%qCeBt!%-&;|etIRH4w9*~0UcuRLMg2m*(UVtVq64HdU
zNYZR_ES1TKSiZR40=$H-6<`zu`w&QyzMC!V?bq4DzQ6Ux59y6BOEgvZdGY@HdHD<f
zB={Md+mOtRA708L{RkhjEqaK#pmF27rupHdl<x9f=AnG}RqHtYY|Y160V?3b_RcXa
zNiR7su{d9v7dm$A60I(Ma@ZqU@uS>IDptW-O5Ea$gtT2(z;x^9jG>*E&P^<gjzs^t
z@p)T??xtS(2zQ;pA{|Jxu<Kr--!lg>i)lVD>aI;lA!qFH%j;Py;b7iFeMy7FwIRhr
zb+qK>o*3)wr>zU6?N&GDnB_rJgZGc%($+Ss@PciX#AdCroa{RW3SCvY+<eS%Y2E1q
zy5>nAT(&jtpcRfdBp;lZ8~onTJ%X!hDfLwDJYLMgd!g&tPU>2`&XH)n3E717;|KX*
z)<>H=*Sc40JSuh|`n3eAZT$B5q&XpbWc2;+oT!MCr#p#-nqTKL?++yLtjmlFVr%1`
ziC*mw2~d9i!80{w^eX+NLW$V%+gWd=(vw)p&FVfDoeUZJr2r3=fc@{ox0e_y=l1{%
z=+wPgE04}RvzmAPm3)m`v!Avlnnqc9^3m6#Su;i}_q`nQcKehIJ<RAlY*MOkERr{w
z8k~LL{^h>rNrG7P>{nO!lA~Gf=~t&~O*WNeT*nCB?=rBvHatSuhAD>qI=W&0)WL10
zXH8YZScjVW9t}q~9G}UNVRe|~alZr&<SLI+bj#hQ&djAVCtuJ%O=rx!4h<C)JM(Le
z_D89;9pI+kpCoa8djxG)ao~?Dv7*Cic^4O1ju!(=F_pK@EjEkwwWWdvw?1vN)(cy?
zA_H9@BZ18ct!O;I_T_kV!v)(n`jbIzmtD2uY#z&>{p6><XLZZZdh4Up($<HLPcdu-
z*@4_U#ZUU0tgd%DlM7lpi)|C{YQ6f%-#djLXr=3$Re9gbSa~mH;&GRBL~`AW=@U<q
zDAA-Xfs^0kZD;(5G@pJjY17++VC%S)7#E}DE0+0cgCF#@UMe!m1Z$)27NLfdMqbE{
z`(DeislA}`Wy-ei(fZm^u{~3b$(GvTz~NZ$cN6DgvtQf1i&-(3&`gr%+L4*jpZG`I
z-n904zwTocT2@`7g|`KVUa(iWWjP<d>BH*mzM8QjiMfQ?@K%Puy+_Ksb~mD3^P|2p
z&{TIaMs5Mho^<|-KXP3phJWwT3-O`#){bABn+%Gp<Kt*udD7^#%GZXHn!{<ew4LA5
z8IvW0cx|a6)=lEcD_f?IQhUG7JTK>OCFcGWLaJ5p1IqG31$8ZXol`q#)A$+jgP;1`
zyDF~_$c&LoC-u;MXKFCMA{%>F-L~MzPG@QONJOhE=Ei0E`?u1ZY3H7+dA=yn?{+4s
zn$7+3<;!K3@1<{CJBi%Z{Hu3t!cQ7OekNl({O4wqR>`k=mi@L7f6>xZu!^M3zo9I8
zD@+So^EL2Gf3LFQfU&o#?nrjvt~++tn+^=FWOy-WM1w`!0vYl}Mk;uDuxBb|)#DZF
zPb)MUxoY-%ca&brS}5-~Up-IUZ*`TZCH2cKdt4J3u{oql$#snDIGVI|J_Yv3y%kx?
z3~XOxwX?;Tb3oHXKlWXlEc*7#hy0KyJ-i$Hod&a_T%M24AG3)`=Ej<~p<b}8wBFR~
z+4`6zpPQ|)`=OfV!tzYcxtTir&A*v-;2iN?d!PK)eokRkeuJ$sRYCrBv&^wB2SL`W
z!eo|}j7TNp_C4|OI`!EPt~V5Kea>XR(7t^CIKkHRiAUp{D;O#<yVaj9zZlDBoTb|{
zvkfIx3RyQ>|Bx7a(@k-Md~=7#KbFMn`^jwia2tE3;eA5Rh{=>voy__vv$qwc4mU?W
zZO~SqY_gTn)BGNo*pENe_W?h3ev7W7!(N`nG0infk9X=cZ4Mr)-Rhw*abRbGYX(*>
zS0M(_xBB$zM6T;Y4{DW2H11bT5Z`f+Y`|0``OvG$RXeKlD;?*O(vKzIPf%(-v%gCL
zeAVv%<n@*nCBeh(m6~#0;~6vC{u;6KMyV(|3>~OSRRh!gab@qTp04?0^!{v_VA!fX
z%V4o!bMKt{(Z<r81*z@pcj{7124~;y$FIW?&ryQ+$vZrr-x_wUt|g1Y+-JCz|E0ud
z{B9gx_2~tNy{D|4j{jtK4UB%1u9YX}C+|-j+Tb=8w$Nni3@853_As#CgaLxr-<q_$
z04S{$HNLprpT`JNOEG$n`~Ih>QEu29jmwpuzP_47RizSrwc^wy=Y>YcHus-_?7w6U
z4T30}9BW+Fe@=|s?mjyqeo6DIo+`hvn*1IOhF%QgH8lM_J$bg`Ip)_WN=p6Kg1q%b
z(^Sd7>UWFd{l9dq@F|bFu&}+|FRvXob@X#Pp<CAMoy3%?wDM?wGoV<1;?y=hflEkr
zF5fBjx|FurnasbvOtvQ0npkQ-Sg=tUARSr#^jO)2Y)X)+=7gznpP|gd4*#$h{E0tp
zHi&aTO8NU_#eL?)J+l)^x_8uQ>gFvDP?et3eC9#<g6ckU@Mq4^4DlT%w}1Gt!J9Tp
z&QUcPXRG{P&1`*Tdau`HA2z<^oNv>b{S-LEam}<x{E_sT$X?_8%~i@#K8JNkWxE2e
zm{%ys<`k(l%xfPxqE6g$jV>jdSjqQ3HdS)}R4o7GhLGqH*<t+F^Nj6Y>*Viw6veM{
ziF|)(BWBu2(nP{EJF~map(7(Y<uKWjzHi@m_(bLW>1vmwg*0A`gw)Q4vl~4Uihgbm
ze#w!H3-Ec_ce2;{agk1q&RDK=K>50+X1dR~+^WiIPr~=FMTu(r*!z1=wp_8-;iGyC
z#?r2({jxVcRlPU!BqK-(Z!wwEo9R+cOYeARU0IeHU*gz%KmQJT427=Xza>AYa&g#q
zzG3v}LiX$)Txr-x`<fp#@z=iE8zXi|DNL`_*)z~^UgOsxbP;3h?wgm+2YcR_b@nE4
zr)Ad=dwCso9&d4hraJz^g3H}?-JAJG%6~4T0lTjbuu9Q`>3%1|lrvuyfQ+P6w@1Uy
z&0b|D5?40%)NVc<U+A9eKa}noj#s;3{whTw&BalNPW#(M=8eT@`so%8@BZ%MtS{Z7
z!%k*5oWG$K7WWR<=gH+JU70w2`o_)6n?fz(JLi2fwih;<pgQFm&8jgwvajU@^7R@7
z35@3tiews!8~XoN;<Y?AzN~U5#jHZ^EpzO71ZvLe!RLJkRz78h2b3xv!L_?~MSbJg
zMquX6!1Z$Bl0;?W;?=+aVOwJ4;`T}9!-sqnaJ~gxWw#eYuAR3<Pv7bAYx%juxu*Ol
z&}lzm{x<o;Sl(J#H6|@rz3!Lqxgow%U+%NXg-M0=%}vIA(^oA9kFR=uuQvRH>|mN#
zKDJ%wqD*A9<iXyG&u{EI`)4Z>q}JSSuIb5Dt>~~G!LN<B8<~9lDOLWqO?klP7_AxX
z+?T1u&07M3=Hj@K02L7NK&IBEA2*U0NkVfGeGm{%l0int$QTe2VD3u_EbzB!G3bAj
zJT`;WhOBO@*m+tg4u?l+OBUiS+hJ_L(z~Z%7;lLE&MbH+Wc3K}@=P=`sMh}>FN;E9
z>T^~c7ghd$-Bcmm03*~!*m}p6`KT4c2k6Ti4EZlv5fj0<k^xpEa5<uLy|OY1wH8kj
zd%yU0V_Fbg1$jfliMN^n{XXcP`|qIZU!Sk?49iz!fD9%(J|c=u2V<!*@ey1ul?>M$
zB+1giSPq>Lv1@k(nN8!s=M&*t0%QTLgA@Tvk_3gxX7M=ig>d*HAsoJh%<TXw4Ze&V
z8!>f=u0u>c7!F_g2WbG%LRTXoCNxcexX@e$_!2_f3`h#iHH6<nSYCU5nQ>zoX+?^G
z5N;xbHeqRy=UWItG!1TFMs6d>9m4V+9l%o1&gG?lBFJ3;D_nAq91eF8mY?qfIH8OG
z0D&L=<Lh4bJp!bJP!EDUCM=)#$ujcSGV&AzAw=u(Ncbgv9>HKz!I+3$^jIp~3rQ_Y
z!X+OE?gPX~avUa`3sNH}F+4<$A!4mR9DYVfgr6h&4o~MqAc_`|bblX&kmC&qLJjl{
i2LTi>5q?2{mYpvN5C=+xvY>nrLT)T1gh5`U>3;wk=tWBa

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 84bc8340d5..8479847160 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -14,7 +14,7 @@
 import numpy as np
 from copy import deepcopy
 
-from gensim.models import EnsembleLda, LdaMulticore
+from gensim.models import EnsembleLda, LdaMulticore, LdaModel
 from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
 
 num_topics = 2

From 96d6fbdf06285ff0532d5827fed7d12a854a0559 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 8 Sep 2019 19:55:14 +0200
Subject: [PATCH 056/166] fix code style

---
 gensim/models/ensemblelda.py    | 1 +
 gensim/test/test_ensemblelda.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 360b757a30..a371357ca4 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -106,6 +106,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 class EnsembleLda():
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting
     reliable topics that are consistently learned accross the ensemble.  eLDA has the added benefit that the user
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 8479847160..84bc8340d5 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -14,7 +14,7 @@
 import numpy as np
 from copy import deepcopy
 
-from gensim.models import EnsembleLda, LdaMulticore, LdaModel
+from gensim.models import EnsembleLda, LdaMulticore
 from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
 
 num_topics = 2

From 819a05fa1ea1bdf1829b00a1ecac02b493bc53e9 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 12 Sep 2019 00:33:55 +0200
Subject: [PATCH 057/166] added _is_easy_valid_cluster

---
 gensim/models/ensemblelda.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index a371357ca4..14941e6fff 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -979,6 +979,11 @@ def _remove_from_all_sets(self, label, clusters):
             if label in a["parent_labels"]:
                 a["parent_labels"].remove(label)
 
+    def _is_easy_valid_cluster(self, label, cluster, min_cores):
+        """checks if the cluster has at least min_cores of cores with the only parent-label as itself,
+        meaning no other cluster influenced those cores, hence the cluster in question is easy valid."""
+        return sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores
+
     def _generate_stable_topics(self, min_cores=None):
         """generates stable topics out of the clusters. This function is the last step that has to be done in the
         ensemble.
@@ -1032,23 +1037,22 @@ def _generate_stable_topics(self, min_cores=None):
                 self._remove_from_all_sets(label, sorted_clusters)
 
             # 3. checking for "easy_valid"s
-            # checks if the core has at least min_cores of cores with the only label as itself
+            # checks if the cluster has at least min_cores of cores with the only label as itself
             if cluster["amount_parent_labels"] >= 1:
-                if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores:
+                if self._is_easy_valid_cluster(label, cluster, min_cores):
                     cluster["is_valid"] = True
 
         # Reapplying the rule 3
         # this happens when we have a close relationship among 2 or more clusters
         for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]:
-
-            # this checks for parent_labels, which are also modified in this function
-            # hence order will influence the result. it starts with the most significant
-            # label (hence "sorted_clusters")
-            if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores:
+            # This modifies parent labels
+            # which is important in _is_easy_valid_cluster, so the result depends on where to start.
+            # That's why sorted_clusters is sorted, so it starts with the most significant cluster.
+            if self._is_easy_valid_cluster(label, cluster, min_cores):
                 cluster["is_valid"] = True
             else:
                 cluster["is_valid"] = False
-                # removes a label from every set in parent_labels for each core
+                # removes a label from every set in parent_labels for each core.
                 for a in sorted_clusters:
                     if label in a["parent_labels"]:
                         a["parent_labels"].remove(label)

From e3025f60113f706f9eb1e0ec214ecb2f6570dfe8 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 12 Sep 2019 00:39:00 +0200
Subject: [PATCH 058/166] updated thesis reference

---
 gensim/models/ensemblelda.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 14941e6fff..dc36dc73a3 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -78,9 +78,8 @@
        2010 WORKSHOP ON NEW CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. 2010. p. 45-50.
        Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595
 
-.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation: bachelor thesis.
-       Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/
-       TODO country? thi? loosley, salles, stephan?
+.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis].
+       Ingolstadt: Tecnische Hochschule. Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/
 
 Citation
 --------

From 95e1b7937b6dc610715a209b46c7d0612fbf108c Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 12 Sep 2019 09:37:00 +0200
Subject: [PATCH 059/166] updated notebook example, typo

---
 docs/notebooks/Opinosis.ipynb | 126 +++++-----------------------------
 gensim/models/ensemblelda.py  |   2 +-
 2 files changed, 18 insertions(+), 110 deletions(-)

diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/Opinosis.ipynb
index 3fc845040c..8227f99932 100644
--- a/docs/notebooks/Opinosis.ipynb
+++ b/docs/notebooks/Opinosis.ipynb
@@ -2,12 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-12-01T17:57:35.858751Z",
-     "start_time": "2018-12-01T17:57:34.507686Z"
-    },
     "scrolled": false
    },
    "outputs": [],
@@ -27,13 +23,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-12-01T17:57:36.539523Z",
-     "start_time": "2018-12-01T17:57:36.534986Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "elda_logger = logging.getLogger(EnsembleLda.__module__)\n",
@@ -64,13 +55,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-29T20:19:17.891961Z",
-     "start_time": "2018-11-29T20:19:16.731678Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "!mkdir ~/opinosis\n",
@@ -80,13 +66,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-12-01T17:57:39.575650Z",
-     "start_time": "2018-12-01T17:57:39.571588Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "path = os.path.expanduser('~/opinosis/')"
@@ -104,13 +85,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-12-01T17:57:40.842440Z",
-     "start_time": "2018-12-01T17:57:39.905273Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "opinosis = OpinosisCorpus(path)"
@@ -138,93 +114,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-12-01T18:10:13.690983Z",
-     "start_time": "2018-12-01T17:57:41.089661Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Generating 128 topic models...\n",
-      "Spawned worker to generate 42 topic models\n",
-      "Spawned worker to generate 43 topic models\n",
-      "Spawned worker to generate 43 topic models\n",
-      "Generating a 2560 x 2560 asymmetric distance matrix...\n",
-      "Spawned worker to generate 640 rows of the asymmetric distance matrix\n",
-      "Spawned worker to generate 640 rows of the asymmetric distance matrix\n",
-      "Spawned worker to generate 640 rows of the asymmetric distance matrix\n",
-      "Spawned worker to generate 640 rows of the asymmetric distance matrix\n",
-      "The given threshold of 0.11 covered on average 9.9% of tokens\n",
-      "The given threshold of 0.11 covered on average 9.8% of tokens\n",
-      "The given threshold of 0.11 covered on average 9.9% of tokens\n",
-      "The given threshold of 0.11 covered on average 9.8% of tokens\n",
-      "Fitting the clustering model\n",
-      "Generating stable topics\n",
-      "Generating classic gensim model representation based on results from the ensemble\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n",
     "                   passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n",
-    "                   topic_model_kind='ldamulticore', masking_method='rank', min_samples=32)"
+    "                   topic_model_class='ldamulticore', masking_method='rank', min_samples=32)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-12-01T18:10:13.712818Z",
-     "start_time": "2018-12-01T18:10:13.695844Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "- 0.132 staff, 0.082 servic, 0.081 friendli, 0.075 help, 0.021 good, 0.015 quick, 0.010 expens \n",
-      "\n",
-      "- 0.166 screen, 0.063 bright, 0.040 clear, 0.022 easi, 0.020 touch, 0.015 read, 0.014 new \n",
-      "\n",
-      "- 0.146 free, 0.043 park, 0.034 coffe, 0.031 wine, 0.025 even, 0.025 morn, 0.022 internet \n",
-      "\n",
-      "- 0.142 batteri, 0.099 life, 0.026 short, 0.020 charg, 0.018 kindl, 0.018 good, 0.017 perform \n",
-      "\n",
-      "- 0.123 room, 0.111 clean, 0.050 small, 0.035 comfort, 0.033 bathroom, 0.017 nice, 0.016 size \n",
-      "\n",
-      "- 0.147 seat, 0.086 comfort, 0.058 uncomfort, 0.045 firm, 0.031 long, 0.030 drive, 0.014 time \n",
-      "\n",
-      "- 0.161 mileag, 0.106 ga, 0.056 good, 0.028 expect, 0.019 hard, 0.018 road, 0.018 high \n",
-      "\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# pretty print, note that the words are stemmed so they appear chopped off\n",
     "for t in elda.print_topics(num_words=7):\n",
     "    print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -243,7 +151,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.2"
+   "version": "3.7.3"
   }
  },
  "nbformat": 4,
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index dc36dc73a3..84916b25cb 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -79,7 +79,7 @@
        Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595
 
 .. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis].
-       Ingolstadt: Tecnische Hochschule. Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/
+       Ingolstadt: Technische Hochschule. Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/
 
 Citation
 --------

From 3d11649f05daa72d2107ff1c167b61a6944df46f Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 14 Sep 2019 23:59:44 +0200
Subject: [PATCH 060/166] docstring styles, renaming, cleanup, stuff I need to
 discuss first

---
 gensim/models/ensemblelda.py      | 412 ++++++++++++++----------------
 gensim/test/test_data/ensemblelda | Bin 9893 -> 13705 bytes
 2 files changed, 191 insertions(+), 221 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 84916b25cb..2bf40afdb5 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -4,10 +4,12 @@
 # Authors: Tobias Brigl <github.com/sezanzeb>, Alex Loosley <aloosley@alumni.brown.edu>,
 # Stephan Sahm <stephan.sahm@gmx.de>, Alex Salles <alex.salles@gmail.com>, Data Reply Munich
 
-"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting
-reliable topics that are consistently learned accross the ensemble.  eLDA has the added benefit that the user
-does not need to know the exact number of topics the topic model should extract ahead of time.  For more details
-read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/).
+"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
+
+Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
+the user does not need to know the exact number of topics the topic model should extract ahead of time
+
+For more details read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/).
 
 Usage examples
 --------------
@@ -107,17 +109,21 @@
 
 
 class EnsembleLda():
-    """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting
-    reliable topics that are consistently learned accross the ensemble.  eLDA has the added benefit that the user
-    does not need to know the exact number of topics the topic model should extract ahead of time [2].
+    """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
+    
+    Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
+    the user does not need to know the exact number of topics the topic model should extract ahead of time [2].
 
     """
+    
     def __init__(self, topic_model_class="lda", num_models=3,
                  min_cores=None,  # default value from _generate_stable_topics()
                  epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
                  min_samples=None, masking_method="mass", masking_threshold=None,
                  distance_workers=1, random_state=None, **gensim_kw_args):
-        """
+        """Create and train a new EnsembleLda model.
+
+        Will start training immediatelly, except if iterations, passes or num_models is 0 or if the corpus is missing.
 
         Parameters
         ----------
@@ -188,9 +194,6 @@ def __init__(self, topic_model_class="lda", num_models=3,
         if "corpus" not in gensim_kw_args:
             gensim_kw_args["corpus"] = None
 
-        # dictionary. modified version of from gensim/models/ldamodel.py
-        # error messages are copied from the original gensim module [1]:
-        # will create a fake dict if no dict is provided.
         if gensim_kw_args["id2word"] is None and not gensim_kw_args["corpus"] is None:
             logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
             gensim_kw_args["id2word"] = utils.dict_from_corpus(gensim_kw_args["corpus"])
@@ -216,7 +219,10 @@ def __init__(self, topic_model_class="lda", num_models=3,
         self.gensim_kw_args = gensim_kw_args
 
         self.memory_friendly_ttda = memory_friendly_ttda
+
         self.distance_workers = distance_workers
+        self.masking_threshold = masking_threshold
+        self.masking_method = masking_method
 
         # this will provide the gensim api to the ensemble basically
         self.classic_model_representation = None
@@ -234,7 +240,6 @@ def __init__(self, topic_model_class="lda", num_models=3,
         # parameters, stop here and don't train.
         if num_models <= 0:
             return
-
         if gensim_kw_args.get("corpus") is None:
             return
         if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0:
@@ -242,7 +247,7 @@ def __init__(self, topic_model_class="lda", num_models=3,
         if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0:
             return
 
-        logger.info("Generating {} topic models...".format(num_models))
+        logger.info("generating {} topic models...".format(num_models))
 
         if ensemble_workers > 1:
             self._generate_topic_models_multiproc(num_models, ensemble_workers)
@@ -250,10 +255,7 @@ def __init__(self, topic_model_class="lda", num_models=3,
             # singlecore
             self._generate_topic_models(num_models)
 
-        self._generate_asymmetric_distance_matrix(
-            workers=self.distance_workers,
-            threshold=masking_threshold,
-            method=masking_method)
+        self._generate_asymmetric_distance_matrix()
         self._generate_topic_clusters(epsilon, min_samples)
         self._generate_stable_topics(min_cores)
 
@@ -261,12 +263,12 @@ def __init__(self, topic_model_class="lda", num_models=3,
         self.generate_gensim_representation()
 
     def convert_to_memory_friendly(self):
-        """removes the stored gensim models and only keeps their ttdas"""
+        """Remove the stored gensim models and only keep their ttdas."""
         self.tms = []
         self.memory_friendly_ttda = True
 
     def generate_gensim_representation(self):
-        """Creates a gensim model from the stable topics.
+        """Create a gensim model from the stable topics.
 
         The returned representation is an Gensim LdaModel (:py:class:`gensim.models.LdaModel`) that has been
         instantiated with an A-priori belief on word probability, eta, that represents the topic-term distributions of
@@ -279,26 +281,25 @@ def generate_gensim_representation(self):
         :py:class:`gensim.models.LdaModel`
             A Gensim LDA Model classic_model_representation for which:
             classic_model_representation.get_topics() == self.get_topics()
-        """
 
-        logger.info("Generating classic gensim model representation based on results from the ensemble")
+        """
+        logger.info("generating classic gensim model representation based on results from the ensemble")
 
-        number_of_words = self.sstats_sum
-        # if number_of_words should be wrong for some fantastic funny reason
+        sstats_sum = self.sstats_sum
+        # if sstats_sum (which is the number of words actually) should be wrong for some fantastic funny reason
         # that makes you want to peel your skin off, recreate it (takes a while):
-        if number_of_words == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None:
+        if sstats_sum == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None:
             for document in self.gensim_kw_args["corpus"]:
                 for token in document:
-                    number_of_words += token[1]
-            self.sstats_sum = number_of_words
-        assert number_of_words != 0
+                    sstats_sum += token[1]
+            self.sstats_sum = sstats_sum
 
         stable_topics = self.get_topics()
 
         num_stable_topics = len(stable_topics)
 
         if num_stable_topics == 0:
-            logger.error("The model did not detect any stable topic. You can try to adjust epsilon: "
+            logger.error("the model did not detect any stable topic. You can try to adjust epsilon: "
                          "recluster(eps=...)")
             self.classic_model_representation = None
             return
@@ -315,7 +316,10 @@ def generate_gensim_representation(self):
 
         # when eta was None, use what gensim generates as default eta for the following tasks:
         eta = classic_model_representation.eta
-
+        if sstats_sum == 0:
+            sstats_sum = classic_model_representation.state.sstats.sum()
+            self.sstats_sum = sstats_sum
+            
         # the following is important for the denormalization
         # to generate the proper sstats for the new gensim model:
         # transform to dimensionality of stable_topics. axis=1 is summed
@@ -337,7 +341,7 @@ def generate_gensim_representation(self):
         # right sstats, so that get_topics() will return the stable topics no
         # matter eta.
 
-        normalization_factor = np.array([[number_of_words / num_stable_topics]] * num_stable_topics) + eta_sum
+        normalization_factor = np.array([[sstats_sum / num_stable_topics]] * num_stable_topics) + eta_sum
 
         sstats = stable_topics * normalization_factor
         sstats -= eta
@@ -351,10 +355,10 @@ def generate_gensim_representation(self):
         return classic_model_representation
 
     def add_model(self, target, num_new_models=None):
-        """Adds the ttda of another model to the ensemble this way, multiple topic models can be connected to an
-        ensemble.
-
-        Make sure that all the models use the exact same dictionary/idword mapping.
+        """Add the ttda of another model to the ensemble.
+        
+        This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use
+        the exact same dictionary/idword mapping.
 
         In order to generate new stable topics afterwards, use
             self._generate_asymmetric_distance_matrix()
@@ -472,7 +476,7 @@ def add_model(self, target, num_new_models=None):
                             'stored models for a memory unfriendly ensemble')
             self.num_models = len(self.tms)
 
-        logger.info("Ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
+        logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
 
         if self.ttda.shape[1] != ttda.shape[1]:
             raise ValueError(("target ttda dimensions do not match. Topics must be {} but was"
@@ -491,7 +495,6 @@ def save(self, fname):
             Path to the system file where the model will be persisted.
 
         """
-
         logger.info("saving %s object to %s", self.__class__.__name__, fname)
 
         utils.pickle(self, fname)
@@ -510,8 +513,6 @@ def load(fname):
             A previously saved ensembleLda object
 
         """
-
-        # message copied from [1]
         logger.info("loading %s object from %s", EnsembleLda.__name__, fname)
 
         eLDA = utils.unpickle(fname)
@@ -519,8 +520,9 @@ def load(fname):
         return eLDA
 
     def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
-        """Will make the ensemble multiprocess, which results in a speedup on multicore machines. Results from the
-        processes will be piped to the parent and concatenated.
+        """Generate the topic models to form the ensemble in a multiprocessed way.
+        
+        Depending on the used topic model this can result in a speedup.
 
         Parameters
         ----------
@@ -537,7 +539,6 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
             In that case, ensemble_workers=2 can be used.
 
         """
-
         # the way random_states is handled needs to prevent getting different results when multiprocessing is on,
         # or getting the same results in every lda children. so it is solved by generating a list of state seeds before
         # multiprocessing is started.
@@ -609,7 +610,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
             p.terminate()
 
     def _generate_topic_models(self, num_models, random_states=None, pipe=None):
-        """Will train the topic models, that form the ensemble.
+        """Train the topic models, that form the ensemble.
 
         Parameters
         ----------
@@ -623,9 +624,8 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
             send the ttda.
 
         """
-
         if pipe is not None:
-            logger.info("Spawned worker to generate {} topic models".format(num_models))
+            logger.info("spawned worker to generate {} topic models".format(num_models))
 
         if random_states is None:
             random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)]
@@ -668,11 +668,8 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
             pipe.close()
 
     def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method):
-        """ Worker that computes the distance to all other nodes
-        from a chunk of nodes. https://stackoverflow.com/a/1743350
-        """
-
-        logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas))
+        """Worker that computes the distance to all other nodes from a chunk of nodes."""
+        logger.info("spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas))
         # the chunk of ttda that's going to be calculated:
         ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
         distance_chunk = self._calculate_asymmetric_distance_matrix_chunk(
@@ -680,28 +677,17 @@ def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pip
         pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
         pipe.close()
 
-    def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"):
-        """Makes the pairwise distance matrix for all the ttdas from the ensemble.
+    def _generate_asymmetric_distance_matrix(self):
+        """Calculate the pairwise distance matrix for all the ttdas from the ensemble.
 
         Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering.
 
         Afterwards, the model needs to be reclustered for this generated matrix to take effect.
 
-        Parameters
-        ----------
-        threshold : float, optional
-            if threshold is None and method == "rank":
-                threshold = 0.11
-            if threshold is None and method == "mass":
-                threshold = 0.95
-
-            threshold keeps by default 95% of the largest terms by mass. Except the "fast" parameter is "rank", then
-            it just selects that many of the largest terms.
-        workers : number, optional
-            when workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which is not
-            multiprocessed. Set to > 1 to enable multiprocessing.
-
         """
+        workers = self.distance_workers
+        threshold = self.masking_threshold
+        method = self.masking_method
 
         # matrix is up to date afterwards
         self.asymmetric_distance_matrix_outdated = False
@@ -709,7 +695,7 @@ def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method
         if threshold is None:
             threshold = {"mass": 0.95, "rank": 0.11}[method]
 
-        logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda)))
+        logger.info("generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda)))
 
         # singlecore
         if workers is not None and workers <= 1:
@@ -728,7 +714,6 @@ def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method
         ttdas_sent = 0
 
         for i in range(workers):
-
             try:
                 parentConn, childConn = Pipe()
 
@@ -781,10 +766,7 @@ def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method
         return self.asymmetric_distance_matrix
 
     def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method):
-        """Internal method calculating an (asymmetric) distance from each topic term distribution in ttda1 to each topic
-        term distribution in ttda2 and returns a matrix of distances, size len(ttda1) by len(ttda2).
-
-        This is an internal method and should not be used for two general ttda arrays.
+        """Calculate an (asymmetric) distance from each topic in ttda1 to each topic in ttda2.
 
         Parameters
         ----------
@@ -806,21 +788,21 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s
         -------
         2D Numpy.numpy.ndarray of floats
             Asymmetric distance matrix of size len(ttda1) by len(ttda2).
-        """
 
+        """
         if method not in ["mass", "rank"]:
             raise ValueError("method {} unknown".format(method))
 
         # select masking method:
         def mass_masking(a):
-            """original masking method. returns a new binary mask"""
+            """Original masking method. Returns a new binary mask."""
             sorted_a = np.sort(a)[::-1]
             largest_mass = sorted_a.cumsum() < threshold
             smallest_valid = sorted_a[largest_mass][-1]
             return a >= smallest_valid
 
         def rank_masking(a):
-            """faster masking method. returns a new binary mask"""
+            """Faster masking method. Returns a new binary mask."""
             return a > np.sort(a)[::-1][int(len(a) * threshold)]
 
         create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
@@ -833,7 +815,6 @@ def rank_masking(a):
 
         # now iterate over each topic
         for ttd1_idx, ttd1 in enumerate(ttda1):
-
             # create mask from ttd1 that removes noise from a and keeps the largest terms
             mask = create_mask(ttd1)
             ttd1_masked = ttd1[mask]
@@ -842,7 +823,6 @@ def rank_masking(a):
 
             # now look at every possible pair for topic a:
             for ttd2_idx, ttd2 in enumerate(ttda2):
-
                 # distance to itself is 0
                 if ttd1_idx + start_index == ttd2_idx:
                     distances[ttd1_idx][ttd2_idx] = 0
@@ -861,13 +841,12 @@ def rank_masking(a):
                 distances[ttd1_idx][ttd2_idx] = distance
 
         percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
-        logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
+        logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
 
         return distances
 
     def _generate_topic_clusters(self, eps=0.1, min_samples=None):
-        """Runs the DBSCAN algorithm on all the detected topics from the models in the ensemble and labels them with
-        label-indices.
+        """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices.
 
         The final approval and generation of stable topics is done in _generate_stable_topics().
 
@@ -877,43 +856,46 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None):
             eps is 0.1 by default.
         min_samples : int
             min_samples is int(self.num_models / 2)
-        """
 
+        """
         if min_samples is None:
             min_samples = int(self.num_models / 2)
 
-        logger.info("Fitting the clustering model")
+        logger.info("fitting the clustering model")
 
         self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples)
         self.cluster_model.fit(self.asymmetric_distance_matrix)
 
     def _validate_core(self, core):
-        """If core is not a core returns False. Only cores with the valid_parents as its own label can be a valid core.
-        Returns True if that is the case, False otherwise.
+        """Check if the core has only a single valid parent, which is also the label assigned to the core.
+        
+        If the presumed core is not even a core returns False.
 
         Parameters
         ----------
         core : {'is_core', 'valid_parents', 'labels'}
             eps is 0.1 by default.
+
         """
         return core["is_core"] and (core["valid_parents"] == {core["label"]})
 
     def _group_by_labels(self, results):
-        """Groups all the learned cores by their label, which was assigned in the cluster_model
+        """Group all the learned cores by their label, which was assigned in the cluster_model.
 
         Parameters
         ----------
-        results : {list of {'is_core', 'parent_labels', 'num_samples', 'label'}}
+        results : {list of {'is_core', 'neighboring_labels', 'num_samples', 'label'}}
             After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results
             member, which can be used as the argument to this function. It's a list of infos gathered during
             the clustering step and each element in the list corresponds to a single topic.
 
         Returns
         -------
-            dict of (int, list of {'is_core', 'amount_parent_labels', 'parent_labels'})
+            dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
             A mapping of the label to a list of topics that belong to that particular label. Also adds
-            a new member to each topic called amount_parent_labels, which is the number of parent_labels of that
-            topic.
+            a new member to each topic called num_neighboring_labels, which is the number of
+            neighboring_labels of that topic.
+
         """
         grouped_by_labels = {}
         for topic in results:
@@ -921,7 +903,7 @@ def _group_by_labels(self, results):
                 topic = topic.copy()
 
                 # counts how many different labels a core has as parents
-                topic["amount_parent_labels"] = len(topic["parent_labels"])
+                topic["num_neighboring_labels"] = len(topic["neighboring_labels"])
 
                 label = topic["label"]
                 if label not in grouped_by_labels:
@@ -930,64 +912,70 @@ def _group_by_labels(self, results):
         return grouped_by_labels
 
     def _aggregate_topics(self, grouped_by_labels):
-        """Aggregates the topics to a list of clusters
+        """Aggregate the labeled topics to a list of clusters.
 
         Parameters
         ----------
-        grouped_by_labels : dict of (int, list of {'is_core', 'amount_parent_labels', 'parent_labels'})
+        grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
             The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the
             label.
 
         Returns
         -------
-            list of {'amount_parent_labels', 'parent_labels', 'label'}
-            amount_parent_labels is the max number of parent labels among each topic of a given cluster. label refers
-            to the label identifier of the cluster. parent_labels is a concatenated list of the parent_labels sets of
-            each topic. Its sorted by amount_parent_labels in descending order. There is one single element for each
-            cluster.
+            list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'}
+            max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label
+            refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the
+            neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending
+            order. There is one single element for each cluster.
+
         """
         sorted_clusters = []
 
         for label, group in grouped_by_labels.items():
-            amount_parent_labels = 0
-            parent_labels = []  # will be a list of sets
+            num_neighboring_labels = 0
+            neighboring_labels = []  # will be a list of sets
 
             for topic in group:
-                amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels)
-                parent_labels.append(topic["parent_labels"])
+                max_num_neighboring_labels = max(topic["num_neighboring_labels"], num_neighboring_labels)
+                neighboring_labels.append(topic["neighboring_labels"])
+
+            neighboring_labels = [x for x in neighboring_labels if len(x) > 0]
+
+            # TODO why is len(neighboring_labels) equal to num_cores?
 
             sorted_clusters.append({
-                "amount_parent_labels": amount_parent_labels,
-                "parent_labels": [x for x in parent_labels if len(x) > 0],
-                "label": label
+                "max_num_neighboring_labels": max_num_neighboring_labels,
+                "neighboring_labels": neighboring_labels,
+                "label": label,
+                "num_cores": len([topic for topic in group if topic["is_core"]])
             })
 
-        # start with the most significant core.
         sorted_clusters = sorted(sorted_clusters,
             key=lambda cluster: (
-                cluster["amount_parent_labels"],
-                cluster["label"]  # makes sorting deterministic
+                cluster["max_num_neighboring_labels"],
+                cluster["label"]
             ), reverse=True)
 
         return sorted_clusters
 
     def _remove_from_all_sets(self, label, clusters):
-        """removes a label from every set in "parent_labels" for
-        each core in clusters"""
+        """Remove a label from every set in "neighboring_labels" for each core in clusters."""
         for a in clusters:
-            if label in a["parent_labels"]:
-                a["parent_labels"].remove(label)
+            if label in a["neighboring_labels"]:
+                a["neighboring_labels"].remove(label)
 
-    def _is_easy_valid_cluster(self, label, cluster, min_cores):
-        """checks if the cluster has at least min_cores of cores with the only parent-label as itself,
-        meaning no other cluster influenced those cores, hence the cluster in question is easy valid."""
-        return sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores
+    def _contains_isolated_cores(self, label, cluster, min_cores):
+        """Check if the cluster has at least min_cores of cores that belong to no other cluster."""
+        # TODO is supposed to check for cores, but checks neighboring_labels.
+        # Funnily, it actually seems to be cores (see output of next line) but it doesn't read that way
+        # print(len(cluster["neighboring_labels"]), cluster["num_cores"])
+        return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores
 
     def _generate_stable_topics(self, min_cores=None):
-        """generates stable topics out of the clusters. This function is the last step that has to be done in the
-        ensemble.
-
-        Stable topics can be retrieved afterwards using get_topics().
+        """Generate stable topics out of the clusters.
+        
+        This function is the last step that has to be done in the ensemble.Stable topics can be retrieved afterwards
+        using get_topics().
 
         Parameters
         ----------
@@ -998,8 +986,6 @@ def _generate_stable_topics(self, min_cores=None):
             defaults to min_cores = min(3, max(1, int(self.num_models /4 +1)))
 
         """
-        logger.info("Generating stable topics")
-
         # min_cores being 0 makes no sense. there has to be a core for a cluster
         # or there is no cluster
         if min_cores == 0:
@@ -1008,6 +994,9 @@ def _generate_stable_topics(self, min_cores=None):
         if min_cores is None:
             # min_cores is a number between 1 and 3, depending on the number of models
             min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
+            logger.info(f"generating stable topics, each cluster needs at least {min_cores} cores")
+        else:
+            logger.info("generating stable topics")
 
         results = self.cluster_model.results
 
@@ -1015,52 +1004,37 @@ def _generate_stable_topics(self, min_cores=None):
 
         sorted_clusters = self._aggregate_topics(grouped_by_labels)
 
-        # Iterate over the cluster labels, see which clusters/labels
-        # are valid to trigger the creation of a stable topic
         for i, cluster in enumerate(sorted_clusters):
-            # here, None means "not yet evaluated"
             cluster["is_valid"] = None
 
-            label = cluster["label"]
-            # label is iterating over 0, 1, 3, ...
-
-            # 1. rule - remove if the cores in the cluster have no parents
-            if cluster["amount_parent_labels"] == 0:
-                cluster["is_valid"] = False
-                self._remove_from_all_sets(label, sorted_clusters)
+            # TODO removed first rule because when amount_parent_labels is 0, then there is no
+            # topic in the cluster than has a parent, hence the size is either 0 or 1,
+            # which can be covered by the next rule.
 
-            # 2. rule - remove if it has less than min_cores as parents
-            # (parents are always also cores)
-            if len(cluster["parent_labels"]) < min_cores:
+            if cluster["num_cores"] < min_cores:
                 cluster["is_valid"] = False
-                self._remove_from_all_sets(label, sorted_clusters)
+                self._remove_from_all_sets(cluster["label"], sorted_clusters)
 
-            # 3. checking for "easy_valid"s
-            # checks if the cluster has at least min_cores of cores with the only label as itself
-            if cluster["amount_parent_labels"] >= 1:
-                if self._is_easy_valid_cluster(label, cluster, min_cores):
-                    cluster["is_valid"] = True
+            # TODO I don't think it makes a difference if _contains_isolated_cores (formerly easy valid) is done (here
+            # and afterwards) or (only afterwards)
 
-        # Reapplying the rule 3
-        # this happens when we have a close relationship among 2 or more clusters
+        # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any
+        # other cluster.
         for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]:
-            # This modifies parent labels
-            # which is important in _is_easy_valid_cluster, so the result depends on where to start.
-            # That's why sorted_clusters is sorted, so it starts with the most significant cluster.
-            if self._is_easy_valid_cluster(label, cluster, min_cores):
+            label = cluster["label"]
+            if self._contains_isolated_cores(label, cluster, min_cores):
                 cluster["is_valid"] = True
             else:
                 cluster["is_valid"] = False
-                # removes a label from every set in parent_labels for each core.
-                for a in sorted_clusters:
-                    if label in a["parent_labels"]:
-                        a["parent_labels"].remove(label)
+                # This modifies parent labels which is important in _contains_isolated_cores, so the result depends on
+                # where to start. That's why sorted_clusters is sorted, so it starts with the most cluttered cluster.
+                self._remove_from_all_sets(label, sorted_clusters)
 
         # list of all the label numbers that are valid
         valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]])
 
         for topic in results:
-            topic["valid_parents"] = {label for label in topic["parent_labels"] if label in valid_labels}
+            topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels}
 
         # keeping only VALID cores
         valid_core_mask = np.vectorize(self._validate_core)(results)
@@ -1081,8 +1055,7 @@ def _generate_stable_topics(self, min_cores=None):
         self.stable_topics = stable_topics
 
     def recluster(self, eps=0.1, min_samples=None, min_cores=None):
-        """Runs the CBDBSCAN algorithm on all the topics in the ensemble and uses resulting clusters to identify
-        reliable topics.
+        """Quickly change the results of the ensemble by reapplying clustering and stable topic generation.
 
         Stable topics can be retrieved using get_topics().
 
@@ -1118,11 +1091,18 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
     # to make using the ensemble in place of a gensim model as easy as possible
 
     def get_topics(self):
+        """Return only the stable topics from the ensemble.
+        
+        Returns
+        -------
+        2D Numpy.numpy.ndarray of floats
+            List of stable topic term distributions
+
+        """
         return self.stable_topics
 
     def _has_gensim_representation(self):
-        """checks if stable topics area vailable and if the internal
-        gensim representation exists. If not, raises errors"""
+        """Check if stable topics and the internal gensim representation exist. Raise an error if not."""
         if self.classic_model_representation is None:
             if len(self.stable_topics) == 0:
                 raise ValueError("no stable topic was detected")
@@ -1130,32 +1110,35 @@ def _has_gensim_representation(self):
                 raise ValueError("use generate_gensim_representation() first")
 
     def __getitem__(self, i):
-        """see :py:class:`gensim.models.LdaModel`"""
+        """See :py:class:`gensim.models.LdaModel`."""
         self._has_gensim_representation()
         return self.classic_model_representation[i]
 
     def inference(self, *posargs, **kwargs):
-        """see :py:class:`gensim.models.LdaModel`"""
+        """See :py:class:`gensim.models.LdaModel`."""
         self._has_gensim_representation()
         return self.classic_model_representation.inference(*posargs, **kwargs)
 
     def log_perplexity(self, *posargs, **kwargs):
-        """see :py:class:`gensim.models.LdaModel`"""
+        """See :py:class:`gensim.models.LdaModel`."""
         self._has_gensim_representation()
         return self.classic_model_representation.log_perplexity(*posargs, **kwargs)
 
     def print_topics(self, *posargs, **kwargs):
-        """see :py:class:`gensim.models.LdaModel`"""
+        """See :py:class:`gensim.models.LdaModel`."""
         self._has_gensim_representation()
         return self.classic_model_representation.print_topics(*posargs, **kwargs)
 
     @property
     def id2word(self):
+        """Return the :py:class:`gensim.corpora.dictionary.Dictionary` object used in the model."""
         return self.gensim_kw_args["id2word"]
 
 
 class CBDBSCAN():
-    """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN).  The algorithm works based on
+    """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN).
+    
+    The algorithm works based on
     DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a "nearby" point is, and the
     minimum number of nearby points needed to label a candidate datapoint a core of a cluster.
     (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).
@@ -1178,58 +1161,59 @@ class CBDBSCAN():
     unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is
     one cause of unreliable topics).
 
-    Parameters
-    ----------
-    eps : float
-        epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
-        default: 0.1
-    min_samples : int
-        The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.
     """
 
     def __init__(self, eps, min_samples):
+        """Create a new CBDBSCAN object. Call fit in order to train it on an asymmetric distance matrix.
+
+        Parameters
+        ----------
+        eps : float
+            epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
+            default: 0.1
+        min_samples : int
+            The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.
+
+        """
         self.eps = eps
         self.min_samples = min_samples
 
     def fit(self, amatrix):
+        """Apply the algorithm to an asymmetric distance matrix."""
         self.next_label = 0
 
         results = [{
             "is_core": False,
-            "parent_labels": set(),
-            "parent_ids": set(),
+            "neighboring_labels": set(),
+            "neighboring_topic_indices": set(),
             "num_samples": 0,
             "label": None
         } for _ in range(len(amatrix))]
 
         amatrix_copy = amatrix.copy()
 
-        # to avoid problem about comparing the topic with itself
+        # to avoid the problem of comparing the topic with itself
         np.fill_diagonal(amatrix_copy, 1)
 
         min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))]
         min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x)
         ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted]
 
-        def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None):
-            # count how many neighbor_indices
-            # check which indices is closer than eps
-            neighbouring_topic_indices = np.array(
-                [
-                    candidate_topic_index
-                    for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps)
-                    if is_neighbour
-                ]
-            )
-            num_neighbouring_topics = len(neighbouring_topic_indices)
-
-            # If the number of neighbor_indices of a topic is large enough,
-            # it is considered a core.
-            # This also takes neighbor_indices that already are identified as core in count.
-            if num_neighbouring_topics >= self.min_samples:
+        def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]):
+            neighboring_topic_indices = [
+                candidate_topic_index
+                for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps)
+                if is_neighbour
+            ]
+            num_neighboring_topics = len(neighboring_topic_indices)
+
+            # If the number of neighbor indices of a topic is large enough, it is considered a core.
+            # This also takes neighbor indices that already are identified as core in count.
+            if num_neighboring_topics >= self.min_samples:
                 # This topic is a core!
                 results[topic_index]["is_core"] = True
-                results[topic_index]["num_samples"] = num_neighbouring_topics
+                results[topic_index]["num_samples"] = num_neighboring_topics
+                cluster_topic_indices = cluster_topic_indices.copy()
 
                 # if current_label is none, then this is the first core
                 # of a new cluster (hence next_label is used)
@@ -1240,48 +1224,34 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors
                     self.next_label += 1
 
                 else:
-                    # In case the core has a parent, that means
-                    # it has a label (= the parents label).
-                    # Check the distance between the
-                    # new core and the parent
-
-                    # parent neighbor_indices is the list of neighbor_indices of parent_id
-                    # (the topic_index that called this function recursively)
-                    all_members_of_current_cluster = list(parent_neighbors)
-                    all_members_of_current_cluster.append(parent_id)
-
-                    # look if 25% of the members of the current cluster are also
-                    # close to the current/new topic...
-
-                    # example: (topic_index, 0), (topic_index, 2), (topic_index, ...)
-                    all_members_of_current_cluster_ix = np.ix_([topic_index], all_members_of_current_cluster)
-                    # use the result of the previous step to index the matrix and see if those distances
-                    # are smaller then epsilon. relations_to_the_cluster is a boolean array, True for close elements
-                    relations_to_the_cluster = amatrix_copy[all_members_of_current_cluster_ix] < self.eps
-
-                    # if less than 25% of the elements are close, then the topic index in question is not a
-                    # core of the current_label, but rather the core of a new cluster
-                    if relations_to_the_cluster[0].mean() < 0.25:
+                    # In case the core has a parent, check the distance to the existing cluster (since the matrix is
+                    # asymmetric, it takes return distances into account here)
+                    # If less than 25% of the elements are close enough, then create a new cluster rather than further
+                    # growing the current cluster in that direction.
+                    close_members_mask = amatrix_copy[topic_index][cluster_topic_indices] < self.eps
+
+                    # TODO changed because it is supposed to check the distance to the existing cluster, not
+                    # to the neighbors of the parent
+
+                    if close_members_mask.mean() < 0.25:
                         # start new cluster by changing current_label
                         current_label = self.next_label
                         self.next_label += 1
+                        cluster_topic_indices = []
 
+                # TODO changed in order to make the parent_id parameter obsoloete because (i think) it (the appending)
+                # can easily done before calling scan_topic as well
+                cluster_topic_indices.append(topic_index)
                 results[topic_index]["label"] = current_label
 
-                for neighbouring_topic_index in neighbouring_topic_indices:
-
-                    if results[neighbouring_topic_index]["label"] is None:
-                        ordered_min_similarity.remove(neighbouring_topic_index)
-                        # try to extend the cluster into the direction
-                        # of the neighbor
-                        scan_topic(
-                            neighbouring_topic_index, parent_id=topic_index,
-                            current_label=current_label,
-                            parent_neighbors=neighbouring_topic_indices
-                        )
+                for neighboring_topic_index in neighboring_topic_indices:
+                    if results[neighboring_topic_index]["label"] is None:
+                        ordered_min_similarity.remove(neighboring_topic_index)
+                        # try to extend the cluster into the direction of the neighbor
+                        scan_topic(neighboring_topic_index, current_label, cluster_topic_indices)
 
-                    results[neighbouring_topic_index]["parent_ids"].add(topic_index)
-                    results[neighbouring_topic_index]["parent_labels"].add(current_label)
+                    results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index)
+                    results[neighboring_topic_index]["neighboring_labels"].add(current_label)
 
             else:
                 # this topic is not a core!
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index a83e87b8c7965b1871ef711a66f449a1e618b1b4..3b9563f08da01a1f9900d7060694ec235af53c2f 100644
GIT binary patch
literal 13705
zcmb7r2UwKvmLEkF0Ra&kf})@fq9`DWNG~D|pp2+vK^TU?q4UiQDk4ac-a(oo(yLOW
z3rKQKl1(;kv*~>|yGb@}vzu%-$!3$gzwevD|Gjr}^E}aKzVi0de&@XBdnvb2Khxyv
zv+{iM3NkdgdLO)K^3roPxfv>(7}Ik+o&__n3c<Wou#hSDo0*xVCPXMx6%mQiw-ZwD
zCPyU42$r{g_OqY8QJxViqi)}gON>d4jzd*qR7`4AM10h}_=x1V+li^kHx)5=Z{CiN
z7IwvkP4$WY-b+Io3JSB-snRl1)wwFYUa<P5wP8(CwG9@OATHNN*sZj|g!x5z#*TVn
zk9?QX9z{l~QgdFUs&vovf_15|SEk&J3Uz_5ut+c1qzJaM6v0j=H<O#!2>ayb;>})e
zA>JG`DT1S^X20B0EOL_X5^o3OR^rWBzFWK<l<yI5htT}6%-HpavB6Pejf>n`Y;X(>
zTxG^7+|a<HMmTP)Ie{8?nQ@Af#z9XRyLzC;Q@&U1ep;0xoRJxy&&qAYA{lDEOhxC6
zMc#5-vF1E#F360l^)c35G}c_wXq0=vfvgO_7X`Wu!FPwfB1;N%DxZujwIQn@U!^Oy
ziT>$P@Pp+2ONGlar8RmR3UW00epwkpK&f!WSfnn<D=ac-bi!4+nbHbvv_*NUd?8S7
zuG}lmmyfzkm0BZQlUqn*=rsDm0w|XdB)61=qA$`t*AxraAv{q=`eMC7lP3hrt(3dO
zDYNo4LWq2~^irh9M4|FMN=wve3iU#m+*+E{pwnpdLb!ad^!7}rD%1)QavN#DysZ2J
zonDBP+ZIU+01G5Kgs4&>8iLkdm78k=<rdm&a!YNHe3$mR+)5iP->nUi@6m?Jt+iqD
zz1ncOjW$AVtBov@=2mCwg_u&|hA3AH>AxASyYSlsd0OGg63@Hwv&L^P-fZx*m0K9Q
zXP|p*sc`dGb+`Q0-9<5oDxXoH)(deUSd^+*P*|WBZpqz#yyxe?66A8%qVykDA{m)p
zh=(9VMi;`&z_bLpUy;%o?|GWM0$p)xrY=j9pOIUfYA|G|gv2_fBkD4;^afSFT9XRp
z$<ctDw;^8<3-VO@oUHt3sRk|Nsx8RP5bh*OHn>xfr!i;?GK3^aekjxn3Z*MLN>={E
z=5?Av^w;DYRH70JceT5rcgdx~J((oX3`23DM!2sAorT3V(wmL&Kxu(-GA|2C(Tr<_
zhYH~ltVwB((qQ3nVq&6*N<Wh>ax;VQBwg#P^_R3#<QMcZPcNj%?Uc6YAQGs{N{1$A
z2v6&^B9BD%Rcnj#bM#rSG(u{^Z|pIyMWa>~3u&=ozlD1IlXzK>nW@(p1XXO<ADBuH
z;112p$`eJSD@a$RXXR!YiiPypu(z*%J9gH&M4DGzNl{^j$^c2Ip^FKyB0F?7mZfT*
zYjnjz2AC~Eo#weJ*Hol|a!dT*nL?YDirONLkonUD4(L;;(TPIKNHt0SS-p`kRc>y&
zN}Zz@w22Z~s@y`YO2{&@*axVf@yRR7HDsxDI#sbvs!pSZ(He9`YJ-iCZ4yL&hOt)2
zk(*Tux$0CkfLX1#(d3H{HbR~xi*zAh;!!SSo9`<W)Ch$NL8ujU3PCR!K(%1dKC0K|
zN!E!ahlxL-NQ;)w6~YUH@G@Oese(d)0c4SCTx)T?5e5t@;gvR9E3KhKi-BJ&gi>)}
zyK14#IPfKmC=UAn^Ap~P3n*6z6$YU)T@uQV1!yp5RlQWEQ{`tA<fXzq3>x9BiG9XG
zo77Z=RC`yfOjR2SbT&e@mqMtKNmiPW?0fB+|5c%OXZ3aB>gyFk16H3<Xi+URYK=->
zbXjOJDtWU)Xn~Sz|Gav)ri(1BhyS~Cg+8h1a|Vxd1<mosOIkkQn&TYDi^XBw%VWx9
zuA^2O=3!qRps8~Yu2R1fH|+N1VZ3*8A9rXp=jbaB(gIJ@x6I@;UVrLY#-GD;_~YyN
zY-+z45`WCrJF<iay@U7*Rd9`~yF&(-M~Bfguk$=@(g4-)cdozedxomyHhXB4W~i=+
z`)<=`3o1{f_wfQvs&DZIEksb2f@V}tZt#R1SMZ7jHS%)mV`||^-U_GrUEIV&v|i%s
z&od7&SyP@pchj=rA@y=CPvoT9aEsE*gR9Mt&<FnFaXOfLUeXutpg!L6jimPcz0^Rp
zx%BQ5Phg~X+(Vy^8N#WVn$8ANt@Yy&>d46Dx5rDk@^Us!a@|v|;c8xZdNsl8RQ8Ee
zVA$GoDyIb<)nfrGJeBN8>xL3qO`|#cm^d(Vi<VNcs?L|sZ&=0fH)^5{aC4d8amzkx
zi~{|2Wi%DdePPaOUVEl1%b_-I=MAm|qw1;U3AfvD#o1U7`si@`iHtY-1HVnAE?Tp@
z!ZUU}t_aGfL0UuGje8E@NISJ0<^e}gG?K66uRKg0^!YjU;MR8aX+#_i6w!J#7)s0g
zT&UJE6C&xQmGeHHu5YO0055ogy<gCMg5UCj-LpHqgtZKU>`ht^rFXPy$D`DrZM|DY
zP28w`9)PLe(=dHb1_w9vU~fl&z~5<`E3!lAEf4YsheH>s;;M57b?3%}#aZq-V2HY{
zNqS0OUQ;uFji%1M+(L8VhhMqieJeeHr*rIfgQ}jR)Oq*VRrL+-=VtD|W{_dL{<PCp
z`fEHBN&S!S@F(6ploY}%)EG_WyiC0^ntQ_I$z0F#JjEj!VAWjNeuaXUZ*U!Vf%RW#
zoY$x)^(BAX$IHA5&a7SK?K3=dM-&UFoug$>p2-*X$I=)NVe&@mKj-b}&ULxGR=D?}
zgA1?nx;<}m6IIh^9=J?>R7o>D7fNdoa5K;Q`V<|cCH{1jhIK*IrRHy7#9|Ru+LqFi
z7kJ)Am8of1VKWU;L%w&=F)jKJT2aTf^Hj@23DoUQJy-l8!D;TLW{n?r(ju)>m31;L
zc^Gza&u(tuJ{pLQIpSbLE7zfi+cZJrwC)o2sEF#Q3slZHxpU{k*SrueQVH37J<PN8
z*_&3QK;?8~0d;T{Ro`@_uWznjr4glv3(wFNO<dru=iD916_&@iGBonO9)?q!r=nH*
zq_}p2+B5l!%}wrdPV%u1c**18(NFT|3srHehE}Z~9mGO9x%z=0fTiayx6(JNi=~PE
zVP_$tP8v|^cs<3HKF|iuQ_Uf&(7ZULwB%)(ANSm+7OLb~5cd^pF3&pzonJHO$s5){
zol5F*%g?&Ojl!iD(ayAZfSUbiJesDxyq?1B7lAU1rLN}jk-kb^)^P)Ug0j|t5pDPG
zIHuC@b%0OhLm5qQm+k<y^7mjG;r2JQsDMaXV`!+1=X2<DKqAa+sSx(vaV43)2k?+Z
zAkUc@0=Z7@k;V1509E`g><+EDmY(Bpjxg-1Gyp`~?EnqWW@>2I+UYRYSYXh}H}sii
z!w!J^wLBTkO-VWGO9UCvCco#gr@+BHpJ1A_q{U+vkmm@0vZQI+IF*s`@Kuq_Kr3Z1
zj@fwb(VnAkP@{L;VMW8Q1g_8E$%GsEC$6fv9QZgx-S_PB?9RTSQMZU}ds;tA^<1C0
z4@6AtrcnWMp18%^$=s)=)}%+^Z=0hxwENqmBZs+@w}I;4!T1l<_~>T5BQ;&4jzs%d
zZYttW5ZPNQ_vOWV_j!TqV|b+yC{+g$SHK|0oOt2pzU$!t*|z|RY75_31tuGzWvVy>
zecZak9j86cIFuICdrJ{3=b=Tt2SEKOK(mtS(_ogvz?@G2nHp}4eDa9C+_Q3kjeU>h
zu0-xQa+w-<Er!>QWYGMrBH&I#rWvncx{skWp+ESPzbRrL12o6M<<`^uJ&EVjxjPOV
zZ-FKb&^vD^QI#vThyc>-NV8o1iu%oABBQ(z2<q!-=v6+XIAIY*3(sHJ(e(baH}5)}
zgTXaEuu9iZ*O_Qw`Bow5?+y;7mPj7E|Lg(HhSGX*zD((5?N4)_@NgsIDtujD1ekLd
zFJ?QyD*FNMP~5>MFpzR?e4MW2My@NQE_XkNP+EdFTEVCbymXgW{JHiH6k{8j+{pu}
zi8OX5i@Lc2^U8S8k2X)4(?k$`v*j^ri{?+)xYim2O!1aEMAYjFu7Bc|^lTnJ!b>sG
ziw`t$?KH@nDWfrm3$y|mcb>|DB2@yiYM|x4ynM+wkEX&v-jpgN!U3qY#dRufRy%TC
z27RJxp3CIf3)FV=O?t-RybDf1tPZNpJP!%X`hmPUUV>{Lp!S1NJZs?jD0;Vtd(QE+
z{hMP4?4!W#DIR(W;f`|uad+zWh2Xm*XwJqyo2H+pJ@)0+BQy~S_Se73i3M7$@zP#?
z=fa<yFCC=?ssT)`E2u^TKY!CR3d`$HNkKfa$WzqfMGcACT>5Bnia+FYEmb=4dKtA?
zxIa|TdM==>%EPBP3g|WOmq}kvz%R7qafRB3rU0m8JnaU5|MD2sU*Z~Dntx$9ks-@{
zTo}&vN3f8kLVK#T<Z1=A#qq2sRqf|)pim2>R(osrX<tM`bDp%kE6fq1@#I#nrX|^f
ztGwj~_t_f{1MH(QYAJSivY?hX)O#25>F2Rx;hY#{b>XcCzcqJg#wzuNaC?wzc#6F*
z)zAvfX7Ps8*^pe`J|0Wm-VY@sY=ZZi+yh6@=u(!#ZBVu`PkvA1VK=i8T5M8z=tVVL
z+$@5m4eHR*fIByNpFemq+B)#L%?ld2p$6SO-bWF2Rog&0`gl7IlXoF-7^JUf_`~k=
zi7?7e>ecc(EeUWvTh3hRcJU?Gz`i#T4D>{vMElN=6yCg9>~+ESz<uiV;AXodVD<uk
z+fVEH+(tuD(30|S0B!jdZgCakgIzYe5)<zNG+MB}siVAk&s$CNPYx?@0rJDZsdD<N
zzU9J0-q}0>&W`&5pvPqjZubcALP_1Vi<hV$s~t$cOLZ^2Xf+oc-?jiYOk>5Zu=KWP
zfVDx|e3czTZ5Psx!?FfnbJr!Xc^aC(%9AcM`UvJXOcUp*cQ0u7KTC5o4{hp#YpQY0
z!P^*Abt~hH%WWR_r12v>sC15hmhr?IEX5MJK8jj`!RuC@qz&lm3e`eMC$8m$tLTdo
z#nseX;J{z+7e40Iba;~8ku*`_cb}$S!kYT`@uwJbYSO^=T6m=RWV{+3W`GRsc6)d^
zG{cSRN_ZmJ^`3Cs0-@FRu7e)Bv$S^ocs^!XO5yqvk!s#9f|g9U@i@1i-$T8(0s*0u
z@aCg38aYmNM`<k$CcFE|W1ph30z|iC{Lu^(ey4V*S{JPc@d#W=C3<~4akBI=)nx)J
zS3+dKk1iUtdG3+}=1o)ic?;U&34a66D0#pefqDj*+0T`T(yB1*gpL<IdHjwueAW=Q
z6%#V<_5+!tRH=ubD0Ml{@92Z4_V|IvS(sq7gl6fz+dYq4jsX0MBVgz|nAFGnP_0SC
zPSx<Nv~&$2*9=X$BbaS_N$uPLO&vf~yy*7~UaFBhZUj-qbC1{uPW(C9DKRqn`5DOL
zozgXkdfY_w|AOJF%<X|L+(YFVSZSp)=`gihbBoG?mjMEc)M|zWSLtJD{2^6`?FEv`
zRQ}Jg@aDkFF9WE$(A5p>?13eI<<HR0K}-I8G!l`_h5&?~yA5f~z4CKH!AEX<h$+8D
z-;UyMA*ERCCm!~<a?<fiBGy~YEq*kg6GZQM+%5>-XaJ;F!&gkWx#k><rS%tt5R74c
zwT6c=$FvV(;$x`GAFw(_E56ilE8c~_21EHUQ?_e-vEdd3_gNJ2%$c+l=vx!y(QQ_&
z*_)Kby$TvT&*Ql?N!tlDxt|*Cz}Qwl+K%3vikSyMd)NN+@zGJ#`WnIOL<ab|iPoQ<
zJMq?aSndoUri&YH@LY<^Ee(D1pzr+l4NXVt0OX4hGqyB{ap!3`!iL)1;4WHs#n9MI
z*x9pSYE|&a(K83_x#`0BWL{J8ayTu=r(xfj9r5VQQQ*w7&?au7+Z95GLFi1E=gQ4u
zrF}Rq2H~ACF_ncpjFj}z3th;4h$)L_s=Pea&t;UlW5e#GC;a9gPW`Er1wpE&P-ReS
zkqYcd+(~Zj<fVF%DiO1g#{GETPL|V@;Qsj}va~6;5#A%w^r2MfmuX*VOQk%8UhJzE
zKJM(w2gF_ZCxtL*%+}e}2t&rK!duMq<kihkXxB(wD1D^%xKItZ$Antk<NB~TS#)TI
z_(l^sF`=o2(c+eBlWN0lplY?@Z@4!Qpt*}S!=S>mv=J5$PH%)^^Ny0n*XP25OQOQ1
zHyPJDhT9@q2z!pLMLn!^1Y&8XEf<Keg|>uYk<B9tVbmaurSBvFGsFbpI8vAQwU3Yl
z6q8(<mxVF81<%sO^u@#ud6Ob}Qwm}F2YE9_@`}*|QQB;N@Euw%2#%wvf?&NY4{<a!
zdnCkvU#{_Wl1Foc{X%ks0m~B)L*e_|ABM(Iw{xiDDY!a|{kYgU;o=#G3v8&`DMYY$
z3zc5W3_KN*o`Q;-)c)o=BEZ+zlOjv53!g=rz9@uQgD{s~l&*cN6*DLEGVPCS$%44>
zuL@ytCtI?lZ4px-<w$`z?SoL-VaGlZ$)|a>;1#u((vTfI#%x59KX$B>+@riNyTNa7
zzNWQw-U_wgh7cZ6ImS>$G4<p+Dj}0??1n25Eww?Jw^O+~{S<$MjkG%;V7vC@1Wkn=
zJjWk-7~y4NGEH9aDCU9VcIr~_r!|u5;I7K4_XxM$p{)#f@;a}lfi~1?fzj6ZO9pW9
zB=x&c3-t?6&sv+6(IWONAHA@9LfrTo#y*1)>JHG`qx@Dtz%z8+aFIXm^3>6LY#e)>
z`91ZdBnQWZaQ(T%kNjQ%<694yAyW931{8Y-hxOGe<fxDQ{#mF?*CCoa$;+km)|%?6
z{SbW(I1p_P;#z?LAF<HINAMH#ynLCOF7uN8Wv)x3jwo*M;)Q)?&IW2%dEdSNJcC*?
zZqf2pg*m;qO`^3+_ey=K`3*ek2g@ham`N=uh#uYo$KZUbX|05MQ%?Kt!`8r03v_P*
z>*si(B!H@JJNssH<-TBUbU#iTPierG8lM%~^M_KN-tQe1TZmPSdOx9o1Ii+5)>F;X
zN4HC9_Aymh<=>^I@W=PDD`<v`+JaYI$a=Z&LNQl?Coe;}?nD8I-#(7Gv0u$KM|k8!
zFpcwU$QdpVqy7-<z0?%MO=saDYf}7AAiA1YQQLknwQrX<8V>Q4o;SRC=@BwFiP3j>
z@u*Dq)ZIG-ZnDq(5jF+c^esVa&TpyyqQFZx{9TW^hrY3WmI-?qv_?GDq&RzxDo#>U
z3|BeYBHnG}u3VVy2v?MyNCvpRjooz~k<O+ybzbAn7)zRhw$<-GYQ{6xkG&zGjbbsV
z{e&J{=GNFEbcy&95rJLSd(^h&t7wp>ULAUMgV$fvh#Fy03q;w&jXsECz0c8F0uMaC
zn_zY94SjrNcRYs2-aJ0N_g)cK`e$-u{L9<CaN>p+w|mp}L4Tch(7iWM*gjr9!sUm-
zg6*srgrP_~ZL5OZ-~b!c{2{2An_s3oQe6Nvt=|{kyd|2)ph7cwG+qFz%XyBPb~)zV
zg7VE$$Lj-!ZKxT6<PcQs8#b3Po%?zomX(xIH?@|abeO-Nyoj~0b9s6W)trYD9gi=&
z@f5OOdvq|r=qazHKSgl85RYKWo?A<7GxyyE>h(}%U<x;Q#0ApO5&BBq=Cmlp!Qv6%
zed6`|KD^AmB7YHGh`EQ4)ViNG9?<wP?sErDb#jdgHqw<qjqb=9%+mA`goEP{Ob7I-
z;UZTka)gV(DnzpvUeJs^l@}mJvw&0l{2F-OiYT*&sw1Nj4OFNRZH>~l9@)DDsN6a)
z8(_E-h+oIJ?c|kMgv4EP&m%`mqaYHntDIXzm6$#YoM`0Lv|#%@@T1S-vOU+k(WD!H
zE(Ol?fxn%5kUVs-hefp^)*rvb{YboQIOLUt(7;KkX^->%0(kT`<-L>#76vD)OR~K1
z<6dzn>NKPNL%fK{aPAIoI;5OD6iRc>{5in$mY|_w8(x15m7B*7pvSBzhnB<df$ota
z%-Lsgn?DtD1-`OCq+J(J?8Xuwo;s;QgW3nMxlM#L)u3{f$Ik(>D(^ym_1KeEDas=A
zY=Hz5Ni=&Hii<1=OtT8XUUzsFVwz^h!%$UlZtNrgaPnf1&fOA(zD)z{Ozz=<2ef*d
zzg^&Mdl+W-D_)BP^7dY$#V1KOFM1p+P38vAU>?g15_hoE_o-qJ%>+U>+rgi&r$hW~
z4GCpDbdWwg#2OL3)A%`OUd=5!2`H_(!PRcmb%iIbuo-@P+}A@vtA}|x1zUw7LW-0Z
zPxF8;PX@*wh)tsRF$jUSE*Bx-uP}#t%;)lERA68*<W<S<N;UcPHi;*N*cU#jw_T{u
z6?E3p7dy$rmt(lii)XWtL`ME%#)C!<Q>B#|C~e^ZFMAsHL{L9Mv(^bv(O#JNG}T;T
zB*BtHc;ktM7U{KXF<9igl#*TCl|t(@Zh@%reJpJkd#A!G+o4&lG=FSw82rV+5jW~7
zMAm7;5)tiYSxlY<4e`gz@PCs~Xe1j-9Fwnbg-?*AlRfv}BvHArkGey6^Wt;*_7I``
zBC6V`UqK7Lh)~KemqvN)_Hv@-7urCCvgJke<x)Z^LfHly^|H<{jH9a8uXw@h0)2nR
zHL+O#w{VEC0~QXgfP9-gkuCvfDbiC=iU1ZCI88GJfQ>00D7|~+AveR>ROLn;#8z@0
z_PyzF*q=Y!!UjJhZ0&oFy+RY<7^=<v=jrQiq#K;E8Jg#XdszN`XZgX`Tx-Maj?^!M
zk*`s&H_!X<`l*M=K7d6-)D+C4o-%IQf0e7;X$7c;igsFebcU@q!#dmWg#*^Rbp7HP
zNTIJpc^4jkSi#*l!gpaO(hBxXV+q4<+9x#oq{J(q8bh(@=@UUNfS-k<^zqQuvSZv@
z=EGBnlc)B8?bzyX%2dM1Tt{A_`s1+lx9T8}UyrTPBBD6jJX+v=$I6ChxLV*z8|0$l
zzy^wW@+L$uoXV3gX!e!sK7qRT2XK!kFsAb?c(rzhX6@oFo?)grFtM5&jv>Bp!15<=
z@sNe?9Djd??LnIoTw8@*cU&mpiliVz!to2Qd>zVo`ye1{JeMaQUxFVJQ-sxo4D_N8
z_0mwRGj%0jDY=T2^&`mV12<wHG^IIxLkyGj7#0!dE&lL^np|kbki+l155c7CAgh&d
zC}1~j96O49+@uF`igV$&vv}ssX<FTL&pzlbcK~@m28IF%d$HMAhJDl>qHVb=cANlh
z?zm0;xo7>UB@_BSE1Ds6_$!ZiLfPJ@U{g!oHhH{lo4JS9_r+eP?-r3~l|1E(R0#ld
zf;$T}icB$n0orst0MdV?Efvo_gjfP;@i6yYq|Hcx!4PcxEw3t{gP~pYnd*<wC^x5_
zre0qJQH@Vj_uy4)c|6Adg&9{tBrRwB0134^?y<o%^U#!U@d@DoxzV_Y;sac+L5z2T
zzX$OcHM{VF9uTpCU~vFVhXRhli;ZHf-XX3hW8Cr%<$(0>)P^>#0omAujqDG^ws(kz
zvw7-f47}p1n!9WiN<;vS)S^b9$=fvLOUs8<)a6w|b^2?duay=L@?x3Qap39<R~(~n
z=3r}=j_OKJa#yCPP_3@;C<`>cn`-iD%~_8*yK|1(?f0Wf{{UxT!8r9%duUv~CYySb
zl)NZF#QosNKr|d(hoz<cep-BvwRYd4@#Me=YP?P(g|xjFAT?9)8iv;A!K3a0$Y28o
z8{><(I*FS0(swh1dvLOZFT>P+f!CkFcsf$gKLy%XTBJ~|-rVBJevgu~S*5^}MgCew
zn@5n(Xe!%(333rX*7Q@k3Yr3wJ`dAbfJ141bWwu^*8!P6!kTF)iYs7|%iI%c1wS%m
zOABT+e2u#h1XkRqssqodL9*L5?ou15H;c>N{E*N)LBoM&p`kqH5kO<`Vsn9awEMEq
zvdeIjzFY}VCgecyJXC<-srph9EjyljX@&1Kr)XM@jL-JIY-CRALGLVf56$=W2i&;|
z@IA_3&0%Z3+;8=?5KgZ=0W_@e1}|qKImrXh0KQF-=(m>ujYS^v0w#U8y;=;cntmG1
zbCCx<fP6LS;`bf1{HNy;gpMB;)Bb9jh<LW=27fI=(K@tmnbxhS?n;=;AxN(Qs<G_H
zpGD79e$SE@&%vlsHp1PO&uDEgKxmRTV|hh|)Ks{W|6Urjrsh+$>c+LFF!vg7;WH7&
z>`ZY_zXZqGTFULP;w3nN=~r&t5Q+rZN-kF*37QX05R)CS^+{f!g-qD`EPs|mKNo>`
z<4#lqlmC29jFXy>s(}qHdIOw*@kr6~+cOWiRcn7pq0d6RRRvj&!D%Huhnoq7%(grV
z;le#H0E$-u>=iG$y&!|ToXjp6vS|G(Pn*$_RSv$_yn>O{`#}F&c@7{^wU6u1a;*nK
zCT#IwzMBB8@5iC}bzZz^qq^)3sf}=rj~hUy0nqT_DNP=Q34XDn-Z#O3n8hS+r`94G
z%%pi|>b;Z|d6yO+^2+NAWmt9tA{96+2zwr;IY68H?gxtz*(8Rm)j0zu=6P7>3UBn@
z{(jm)aHN*IFL3vLo`y|-1mvw)WeCgS_H{)etQv&1bQ}lz`OL?<G4o-}Go<Q^@`P`{
z;@E{=t;$v5(7=ZF5jM8sNgqtI%@YyAra1nVLik=IY%7GnASExvLs15tY59vO0vXRv
zi6?k?XQTg@#QpMLRtSGZ+~`}&k;nKo)A54LK;f?%j~Dzkh4AbD_u~bBT{>RyN8%h<
z#UO&`K3WqGRHSKum(mlJyTYGHk6dmGZIB+rx$78x5nCeJh}^sQWcH)hOdZM<Qf0XV
z49GZQECMNGhid?XPUB-bWG_AP;0*;08T(+@YJBAKP^Iz7o%)lEPiC>uxplFjJ9XVM
zmW9)Rsg}wQ-!}Cfqy<woq}~D<HjACO9QHkIe0EoPakW?&9$#YYgakZQ86VStR;EYM
z<Hwt33PpNQnq9<=2vZ?fnW~|EKj!3#DAS;W(C2RBlr(5h_!w@Yv(M8sKRgURo0!%&
zK+Qj%>vo>q;if(si_N(1G)cS<r)hs9?`iO-dUx6iXNmppE+{w3i{3dH+0W%SdDX~s
z_Z$F##C<bO`p#PAS;i+Ys%qzPkU#z4syCd9kpboD(Z;6XJn*Ahn_w(6qlFu$+9Uvy
ziQ;0oQ==FLx$3Nu_%!Z?voh)kd`747(Oru8jHRK70gTKUq@fc=(!u^UUz6U2{irYZ
z4P%LNGX$t6c7@09Gg2NNDk7wj$=TS{n)HqaOp3>Y4^7$zuGO0!%@`XWNiQ)YPE{Cb
zPm`I513f$SX?J&+P@Z$s*d@)1227+6o&|iFNT+#|-eJgr9~KAr{b6zd$Df|x?KBC`
zaxoI`P7NlT3r7rUGTHF39izj^iyf=uavrrclAi{sTQrGVCYhb^H$=Son+oA?8HB%`
zuJnK<sPx5oINYbh8NMH<1M^gPf0<fP1mF=*()~ug(&hizRQNkP==7T+I{jUR@b`@9
zWL+cteeJJ{>AYVPPx+bt$zW&iJwt)HjbS`mc%158khyP^Y^25`f(Aq3sl?MD?=^^x
z#BwSRL@e262!)b<$g$HQa*>+fImJ_+p<15V&q!xtmog#_vNMfZex8;>pYrx2hg)!u
zVjY!xJby{Gq10n=qVeN9J<<@>wVe#*^<C8Y0{&kdh&PeZi^Dz$p&ok5jHB%gPW={7
z^OW#RVjY)Xw}wRzg!%^arjNI&M<7*DBlcgR*u_>ODFAP59LkK0RK_Es7=Vh7VY}2A
zhJ!_WVf-W1W~5D;>2j_U4*^X^Ol&xX7QI-12Db$uO6fz4Dl#Hm=6o4!3p^g-1;^6*
z3#9(s^1#wb<x4`kS!!XV1r#oE)#Ep&Zcw16H!;)@igh%3)5;;lFGkLpL;yJJL{6|7
z_WDI!HZqik6WB(y|I!j6szx05zz$(lsxi&BNn3ZZFI>k4!6;eLB3>B8>6#BAckoLh
zF*Ey#jkUXz3e!!3{IN{4Q~n38?sSso06DdKs7uREsZmSLAATL+N^_X?CSrq41Hvey
z(wUg24UFdHD8zfeuq^I3*xkB=k0z47N#hIuK(wNNs1W{<LHNh%%6%wQ=N93(v(9*e
zUHB)z^u6Ne59CKhMn~R_ib%8({wY3L{IgQwpUbpiv0>|`Q}kkfALmU|^{TwWTpT9-
z7dXKx9go+%G^FOL(lxolzeJH3L*S$>PIl^r-%1gFTUILkE8}UzEIp2I>NLXd)G3|N
zC|{HHOq*Vy6A$-F-Sxuns#8<bi?VPiG(R=f2IpOEgnykP{2RHss#^HB3gP#ZM}MiG
z@l=0SenyrWEBbd@wEp*qE^t12kEUAq4+`Nw{x}1>v+Vzb1H9t*8{>+9A767sF5zTr
zRz_-}N{17|df`8d1O0(C?H?+H|58*c{8yRwcj~miE0+I}+ziY4Zwld$wbF$DT|Aq=
zOTHTu{tt!lKed=Z`};U(D-H4|Qse)n5dIW{{O_Lz`9D(m&lEZ{943~=(V62(geGYy
zodw<$I!m$k=Lgj7LW$T{XC*b*jcZY<ZVzrh2G?1OZNM|#Ua5@@uEr^B#b$`ZB=+d+
zq}qLAt#Jx-Y0Wx&lt|qjqy~<-Vv7B^{WOJ>*hXB&0jZ5MuEr@2ip``m>tKlPkW_nE
zto;Qlj-W*9epG7Uf-9ythTBh5xQcB=D%_+t$8j}IaYAecDlA1I>)fT<lVa^JP;m++
zQg;ujfhVq*;xukQO>stSBT{izY9qtdIE9zkOrl~JrZ^|ndW*HcK*f2KNZl_;4SaCL
z6c=&(X^Kl?8<7fMsf{15#wq;8W)c-vnBuZj8z9#H0u@(KB6YtiH3-DDNP~}U;#W+4
zfetPZf6XEpxL$V+l_|O)TxF%Y>*7uFjd`k<sp28<KW}fkU^K#?Cj4%SE<|ju{k?2a
zSF%M`XNxP<K#`4qWg!kA8n0d#igNrRh4@zjFlfncbYUo~1L@(o%d&qOH9~yX{&BXb
z<k_N*WQ!~oM>Wefj+1Tds*#KkpGEPvL8f!$x=1uN`EFg5_$<`wqH$N~VsJG|$F4?q
zL#&eg_@7`&_y=$wOWO&Pc<h-e{3$X$@ZjD%#-ecU-iaTvADh96J-90~?%k<}H)}CP
zkTjSVYpk{|+=(C&FyTwSDURyp52g@Oc?bJODMAeN7afo^<{JEp@sZ|bI}zd$o<44j
zp?Iy-6hG41kLsJArugv=KGT`vM_H69e&q5CI{~D-Qi>mSu_7Z1YIHXx9>w8m(A~ln
I-!^^z3^Lb7%K!iX

literal 9893
zcmb7q2{=^m_xRYt*p~=}q8gO7Ma3X{V`oZP#>^OIVwgLN?D5H(y^<wFN|wly$dWB7
zOK2lSM5wd~MJc~K_v+j4^M9W2?|J_Bc{ul+^Pcyd_q^vlch0#tf)hpbCiyWaRCTJq
zC&`zg4k9Ef!I$LgiAS2Q-&=wA0Ea)o83u5nuwpoCBWIk0k&TJ1HO|q=$jKDoKC-s9
zb{fmYq%r^=H|%DjFP_1m5OL6)I68?&CoxEVOgxj~?+0x7e>8%cpm9i0jWyc{B*3c@
z3Gjsh{3xsljYP+t@~3;^nEo^hkpUp%u{@s4AQ}k}Ao{VWv>+seApt=wr`H|~0>csO
z00^NlTv!f08yg$8VQbJo41!6(B0&*wHXjn50SIF_ut*A%M2E~`03sL!R>T`mrQ&f+
zGH5W_-`5imH8-#$SU;XGS;cOkfr}zB05Nc!05}hhMf1cnL4r6);PE68@j-xuxxqLB
zy08MC6b6${A+VSvPe2m1ngOn#1Z@02R`19jfdr($)k}u~n^0r{vJjRRRORUpu3QGg
zkL4mU@qjG6aCM?Toup1>`7$YZIvpQ`1RYKE_XCx(h)g6Px9%Q4PnZjA#vr1x90brk
zTTmDdj9@fbC?2Z`8f)<X^9MR{1B1O31|UzCBmbRLfjucFMlgmwt%qx{1-hWn5}li7
zi6#x2qB*e-4J=V6XixC|x{oOuM>9cRXo@vN4#gUt9@aIy&fR3N`l->t>Pw@6-M%J+
zo={Z-y%1wV_C2ag)KI{#$sl0-Ve@2TU$bo4qh=<7MZgOC)0h-0C76W6`_jmGKrtRG
zL8bUnsNm+%=>7yef#OSH1_4{m4Wfc4%qt{Aut-t>-WNv-Akl+>Z5Sk$pGao;`7ppa
z043`IUaZ)E<jE8qIFv;Kl>bWe6Y;*j1U%7)0c^Ly^0R{gj2|%QcVGmu8~nYz7$hdJ
z)7;?8dJOQucA@^|C>6+2koWoicpmhs;C~jw-ec8xED{VL8lFicljwk&4HUY64#6%A
zCsu$INHg{I_a?B1@BWtt^?z-}{ZD>+cD4R`mL>jr3=~zbFt1QwN5~htFP`$%)7JOZ
zyMFyRdTQ`E+Vr9%T69~MKBrigzMy!PzOrSOer~6@J|MgW&Atz3zSAeTW$9a_oj&w<
zQ|KZ4!y$**w!=qZ-LD$p!Q!QOYO{|mp!p}>!DyxV)A8z_6e2qf;ORj~lfSos78vx}
zVSo+_%g1&r6!Ck&i05Pa`;h#!DW1UIFkm0bc?0M&aF6`K1H2z2;LHP}-gG>T4CrEz
z5R(zaV3Mf70SphsWYS6C5j}|Ecjf|zl4uOz5Qfj0?RcU;l?EP8I-rL^z>+L_04WI2
z$8bAygM1bP<e@RV&@^Ng74HWaU^pNa7<DAT5VOIV9~?*V11Isq6G?y(h7+0@Odd4w
zAd-N?7%mo;%hQVi7>5BSC@?#5LWhJCgk0bsH~7Z`-faLOF9`WS$PYp!_$PqjWI=}?
z=jRs;m|`|#`9PmS-T=(P0CR9QvL1$mY=Ge;8)3M}hcVn_eGCuz0A>T(5W`DGWBAC2
zF#P0$7$jL2BS7B2J{!%S0a#$Run1&X@)oQh9q;GqPsPE>$#Oljz(gbt=Kyh`yo4h%
z{pm>Hh>8P%L1B5VowN=d(Af`I{VVdU!N{|waYmEDY|Q$Tjcxu*HjZ~{1^q|8sQpzY
zcK32{Wm!%>*^IYL%*RyZ7Ek!v5bx;av5L9(?|r{7HFC$u7?&lKb1J8bcFcafdADjX
z_4U5^R5u*S0A*UoC~Ii%x{q2+d>-6~WZk(lRGCgbe116njF?VGtT10ddB(4JO3&(z
ztn6<w)=UXNu%lgmWPpGaJh`7|`J1=G!KTv}ZZ{VderXPHucUwJb(WlaYLvpS%A5l_
z)E4Alk)q;#+<E5fQoUXs7Ab1%DDTqcZ29>pXR1#6!l+AU<L*yIYbVrQ>CPUses02%
z4T6%^+^RuCu3_T4%4#^JB8L%Y=1Ph^1&fUYtPj-~Xt&82@3}__P<Wwa;xO|0Zm_0a
zN{#!SS1t}*vKLoZ(l@H+c30xuGCKuIS@rYxdmRVDwHxB+SF$G&Pbx_=nZXUEm7O!T
zBC#q}DU<Fw)(=KfDz8ar2=CF+`n1&#=yyT{D}Gq%5+VBW?_X+sCb-dL@KB=njcxk?
ztq?9})!`leK>DMFP@?TiyEWzDO;@jZ?iLb%@&*H(b{LAJ(oE}2wVU7VJrMcv#W0N+
zR(+W;YLc9bcOW+Qrc#A^16!W)=Ut8fEK+`Ce7ei0v{5E(OmN#uU6hx7e~R8N>alMt
zxTButulFtX-PXP$oGGMt$hdL1?zQZGg@%I~if`|<m+UocVjf65sH9+__U6O4W%Ae3
zEkSc;C5O)_)p)G9UbYX!d?Tf_2WC+oO6~J`-a5fCe7p8}ZeF<Ve!Yyr5&!x5<rGZ&
z#^KD8_o{>AjYnMrk3W9N@Yqwk!(pb{Jp07iU1v_QOa^kx&^a>SWn3grN*s4Dy)u~Z
zU+}ilP<{b(N!SJP)_CjZRt`RyozYy_EY_vPjP#ngyOtsaL46-eKbzfi|C;xx<IHOB
zuBBVTK_d#c7K3tj{SN=p^WNpnQepDfO4JGc{-E>=m=rH^<SI|Lzx4eHMO~}!bvb@d
zLQlT!Th!96mKQ_a(i<ND4~h8Yc+<Gs&$f@r{yMGx9$OSQj6+wdStJW(2N~6NA5q#l
zgf4X%Pj}l+$V=SUb12i`3HMzjdQKshsc3z+>(Y|(A~Wx4aijSu`gE#a^9)1oP2sz;
zPe=Nh!~aasI=?%e6Uiz}${C;DIxy`qXSby@)Pt%&o~pk`EahcX)a>m?9u87JH2XMi
z?L(atXTDC`*jKEZba0|8+j>}F<DHvD!zM54XS1{7iaL<)W16Fn8on?J-n@O{nQx(J
zl9M&kqC?rgYns*3i8c?`->^lX{5<N-sVv;cv8m7sJmwQmc<__QnPcBkGf8esl7z?B
zZVMy#x4V|zKBv5DtEa4;5VVZj??@`DA;rA;^f~Kh!{aYEC#0!0m3hCW+EcETEHK=~
zSbLJIUt!~bj*XKQUoU#S;R87R-b>Mp4h>KAS6=#X^gZyjr2FZ|(DvKLU00uLWxb4g
zC4Y<}T3cX#&_`A14!!@}Nn!h8no(3&ilo+(?1qz>k{nkB5x-+-)w$O^cxp@TEzDk2
zSQ#^UTJ`(gO`$8r(Nx=@ldk!U+hwNTf(jBmuMQa9c0}52m7RFu{y^0JZRGZi3yJ3@
z+7qhRo)8YJ%u%!UglZJmxc+oHYUEeL2)fhKIQXWPu*OPh)5&t@w!PsqIHw!y@PfNr
z)F**Z`>=20v8mf!^US9q;&#uFtt*WH>Hh3^lm@PCN^dmq`6qK{?7nMrp*!D-275jK
zW%2xS^*oAG8>{e0&u9MCH75~qqC<TE!cCxaLikz%opbx9IB!9vbEUT>4E%<l_Qr1*
zxxQeer*gL8>}1ccO;bTOfCI1n<HCT~-}p*@z9c`F*K6$NXttb|x44WL>@iB%#T9Q5
zqh&=iJf<q06ERbVOn>LccQnLebiUMTrcef$i66P?YBRa5*ZYBZ4{P&dW&3T^hlV2`
zHD9VpO%1-&oBMpK_{HO0J8oqcr|ot*@#?ov;4vIu&!Ljaw;RnJs%5gjTO7ZWB&n<2
zRhwuR;p3i&lO6cX5j$|}DY-@HhCAg-WMO;r?pWM(b9OW1a@uU;(D0Ex!pBzLx*XzC
zHOs0`>#?!VIe*iAOhOa?kJQRto7uBrnu|9W(JrfZ5(-Uyo~nI(+OWf;OZCuG1VOT}
z)?|6|4D!drylJrqo~qBzX5AExG*TVI+`i;}uCK$i@0jP5i11~hhN)6J!$6@z*XJ4D
zozc@eA4lkzcum40`TX~*9cEi}Y<n5Z!o~K4@62zJGpVi;$N}>8iudz!me=*BhFJzu
za_=mTj1>%rl=14<j{d;KD%;*zs=%kJm&pACptlcd&32l(mt7gzRx}hgWWbMeFrWF_
zO&Z$7FZVXUN!(Da_2HhY*lgqp!|_l|onE?5D#xs&)GI5;FkTM{N7qtq544`yjnJjo
zwYR(ORmfbNy(J#J(=fbfuTWu-deuWfTD%8;S~m2Eh~K5i^Ov_I?;$<D`uf&>(aPh=
zU%YNS`dEKN<=9k<nQfKUt^>LUqK^q6s!FpaWa=aHw8sm!1j(J1xpKR@^$7tnQ+<<V
zA}%ew)I#8EGd-E*<t4pca?JkI_^J!5hj=E5k`W<2*>wEfR|CtbyR&WR<8KAZc(!+@
zE*~Cp5>>0IcqQLGJ?$9(!7H)P;Be&#=Ul^Uk(jL`Tb3SA30s+;8Gmr_^p9|^ouN&G
zQPP85ZHBqD-<%Fox|@RJ`Wi0PwQLPb-YfaUK7ge%duqWn_R*w~>b>`zCo@*#V!fkm
zPH3|DRaV+|#b>JQQ+{+U@mqXP!DvlN=TpQkiRc&3gtKRn_u7(7qZN2kT)kh{NLIRb
zi2g)kUByy9j1fGiDxUA)eyyMuuihNlT=`2*i<B%_Ow|~=H}0yV?POhG5O2(IY7pgb
ztecG5RIHnFs#op$iu<#ltXiKJ%aISRU0+rC80XM!8F#WM?~GgcAtzNmgGb@F<RUwk
zOIrfce+Z?V$GS#)w}~&baY*z!McsR1Tl4yX&5h`UcB+qJ4KP4X_CZ>0FmU%C<qy^*
zy~)4N$4wy3?TenaNDz<+RDWVtKKb&P;HC%1WLI~Hht5q%R?N&~6f`MIG#o%NHMA^6
zTkkXmW*fR6l*6oqcVDy0v<aclcb_YAifyzGN{qLg_gQ+1D9^VtTO=Eh_H~ON^Betn
zd|Agj$)%O3)l#cZ{4RaSGo1@v@v-S2McOwm;4vX2ue;)cKD9jf>a?Qg-a}1lQqeD@
z%%n6Xm;o~WU2FTiEu2Da<6W+e7j2f+5Xfr%KKQ#y?bNsJUTd679}9PTCf?Y<qA2=1
zC1cI1@AdDa%)R|xV|?RvQO=L=K6J_I)76>{6d_sVCMDy5p9!C<u}lKS7lSYde+WYt
zRG5iD#rk^r4>bb-TQV<J2>g`^wif<WF?Rp@>1h9-qJ>?_>#kSw4*%`9BeoS(*LuyP
zfL*<gzx9To2MdPd{B{s-`25WsHb1V+6vBts>6yu3Y=HE9syPJtXqWn+d9v~$IQ(*F
zCmVORyI9YLx;hsixa<hSO4KC?-}GL-oXuYj)4I=w_3LyCWrH?0ZdaBIjYAm=U0~y_
zyO++hp`!EW^9D@#7q}47mjzZVbl6*f<Nwu=F+OZ^*wNU?1_?O%1K2R&C<-hKNr6n9
zFP=d11&*=HLa;c(G4NEHFIe_Cg9I*i&2!w`;7%6%%Et17)d|=$U;roF02fpk;EKWu
zfrT!`6Gy|-!6pp@a3kVy1QrGCEBN7XNU)?u0`6`A4#R<ub^tuELhv|PCj-EfK_&ra
z60tn6C7$fw%KA8pCj%g{2YI3XOXGq8meqK$w(w?`4U|7yPyR1kkL6=ma_lR{0ick*
z$z&`ay9{JsAmt=X;e`ooiVq32+ZXQH{P7GGY!LBM7&vwh1@Mbwg#rF3vK!k<8V12$
zCqRZ4Mu#y5*%K_cpj9wYumaA%3Rp0OT{N;UP-Os2VHd0H3#6Q48xcqb^^zfVLF?**
zVFe+ux=<Jk`>QS-JQm~tG9-TbuXx0|cqHuje_-)5Fct-hbHLU{v&+PjWXJ_Ef5l_h
z#p7Uo@v!(=7)$u8J&~=Q1#3_GE1tYAo&xJjg~iig?A%}N=h@mRu=Wdo#nactFT(mV
zVDU^C%lfN58`d7ccHO1F;yLT$xv;)GSo|`K<^R=Q0Bb+V7QgaGoCNlbl<<roDwPCw
z=ZH9Vmk#eoB;lxd5D&!pvzY9zo+nTkkL6~9_rTSErKxM6O<<i4wz(YQoe*#&j=#u)
zj=2XZ1A-BoGa=Y2kPSh@XL3-Q$VX~IX#%lMM-b6anurj|f#uBwp)`?yU=d0a`Gijp
z95-`-(o{sv2?(~Logt{0V-G=u`81T~6w{ADxM3blkC3*7@bM~`KAxNlrMK3)D-cw~
zt3YYa&?*EP7lA$lL63MCM%=T4=R?1Q<{wWog7FTRp6|6j?~E}-&mXx6&6_Wr1Hp(a
zWqAHCDX>5Cl3*AIL&YdKJt7`#FK4Gk!%`i{Kl$r)#b(1cC|%0t!+yLV0;f@n?{FIR
znTH^PM1b{UC=hHt1H<vx>+4s89|Nrf6Nqm(woWI*j{!uL8BDi_9|I9V@MB=S1AYv6
z45!2U^XxnX6+2<*frcLg;{^(k{p0oUV*p_Q(_6LR#{gmuUZ<fI{1`Aq!;gX1li*_j
zT=#e$yuahe;eE}IhaqAH-pBFlGm!s1P_Ui(>vRuCcpa@i7LffOJbAGHcNRi$yb|8u
z@x1l;Tv(6Kx9jjJOgE&+LiQ`3h2zuk58VKc&-@HHK3k^*Apd)OTY&wJ0^<{G|B3$R
z5dG<`7qRm{+$P2`SXAITm=A@p-0#<BjKEx2OtvJm`$#1y^1s_hrQqR$+DA5UZ%ACI
zAdPEI-N<SCcmLqDumZvH3L}<I%j}TzQIy#|i_m()FEnSj{v=-1G%_f|SJe4aOMd&W
z)ZR5*Ct=&@-9eEt+yPgG67)vZ(Kj6FHcrL6m4e3P1&LvKDTm|Ewq|Y`uxs|LiBoac
zMW%4}E`RhC{kSzpHaQ1y-^qsvulYTya$U^1C$zWh>;s7?nR`H2!6Bx`D7P{_J9bEN
zB*V=~&r<)Ps`Lr%;Pw;=#LpidFD}%7<<XZ>#~j_+_)O~N)}^p0zW6k$>b(*s9KTZ1
zM_X%7OnzC~lelM5c+UfmbH?2&^FjzoW*T12@o0c&vgeW7{SF-20`~Qq*f6c6(@U4Q
zTob&K-7eje{aP1YACl>|T}!ChPMiGrQOoD~kiBPYCAChr3+eSJ$~U^_2_$&agkOfe
z7gIAfr;Sdjx#meYvyvDw?k8-+DqJFY{2UVvU5?)C%V_Sbq67-<4(gB`2}D0o@ULH!
zUwB2lziCoIujrHhj#h&leLnuggS|V`V%&5T2EO1^#bRG@s})VWWBgQGJRy`|V`8@;
zgQB}Ds_DPa;y8g$KG|SR=C~KH0W6y7Y?f0*pBC`;x<AghlIXDWK;+g$(+*LA9|_NV
zCkxxc6`U5L%(-{XH1VB3@Whm>q$PfX4B`Wyi&}hR&>i2)g}*rXo;=G<Gcqd5j!*T&
z=a&Ra1)Hse780K%r)2b~b9Jhz)?1IcI9l__oz4vwlsA2Q+GRUGmesD~)ngTCJFj4-
zGN{(FQHQQ}5PLc<YiU!fbkQ2c^n97%8vfvxs<XtG2ZQ<o38Ha(+jF#@KQejytBhLY
zzY_5|kz15{<U+-{Pzh;d(&9w=kJonxk22n%Lc%YmgrvXuy;bAMomPTR%&w|wt8g>-
z(d82x{aEhb2=4LoRzk0Z`$Gi0gdEF;{?T>oR<hJB*i?OyqXw`{<EnhUa#Ve3c&6#Y
zrN-P8vBqOXaxWCGpTCcPk#3!!xLi@X>1NTwx#n(-HU}R-BlpdX-KWZgBm6lZa+cl*
zUF|QuAAQ)Qyp8y1s)f<E(|TcLiFtOW{Mx|QedbrvG0K}{!i>jUenmVI_))quFX6j~
zo-p-d7Uv$}hg*ub%!Q3+J%7UcNJvEZ(m?IB=BqP5MW`YAd0QujB*pxTQ=PGH$2ghd
zJenw~p}dM^@p(j!)Pl$En|7-rZd;|$HznuxDjxwzSCA_%C6$pw2$oN+A+~vv_51~=
z?Q2>a_~vaTT|;5RcaP)Sw&I(+j@?`B>|FUity<IW_#{5+j6?Wgs=(59`wHp8uG!8r
z^%o5(4N0QXp@pSyw=U0W9Hc99jYfHVndKcjR%iQT_nL6p#o*<`sl%EzuJ#rhN^)UN
z-O74zy$%}uDm@y>Tc%r;cIoSOeQg1zb@;@_OUGBgJ}gp|{O-%z+8blmxg~wyEKkfm
zc`YS}I_V|1x{1izh3#Wnv%2OI=Jwg96~FedVk0>!Dt%Xtb#?U;+MkAz{G#k<0rOMp
zI9u7Q&7HC3f)|=1BAYaJQXaS`eo!iu{)Y!Knmnc=d_{R}vVzZgc=&Q@%)aigPWnGx
zImy)W1Ew)g-9osZx!oz^j2<y>2~xwIPb*KUl;bz}J+^5}2F*aCk?U$xxG9yhV02GG
zanbx@h^W)PHuZyNXEKhSwvPaWF6ysp1T-AKxo~zM$DvBADPm8UvC;+=@!C^l-RxFY
zxu$j4$L^~te10^`GoiurD!cD`-#0I9F#PiP^_}mfy~-76z9$X@x5smFjiqJ&?ImMj
zj+>76O?LPiZ~U|pzbYcFGbeEK#={o@7kXo7Q(x_>-eCMmvj59G);{8>BxYl17jjTE
z2i3HksrE&EW!v$<&hrI{dQ-}%Ua|chinm*(JJarnN=_xsq^eezFCNm^F`O0R?D*-5
z!*er3?zH~m(%3g!$gUNxAu4_e=*jmA2W5w<WOAkUhs*VctG~Nd!dvi0YZZkDw)%=L
z#>+?CBOZ*4e_GQe+9czzX2*m4&<V^U*LGy&1<hKVeS@1lRS~>d2)HP=cS!mE`JH{s
zJ1PXNIKAK9Y>)1i8|yBgJu<;bSN-jsGT_`>S8{H5NSpmZrnUU;U@ejGwT+LaeP>-X
zpZQ(|I`esI)H4grCDWbuh3sejbE4#BPxU*}cC(gQM#{tHBQH-U%HC#zO(m)NxSO{P
ze?M=OlbJ47`scHuE#<@M?RC6Z;e(Ekb8~gk*M>jVUWk)w!EFBa_2OgO9N(Qf$zCy^
zB5SW!Pbz;LaZBBB5lLMW@Z!BAWPa!_=i!~s@3ef^eq4V-BRP1bj}|^Yb}6?R|M|FF
zziy563yr43s}mDZ-<<>Qr{9R*bhXc8d#2V%f-4@I9XoBRa|+Rsk-S5@wu?T5y#A%S
z!OGe&>Z+wtkVGpfEKlM0!M91W^Nfnd{IM?ud*!O8?sBT>Z{BLUg(~Iq@os8G%HESb
zlVZBBCa#=(XJLBxJMkIP!MP<NnTYFLE77Wz#;2=<#4!#O_SbA&2<MUS`p_oVz;W#v
zJuYnPGp|mde{)_{(S{hmdPzp=MIRYKG3|eJ_DarAXQiPn{YWP>?VT5ZmRIR{CU4Gh
zaS4mn1l+xNq{vCvDB(~ct%d(`nSW^03#BQ=y<9czJyxtv`$Yb`2&ST*)7hCTX}eck
zi_hV9xflM!=@!f2;B*yJ3fV~s%U<{z-n41DH(X3J>cO*|2z;(Ca&XsTReaJ34bpVn
zKyIR5fpJ@D?GTzrI<!Eb3UTGYv%biqXCwm7ruEl9BQi57ty{D^9TcwUBGBl!N_($u
z#oyGmBMVW|o-r-T_w9AI_L(-F@XzSfw67Ia4dn57x2=@;twreMm51%0#*>aRBosR)
zuYTpI%Y4P-PFsqw&bj?_$;+>6%)6+!r6F)~VM3+guE74F<x+Q(&xbcEt;7(R+(QUI
z_gw?i(hWa|*Nwb+&Aze*R<1AI;fdR4AS;!1cCA)>jujh$<yugkby+KX&gEUtUog9B
z{tch$Zz!(ApU;=Ht!_)#w&nU?PmV90cT*&2Ug{4U{JpqP8oZ?&{o<Q%+b3neyX9X^
z;)>_1yq;IgVLBIX6^Y0EGpDMN+xyroyGAYg-PpF5_jVlXCksnCFev9crmt1X*#|gO
zq5GPZXh$QJchA&)X$u_m_k1Rf&}35X?0=8A#p6@s5d1b?y+7sir!zticXlaVEJzz^
zD)}v+7pJm-#P3o2R(M~1%eU^)OYX%oAIo_%PwUmz*bk<#^1Ui0Gc_rP9czw%3*8<5
zvb!O!alG>GSd-9Ey}7X2Z(VfB*e`GBoBKzEk9usZ=z6~U`O)_ixsPvy-)Rsq-2dtg
zVRzt+Z5=IbOmQZb;HM)t#-=!9BP(O9m64OBtqsn}!ok$h!q&<JC^I)GS#MZKQb|;Q
zdJxWwP9gbu`Uc^ks_aG_Rv7+r1?O`Lho^fpfbuY)0!7v&Ym>z=+Sm>37tSmOaMKN_
zM7aS~U@?f{hykj>@(lzv;G0$u+#<OFwd<VQ;7eDKR0o<2f_m`9EC}v^?`A>J0KT3D
zK_fW42?dKb!vpTZoEESq1P8Q&1KQXXB6!~p4&aOdI$+K{kkbivG}xT`@T6U^=mU`R
z5Pa<mGP?0@Ko1JO?*-oigQPx?`)Hli50eJKcf%lO5ac{Y!PY*3IZt8EGZKjmej>oa
z%KrC47kK`!vUUhAYx&@^_D^pEtW8j7vX#*lJqA-W`FPoLJ?wHNM#(52UHs+U2<aLd
zkI?-%z(luR@)4xhKzzGx@08Kx4`P#fk0;sV*Dn+iv+EYxeeHzEkwG>+LMb3H7M;Nq
zFOrAHc~qosMXzw5KXn_#S91W%(jN9WSnkg(Ih^iP=WaH=^$_2eJhYbHs=699PB$_^
zycYenQ}|&Ugu`}PEsQN)yU}A17tWtlm<rhkO|{2Yd}5>!vVU7rW-QusVDHZtRczc4
zCAlXbtt)S&vey6C&OiFz%{m8`Jz~@0t8}r$<uo*|wfD5?S#)Zz+HI);NDlH#&Xee*
z&hj3E3-wHen`wXbwI)a;eZBhEo~CBU3oF$SKQHSP6P?;8b*%1x`2}o3LhI|=U{51J
z{)6=)w&_Uh|6hJy>|%U;1mn;??2-=MyZL61fki_Ta~aw{cztADlO^M#zxrM=1&`R>
z`fCqZ*0Dd9g1eFoU>L)V6=C?(!OjS7y<y1!Ubq1xU<dIf_{|2`N`sn+z^gbi7{sF}
zGLalbK1a?cvHLMtN$?XdmLGGy#rUUnIELW>8*i^sWLRh%MP`u0$O+^uwh)5MW`pe-
G_5T6-X)~Yz


From 71b3825d5429c6a3ce3e5e6f948509dfda1e2d60 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 15 Sep 2019 00:03:56 +0200
Subject: [PATCH 061/166] tox

---
 gensim/models/ensemblelda.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 2bf40afdb5..ac8c316aab 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -110,12 +110,12 @@
 
 class EnsembleLda():
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
-    
+
     Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
     the user does not need to know the exact number of topics the topic model should extract ahead of time [2].
 
     """
-    
+
     def __init__(self, topic_model_class="lda", num_models=3,
                  min_cores=None,  # default value from _generate_stable_topics()
                  epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
@@ -319,7 +319,7 @@ def generate_gensim_representation(self):
         if sstats_sum == 0:
             sstats_sum = classic_model_representation.state.sstats.sum()
             self.sstats_sum = sstats_sum
-            
+
         # the following is important for the denormalization
         # to generate the proper sstats for the new gensim model:
         # transform to dimensionality of stable_topics. axis=1 is summed
@@ -356,7 +356,7 @@ def generate_gensim_representation(self):
 
     def add_model(self, target, num_new_models=None):
         """Add the ttda of another model to the ensemble.
-        
+
         This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use
         the exact same dictionary/idword mapping.
 
@@ -521,7 +521,7 @@ def load(fname):
 
     def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
         """Generate the topic models to form the ensemble in a multiprocessed way.
-        
+
         Depending on the used topic model this can result in a speedup.
 
         Parameters
@@ -868,7 +868,7 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None):
 
     def _validate_core(self, core):
         """Check if the core has only a single valid parent, which is also the label assigned to the core.
-        
+
         If the presumed core is not even a core returns False.
 
         Parameters
@@ -973,7 +973,7 @@ def _contains_isolated_cores(self, label, cluster, min_cores):
 
     def _generate_stable_topics(self, min_cores=None):
         """Generate stable topics out of the clusters.
-        
+
         This function is the last step that has to be done in the ensemble.Stable topics can be retrieved afterwards
         using get_topics().
 
@@ -1092,7 +1092,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
 
     def get_topics(self):
         """Return only the stable topics from the ensemble.
-        
+
         Returns
         -------
         2D Numpy.numpy.ndarray of floats
@@ -1137,7 +1137,7 @@ def id2word(self):
 
 class CBDBSCAN():
     """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN).
-    
+
     The algorithm works based on
     DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a "nearby" point is, and the
     minimum number of nearby points needed to label a candidate datapoint a core of a cluster.

From 2d184a4507ee16dea7701839f320383502f811e2 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 15 Sep 2019 10:50:38 +0200
Subject: [PATCH 062/166] fixed stuff in CBDBSCAN

---
 docs/notebooks/Opinosis.ipynb     |  31 +++++++++++++++++++++-------
 gensim/models/ensemblelda.py      |  33 +++++++++++++++++++++++-------
 gensim/test/test_data/ensemblelda | Bin 13705 -> 13683 bytes
 gensim/test/test_ensemblelda.py   |   1 +
 4 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/Opinosis.ipynb
index 8227f99932..a828b2adf4 100644
--- a/docs/notebooks/Opinosis.ipynb
+++ b/docs/notebooks/Opinosis.ipynb
@@ -32,6 +32,18 @@
     "elda_logger.addHandler(logging.StreamHandler())"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_print_topics():\n",
+    "    # note that the words are stemmed so they appear chopped off\n",
+    "    for t in elda.print_topics(num_words=7):\n",
+    "        print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -107,9 +119,7 @@
     "\n",
     "**topic_model_kind** ldamulticore is highly recommended for EnsembleLda. ensemble_workers and **distance_workers** are used to improve the time needed to train the models, as well as the **masking_method** 'rank'. ldamulticore is not able to fully utilize all cores on this small corpus, so **ensemble_workers** can be set to 3 to get 95 - 100% cpu usage on my i5 3470.\n",
     "\n",
-    "Since the corpus is so small, a high number of **num_models** is needed to extract stable topics. The Opinosis corpus contains 51 categories, however, some of them are quite similar. For example there are 3 categories about the batteries of portable products. There are also multiple categories about cars. So I chose 20 for num_topics, which is smaller than the number of categories.\n",
-    "\n",
-    "The default for **min_samples** would be 64, half of the number of models. But since this does not return any topics, or at most 2, I set this to 32."
+    "Since the corpus is so small, a high number of **num_models** is needed to extract stable topics. The Opinosis corpus contains 51 categories, however, some of them are quite similar. For example there are 3 categories about the batteries of portable products. There are also multiple categories about cars. So I chose 20 for num_topics, which is smaller than the number of categories."
    ]
   },
   {
@@ -120,7 +130,15 @@
    "source": [
     "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n",
     "                   passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n",
-    "                   topic_model_class='ldamulticore', masking_method='rank', min_samples=32)"
+    "                   topic_model_class='ldamulticore', masking_method='rank')\n",
+    "pretty_print_topics()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The default for **min_samples** would be 64, half of the number of models and **eps** would be 0.1. You basically play around with them until you find a sweetspot that fits for your needs."
    ]
   },
   {
@@ -129,9 +147,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# pretty print, note that the words are stemmed so they appear chopped off\n",
-    "for t in elda.print_topics(num_words=7):\n",
-    "    print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')"
+    "elda.recluster(min_samples=55, eps=0.14)\n",
+    "pretty_print_topics()"
    ]
   }
  ],
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index ac8c316aab..2f04fa640b 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -994,7 +994,7 @@ def _generate_stable_topics(self, min_cores=None):
         if min_cores is None:
             # min_cores is a number between 1 and 3, depending on the number of models
             min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
-            logger.info(f"generating stable topics, each cluster needs at least {min_cores} cores")
+            logger.info("generating stable topics, each cluster needs at least {} cores".format(min_cores))
         else:
             logger.info("generating stable topics")
 
@@ -1199,7 +1199,20 @@ def fit(self, amatrix):
         min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x)
         ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted]
 
-        def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]):
+        def scan_topic(topic_index, current_label=None):
+            """Extend the cluster in one direction.
+
+            Results are accumulated in result, a variable outside of this function.
+
+            Parameters
+            ----------
+            topic_index : int
+                The topic that might be added to the existing cluster, or which might create a new cluster if
+                neccessary.
+            current_label : int
+                The label of the cluster that might be suitable for topic_index
+
+            """
             neighboring_topic_indices = [
                 candidate_topic_index
                 for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps)
@@ -1207,21 +1220,27 @@ def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]):
             ]
             num_neighboring_topics = len(neighboring_topic_indices)
 
+            # derive a list of all topic_indices that belong to the cluster with the current_label
+            cluster_topic_indices = [
+                i
+                for i, topic in enumerate(results)
+                if topic["label"] == current_label and topic["label"] is not None
+            ]
+
             # If the number of neighbor indices of a topic is large enough, it is considered a core.
             # This also takes neighbor indices that already are identified as core in count.
             if num_neighboring_topics >= self.min_samples:
                 # This topic is a core!
                 results[topic_index]["is_core"] = True
                 results[topic_index]["num_samples"] = num_neighboring_topics
-                cluster_topic_indices = cluster_topic_indices.copy()
 
                 # if current_label is none, then this is the first core
                 # of a new cluster (hence next_label is used)
                 if current_label is None:
-                    # next_label is initialized with 0 in
-                    # fit() for the first cluster
+                    # next_label is initialized with 0 in fit() for the first cluster
                     current_label = self.next_label
                     self.next_label += 1
+                    cluster_topic_indices = []
 
                 else:
                     # In case the core has a parent, check the distance to the existing cluster (since the matrix is
@@ -1240,7 +1259,7 @@ def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]):
                         cluster_topic_indices = []
 
                 # TODO changed in order to make the parent_id parameter obsoloete because (i think) it (the appending)
-                # can easily done before calling scan_topic as well
+                # can easily done before calling scan_topic as well.
                 cluster_topic_indices.append(topic_index)
                 results[topic_index]["label"] = current_label
 
@@ -1248,7 +1267,7 @@ def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]):
                     if results[neighboring_topic_index]["label"] is None:
                         ordered_min_similarity.remove(neighboring_topic_index)
                         # try to extend the cluster into the direction of the neighbor
-                        scan_topic(neighboring_topic_index, current_label, cluster_topic_indices)
+                        scan_topic(neighboring_topic_index, current_label)
 
                     results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index)
                     results[neighboring_topic_index]["neighboring_labels"].add(current_label)
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 3b9563f08da01a1f9900d7060694ec235af53c2f..717c2916143305036b37fb3a7746d07b59c971b3 100644
GIT binary patch
delta 3761
zcmW+(2{)Bp8`hzcN`o@xH79XOdNYLxg~(hgO~!=OK}zJ|IC0Fg9Auv7;h4!<*Yy{^
zfAal-@AfTg8MggA``*`m4fkm*{BObPe@-i<mL?{zBhY_=lkCQz%~vxgY^z^g=N7lD
z-}t|JtgwbFIK&!_Y)315*_w-aETbKbfoS3i52{08d|dU>k~8c(&k0#CF-eFG3UPAh
z?(HX!oY{Ta+oTZnXk#s#o}uP`Sjqz%dDmjR!e-21{~iXo^4gSL*HME#Zk!DWfPsT{
z;ZbpAFmjPwIZ5xhh$S@0`@5xxou`;8<uni7ZgD<3@e-%F854{tO$w(loaBg(6D}Ba
z&-@a1{P#;9l>6T^g%(?Ab@`O*i#gP?AG3<5tQpbf9CJCfq?o@skN(de*@}U<{HSEP
z@Lz78Nq+7QgDY3r{h5t6mR8@m%lUKF@t@zSi=Wv(bKvmzvlj|ad89r=mzo{#u_Z{1
zS*M}4$lXF<n~f518@}<7dz^f7;i5U3OK!-!^{iE{gt^LXW?p=G$%ZVfR;F7Q#bznk
zyik>gDMxHzu^`Q!!}t}(5wwJIQ$XCokdFoWF1^Wk#c3FCqY<N=#5S4)J#ADB*08L`
z&Y5C%bDdjOSiphwX%A~|s!#Fp2{?R<364oqr<5}G$|38IFvRw2n8GSIIF<+lbiw?M
zG1PF}13T>JM1nmQZe(5ONPYr(qzj|w=))lEF^P@@bgJJLXdJnp^B^&>G=d`;{LAI{
z6>Lw(Kq03J&{mNoyZ=hpC$XRkh?KUEb4GFI+=o!;D>b>dxXB5(+d`Qy><JTy-X@(E
zO18bZip9q?I!Q4HAs<jr9ft1kAVkVOWQ!lBf1HZ?K|LmuIm~4&hf!CQiD5L|#Z2}4
zbQA3SYt5xJZu_uVZIg>BcAPJ>q0Waj!Hx<>(VJq2b@W_4p*j0C*!sKlZuO!%`d)aF
zDH?8FN=W5IgmqAKICnz1coX|L!T`okp!p6OoF22uibJTwG8giNo<khtIu85<NdvUc
zVTg9m6*_FPxfBcTL0|D3YvBoK&E%9hMzaHyDj&h4CnnJ1BlnMqvx{vuU*{|uetvM{
z+Ix!fykBu8D<d1fuAvo&Uf$B}Lpi0fLioRM(!|=oG(AyTmdPb*AKbyZ)nzW9@_8YI
zndJg@UYJ(K-C*t8C!uPzXPG!TiF{kRS!FIoD#JYbh-rI~_*yhpF=qLq!23-=YDhx<
z89C(WgHrB5Em|E<vQbFX?d$oH+N%#(uaVVx>NJUw96l=tFQ>k~j5#&*(dW4yjy*FM
z9%YnP!EpW(=U+s#KH0Ct_pZ=poCh>~<*>UA4B1e|uwNpIH>%n5^;<HUuex#@;~&b@
zSI(hXuo%d37QkC?e}q0;&~1_nt)eax2e#7fNe%bWhLL0;(-g-w5fwj0?QQ&Jg$?$w
z{+Woc4INpT?5hwHS;uyeGgscY%Jw-l2ROZdtw{4ah3L~iG2@Bt^uWSGj<WAHmvE4y
z;&RkAzvHbD=#A5efi<x6KF95$vq|Sb_)GN@%&~#J7rBZpAMS;@M4o<QZ+1h41rFX%
z!%UzX>pmsr6|f7x3Smqa_O9O(#jVKK8kX3uaNrqs<AhOL$?v$5R_y(VYg{^oO*C@#
zyKL(WC^#LJ6oXle{!60}U^KJpv1oZD2>b6u6T9-CP8*J#46JaqN9}Ph`uWQ@2#aOa
zF4!wdykXB)6R#w6-NmGhi3bjfpK{h0s|o(t_@bm$p&>)<8D!1<Ta{>Gt(e8wZCfm|
z#ov|9=g_H1cNZA-f!_JHSQF_7_#~!4CwR{kaWD^U`PQjFpcBBlI8??B$4jikpdH6e
zg)uwgCfyI8qQ>34472WDWw(7XfqsdNo^Ka;l;)veH6$t!+7!+d{@>dSUf6L*9mlV|
z=kEvRthYv!X|PzOF_+DO+<C8hA+jIM5qLJwyGd+xSl<dqzg-l%vGa~P8oeTX{B7*H
zg%zg)KN!7T+}))2%VvIN(sl!DFQ+@>j}>+!#rf-$@;K>l%PB5!=t7y)*1}HCTe(UE
z&>-fzdowswjDOxu3}x*K2Yu01m|b4XH4I_I$>AFtId2=p5e^3OAccJv(U?|AXy}tq
zmnBM#wf1?J?~BoDpIdU`I_Ek0nhVv5SUo{;hn|Z{^%*&TI#x1ZNs){7U@?TcfBjR?
zav2i=+!tYPVDh;GIzl;L?p4Y45($%yWUgkw_*NYFh!biIRrzu3D)(GPGj*4Qrd?(j
zun?AXU@1%9-CitenJO<SDQAaA4s~W6aS0P1b}7*=a?dRsL?|V(m`#WFy7}3&*bc@t
zCgMWERmC)hJdBLJfOfwpY|N|3)(A*5cO63|zxqWHJtkO>VOu5sNN7$4tFY;VttXuQ
z=bozabbhk`M=N%s0psQpPc4Ze*Zvad(=<Y$&d=9Xg^HcfT^kyP9;_y(T5=GR)PBW;
z75Xc=Y=_<?8da$AuoXLPaAx1j0Cf0qL%6nB!NVZ2%Qh>~^-csQ?b*RuC+r_<o;yB4
z7!eoM!x$D7g{DLvJw_L%C6%V7)JEAd@!E_VicA+R5|XRXT0LY7rmVv(FPubQk!0y!
z6xL0ouyw&g?+C3VpX@Zo3Lj)X5Vq>k;fD3V*lhmc>?a9Z16Duh$j`pV9G3xVpIz)j
ztu$~#&YYItwvsuIF)lh;{4g(+;yV?a)CwU$+w+^4y@Q^TaO{R)Cf{G4-@ffooc{bD
z*1X{8i@eWh2}ff#wX9=P$q6>SW6$?n><|Yqh~y`7(SikyAU$_OykjCL=Ct^v{vl^h
zOXU2KC4&^B!*VtCc8+Mf%TW_HVNrqJGubUeT(yxKo@F-W8Q3gm&0E_vZ>dzLE|H9w
zj^m~;>q6wleQ9<dtIN4$RfG|VygIC%O^|E#qk}b;BAH$;$Ft2e^`6*#uN<k*cq9Dw
zqSi&j-=$nsV=tQnl0Ti5l6Ax7R958nU3c(Xu-nSGmM;V`JW$Y(MWcn_HY)aQIF*$v
z>iZML+IM*p&iZTdPtCZe!mjHj%L>Wk@gJGuf`>V=oM3mPNfOpMpQf-Y%=@mj^m#A9
z(+iU!|Hzc26)sNM$>vFmXiiqS@kn_pVjWJ8m)X)N{n{4#^;tYZzh$r_$Y!1hc3aXx
zvQ|v>&m$~Vp)navmETRda?0t1j6Qu5(7R{+-@4@l=luwm?3wnK8e1fJyTz-Qo!OL+
zaoII5eE6%W!qKHq(%T*Jp5G1<(8_u=WoEc5%#_ju20b=~&+{%S4w;CvPr8W#G-5AB
zCcAFyJ7|}G>o{J-0k(Txk;RFpDMi#4$#|#Zmg&zY=!s;LDi%wDJknq)=6eWxY@sW~
zpJ0J_Un2?ni>)EaF(M%YhZP>gO9yn%b0yDfIC2(SCW6C~%-C~wQczE3pp?@kz|0AH
zbr~XydG1$~%Iw@Lqx+r&+n}fFb9qUnH@Z~Pra8%|0}gyG;kX3r?7hek8X}<c{3fVO
zS{BFM!&Ky{=ciErgwtu9P`+NQ#vt`F1dW=p(c#(g$k!Kadi*L~Cf9ykmY^}nvn%p_
zK;AqYWuN5bhHj0j1nn8@^p>?9YzLk{rr_0^98L9NXH43$^3pBBZcCaqe8-W$BaG)H
z(+p@V6M8IiC;u|q?{h#o!<8ruR0ym?<x+Ap7Sk)@#fE<WE4*%zSzz6Z?fi=!TR-ua
zrH6NmvhwlEoO>74ITmuibhcQrkJ>0MWw@RbaV~R?W7%2X(gUz>?vdq%m0<3>yhq!k
zy!*DH0jbwrg;{Hs$1Z>!D6Oy2SN;lr%bxuZs&#oqvnR;+>9^O_BC1{|8!?mdGco{;
z3NhDiS4dk1ay+s5<&md6z9Kx^;^@y0^10<#5Dt?teI?XU6{W@>kF!ZK3C)!}mnYgj
z95XxVp<HOd%>w#x59pFP`u!gV3@Kv2qP=Q%oB5`QLH-q6|6`RAh=!Za)W*vP4@q_G
z<%z}o7Pd`_Q7%N@5HA<Bv2s%UQ19g{4Xl@-pZ=nLO1;7vFO`R|xh*$V=Hgk7XRskx
zGHdLv%K5SQ@BSkjhceXOf5VlF7v06dn$dSp@}Sqw4SVP=K)a?R|8u>jGey%?rs>X#
L%iqvg+dBOZ({|p(

delta 3783
zcmW+(c{|i?8xAFTgiu+_6Uo+sBuZq@BOw)8vRAeTAzBP0jBTuAiR_GhA7-q_ah_k{
z{U+}hcrWiC4##hf-`w|go!fQIe~a5oHve-`(QbL}T+$;<=3=#g3v9v$59^?_buO`c
z!JYarj&MFLl`8aLhBJPgz>@d7AdK6v!_<#6@;ty%_7@g<W1Cah;xZ13Qgdj?c5mQn
zcnSafNGL|%{m=xbzkDLTbIK=#YiMGtv#V7B8x>JuSf+uSIKl*4dBXOGejaH4$NVDZ
zumV*LM_*yp7>$YeoluFz{CFN>>p7ZJu#%e@L#+jyxMz$`>ON#Z%|#xEWAi+_I1R&h
z=K!v}liRw=EI9;Sbvj1aMs3LlbE?z4+}NTpvd0+L+-^VRa2?jzk5L}`MWC<z5<1XU
ziople%29(H#-vkqIJ(hw=P}yMGD6W`P|9C7zEk<I7>iV8vz0B}$_{$zb*uR1ErCp5
zi$-i!p<OE2<5H?S4AtMUpMwp{=obR#F?JOnq^^Ow+L#N`JU|x?1yCIaslJTPNI^i=
zfTbvohS}v)|FNo}1ikFzA(aAEJJgx%GiTGCSU3E!dX?$PBaZP`A~eujJmQK47g9pX
zF$KN6d-%pmFzJK(Iw!29rP=Z{ZgWO~eyrAFSpM|}Wj}v`i5eK91XJiPJEF~`NLU%c
zp6@+(XL;3@TkeAJntWf&U)-|zn8aNvW=a}Ag5enkv2VdSj2D}kdZLS+pK1f;?%$Zf
zYN}v(_(cHj53J+~PT5o(ieCy&am?!4eKZBx6=0||I_!nXMceAgSB1&hSo?u)ZbxC@
z64lrUzy8fpo{#eaxm;plDt!$*VBob=P<{-@+0F6bYES&)c+Tw$UxK+3f$@waPV(ql
zawzxE8HGmbFye`gOfICdots?ZtfD~B+Gx0vl0sb!Rn!RjYgnKj!yoE+aG5&p3pV;7
zo_cUP>5LiacLO?iu9R0^iN!pp<@QdDd;3_sQdLTQ#ievBNAA;L$y0Wr1*@ERh*2o9
z!i{Ip3&-8u^t)Gc6}wbSI$)+M1e$yv2oyUtQ2zB3yIul&FO(m0q{ePcqodp>#Oae9
zFm(a_!M<qY^h*r6V)#*ju(-?-bQk(_06Q?C*(?>iZq?^GY|0LfVj?Qq-pU+%PehET
z&|(1w$FTP`P(eez0xLGI97zAct?;uxg`e%~T*s;p_9Law<%oaL&t|m5Ib&P#BnY!<
zZjM~RF|>DiT+5*dHkmk4`7Gk?7qL%USuXZ5nGzg>z9O!f$5CUKe9tVfjtk*YnPpf*
zGkXfLZ}$GGR5Bp`d*?3+GyIx8IDje^+ACpqgr)&Zq*c+7<%}^LViT>`&{X*KTAB%U
zp8gzu3pJEnm-e=$=*F^ZqI|s(7h}vMew5I-lz)z4XXS&hQFhp|MYlf|qOk1cl`Zz)
zk+j+Q>1>n`;g?38qKX}u6v?&<<-Ko`tUqApiKLJ+-4j}Bs%+82lc$(vp8`86!b(px
zrW?3fg4Mu8aocW%_`Ls7Doz49ZTy%U=c*r5mGAbM?eUUXJP1pI-ub6D53I%C%{dYj
zy{`fbxn5L=88e&f3bq={sf!A%Vm-`O2x#MC6uXj3@*hBGz!86QK3fv8?A}u>nqbGt
zSU8>KqzRUBc&p%L`nMX-YV0+LMb;BI{K*>!BBeq0U%<?_O12kp@nuZ;&7fR1N>Z+1
z=#530#T^CaT%H$OLSb+~JKGa4OB-5K%vB24+IXI(ax@=3$?pZ?UTYuG_`!So>m1;z
zB>ahBKZeftaS7Jw3PyjTWh}dDI4P|CLZcsd-n``&+oQQxA#tS=nw!Lz^ETXyyZj_v
z;`Wz>MT@atYzmtdn1v2a9-_+QB=+BS^RW6^kKZO|vAHQy8nKoB=OjOsXwMOM&Pe!7
zO4_utGa~an*4|vO604oWQj^Gj`-kYDKAL*_0&K?DNC0&dougjv`|}LiFITg9kP@3A
z@wp&4_uS@5G8LP79C{(3t3{6!81xY(H9JFnmLwzASZB*Oj2nql=D77(`d4B4Te&c)
zHIBqq?N<vdU%3<a+R9se*ZJ;3ULiCdQIhM&6@tLf(`QgeaQ^MbcUXT0Lw&htnwMDs
zHr!?0&7PIv=cxD^XhaQnimk+*;}Y~D->FRTNh3Qm^3vEzRRuJz{#MVhE2C*oj@{z!
zYwiWGElI?2DtaE^#D_%8dwj+ao8)3oPWj{LmJzfeIQWb6=#AoJFx$-JfF&Lq2}>i+
zg8L-Zd5U7r*mE~p#4!eK@NH>ur2+F+cd;it58NsdH7O;$T1D$4)IIPk!%~<uxRe|E
z+)DE6n5tZM=UY=%fJwA)qloKw(Hp18E4W^E*G8hNA8kdxLc+Sg^rzykOzH{rU5(^=
zH8&#_7`(_~Z!TLZoNO&41@I+K*9pOM9KYd;AwMC1=s7mbEsL?7os;3m9(!mbgn@QN
zNvx!Xp1YSg=*UUC2M*XmtK^j-1+9fL_v73nrM~g3ELl)?xCFHqIubvX;*aqyj+L_w
zN*gK+4Nx1qrl-JADxucwcCS8Ca%|JT2x~WGNT|!$ly8nD$<}!;yU6^nb3*$Aw*H09
zuhlmTJWDew!r5*wmF!knLTSR56!gB}x;vV$a7f}vEyT6Oo8I=5{b<7-y7OVy!j3z8
z*n(Y8#k(LLyT~*hNe~~7Vjk*xR~uuf6&QIf9F23nzS8?_@zTiCu>9#=MIHMX4P{@5
zb9k1eA6l`8_0K%Cxf5E-(;KlE@=5={7rSJXEnbwN(COHa#a>a|ygPqmAuO&~R*fSx
zKD)2Tmw~q~d($EMt1#ipE+5~ke@B@;t~LLP*_eFkeAvf97O@s{5y&V{U&!qm*(9d0
z?ZGiq-$e1|07gF1fZa-&L&tVhy4<g0tN8p#mV@Dlzvcab&@3Ls)qCCbvwe#ZH+EYj
zOMGwfKyd|za`s|6Ql!!tE}7f-h-&Avh2gyU`NYK6k{)U)aLIv3Z+!BxnRz`eUgAGY
zuxiA1e!L^6eTu0Sa2Na~s~0>|*yk4LC7-B*??1q}lsl348mg~e*e?~(PmLubmZjt#
zF?{bwg#_Fbj=mK~qxWvk4Kdi%4{9C=(#xX%eJ(m;?!CBh23l{7T$1hs?qCC(A}Ni`
zsaEF_d72l!9v65xzT$#A7VJ5dW|#1>Ak$1hl^WR|3H4KfyN8Q76v6JHO_Zt)E(y<t
zVl7QRZ^6jFR@{DDk-_~u87rm{(0=!Si={d-)%ayjMjN54P%NnCZ2jK}`SOJoNr*m+
zi_|?UaDnPOwNIVjRK7BnZR_;>Rkx};&_B6RF4yd4vHkm5-#o1mrD$Eaz&_uL7>R!@
zVJdT{pL3p=y#bX2^f_W()69%}H4XpD&O6UP=j6r{^oiayFobYc=896j_UGo`KQo{z
zlGNM_^^_pdV9LDKu|(ioMx(DWj;Rf(=1LkTd}Mzw5ID!Fl*O%Cj@4Fi$DIpFb~3=G
z74myS>&YR1X><<CFETWKI{I=DWA2}B*k*i|8|J=a9lu@PxW!vb_&3=Lri0?FKW{}}
zi?U9&$e_h;uxw!~SaOvWx3>;`>=(IC$gaHO|4~L(C;MYU&{XRd`_6`|sWyobskI(L
z$)F<5IRwKllo(i(^O}t;C1yB`#saBTnU;JVeP&eW8dE1>u!Ej+BGcwC(O5`FOV}j|
zRnOdjTB*GI@x!`6v{X2|2%y7a$Zf8QM5j!+>JTBT&0(d4^~Nh9WaFE^jePKjo#}F=
zVms<pBo9J=O1YDq3AkWW#l1wSxP@weY?g%JHy12IWHe1k^DQzsv@XsiS7TxL3fami
zCTy!tk9PX>E;=C^ngB`ICG7d3BR;{A+fPM-a#gW&LVb0-aJPEK@`^`JmT0a^C>c7Z
zs4u*f{Fx&unD^yEDHd`15{p;RX(8D5_~SI{(g(SELb}&n@lA+|M9&XdX|)9c>5;rW
zS!=`NCt~0g$r=qiVz`mz7+;73H=OX70?QFqlIc4_wfxwT&36+s&&|>6BBP_{d^F}m
z;>2zrpQ0y)vko3tE!lO~H<kKa>cXK*$dMnV#n0b+I7r~=DtkZu+gjO|QrTZ!Igs~H
L=}yKq`<wp*E@k&<

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 84bc8340d5..a87af416cf 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -30,6 +30,7 @@ def setUp(self):
 
         self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
                                 passes=passes, num_models=num_models, random_state=random_state)
+        self.eLDA.save("asdf")
 
         self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
                                    passes=passes, num_models=num_models, random_state=random_state,

From b0c51558ce1fee11bd61ef048110f3140c6f8d3f Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 15 Sep 2019 11:33:12 +0200
Subject: [PATCH 063/166] removed unused results column and only CB-Distance to
 other cores

---
 gensim/models/ensemblelda.py      |  11 ++++-------
 gensim/test/test_data/ensemblelda | Bin 13683 -> 13705 bytes
 gensim/test/test_ensemblelda.py   |   3 +--
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 2f04fa640b..dfc90148b7 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -884,7 +884,7 @@ def _group_by_labels(self, results):
 
         Parameters
         ----------
-        results : {list of {'is_core', 'neighboring_labels', 'num_samples', 'label'}}
+        results : {list of {'is_core', 'neighboring_labels', 'label'}}
             After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results
             member, which can be used as the argument to this function. It's a list of infos gathered during
             the clustering step and each element in the list corresponds to a single topic.
@@ -1186,7 +1186,6 @@ def fit(self, amatrix):
             "is_core": False,
             "neighboring_labels": set(),
             "neighboring_topic_indices": set(),
-            "num_samples": 0,
             "label": None
         } for _ in range(len(amatrix))]
 
@@ -1220,11 +1219,10 @@ def scan_topic(topic_index, current_label=None):
             ]
             num_neighboring_topics = len(neighboring_topic_indices)
 
-            # derive a list of all topic_indices that belong to the cluster with the current_label
+            # derive a list of all topic_indices that belong to the cluster with the current_label and that are cores
             cluster_topic_indices = [
-                i
-                for i, topic in enumerate(results)
-                if topic["label"] == current_label and topic["label"] is not None
+                index for index, topic in enumerate(results)
+                if topic["label"] == current_label and topic["label"] is not None and topic["is_core"]
             ]
 
             # If the number of neighbor indices of a topic is large enough, it is considered a core.
@@ -1232,7 +1230,6 @@ def scan_topic(topic_index, current_label=None):
             if num_neighboring_topics >= self.min_samples:
                 # This topic is a core!
                 results[topic_index]["is_core"] = True
-                results[topic_index]["num_samples"] = num_neighboring_topics
 
                 # if current_label is none, then this is the first core
                 # of a new cluster (hence next_label is used)
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 717c2916143305036b37fb3a7746d07b59c971b3..286d8dda8bee36a5089ebce4476eed98998ff7e4 100644
GIT binary patch
delta 4758
zcmYLMX;)O)l1(NNK`22K5d{R4QVV5Nkpe|fhC(eA6%-K^O%f3^5wI-qBJ(_t0x~4?
zJjpDqwc{_mf6&!yb$@t0y?#A+zuq_PE@7>6@3|*VoH!9Xc0~S3SChT)2Ww^5UZc5l
z?LKBPFAY+L`4ckv6z$T3%}dxib?1c4U{jWzq*A7@V*`!oM!6lTWex4vap9g_hm+Fi
z8z0Io{nBJDoo;9h{dwQbgy%m>6FwH8^DJudMP^?}XJQ`p)HqzhT&^p&(JRV2<{tC7
zwXl2pQGx~OmyV>fn8y@mWahr_ZFEXSx_yLBs?mggwB)$TxC}`7^URwk(Qc0FQ&R3J
z>vw3_DmtY489L9SM#>{(>M$xWiS9kv(j|&9`Jp-5WDIkjXwb@vZGx=ac_YdIDq_w{
z`43sIrBpu1)CF72D-K5jWm!69GYfq(^NhD`kUD9m?X}P3^9>BbXpXfsJq@pZB5Sf3
zD%JZfQGEjoAt@erv2-R)7G&}*c4P&e*QHB(?68KN^D-&5XfnqP8lGT@XDIW>rCWx1
z_O`5BAH9hIsfm-R$Mo0SP5B~qx7=X#KN%#|du2p64QQ0^{1}wQ7>dz*@@I6X<w<=k
zKKUM`JF7$erNuh*4jN?W-6PaWlax71kCeZ-ZTmAiVj^VcdWe%I6h<1hB;6Q-Qs;2?
z7g>H`E0tj|W;?@pPmH!wa}~oV(_<<>97BoTvVmq8Ej*<*7VFY_3e6TWBpc4yx-46c
zbo(+E{C<)alw<R$EGupXzV(VREml^J-N3l?$|9XR$hUT>b8>Tbu)37#C;j@@3YKkT
z9zy{#a+iD7WM+S0D5@}x@gg+SE&Z6(5_Ua)*iDzkFyf*IWf644EH<2<r^@zzBC$K0
z;prwYj52TYgtjhAi%eT!)ROUArIyt^y5YmcTQV;lDVZ`+M4WA)Rfd%p$ry;@WqKX<
zi1Bv5EapkWK3U6`a+Ke@FCS&dhkEPoCqF7eD;>LnQbKX?y8uktp_fQoy2Y4vo{yEy
zd~7F5g^WjIg}|6OaXz03QRQ-mP}mLi<5kCH3u}5=ew}>E;vw^BT}l;eer=EjZ0`-V
z+8d(BXWl3(2p#v%NreWVWZ*8ZTrZ=%h%xH|ojT%l<c-s@Tp9Mn2m@Iv9gnX2JAPk?
zQmkH$I$2<a4u6^7n~PP2=`<nrD76m_*G5TecCzPfl)S~r<-2G<{sU?>m{#IdaGu4M
zd@w_;H9GcSJ(zh`Mz_^RpbxY07G6R6q-#geQYgK;b3WSVq1iGgjf{CIx{DtsOHIzp
z$M#bGbHW+Idz)FjgiU<L6z^3}6lXCSBQkoE*)uGgNgnbM!!mA<xyS_GKTiE@D{XP8
zV)VKK^z?S0EYYBTH(9>G_h#9UlGqGsBcf4}gl%_JUWrJH%SD&;DbgPP-H}Yx$%^zl
zquDY^hJHc${g|X&X+}rT!>5nQ3~kbm(GaHXGDcB;*YedlSv#K)qB~YdA075aYsgia
zTIVHSFf9uS<c+qRNK9uj;_I^PBV~nHIV?sGl$lxaZqtliD^ai^4aZJ<hhikh8`aL?
zF$~uTn(ZSGzmml#<P+4M=KfCb!d}Y3bjSDf=lZ?V-v{7xg4cl*;<QX5E}NzN@d>)C
z>?cb1Qh#aS#S9sh-aDS?yHNP{C^LEY=s}w(-(2ZQM%OX#E54YOl9SI;-O>r;T~yEp
zvf;U8G#SvE;O|9_h4Hct+6z1|^$@GF?G+rDqs0zAP_L7jJ?M9^lQtPuUP<3!;<)bu
zwj&dK(Vc{zy@c&5!$F>Yckhzz%S%|4_D~OLh?V|4D-33*kiVv6SC%7!v53Y3`4q#<
z9cFl%zGo2I<IZBu6(1aL+enM8^BXFwm3lvXIV*LK4Q~Dcbl>Wec+_3Vf>Mn!)I3DX
zW&hm!=;FW87AscL>_vL2$451+r!NLOONj@1qNU3&I8fGK`BC9V8Iv*@BKZtwn>}%V
zfpKo$lwC~w`AU@+CnZd)nQN#)>ACl(PFV+ICHlEcFk|Lq)B1qb6=JJQ!K9QrW8nm9
zH5iLki69iqhn^LoG@BOGSxD0(CSZ54Oyvb0IfTumRB60!c+cuIBpv=3bwRtUnM}J&
z!&CId+{-|pJ63Jf<T&!)k<3scHS#eyIuA|m<fs8G;1jxC8TB5S3^|pk6a_mKVc=E3
zGjpj+$)NZ8u;rBYKuX@;Nnjj04r9p%pAUw*O0|aQT9@%-==U>VF*1{6u;g@+`7>h^
znI)Uj@yg7GfyQDQqq1{NstU4ylCoPqjLedZV@wvzP9MPMz(Q%oCnp!uag8WD(rA``
zKT&$ZP@ni6q1sEoPCvnl4>_O`E0-=mxau1eby-@xG5QPHq|@<7tY6m3q^(63Iuu>u
z=}gQ&K-twtx2<LE$ko@pVmWJZBL*)Pk;xZyze<%<xOmHY-t(iuGU%utPdXc6f$dlJ
zNv&)<VC-ZxVbU3B6Zb*}U2|LxP-s==udrT>#uv!W!JlO<O?EXHjS82JaHjGX=6t2^
zc~@y=QJ>Nf2fHdl7Er;Cr7G?M>qHHUe^KIFDLo(qH?eI|=tHR)`!MOxCF(vopgTqJ
zEW$$mmDkv`K%YP6Bit~NqE^%e8BRqT$!s==T-S0ARo<AO_Ldt3F6=no+M4kaUB2=O
zEhh5W!8WhCC^Kj1wMJ52nQWeumRm<LTzml)nbLA63f)=$GGakkHW952GANzOAtGds
zE^lH5P+Nkkm-6KV9XS1?y==z$T*X?tR)JBM7$<9JVma(4uT9u+zfl@7V10!5|MKH)
zVyokZdkz+SN!*{E{KAr@biYfOdc-X$eZs>gq}^F2&ta6MuTJXk$QJ{4FDs9g=Q3w6
zCB@{a4(Vd4?Lv!8@iv>Xl6D-WcTc}L_Ky8R1M{g<c2E_C>bIoiJ_;{LSB}hFBOnZJ
zQS4wUS?H@ABGJRuYQF;ZO%<5%wm8Mj#z=g5R&0qCsm1Ui%-_kdMY+@~VOV$dVGTpM
zu7Ck+inu0&1<x^n>BBBOZ#2t1-H$GPN#rxEgv}>+QqvR`B2-erpt~#_WymYA<{*7O
zti0>A;{&$PaYT&t<CHAEls2>1jLAr<)M%nTqR+6m8I_LxhKJ}<u<FG=WAr;VTqE2y
z^A01YZ|1YwF^8Jb{r>J-*7A=QlwkvENI*+K5GwPeDJa;Qk^6|wcNz?`D=<Q3mQQ}U
z(t%RXU*&_f?6C2ukd|zgyK<C-NHuf*GuyP9qZg%1%dV%!PLw4xy1NN=an~=<{~w5q
zdTCGbU>(}_4kD}elbq}9oOPar;s?fLFeRDn+ahBxBkfo!JJfJvgWcShVzg*7<5CkI
z7D;XNPne~DHe@jtm50N{c>f)1K5IQ2r}9&$F6JsM-b0u6cqtfrkLHss+{@8!7%Zg9
zGPU}mrbx>mj?<arXw|=FtVYmz5MAa(d8<@rs5~_5ct!c)EH6IJ7H5t4n0TBXBTer1
zX(;tcig|^JeU9||gtR@u=VHr82~vJprt#4bQ+ScN)qoML@-{GKA00SChfhkkKgQDC
z%`kH_HbG`Wn0V}YPmttyF?A5@zY_IZiF;17R+o^e*4f>El8sB~wn0hgg9p!fR;OKI
zg0vkB$-!dC{dZ_nF#A?v7O25out#$|S+C?gS!IZ|crHPd-FLjZQKn_t4bw*tNL3^k
z`p}Sx$%}rBYm>D2k}SGdXDf?Yo3}|zO&PKs!w#pyrZDq{tYtFr10>pb>|Ee#qYRv3
zOsc4N?Fm~c{-_9$<-5nFJ({lQeVQs;F4FDk=unKU>k6hX#TuURCbcA&$*afNa(rfn
zwqn9nhVHPSb<*`)vdN!Bv)MVImv#R8!t990iW`SeuIDMAW$_qW{gr6UCa3t`meTw7
z=#I2HA+^V`gn_%SF((UqFnFApXbPwJnza);<pXo#y5jygJZv8mVhvScED_`3KLk8@
z!6ML()?ayxWlPi$b^WKqe>$g;u>jdBkh&LGJ>zHPO%#5}_dP(%$D}<8C0+s8W#7#{
zfK_%uDtwr&jcQNCR-pn-`&bm`(2lNxTjz6lR5|gru8@nCsHm6FA5MKK+h{zFo}(zs
zzGNvQ>9Sk=RC@Pa%-50bMh#N(05b*dVmxR`e{8Tv8k5Zc>9O|6j3A(flcdKi7kyV$
z`e5p{CSy?Ftj(>|w<PLY-{{*i;(v?R+|c~r-y3~<#<K2^rK@7aFE7(GGGFTd7;pAq
zKW7E$FSDMdy-qh|zIv0V|2yqpr|SPc-YmUB{||}!f6On||C6uwud21bso(z;ui^K9
zPSpQd`?u=+Uwq@u<1OR&WLD_^HBtX>+CRo?v?c}rZW8>DME!qK!GC>I@ZXAQ;eQhK
ze=%t=Xv{Pk-*~gHN(^S4B^t~*{;tYk!I`?rU};*k;+S7-*u%%~Y7N$^3{5udHObg;
z{HlJRDi*Jaw~DvUtT5P`X%y<WQ<r&rlc|RNoSF98^Ci*Xz>)gD<KvtD1FDQ_gQH2t
ziQ`xO2URheZ?5{l;A~nwq^|zICk%%<Gi`M-Eh<O&png|AzUe=z3aR$HnS_pU{Hp)B
zDn|P)O!m8*Rz1|!-_7^r%(V4{Y0--#)qC^tO}&pQqndxxOi}N6iu14feN{P={g$-<
zd(ILKr`6Tp?LWhrY42In;tw3D-;a-P`p>B{s`=+lGX5ODst-`bOy*lreIREZiH0C`
z`FHz+IWz4I;Y*_7LR(m%oBc(;ry4GC^er}owmlE@RxX?5w7;bq!qm^&ztO2t)~T-2
zsRw4NDj(Po&#&@x(w^oTGV=^S@_W4HgT4Him6vA1xgnh2s(FJ5K7DoHv|dr?+COXm
zqEn-*Q(dI0)abq{G1rB+*#{l*y~^)eoi;mOQ(=fy-}P07YkVdeqBwqarCp^Vdihq+
bUNg7rszh$3;f6^fhT|JUEXR!e48Q*cRCz9%

delta 4766
zcmY*ccT-f~mTejY1O*i(iDVH0L6j&cC@5g7L<L1eLSqXE_N74)<dSJ}Mv~;5b4IH6
z{tE9K%$us2Ki+#Y^Jd=Uux>jw&abFCx6e8IoSoKQ`_P#8=iIg58Hq(^GTG|~)NjsH
z+)YDP#Rd+v>-Fj?@9>UgTEMGEB3JS%9n(5*aXYngFSjabfmW!U8iS~bSNW(Sl(rs~
zzc=Gq?mNqqSQli^VuC{*TzTL7@k2-MHu99^Q9ZSB6*oPn$~$2R_pET&(i4%JX_gNC
zXpmQ5>vPvts-y$nGz|=-0Uojmk5p)Ai<fvOJ1(7<Xqg&te>aTSdrH$qJj2H>H+Uf`
z_B>DXc610$s}p#dM&j(LL(hrETr)l@r1?ueDhcq@r)t`vR;Le2Uz(>X?x#7?9cx?E
z<`{h;Q7GVVyg>bh@41x*6{^U1IQ)aRPshJ-r2%JN<L*Lkv@)~!#QVH(rs7HA8?S=r
z*3a#D<g=-Ho}pXfbL#Tqj<>XfNQ_%1QB}UH3BooPiQqPT;$uGGsmJE$jHy|;hP$;~
zC0-13hBvb*A764qCaskvTjs}PinMKBo<r02v`I_3Nwz#fbzwY8Eq8evLENKZ9~0_3
zpO*THXK2ft8flEDXqTE0z3RwlTBj8++B;pq-Mqm&7PLr5$kPF>yU0I0dGd^o-_RtF
zBdOD(hI^r8;~@=m`(>J@HQwa$SQ?<(5dKEvRLK)=w8#BC`OKCUuVr54QPnf*K^Df0
zsgH)Zo~Eed8FhNS$yM9)VfMY)prQvnn#w<T<!vdqC(~daPv=rwX&j_~AnQ}KC=ZN4
z+9!BcbmaMWcS&2OR^H%ko^<gBWIl3F7((<W&IpifOS?o%k9f-g#vFydqk68T;hTIE
z3bT*7#h+%r7)E|!EltJq2(Qpe7}w@!&<Hi%qS=bK$uiozZOO|?yz9fwURFw)=8m%(
zE3Wn7$`E^z#;7;JhBm0@lAhYMIK=WZa<_KQE6OiCPM;cXoPU<clMgI|qr!RbE-ziD
zLpq^Bn$V->o7CX&h?^{Um}+Q+7gd1gFpu*F9r+`Y162Kj2BKV*fWtO77tx|?a4~(O
z_3&rZn!(e?G?o=8mir(cJ<&lX@A*&y&OYsO^HrXshOh5jc>OJ-@;qPhYG!H{)m^4m
zI`;5HwvVBuu@v}UJSDRXC`yh+$})JFtMA>U4T}rBV(60sz|8R??WO3KDXwwVo5y#(
zs6A8W;DGXN<?V7~7%7hM$$OfyMTxIR(Hf1LrQ~|11tx|*Q=NvAlXqf?9amGU{V8q)
zh`N2<Uvl-Odt9%^>H^oQV<ULP6pB|8UtgekFVaSx<&AJUFmU3N)S_}4ID4KKQlhv%
z-e2&&1=vjR5f2peh^rM1WRW;dhXN{o%Zq!8KgCn?C1>8HiFX>Wi)W}Au^7yDM8I3U
z-;*{A(QSf5Yg7|KN7l&pl$sBzjYi`ErfHr~KPdf*YH#B@3)<u!u78f=Yom_L4DKrh
ziEPkrup_UgIYWG&ngbo)z7~<ZP5^!8E6uvoZgNmw9*=R~YhI?KY&ow)UiOznA5gDC
z4FYT6&O1C|OSM+XJQ)7c>oLu91NWZeHQMpvgD|HEqcmHCYjRrTp*u-58|1<@AL4Ry
zxr^%ZXe$}$-S9)jt>SN;mbqQz!RNHE0HSu{(|I+i!1E!m^RgjrQzNf^hFE7{u2E!M
zG|kc2ZQcR^#xlwup_WI3=`bBNv5%cjD;_-+RO)O?RZ=ecxz0D3mNXSkbdVo=&6Znb
z9&yxli>9n(Zgf=el;?bD?O6bAeiW0+sUg+NJ=l^DZ<J9BSAi_Xy{&19TLPT9`3!Zc
zlU)%;ZIGwRn$}VJfj+SbREv1e=JQYvwW%x<zfdg#*2Tjb-n2i@H8f<y6Z$~R9@wP&
z{!^-SHP+CatB1zhmnNwnZKLOtIiDoCiL@3P8AR0yJe#NEwgC^?bEO)dxcrvC-81HT
zOKQ>&0ab1(xjBgU-pW%@_G8%y&-Pgtw2cnS8{yP%6G`2)m(I1k<?+BLz{-|)Xw@Ot
zpSC=mTwRd+6$5_*q<w&^E+jkBcMIB&0Q1*~B?{!X#gG?y*jxkKTDX%JES%8-cmU+O
ze?24v#J}JILRr7aL%!6NmsL{0>oiQG4tAfokr%9kd6b8O_$Yz<OrmH;j;5iFH`2ty
z#wy#K3wJ=Y)i2C=@+vRz)N5X>h@~|>20Qee6Dv+b|4a;ez_JL(dT1$>Yi|c6P|F3H
z4CF%;<|a+Ou%nK<yinp%#v1~f$!0vSrP9_LFz^XadeLyXKaXGH183As&3VAI%YX(=
zfRYYc&cxmA1*n$k5<w{84!3NsHQ-UFFyOFDjB<jXJ9P9w6k=#DnX0cEo0`&Y2+h!>
zA~alHz+0jBBVx>{-TyH+=9Ffs5v18$_IJ_0`cV-*GTMmdwleyTra2uVr)?kFdCYTH
z{N!aus`!BS7Tie<G+~VP)Dnww?H5R!qy~UG3$M!aL>s`p`feEY&{}+=84uAER~OTy
z1@)KliVgL~@s^w_-K;^U4UXLRGLSm_c@tP$D&^x~&}Ewi>U!@1PuX$@&pFVcRP(&&
z6HKFEQ7vtSMMhFnET250E}B7Cnt`c}5Se^!z?&jo7cFR#YgBD<pId0!GR(~U6!qn!
zmmWmYh75*nAQoDCsz&F7<W>yufcF8=s-+GW+6aot;_p270nK)R)(W%n?R&%%ct9O;
z7xz&W5;zH+Gx)U=&kHopOAaPqjPqc8r)ZT}3IMddxK497sYeK>{ZN`!1>pRyw_QQ<
ziz{52!ec2ph13#GjagjHHQXfVaZ@_?e7?aQV1NOX{G^gvXi<$w&tC&~Oa@0AfhV>1
zdDaLm=R1}LU`#EoRIt{@p4x8ln2eifNhGa%yh|!rwUIa7HC835v|YlLZ>*C%VQH<G
zfF3cU;B8;735CafNjC2*N_g2KpGML0YG~c`8Qkip4z4sq$@KEd6K>N_^aIWJLZvn}
z4fyupY9}>+E8-<DI>_Qd^q<Z$^tzD}7FXeY*G>9{*zKfVRskRb_e5&Q<Si4#Z4C51
zV3?^y^?i@zs`MN*XYJ)DPYw7$PW#U1%}UXaC%$BW1&_01c#^v#WO1~?3rQk(g?Zkx
zM4k@<-92b3^a@@{)xhGk4TPsmqIjy@g-^t%DAtkWCwN<KAz!<IU!TcC>Ng8P2ieX+
z!S13vpx1&_zdxkqa%zmHrn1laylUv6ho?^;8ub3@fHyALAvs^d(R*e*VPgwAZ#TGl
z#gUs-Gy$mv;Nge5oKDVvKyLTIJ>Trmpv8?S#+%_5kO|We1}$v^&kIg+9+rXGr(8e)
zjdYNJmtD8zO=`z>El=e0Ah&y5#Ny=BgnX{f$Mde1cl5tLrk)6HlE=_;5TB@NI{I@c
z_gGVH5q%Fqh!551pmp3D8Xt`k8sHI;kDed{wJ((D=aoEaN;@*dVHt1ic^eqilMw`S
zx)7LoM6V_lWwF4ArA2r<_u}b(fW|iDE-x$*$~>t{jx^1qM;-BCvA`2(taE-5p*-+_
zYTZ8}DpO_!3O|~TFnnQ1^^bWbi6_P4a}_kiT0DZr47kzmx%A}g3(=RZs?Ff_FBi~g
z25@#2;~;JxiL{OLaG`FsT%h(;?)1dk9_<F5l^}TOI*%oKaA$OqRC&oJVczIw4WH>G
zz@E0wpr;K`qXzI;;yu*`YQMvS;%Q!uq`^{zb+`m3H`7vb=@Zb<x7)yV3*G`79^B49
zXwTXo{IY!iR(__6>Wulo+$-Az{v~sZ1^01vBrm5rpFwf1@H~%aWqwKyq(fu3Ob=QO
z;X|jl)b=pvj`iKZ#H-Fg*1DO*0<=eB>uc&Oc||`o&%Xd_T^>=~6YTr+)9VTpRj-2;
z$YkO)9zbIN=DK$&(lVIsPTL<Jy5smNaJIu^U*F;Xmbzd%j-#22ckShoUi96~G!8GJ
zdEo_4w7rv%9d{oN4Z4_+Hr$PB<9Y1!6+0RhLBFWI3hp-WO#ngu0Ih$wNDZQf>yBLg
z1pj!TtJ6UaDCQf`HU*+wjJO6aN3?166!=i<;fn;;qtVZN^m@u#k!L;RZa{OJG6wJB
zIi5)61|@pd_$|5PBk=FxLn}KCRozMB)pO@u!C=kQ=ZAjKYvV!()SXN1>JC+5y}C0&
z-K9}?XDU>ivz8XNqNz+KD=7YuUX=0SW4^LT{Rf49$|<bnXJjh#)qhmzrJTjABqRTQ
zcKXK*L0O<FQvXTy*J<j%Q|M*XssBDf{SU=o)qi$Z{gqbrdujb26*8><Q-b<0s=t-y
z|Lm@iD-0Bd$~yJGB&h#Y1qG@<Dr73%CI2Ssr2joZ{U5mHKY!iwU%K`GP7q{zGMVlo
zK@Y#&6?#AK66BaA2>R0Q|G!Z%z>0KMIHeOX#8~`QFv92mE*Fd?8CWlz))S>$Ofdhc
z!&H(}$P}j(M#?(j3}y+!S!wsD4n3XKf*DqH=goBj=P*LYd3=89xFE?$c39|SE@J$t
z;*ul=H{?=C1kqBrZ6$60#t&<(=+4{d1Z*)v$7Ote>9CVzBtNd`Wb85i)ZrkB!3}+#
z8;-hNCu#RLZa8B_)SY+HNw{K!j;r|m(&2`g&W>w38F!37bzGOkbbc7X4-eh0r?mSU
zH@vW-JAOka;EfR~eDL|HLIgyFo05>^hp$fP7RH}CZcB1HKTbi1pKjM*+Wn0m0a(!;
z57Y?+VJub_XQr3~xJ8A6Mwy+Sm0w(>Q3^#uFgDYKI~d)+3L$#3?yB3)Vko8xJ=O2i
zguBv)>Thx-MC3~G%$1B$0<d~1r~kK7vI+%dk#G;|3WF45EWR&F*VTs*hBYnR3&+<z
z_t&lWrMc=axe}IhB_!lZVU%u_<>H=4xG4AMS*5PpiZmIS`O0)n;m7w`MZyEfs&Z9%
z3R#^HA^lc22$A?E2oEv-Ou?CY;nAkj&sfhcN|Gq77ov3%F&H(%V~m-_ncn{cX&L|n

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index a87af416cf..f50b6fda0b 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -30,8 +30,7 @@ def setUp(self):
 
         self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
                                 passes=passes, num_models=num_models, random_state=random_state)
-        self.eLDA.save("asdf")
-
+                                
         self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
                                    passes=passes, num_models=num_models, random_state=random_state,
                                    memory_friendly_ttda=False)

From 70a06602a7b51ff05802dd21f9658a196b55770d Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 15 Sep 2019 13:31:19 +0200
Subject: [PATCH 064/166] tox whitespace

---
 gensim/test/test_ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index f50b6fda0b..84bc8340d5 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -30,7 +30,7 @@ def setUp(self):
 
         self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
                                 passes=passes, num_models=num_models, random_state=random_state)
-                                
+
         self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
                                    passes=passes, num_models=num_models, random_state=random_state,
                                    memory_friendly_ttda=False)

From c8b23ab3f5d578e571ab948e501f6eb71af250f6 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 15 Sep 2019 13:46:04 +0200
Subject: [PATCH 065/166] cleaned obsolete stuff from cbdbscan

---
 gensim/models/ensemblelda.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index dfc90148b7..64b5a25c17 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1237,7 +1237,6 @@ def scan_topic(topic_index, current_label=None):
                     # next_label is initialized with 0 in fit() for the first cluster
                     current_label = self.next_label
                     self.next_label += 1
-                    cluster_topic_indices = []
 
                 else:
                     # In case the core has a parent, check the distance to the existing cluster (since the matrix is
@@ -1253,11 +1252,12 @@ def scan_topic(topic_index, current_label=None):
                         # start new cluster by changing current_label
                         current_label = self.next_label
                         self.next_label += 1
-                        cluster_topic_indices = []
 
-                # TODO changed in order to make the parent_id parameter obsoloete because (i think) it (the appending)
-                # can easily done before calling scan_topic as well.
-                cluster_topic_indices.append(topic_index)
+                # TODO parent_neighbors and parent_id got obsolete. 1. the appending of parent_id can be done
+                # before calling scan_topic and 2. using parent_neighbors the way it was used was not conform
+                # with the algorithm description. scan_topic gets the list of topic_indices of the current cluster
+                # out of results now.
+
                 results[topic_index]["label"] = current_label
 
                 for neighboring_topic_index in neighboring_topic_indices:

From 6f98624b598833676bd9afc7286bff0ef521ece4 Mon Sep 17 00:00:00 2001
From: minze <to.213692@protonmail.ch>
Date: Mon, 28 Oct 2019 00:29:05 +0100
Subject: [PATCH 066/166] idk

---
 gensim/models/ensemblelda.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 64b5a25c17..20ba1753c3 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1195,7 +1195,7 @@ def fit(self, amatrix):
         np.fill_diagonal(amatrix_copy, 1)
 
         min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))]
-        min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x)
+        min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x[0])
         ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted]
 
         def scan_topic(topic_index, current_label=None):
@@ -1212,11 +1212,9 @@ def scan_topic(topic_index, current_label=None):
                 The label of the cluster that might be suitable for topic_index
 
             """
-            neighboring_topic_indices = [
-                candidate_topic_index
-                for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps)
-                if is_neighbour
-            ]
+            neighbors_sorted = sorted([(distance, index) for index, distance in enumerate(amatrix_copy[topic_index])], key=lambda x: x[0])
+            neighboring_topic_indices = [index for distance, index in neighbors_sorted if distance < self.eps]
+
             num_neighboring_topics = len(neighboring_topic_indices)
 
             # derive a list of all topic_indices that belong to the cluster with the current_label and that are cores

From 799c11284a75d170d4fc29f27a92cca3c1c7bb72 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Mon, 28 Oct 2019 22:42:41 +0100
Subject: [PATCH 067/166] updated doc-strings to be clearer and better reflect
 the truth

---
 gensim/models/ensemblelda.py | 84 ++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 37 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 20ba1753c3..a4ce3b4aa2 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -974,16 +974,18 @@ def _contains_isolated_cores(self, label, cluster, min_cores):
     def _generate_stable_topics(self, min_cores=None):
         """Generate stable topics out of the clusters.
 
-        This function is the last step that has to be done in the ensemble.Stable topics can be retrieved afterwards
-        using get_topics().
+        The function finds clusters of topics using a variant of DBScan.  If a cluster has enough core topics
+        (c.f. parameter min_cores), then this cluster represents a stable topic.  The stable topic is specifically
+        calculated as the average over all topic-term distributions of the core topics in the cluster.
+
+        This function is the last step that has to be done in the ensemble.  After this step is complete,
+        Stable topics can be retrieved afterwards using the get_topics() method.
 
         Parameters
         ----------
         min_cores : int
-            how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
-            that look similar have to be present, so that the average topic in those is used as stable topic.
-
-            defaults to min_cores = min(3, max(1, int(self.num_models /4 +1)))
+            Minimum number of core topics needed to form a cluster that represents a stable topic.
+                Using None defaults to min_cores = min(3, max(1, int(self.num_models /4 +1)))
 
         """
         # min_cores being 0 makes no sense. there has to be a core for a cluster
@@ -1055,7 +1057,7 @@ def _generate_stable_topics(self, min_cores=None):
         self.stable_topics = stable_topics
 
     def recluster(self, eps=0.1, min_samples=None, min_cores=None):
-        """Quickly change the results of the ensemble by reapplying clustering and stable topic generation.
+        """Reapply CBDBSCAN clustering and stable topic generation.
 
         Stable topics can be retrieved using get_topics().
 
@@ -1138,24 +1140,32 @@ def id2word(self):
 class CBDBSCAN():
     """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN).
 
-    The algorithm works based on
-    DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a "nearby" point is, and the
-    minimum number of nearby points needed to label a candidate datapoint a core of a cluster.
-    (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).
+    The algorithm works based on DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a
+    "nearby" point is, and the minimum number of nearby points needed to label a candidate datapoint a core of a
+    cluster. (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).
 
     The algorithm works as follows:
-    1. (A)symmetric distance matrix provided at fit-time.
-    For the sake of example below, assume the there are only three topics (amatrix contains distances with dim 3x3),
-        T_1, T_2, and T_3:
-    2. Start by scanning a candidate topic
-        (e.g. T_1)
-    3. Check which topics are nearby using `self.eps` as a threshold and call them neighbours
-        (e.g. assume T_2, and T_3 are nearby and become neighbours)
+    1. (A)symmetric distance matrix provided at fit-time (called `amatrix`).
+    For the sake of example below, assume the there are only five topics (amatrix contains distances with dim 5x5),
+        T_1, T_2, T_3, T_4, T_5:
+    2. Start by scanning a candidate topic with respect to a parent topic
+        (e.g. T_1 with respect to parent None)
+    3. Check which topics are nearby the candidate topic using `self.eps` as a threshold and call them neighbours
+        (e.g. assume T_3, T_4, and T_5 are nearby and become neighbours)
     4. If there are more neighbours than `self.min_samples`, the candidate topic becomes a core candidate for a cluster
-        (e.g. if `min_samples`<=2 then T_1 becomes the first core of a cluster)
-    5. CheckBack (CB) Step: Check if at least 25% of other cluster cores are nearby to core candidate
-        (e.g. at this stage in the example, the cluster is empty, so checkback passes)
-    6. Recursively scan the next nearby topic (e.g. scan T_2) - repeat step 2.-6.
+        (e.g. if `min_samples`=1, then T_1 becomes the first core of a cluster)
+    5. If candidate is a core, CheckBack (CB) to find the fraction of neighbours that are either the parent or the
+        parent's neighbours.  If this fraction is more than 75%, give the candidate the same label as its parent.
+        (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given)
+    6. If candidate is a core, recursively scan the next nearby topic (e.g. scan T_3) labeling the previous topic as
+        the parent and the previous neighbours as the parent_neighbours - repeat steps
+        2.-6.
+        2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5)
+        3. (e.g. T5 is the only neighbour)
+        4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core)
+        5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3.
+            Therefore the candidate T_3 does NOT get the same label as its parent T_1)
+        6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5)
 
     The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for
     unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is
@@ -1182,7 +1192,7 @@ def fit(self, amatrix):
         """Apply the algorithm to an asymmetric distance matrix."""
         self.next_label = 0
 
-        results = [{
+        topic_clustering_results = [{
             "is_core": False,
             "neighboring_labels": set(),
             "neighboring_topic_indices": set(),
@@ -1195,13 +1205,13 @@ def fit(self, amatrix):
         np.fill_diagonal(amatrix_copy, 1)
 
         min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))]
-        min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x[0])
+        min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda distance: distance[0])
         ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted]
 
         def scan_topic(topic_index, current_label=None):
             """Extend the cluster in one direction.
 
-            Results are accumulated in result, a variable outside of this function.
+            Results are accumulated in topic_clustering_results, a variable outside of this function.
 
             Parameters
             ----------
@@ -1219,7 +1229,7 @@ def scan_topic(topic_index, current_label=None):
 
             # derive a list of all topic_indices that belong to the cluster with the current_label and that are cores
             cluster_topic_indices = [
-                index for index, topic in enumerate(results)
+                index for index, topic in enumerate(topic_clustering_results)
                 if topic["label"] == current_label and topic["label"] is not None and topic["is_core"]
             ]
 
@@ -1227,7 +1237,7 @@ def scan_topic(topic_index, current_label=None):
             # This also takes neighbor indices that already are identified as core in count.
             if num_neighboring_topics >= self.min_samples:
                 # This topic is a core!
-                results[topic_index]["is_core"] = True
+                topic_clustering_results[topic_index]["is_core"] = True
 
                 # if current_label is none, then this is the first core
                 # of a new cluster (hence next_label is used)
@@ -1252,31 +1262,31 @@ def scan_topic(topic_index, current_label=None):
                         self.next_label += 1
 
                 # TODO parent_neighbors and parent_id got obsolete. 1. the appending of parent_id can be done
-                # before calling scan_topic and 2. using parent_neighbors the way it was used was not conform
-                # with the algorithm description. scan_topic gets the list of topic_indices of the current cluster
-                # out of results now.
+                #  before calling scan_topic and 2. using parent_neighbors the way it was used did not conform
+                #  with the algorithm description. scan_topic gets the list of topic_indices of the current cluster
+                #  out of topic_clustering_results now.
 
-                results[topic_index]["label"] = current_label
+                topic_clustering_results[topic_index]["label"] = current_label
 
                 for neighboring_topic_index in neighboring_topic_indices:
-                    if results[neighboring_topic_index]["label"] is None:
+                    if topic_clustering_results[neighboring_topic_index]["label"] is None:
                         ordered_min_similarity.remove(neighboring_topic_index)
                         # try to extend the cluster into the direction of the neighbor
                         scan_topic(neighboring_topic_index, current_label)
 
-                    results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index)
-                    results[neighboring_topic_index]["neighboring_labels"].add(current_label)
+                    topic_clustering_results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index)
+                    topic_clustering_results[neighboring_topic_index]["neighboring_labels"].add(current_label)
 
             else:
                 # this topic is not a core!
                 if current_label is None:
-                    results[topic_index]["label"] = -1
+                    topic_clustering_results[topic_index]["label"] = -1
                 else:
-                    results[topic_index]["label"] = current_label
+                    topic_clustering_results[topic_index]["label"] = current_label
 
         # elements are going to be removed from that array in scan_topic, do until it is empty
         while len(ordered_min_similarity) != 0:
             next_topic_index = ordered_min_similarity.pop(0)
             scan_topic(next_topic_index)
 
-        self.results = results
+        self.results = topic_clustering_results

From 9866ef6a8f38b0941fde1baa33722f6134b91fc1 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sat, 9 Nov 2019 21:29:04 +0900
Subject: [PATCH 068/166] make flake8 happy

---
 gensim/models/ensemblelda.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index a4ce3b4aa2..d8903a2c62 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1222,7 +1222,13 @@ def scan_topic(topic_index, current_label=None):
                 The label of the cluster that might be suitable for topic_index
 
             """
-            neighbors_sorted = sorted([(distance, index) for index, distance in enumerate(amatrix_copy[topic_index])], key=lambda x: x[0])
+            neighbors_sorted = sorted(
+                [
+                    (distance, index) 
+                    for index, distance in enumerate(amatrix_copy[topic_index])
+                ], 
+                key=lambda x: x[0],
+            )
             neighboring_topic_indices = [index for distance, index in neighbors_sorted if distance < self.eps]
 
             num_neighboring_topics = len(neighboring_topic_indices)

From ad97ead41fd999b280ea92ae008f398050d3a279 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sat, 9 Nov 2019 21:58:42 +0900
Subject: [PATCH 069/166] fix trailing whitespace

---
 gensim/models/ensemblelda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index d8903a2c62..7d63f7fefb 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1224,9 +1224,9 @@ def scan_topic(topic_index, current_label=None):
             """
             neighbors_sorted = sorted(
                 [
-                    (distance, index) 
+                    (distance, index)
                     for index, distance in enumerate(amatrix_copy[topic_index])
-                ], 
+                ],
                 key=lambda x: x[0],
             )
             neighboring_topic_indices = [index for distance, index in neighbors_sorted if distance < self.eps]

From 765b912352250e8539c4500b78811e003490be11 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Mon, 11 Nov 2019 01:11:40 +0100
Subject: [PATCH 070/166] reverted some changes

---
 gensim/models/ensemblelda.py | 128 +++++++++++++++--------------------
 1 file changed, 53 insertions(+), 75 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 7d63f7fefb..ba926c9a5a 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -790,58 +790,61 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s
             Asymmetric distance matrix of size len(ttda1) by len(ttda2).
 
         """
-        if method not in ["mass", "rank"]:
-            raise ValueError("method {} unknown".format(method))
+        # initialize the distance matrix. ndarray is faster than zeros
+        distances = np.ndarray((len(ttda1), len(ttda2)))
 
-        # select masking method:
-        def mass_masking(a):
-            """Original masking method. Returns a new binary mask."""
-            sorted_a = np.sort(a)[::-1]
-            largest_mass = sorted_a.cumsum() < threshold
-            smallest_valid = sorted_a[largest_mass][-1]
-            return a >= smallest_valid
+        if ttda1.shape[0] > 0 and ttda2.shape[0] > 0:
+            # the worker might not have received a ttda because it was chunked up too much
+            
+            if method not in ["mass", "rank"]:
+                raise ValueError("method {} unknown".format(method))
 
-        def rank_masking(a):
-            """Faster masking method. Returns a new binary mask."""
-            return a > np.sort(a)[::-1][int(len(a) * threshold)]
+            # select masking method:
+            def mass_masking(a):
+                """Original masking method. Returns a new binary mask."""
+                sorted_a = np.sort(a)[::-1]
+                largest_mass = sorted_a.cumsum() < threshold
+                smallest_valid = sorted_a[largest_mass][-1]
+                return a >= smallest_valid
 
-        create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
+            def rank_masking(a):
+                """Faster masking method. Returns a new binary mask."""
+                return a > np.sort(a)[::-1][int(len(a) * threshold)]
 
-        # some help to find a better threshold by useful log messages
-        avg_mask_size = 0
+            create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
 
-        # initialize the distance matrix. ndarray is faster than zeros
-        distances = np.ndarray((len(ttda1), len(ttda2)))
+            # some help to find a better threshold by useful log messages
+            avg_mask_size = 0
 
-        # now iterate over each topic
-        for ttd1_idx, ttd1 in enumerate(ttda1):
-            # create mask from ttd1 that removes noise from a and keeps the largest terms
-            mask = create_mask(ttd1)
-            ttd1_masked = ttd1[mask]
+            # now iterate over each topic
+            for ttd1_idx, ttd1 in enumerate(ttda1):
+                # create mask from ttd1 that removes noise from a and keeps the largest terms
+                mask = create_mask(ttd1)
+                ttd1_masked = ttd1[mask]
 
-            avg_mask_size += mask.sum()
+                avg_mask_size += mask.sum()
 
-            # now look at every possible pair for topic a:
-            for ttd2_idx, ttd2 in enumerate(ttda2):
-                # distance to itself is 0
-                if ttd1_idx + start_index == ttd2_idx:
-                    distances[ttd1_idx][ttd2_idx] = 0
-                    continue
+                # now look at every possible pair for topic a:
+                for ttd2_idx, ttd2 in enumerate(ttda2):
+                    # distance to itself is 0
+                    if ttd1_idx + start_index == ttd2_idx:
+                        distances[ttd1_idx][ttd2_idx] = 0
+                        continue
 
-                # now mask b based on a, which will force the shape of a onto b
-                ttd2_masked = ttd2[mask]
+                    # now mask b based on a, which will force the shape of a onto b
+                    ttd2_masked = ttd2[mask]
 
-                # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
-                # distributions that will have distance values near 1.
-                if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD:
-                    distance = 1
-                else:
-                    distance = cosine(ttd1_masked, ttd2_masked)
+                    # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
+                    # distributions that will have distance values near 1.
+                    if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD:
+                        distance = 1
+                    else:
+                        distance = cosine(ttd1_masked, ttd2_masked)
 
-                distances[ttd1_idx][ttd2_idx] = distance
+                    distances[ttd1_idx][ttd2_idx] = distance
 
-        percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
-        logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
+            percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
+            logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
 
         return distances
 
@@ -941,8 +944,6 @@ def _aggregate_topics(self, grouped_by_labels):
 
             neighboring_labels = [x for x in neighboring_labels if len(x) > 0]
 
-            # TODO why is len(neighboring_labels) equal to num_cores?
-
             sorted_clusters.append({
                 "max_num_neighboring_labels": max_num_neighboring_labels,
                 "neighboring_labels": neighboring_labels,
@@ -954,21 +955,19 @@ def _aggregate_topics(self, grouped_by_labels):
             key=lambda cluster: (
                 cluster["max_num_neighboring_labels"],
                 cluster["label"]
-            ), reverse=True)
+            ), reverse=False)
 
         return sorted_clusters
 
     def _remove_from_all_sets(self, label, clusters):
         """Remove a label from every set in "neighboring_labels" for each core in clusters."""
-        for a in clusters:
-            if label in a["neighboring_labels"]:
-                a["neighboring_labels"].remove(label)
+        for cluster in clusters:
+            for neighboring_labels_set in cluster["neighboring_labels"]:
+                if label in neighboring_labels_set:
+                    neighboring_labels_set.remove(label)
 
     def _contains_isolated_cores(self, label, cluster, min_cores):
         """Check if the cluster has at least min_cores of cores that belong to no other cluster."""
-        # TODO is supposed to check for cores, but checks neighboring_labels.
-        # Funnily, it actually seems to be cores (see output of next line) but it doesn't read that way
-        # print(len(cluster["neighboring_labels"]), cluster["num_cores"])
         return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores
 
     def _generate_stable_topics(self, min_cores=None):
@@ -1009,17 +1008,10 @@ def _generate_stable_topics(self, min_cores=None):
         for i, cluster in enumerate(sorted_clusters):
             cluster["is_valid"] = None
 
-            # TODO removed first rule because when amount_parent_labels is 0, then there is no
-            # topic in the cluster than has a parent, hence the size is either 0 or 1,
-            # which can be covered by the next rule.
-
             if cluster["num_cores"] < min_cores:
                 cluster["is_valid"] = False
                 self._remove_from_all_sets(cluster["label"], sorted_clusters)
 
-            # TODO I don't think it makes a difference if _contains_isolated_cores (formerly easy valid) is done (here
-            # and afterwards) or (only afterwards)
-
         # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any
         # other cluster.
         for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]:
@@ -1208,7 +1200,7 @@ def fit(self, amatrix):
         min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda distance: distance[0])
         ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted]
 
-        def scan_topic(topic_index, current_label=None):
+        def scan_topic(topic_index, current_label=None, parent_neighbors=None):
             """Extend the cluster in one direction.
 
             Results are accumulated in topic_clustering_results, a variable outside of this function.
@@ -1233,12 +1225,6 @@ def scan_topic(topic_index, current_label=None):
 
             num_neighboring_topics = len(neighboring_topic_indices)
 
-            # derive a list of all topic_indices that belong to the cluster with the current_label and that are cores
-            cluster_topic_indices = [
-                index for index, topic in enumerate(topic_clustering_results)
-                if topic["label"] == current_label and topic["label"] is not None and topic["is_core"]
-            ]
-
             # If the number of neighbor indices of a topic is large enough, it is considered a core.
             # This also takes neighbor indices that already are identified as core in count.
             if num_neighboring_topics >= self.min_samples:
@@ -1253,32 +1239,24 @@ def scan_topic(topic_index, current_label=None):
                     self.next_label += 1
 
                 else:
-                    # In case the core has a parent, check the distance to the existing cluster (since the matrix is
+                    # In case the core has a parent, check the distance to the parents neighbors (since the matrix is
                     # asymmetric, it takes return distances into account here)
                     # If less than 25% of the elements are close enough, then create a new cluster rather than further
                     # growing the current cluster in that direction.
-                    close_members_mask = amatrix_copy[topic_index][cluster_topic_indices] < self.eps
+                    close_parent_neighbors_mask = amatrix_copy[topic_index][parent_neighbors] < self.eps
 
-                    # TODO changed because it is supposed to check the distance to the existing cluster, not
-                    # to the neighbors of the parent
-
-                    if close_members_mask.mean() < 0.25:
+                    if close_parent_neighbors_mask.mean() < 0.25:
                         # start new cluster by changing current_label
                         current_label = self.next_label
                         self.next_label += 1
 
-                # TODO parent_neighbors and parent_id got obsolete. 1. the appending of parent_id can be done
-                #  before calling scan_topic and 2. using parent_neighbors the way it was used did not conform
-                #  with the algorithm description. scan_topic gets the list of topic_indices of the current cluster
-                #  out of topic_clustering_results now.
-
                 topic_clustering_results[topic_index]["label"] = current_label
 
                 for neighboring_topic_index in neighboring_topic_indices:
                     if topic_clustering_results[neighboring_topic_index]["label"] is None:
                         ordered_min_similarity.remove(neighboring_topic_index)
                         # try to extend the cluster into the direction of the neighbor
-                        scan_topic(neighboring_topic_index, current_label)
+                        scan_topic(neighboring_topic_index, current_label, neighboring_topic_indices + [topic_index])
 
                     topic_clustering_results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index)
                     topic_clustering_results[neighboring_topic_index]["neighboring_labels"].add(current_label)

From 32ff401c60321b31ca17e59d047b1243f3a7b965 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Mon, 11 Nov 2019 01:23:34 +0100
Subject: [PATCH 071/166] comma, newline, comment

---
 gensim/models/ensemblelda.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index ba926c9a5a..ae223f25ad 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -102,6 +102,7 @@
 from multiprocessing import Process, Pipe, ProcessError
 import numpy as np
 from scipy.spatial.distance import cosine
+
 from gensim import utils
 from gensim.models import ldamodel, ldamulticore, basemodel
 
@@ -610,7 +611,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
             p.terminate()
 
     def _generate_topic_models(self, num_models, random_states=None, pipe=None):
-        """Train the topic models, that form the ensemble.
+        """Train the topic models that form the ensemble.
 
         Parameters
         ----------
@@ -1005,9 +1006,8 @@ def _generate_stable_topics(self, min_cores=None):
 
         sorted_clusters = self._aggregate_topics(grouped_by_labels)
 
-        for i, cluster in enumerate(sorted_clusters):
+        for cluster in sorted_clusters:
             cluster["is_valid"] = None
-
             if cluster["num_cores"] < min_cores:
                 cluster["is_valid"] = False
                 self._remove_from_all_sets(cluster["label"], sorted_clusters)
@@ -1021,7 +1021,7 @@ def _generate_stable_topics(self, min_cores=None):
             else:
                 cluster["is_valid"] = False
                 # This modifies parent labels which is important in _contains_isolated_cores, so the result depends on
-                # where to start. That's why sorted_clusters is sorted, so it starts with the most cluttered cluster.
+                # where to start.
                 self._remove_from_all_sets(label, sorted_clusters)
 
         # list of all the label numbers that are valid

From b00e010939b6e8f119831b9f60157303a9777db9 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Mon, 11 Nov 2019 01:49:08 +0100
Subject: [PATCH 072/166] whitespace

---
 gensim/models/ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index ae223f25ad..305a08437f 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -796,7 +796,7 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s
 
         if ttda1.shape[0] > 0 and ttda2.shape[0] > 0:
             # the worker might not have received a ttda because it was chunked up too much
-            
+
             if method not in ["mass", "rank"]:
                 raise ValueError("method {} unknown".format(method))
 

From c58c30e863289ea827aad2ffba15671b6a3519a9 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Mon, 11 Nov 2019 19:57:01 +0100
Subject: [PATCH 073/166] citation, reference, authors

---
 gensim/models/ensemblelda.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 305a08437f..38c9d14916 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Authors: Tobias Brigl <github.com/sezanzeb>, Alex Loosley <aloosley@alumni.brown.edu>,
-# Stephan Sahm <stephan.sahm@gmx.de>, Alex Salles <alex.salles@gmail.com>, Data Reply Munich
+# Authors: Tobias Brigl <github.com/sezanzeb>, Alex Salles <alex.salles@gmail.com>,
+# Alex Loosley <aloosley@alumni.brown.edu>, Data Reply Munich
 
 """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
@@ -81,14 +81,15 @@
        Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595
 
 .. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis].
-       Ingolstadt: Technische Hochschule. Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/
+       Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from:
+       https://www.hip70890b.de/machine_learning/ensemble_LDA/
 
 Citation
 --------
 At the moment, there is no paper associated to ensemble LDA but we are working on publicly releasing Tobi Brigl's
 bachelor thesis on the topic.  In the meantime, please include a mention of us (Brigl, T.; Salles, A.; Loosley, A) and
-a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if you use
-this work in a published or open-source project.
+a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if this work
+is presented, further developed, or used as the basis for a published result.
 
 Other Notes
 -----------

From 637640c816c13748c7f32451ebe02a5eed86cb75 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Mon, 11 Nov 2019 21:40:08 +0100
Subject: [PATCH 074/166] potential fix for utils saveload when a class is in
 __dict__

---
 .../{Opinosis.ipynb => ensemble_lda_with_opinosis.ipynb}  | 2 +-
 gensim/utils.py                                           | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)
 rename docs/notebooks/{Opinosis.ipynb => ensemble_lda_with_opinosis.ipynb} (99%)

diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
similarity index 99%
rename from docs/notebooks/Opinosis.ipynb
rename to docs/notebooks/ensemble_lda_with_opinosis.ipynb
index a828b2adf4..72ad2a26ba 100644
--- a/docs/notebooks/Opinosis.ipynb
+++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
@@ -168,7 +168,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.4"
   }
  },
  "nbformat": 4,
diff --git a/gensim/utils.py b/gensim/utils.py
index cf7eca6c90..3831c9a8ca 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -18,7 +18,9 @@
     from htmlentitydefs import name2codepoint as n2cp
 try:
     import cPickle as _pickle
+    print('import cpickle')
 except ImportError:
+    print('import pickle')
     import pickle as _pickle
 
 import re
@@ -542,6 +544,7 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro
         logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately)
 
         compress, subname = SaveLoad._adapt_by_suffix(fname)
+        print(compress, subname)
 
         restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,
                                        compress, subname)
@@ -603,9 +606,10 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
         recursive_saveloads = []
         restores = []
         for attrib, val in iteritems(self.__dict__):
-            if hasattr(val, '_save_specials'):  # better than 'isinstance(val, SaveLoad)' if IPython reloading
+            if hasattr(val, '_save_specials') and type(val) != type:  # better than 'isinstance(val, SaveLoad)' if IPython reloading
                 recursive_saveloads.append(attrib)
                 cfname = '.'.join((fname, attrib))
+                print(val, compress, subname)
                 restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname))
 
         try:
@@ -688,10 +692,12 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=
             Load object from file.
 
         """
+        print(fname_or_handle, type(fname_or_handle), separately, sep_limit, ignore, pickle_protocol)
         try:
             _pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
             logger.info("saved %s object", self.__class__.__name__)
         except TypeError:  # `fname_or_handle` does not have write attribute
+            print("failed")
             self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol)
 
 
From d7efe3a12fab4e7a02eb6141ddc8ae74d97a4df8 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Mon, 11 Nov 2019 22:01:28 +0100
Subject: [PATCH 075/166] commented out eLDA tests, tox8

---
 gensim/models/ensemblelda.py    |  38 +-
 gensim/test/test_ensemblelda.py | 696 ++++++++++++++++----------------
 gensim/utils.py                 |   3 +-
 3 files changed, 354 insertions(+), 383 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 38c9d14916..d9fa3c9a1b 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -101,16 +101,18 @@
 import logging
 import os
 from multiprocessing import Process, Pipe, ProcessError
+
 import numpy as np
 from scipy.spatial.distance import cosine
 
 from gensim import utils
 from gensim.models import ldamodel, ldamulticore, basemodel
+from gensim.utils import SaveLoad
 
 logger = logging.getLogger(__name__)
 
 
-class EnsembleLda():
+class EnsembleLda(SaveLoad):
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
     Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
@@ -182,6 +184,7 @@ def __init__(self, topic_model_class="lda", num_models=3,
             Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble.
 
         """
+
         # INTERNAL PARAMETERS
         # Set random state
         # nps max random state of 2**32 - 1 is too large for windows:
@@ -488,39 +491,6 @@ def add_model(self, target, num_new_models=None):
         # tell recluster that the distance matrix needs to be regenerated
         self.asymmetric_distance_matrix_outdated = True
 
-    def save(self, fname):
-        """Save the ensemble to a file.
-
-        Parameters
-        ----------
-        fname : str
-            Path to the system file where the model will be persisted.
-
-        """
-        logger.info("saving %s object to %s", self.__class__.__name__, fname)
-
-        utils.pickle(self, fname)
-
-    @staticmethod
-    def load(fname):
-        """Load a previously stored ensemble from disk.
-
-        Parameters
-        ----------
-        fname : str
-            Path to file that contains the needed object.
-
-        Returns
-        -------
-            A previously saved ensembleLda object
-
-        """
-        logger.info("loading %s object from %s", EnsembleLda.__name__, fname)
-
-        eLDA = utils.unpickle(fname)
-
-        return eLDA
-
     def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
         """Generate the topic models to form the ensemble in a multiprocessed way.
 
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 84bc8340d5..540410e5a0 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -1,348 +1,348 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Author: Tobias B <github.com/sezanzeb>
-
-"""
-Automated tests for checking the EnsembleLda Class
-"""
-
-
-import logging
-import unittest
-
-import numpy as np
-from copy import deepcopy
-
-from gensim.models import EnsembleLda, LdaMulticore
-from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
-
-num_topics = 2
-num_models = 4
-passes = 50
-
-
-class TestModel(unittest.TestCase):
-    def setUp(self):
-        # same configuration for each model to make sure
-        # the topics are equal
-        random_state = 0
-
-        self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                                passes=passes, num_models=num_models, random_state=random_state)
-
-        self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                                   passes=passes, num_models=num_models, random_state=random_state,
-                                   memory_friendly_ttda=False)
-
-    def check_ttda(self, ensemble):
-        """tests the integrity of the ttda of any ensemble"""
-        self.assertGreater(len(ensemble.ttda), 0)
-        a = ensemble.ttda.sum(axis=1)
-        b = np.ones(len(ensemble.ttda)).astype(np.float32)
-        np.testing.assert_allclose(a, b, rtol=1e-04)
-
-    def test_eLDA(self):
-        # given that the random_state doesn't change, it should
-        # always be 2 detected topics in this setup.
-        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
-        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
-        self.check_ttda(self.eLDA)
-
-        # reclustering shouldn't change anything without
-        # added models or different parameters
-        self.eLDA.recluster()
-
-        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
-        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
-        self.check_ttda(self.eLDA)
-
-        # compare with a pre-trained reference model
-        reference = EnsembleLda.load(datapath('ensemblelda'))
-        np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
-        # small values in the distance matrix tend to vary quite a bit around 2%,
-        # so use some absolute tolerance measurement to check if the matrix is at least
-        # close to the target.
-        atol = reference.asymmetric_distance_matrix.max() * 1e-05
-        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
-                                   reference.asymmetric_distance_matrix, atol=atol)
-
-    def test_clustering(self):
-        # the following test is quite specific to the current implementation and not part of any api,
-        # but it makes improving those sections of the code easier as long as sorted_clusters and the
-        # cluster_model results are supposed to stay the same. Potentially this test will deprecate.
-
-        reference = EnsembleLda.load(datapath('ensemblelda'))
-        cluster_model_results = deepcopy(reference.cluster_model.results)
-        sorted_clusters = deepcopy(reference.sorted_clusters)
-        stable_topics = deepcopy(reference.get_topics())
-
-        # continue training with the distance matrix of the pretrained reference and see if
-        # the generated clusters match.
-        reference.asymmetric_distance_matrix_outdated = True
-        reference.recluster()
-
-        self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
-        self.assertEqual(reference.sorted_clusters, sorted_clusters)
-        np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05)
-
-    def test_not_trained(self):
-        # should not throw errors and no training should happen
-
-        # 0 passes
-        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                           passes=0, num_models=num_models, random_state=0)
-        self.assertEqual(len(eLDA.ttda), 0)
-
-        # no corpus
-        eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics,
-                           passes=passes, num_models=num_models, random_state=0)
-        self.assertEqual(len(eLDA.ttda), 0)
-
-        # 0 iterations
-        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                           iterations=0, num_models=num_models, random_state=0)
-        self.assertEqual(len(eLDA.ttda), 0)
-
-        # 0 models
-        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                           passes=passes, num_models=0, random_state=0)
-        self.assertEqual(len(eLDA.ttda), 0)
-
-    def test_memory_unfriendly(self):
-        # at this point, self.eLDA_mu and self.eLDA are already trained
-        # in the setUpClass function
-        # both should be 100% similar (but floats cannot
-        # be compared that easily, so check for threshold)
-        self.assertEqual(len(self.eLDA_mu.tms), num_models)
-        np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05)
-        np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05)
-        self.check_ttda(self.eLDA_mu)
-
-    def test_generate_gensim_rep(self):
-        gensimModel = self.eLDA.generate_gensim_representation()
-        topics = gensimModel.get_topics()
-        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
-
-    def assert_cluster_results_equal(self, a, b):
-        """compares important attributes of the cluster results"""
-        np.testing.assert_array_equal([row["label"] for row in a],
-                                      [row["label"] for row in b])
-        np.testing.assert_array_equal([row["is_core"] for row in a],
-                                      [row["is_core"] for row in b])
-
-    def test_persisting(self):
-        fname = get_tmpfile('gensim_models_ensemblelda')
-        self.eLDA.save(fname)
-        loaded_eLDA = EnsembleLda.load(fname)
-        # storing the ensemble without memory_friendy_ttda
-        self.eLDA_mu.save(fname)
-        loaded_eLDA_mu = EnsembleLda.load(fname)
-
-        # was it stored and loaded correctly?
-        # memory friendly
-        loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation()
-        topics = loaded_eLDA_representation.get_topics()
-        ttda = loaded_eLDA.ttda
-        amatrix = loaded_eLDA.asymmetric_distance_matrix
-        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
-        np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05)
-        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05)
-
-        a = self.eLDA.cluster_model.results
-        b = loaded_eLDA.cluster_model.results
-
-        self.assert_cluster_results_equal(a, b)
-
-        # memory unfriendly
-        loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation()
-        topics = loaded_eLDA_mu_representation.get_topics()
-        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
-
-    def test_multiprocessing(self):
-        # same configuration
-        random_state = 0
-
-        # use 3 processes for the ensemble and the distance,
-        # so that the 4 models and 8 topics cannot be distributed
-        # to each worker evenly
-        workers = 3
-
-        # memory friendly. contains List of topic word distributions
-        eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                 num_topics=num_topics, passes=passes, num_models=num_models,
-                                 random_state=random_state, ensemble_workers=workers, distance_workers=workers)
-
-        # memory unfriendly. contains List of models
-        eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                    num_topics=num_topics, passes=passes, num_models=num_models,
-                                    random_state=random_state, ensemble_workers=workers, distance_workers=workers,
-                                    memory_friendly_ttda=False)
-
-        np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05)
-        np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05)
-
-    def test_add_models(self):
-        # same configuration
-        num_models = self.eLDA.num_models
-
-        # make sure countings and sizes after adding are correct
-        # create new models and add other models to them.
-
-        # there are a ton of configurations for the first parameter possible,
-        # try them all
-
-        # quickly train something that can be used for counting results
-        num_new_models = 3
-        num_new_topics = 3
-
-        # 1. memory friendly
-        eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                num_topics=num_new_topics, passes=1, num_models=num_new_models,
-                                iterations=1, random_state=0, topic_model_class=LdaMulticore,
-                                workers=3, ensemble_workers=2)
-
-        # 1.1 ttda
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
-        eLDA_base.add_model(self.eLDA.ttda)
-        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
-        self.assertEqual(eLDA_base.num_models, b + 1)  # defaults to 1 for one ttda matrix
-
-        # 1.2 an ensemble
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
-        eLDA_base.add_model(self.eLDA, 5)
-        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
-        self.assertEqual(eLDA_base.num_models, b + 5)
-
-        # 1.3 a list of ensembles
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
-        # it should be totally legit to add a memory unfriendly object to a memory friendly one
-        eLDA_base.add_model([self.eLDA, self.eLDA_mu])
-        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda))
-        self.assertEqual(eLDA_base.num_models, b + 2 * num_models)
-
-        # 1.4 a single gensim model
-        model = self.eLDA.classic_model_representation
-
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
-        eLDA_base.add_model(model)
-        self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics()))
-        self.assertEqual(eLDA_base.num_models, b + 1)
-
-        # 1.5 a list gensim models
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
-        eLDA_base.add_model([model, model])
-        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics()))
-        self.assertEqual(eLDA_base.num_models, b + 2)
-
-        self.check_ttda(eLDA_base)
-
-        # 2. memory unfriendly
-        eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                   num_topics=num_new_topics, passes=1, num_models=num_new_models,
-                                   iterations=1, random_state=0, topic_model_class=LdaMulticore,
-                                   workers=3, ensemble_workers=2, memory_friendly_ttda=False)
-
-        # 2.1 a single ensemble
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        eLDA_base_mu.add_model(self.eLDA_mu)
-        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
-        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
-
-        # 2.2 a list of ensembles
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu])
-        self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models)
-        self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models)
-
-        # 2.3 a single gensim model
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        eLDA_base_mu.add_model(self.eLDA_mu.tms[0])
-        self.assertEqual(len(eLDA_base_mu.tms), a + 1)
-        self.assertEqual(eLDA_base_mu.num_models, b + 1)
-
-        # 2.4 a list of gensim models
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        eLDA_base_mu.add_model(self.eLDA_mu.tms)
-        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
-        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
-
-        # 2.5 topic term distributions should throw errors, because the
-        # actual models are needed for the memory unfriendly ensemble
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics()))
-        # remains unchanged
-        self.assertEqual(len(eLDA_base_mu.tms), a)
-        self.assertEqual(eLDA_base_mu.num_models, b)
-
-        self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
-        self.check_ttda(eLDA_base_mu)
-
-    def test_add_and_recluster(self):
-        # 6.2: see if after adding a model, the model still makes sense
-        num_new_models = 3
-        num_new_topics = 3
-
-        # train
-        new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                               num_topics=num_new_topics, passes=10, num_models=num_new_models,
-                               iterations=30, random_state=1, topic_model_class='ldamulticore',
-                               distance_workers=4)
-        new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                  num_topics=num_new_topics, passes=10, num_models=num_new_models,
-                                  iterations=30, random_state=1, topic_model_class='ldamulticore',
-                                  distance_workers=4, memory_friendly_ttda=False)
-        # both should be similar
-        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
-        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
-        # and every next step applied to both should result in similar results
-
-        # 1. adding to ttda and tms
-        new_eLDA.add_model(self.eLDA)
-        new_eLDA_mu.add_model(self.eLDA_mu)
-        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
-        self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics)
-        self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics)
-        self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models)
-        self.check_ttda(new_eLDA)
-        self.check_ttda(new_eLDA_mu)
-
-        # 2. distance matrix
-        new_eLDA._generate_asymmetric_distance_matrix()
-        new_eLDA_mu._generate_asymmetric_distance_matrix()
-        np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix,
-                                   new_eLDA_mu.asymmetric_distance_matrix)
-
-        # 3. CBDBSCAN results
-        new_eLDA._generate_topic_clusters()
-        new_eLDA_mu._generate_topic_clusters()
-        a = new_eLDA.cluster_model.results
-        b = new_eLDA_mu.cluster_model.results
-        self.assert_cluster_results_equal(a, b)
-
-        # 4. finally, the stable topics
-        new_eLDA._generate_stable_topics()
-        new_eLDA_mu._generate_stable_topics()
-        np.testing.assert_allclose(new_eLDA.get_topics(),
-                                   new_eLDA_mu.get_topics())
-
-        new_eLDA.generate_gensim_representation()
-        new_eLDA_mu.generate_gensim_representation()
-
-        # same random state, hence topics should be still similar
-        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
-
-
-if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
-    unittest.main()
+# #!/usr/bin/env python
+# # -*- coding: utf-8 -*-
+# #
+# # Author: Tobias B <github.com/sezanzeb>
+#
+# """
+# Automated tests for checking the EnsembleLda Class
+# """
+#
+#
+# import logging
+# import unittest
+#
+# import numpy as np
+# from copy import deepcopy
+#
+# from gensim.models import EnsembleLda, LdaMulticore
+# from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
+#
+# num_topics = 2
+# num_models = 4
+# passes = 50
+#
+#
+# class TestModel(unittest.TestCase):
+#     def setUp(self):
+#         # same configuration for each model to make sure
+#         # the topics are equal
+#         random_state = 0
+#
+#         self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+#                                 passes=passes, num_models=num_models, random_state=random_state)
+#
+#         self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+#                                    passes=passes, num_models=num_models, random_state=random_state,
+#                                    memory_friendly_ttda=False)
+#
+#     def check_ttda(self, ensemble):
+#         """tests the integrity of the ttda of any ensemble"""
+#         self.assertGreater(len(ensemble.ttda), 0)
+#         a = ensemble.ttda.sum(axis=1)
+#         b = np.ones(len(ensemble.ttda)).astype(np.float32)
+#         np.testing.assert_allclose(a, b, rtol=1e-04)
+#
+#     def test_eLDA(self):
+#         # given that the random_state doesn't change, it should
+#         # always be 2 detected topics in this setup.
+#         self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
+#         self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
+#         self.check_ttda(self.eLDA)
+#
+#         # reclustering shouldn't change anything without
+#         # added models or different parameters
+#         self.eLDA.recluster()
+#
+#         self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
+#         self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
+#         self.check_ttda(self.eLDA)
+#
+#         # compare with a pre-trained reference model
+#         reference = EnsembleLda.load(datapath('ensemblelda'))
+#         np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
+#         # small values in the distance matrix tend to vary quite a bit around 2%,
+#         # so use some absolute tolerance measurement to check if the matrix is at least
+#         # close to the target.
+#         atol = reference.asymmetric_distance_matrix.max() * 1e-05
+#         np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
+#                                    reference.asymmetric_distance_matrix, atol=atol)
+#
+#     def test_clustering(self):
+#         # the following test is quite specific to the current implementation and not part of any api,
+#         # but it makes improving those sections of the code easier as long as sorted_clusters and the
+#         # cluster_model results are supposed to stay the same. Potentially this test will deprecate.
+#
+#         reference = EnsembleLda.load(datapath('ensemblelda'))
+#         cluster_model_results = deepcopy(reference.cluster_model.results)
+#         sorted_clusters = deepcopy(reference.sorted_clusters)
+#         stable_topics = deepcopy(reference.get_topics())
+#
+#         # continue training with the distance matrix of the pretrained reference and see if
+#         # the generated clusters match.
+#         reference.asymmetric_distance_matrix_outdated = True
+#         reference.recluster()
+#
+#         self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
+#         self.assertEqual(reference.sorted_clusters, sorted_clusters)
+#         np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05)
+#
+#     def test_not_trained(self):
+#         # should not throw errors and no training should happen
+#
+#         # 0 passes
+#         eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+#                            passes=0, num_models=num_models, random_state=0)
+#         self.assertEqual(len(eLDA.ttda), 0)
+#
+#         # no corpus
+#         eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics,
+#                            passes=passes, num_models=num_models, random_state=0)
+#         self.assertEqual(len(eLDA.ttda), 0)
+#
+#         # 0 iterations
+#         eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+#                            iterations=0, num_models=num_models, random_state=0)
+#         self.assertEqual(len(eLDA.ttda), 0)
+#
+#         # 0 models
+#         eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+#                            passes=passes, num_models=0, random_state=0)
+#         self.assertEqual(len(eLDA.ttda), 0)
+#
+#     def test_memory_unfriendly(self):
+#         # at this point, self.eLDA_mu and self.eLDA are already trained
+#         # in the setUpClass function
+#         # both should be 100% similar (but floats cannot
+#         # be compared that easily, so check for threshold)
+#         self.assertEqual(len(self.eLDA_mu.tms), num_models)
+#         np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05)
+#         np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05)
+#         self.check_ttda(self.eLDA_mu)
+#
+#     def test_generate_gensim_rep(self):
+#         gensimModel = self.eLDA.generate_gensim_representation()
+#         topics = gensimModel.get_topics()
+#         np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+#
+#     def assert_cluster_results_equal(self, a, b):
+#         """compares important attributes of the cluster results"""
+#         np.testing.assert_array_equal([row["label"] for row in a],
+#                                       [row["label"] for row in b])
+#         np.testing.assert_array_equal([row["is_core"] for row in a],
+#                                       [row["is_core"] for row in b])
+#
+#     def test_persisting(self):
+#         fname = get_tmpfile('gensim_models_ensemblelda')
+#         self.eLDA.save(fname)
+#         loaded_eLDA = EnsembleLda.load(fname)
+#         # storing the ensemble without memory_friendy_ttda
+#         self.eLDA_mu.save(fname)
+#         loaded_eLDA_mu = EnsembleLda.load(fname)
+#
+#         # was it stored and loaded correctly?
+#         # memory friendly
+#         loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation()
+#         topics = loaded_eLDA_representation.get_topics()
+#         ttda = loaded_eLDA.ttda
+#         amatrix = loaded_eLDA.asymmetric_distance_matrix
+#         np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+#         np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05)
+#         np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05)
+#
+#         a = self.eLDA.cluster_model.results
+#         b = loaded_eLDA.cluster_model.results
+#
+#         self.assert_cluster_results_equal(a, b)
+#
+#         # memory unfriendly
+#         loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation()
+#         topics = loaded_eLDA_mu_representation.get_topics()
+#         np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+#
+#     def test_multiprocessing(self):
+#         # same configuration
+#         random_state = 0
+#
+#         # use 3 processes for the ensemble and the distance,
+#         # so that the 4 models and 8 topics cannot be distributed
+#         # to each worker evenly
+#         workers = 3
+#
+#         # memory friendly. contains List of topic word distributions
+#         eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+#                                  num_topics=num_topics, passes=passes, num_models=num_models,
+#                                  random_state=random_state, ensemble_workers=workers, distance_workers=workers)
+#
+#         # memory unfriendly. contains List of models
+#         eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+#                                     num_topics=num_topics, passes=passes, num_models=num_models,
+#                                     random_state=random_state, ensemble_workers=workers, distance_workers=workers,
+#                                     memory_friendly_ttda=False)
+#
+#         np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05)
+#         np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05)
+#
+#     def test_add_models(self):
+#         # same configuration
+#         num_models = self.eLDA.num_models
+#
+#         # make sure countings and sizes after adding are correct
+#         # create new models and add other models to them.
+#
+#         # there are a ton of configurations for the first parameter possible,
+#         # try them all
+#
+#         # quickly train something that can be used for counting results
+#         num_new_models = 3
+#         num_new_topics = 3
+#
+#         # 1. memory friendly
+#         eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+#                                 num_topics=num_new_topics, passes=1, num_models=num_new_models,
+#                                 iterations=1, random_state=0, topic_model_class=LdaMulticore,
+#                                 workers=3, ensemble_workers=2)
+#
+#         # 1.1 ttda
+#         a = len(eLDA_base.ttda)
+#         b = eLDA_base.num_models
+#         eLDA_base.add_model(self.eLDA.ttda)
+#         self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
+#         self.assertEqual(eLDA_base.num_models, b + 1)  # defaults to 1 for one ttda matrix
+#
+#         # 1.2 an ensemble
+#         a = len(eLDA_base.ttda)
+#         b = eLDA_base.num_models
+#         eLDA_base.add_model(self.eLDA, 5)
+#         self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
+#         self.assertEqual(eLDA_base.num_models, b + 5)
+#
+#         # 1.3 a list of ensembles
+#         a = len(eLDA_base.ttda)
+#         b = eLDA_base.num_models
+#         # it should be totally legit to add a memory unfriendly object to a memory friendly one
+#         eLDA_base.add_model([self.eLDA, self.eLDA_mu])
+#         self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda))
+#         self.assertEqual(eLDA_base.num_models, b + 2 * num_models)
+#
+#         # 1.4 a single gensim model
+#         model = self.eLDA.classic_model_representation
+#
+#         a = len(eLDA_base.ttda)
+#         b = eLDA_base.num_models
+#         eLDA_base.add_model(model)
+#         self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics()))
+#         self.assertEqual(eLDA_base.num_models, b + 1)
+#
+#         # 1.5 a list gensim models
+#         a = len(eLDA_base.ttda)
+#         b = eLDA_base.num_models
+#         eLDA_base.add_model([model, model])
+#         self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics()))
+#         self.assertEqual(eLDA_base.num_models, b + 2)
+#
+#         self.check_ttda(eLDA_base)
+#
+#         # 2. memory unfriendly
+#         eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+#                                    num_topics=num_new_topics, passes=1, num_models=num_new_models,
+#                                    iterations=1, random_state=0, topic_model_class=LdaMulticore,
+#                                    workers=3, ensemble_workers=2, memory_friendly_ttda=False)
+#
+#         # 2.1 a single ensemble
+#         a = len(eLDA_base_mu.tms)
+#         b = eLDA_base_mu.num_models
+#         eLDA_base_mu.add_model(self.eLDA_mu)
+#         self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
+#         self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+#
+#         # 2.2 a list of ensembles
+#         a = len(eLDA_base_mu.tms)
+#         b = eLDA_base_mu.num_models
+#         eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu])
+#         self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models)
+#         self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models)
+#
+#         # 2.3 a single gensim model
+#         a = len(eLDA_base_mu.tms)
+#         b = eLDA_base_mu.num_models
+#         eLDA_base_mu.add_model(self.eLDA_mu.tms[0])
+#         self.assertEqual(len(eLDA_base_mu.tms), a + 1)
+#         self.assertEqual(eLDA_base_mu.num_models, b + 1)
+#
+#         # 2.4 a list of gensim models
+#         a = len(eLDA_base_mu.tms)
+#         b = eLDA_base_mu.num_models
+#         eLDA_base_mu.add_model(self.eLDA_mu.tms)
+#         self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
+#         self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+#
+#         # 2.5 topic term distributions should throw errors, because the
+#         # actual models are needed for the memory unfriendly ensemble
+#         a = len(eLDA_base_mu.tms)
+#         b = eLDA_base_mu.num_models
+#         self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics()))
+#         # remains unchanged
+#         self.assertEqual(len(eLDA_base_mu.tms), a)
+#         self.assertEqual(eLDA_base_mu.num_models, b)
+#
+#         self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
+#         self.check_ttda(eLDA_base_mu)
+#
+#     def test_add_and_recluster(self):
+#         # 6.2: see if after adding a model, the model still makes sense
+#         num_new_models = 3
+#         num_new_topics = 3
+#
+#         # train
+#         new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+#                                num_topics=num_new_topics, passes=10, num_models=num_new_models,
+#                                iterations=30, random_state=1, topic_model_class='ldamulticore',
+#                                distance_workers=4)
+#         new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+#                                   num_topics=num_new_topics, passes=10, num_models=num_new_models,
+#                                   iterations=30, random_state=1, topic_model_class='ldamulticore',
+#                                   distance_workers=4, memory_friendly_ttda=False)
+#         # both should be similar
+#         np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
+#         np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+#         # and every next step applied to both should result in similar results
+#
+#         # 1. adding to ttda and tms
+#         new_eLDA.add_model(self.eLDA)
+#         new_eLDA_mu.add_model(self.eLDA_mu)
+#         np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
+#         self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics)
+#         self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics)
+#         self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models)
+#         self.check_ttda(new_eLDA)
+#         self.check_ttda(new_eLDA_mu)
+#
+#         # 2. distance matrix
+#         new_eLDA._generate_asymmetric_distance_matrix()
+#         new_eLDA_mu._generate_asymmetric_distance_matrix()
+#         np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix,
+#                                    new_eLDA_mu.asymmetric_distance_matrix)
+#
+#         # 3. CBDBSCAN results
+#         new_eLDA._generate_topic_clusters()
+#         new_eLDA_mu._generate_topic_clusters()
+#         a = new_eLDA.cluster_model.results
+#         b = new_eLDA_mu.cluster_model.results
+#         self.assert_cluster_results_equal(a, b)
+#
+#         # 4. finally, the stable topics
+#         new_eLDA._generate_stable_topics()
+#         new_eLDA_mu._generate_stable_topics()
+#         np.testing.assert_allclose(new_eLDA.get_topics(),
+#                                    new_eLDA_mu.get_topics())
+#
+#         new_eLDA.generate_gensim_representation()
+#         new_eLDA_mu.generate_gensim_representation()
+#
+#         # same random state, hence topics should be still similar
+#         np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+#
+#
+# if __name__ == '__main__':
+#     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
+#     unittest.main()
diff --git a/gensim/utils.py b/gensim/utils.py
index 3831c9a8ca..e0e54dba3d 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -606,7 +606,8 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
         recursive_saveloads = []
         restores = []
         for attrib, val in iteritems(self.__dict__):
-            if hasattr(val, '_save_specials') and type(val) != type:  # better than 'isinstance(val, SaveLoad)' if IPython reloading
+            # better than 'isinstance(val, SaveLoad)' if IPython reloading
+            if hasattr(val, '_save_specials') and type(val) != type:
                 recursive_saveloads.append(attrib)
                 cfname = '.'.join((fname, attrib))
                 print(val, compress, subname)

From 644f2e483a87a43538ca951ba9c1386e1f4c5e5b Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 23 Nov 2019 16:44:44 +0100
Subject: [PATCH 076/166] saving the topic_model_class using a string instead

---
 gensim/models/ensemblelda.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index d9fa3c9a1b..80c2a88d68 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -101,6 +101,7 @@
 import logging
 import os
 from multiprocessing import Process, Pipe, ProcessError
+import importlib
 
 import numpy as np
 from scipy.spatial.distance import cosine
@@ -267,6 +268,26 @@ def __init__(self, topic_model_class="lda", num_models=3,
         # create model that can provide the usual gensim api to the stable topics from the ensemble
         self.generate_gensim_representation()
 
+    @classmethod
+    def load(cls, *args, **kwargs):
+        eLDA = super().load(*args, **kwargs)
+        try:
+            eLDA.topic_model_class = importlib.import_module(eLDA.topic_model_class_string)
+            del eLDA.topic_model_class_string
+        except:
+            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module and set the '
+                '"topic_model_class" attribute to it. Try setting this manually instead.')
+        return eLDA
+
+    load.__doc__ = SaveLoad.load.__doc__
+
+    def save(self, *args, **kwargs):
+        self.topic_model_class_string = self.topic_model_class.__module__
+        kwargs['ignore'] = frozenset(kwargs.get('ignore', ())).union(('topic_model_class', ))
+        super(EnsembleLda, self).save(*args, **kwargs)
+
+    save.__doc__ = SaveLoad.save.__doc__
+
     def convert_to_memory_friendly(self):
         """Remove the stored gensim models and only keep their ttdas."""
         self.tms = []

From a7bbfc0a21c308050fb139db49e2e3842b93e168 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 23 Nov 2019 16:51:17 +0100
Subject: [PATCH 077/166] reverted utils

---
 gensim/utils.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/gensim/utils.py b/gensim/utils.py
index e0e54dba3d..70b12d8a88 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -18,9 +18,7 @@
     from htmlentitydefs import name2codepoint as n2cp
 try:
     import cPickle as _pickle
-    print('import cpickle')
 except ImportError:
-    print('import pickle')
     import pickle as _pickle
 
 import re
@@ -57,6 +55,15 @@
 PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
 RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
 
+NO_CYTHON = RuntimeError(
+    "Cython extensions are unavailable. "
+    "Without them, this gensim functionality is disabled. "
+    "If you've installed from a package, ask the package maintainer to include Cython extensions. "
+    "If you're building gensim from source yourself, run `python setup.py build_ext --inplace` "
+    "and retry. "
+)
+"""An exception that gensim code raises when Cython extensions are unavailable."""
+
 
 def get_random_state(seed):
     """Generate :class:`numpy.random.RandomState` based on input seed.
@@ -544,7 +551,6 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro
         logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately)
 
         compress, subname = SaveLoad._adapt_by_suffix(fname)
-        print(compress, subname)
 
         restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,
                                        compress, subname)
@@ -606,11 +612,9 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
         recursive_saveloads = []
         restores = []
         for attrib, val in iteritems(self.__dict__):
-            # better than 'isinstance(val, SaveLoad)' if IPython reloading
-            if hasattr(val, '_save_specials') and type(val) != type:
+            if hasattr(val, '_save_specials'):  # better than 'isinstance(val, SaveLoad)' if IPython reloading
                 recursive_saveloads.append(attrib)
                 cfname = '.'.join((fname, attrib))
-                print(val, compress, subname)
                 restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname))
 
         try:
@@ -693,12 +697,10 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=
             Load object from file.
 
         """
-        print(fname_or_handle, type(fname_or_handle), separately, sep_limit, ignore, pickle_protocol)
         try:
             _pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
             logger.info("saved %s object", self.__class__.__name__)
         except TypeError:  # `fname_or_handle` does not have write attribute
-            print("failed")
             self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol)
 
 
From 2abdbf72e2c80d19df1be63c9b38ecb5e075443b Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 23 Nov 2019 17:26:00 +0100
Subject: [PATCH 078/166] saving the topic_model_class using a string instead
 fixes

---
 gensim/models/ensemblelda.py      |   7 +-
 gensim/test/test_data/ensemblelda | Bin 13705 -> 14251 bytes
 gensim/test/test_ensemblelda.py   | 698 +++++++++++++++---------------
 3 files changed, 355 insertions(+), 350 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 80c2a88d68..514d87c305 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -272,7 +272,9 @@ def __init__(self, topic_model_class="lda", num_models=3,
     def load(cls, *args, **kwargs):
         eLDA = super().load(*args, **kwargs)
         try:
-            eLDA.topic_model_class = importlib.import_module(eLDA.topic_model_class_string)
+            module = importlib.import_module(eLDA.topic_model_module_string)
+            eLDA.topic_model_class = getattr(module, eLDA.topic_model_class_string)
+            del eLDA.topic_model_module_string
             del eLDA.topic_model_class_string
         except:
             logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module and set the '
@@ -282,7 +284,8 @@ def load(cls, *args, **kwargs):
     load.__doc__ = SaveLoad.load.__doc__
 
     def save(self, *args, **kwargs):
-        self.topic_model_class_string = self.topic_model_class.__module__
+        self.topic_model_module_string = self.topic_model_class.__module__
+        self.topic_model_class_string = self.topic_model_class.__name__
         kwargs['ignore'] = frozenset(kwargs.get('ignore', ())).union(('topic_model_class', ))
         super(EnsembleLda, self).save(*args, **kwargs)
 
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 286d8dda8bee36a5089ebce4476eed98998ff7e4..79525c3aca30b28384bb1089a558222777737cad 100644
GIT binary patch
delta 8619
zcmZvC2V7LmmM0lO4EQ04ia9i3M8E(vDk5r&pSB4DwoTJ;$<*y`5rMA}B#9!}B8ZCQ
zAQA*bk~(wF?Cj1?*xAW58{W*$rrA83cW2(4`QO{1Z@xGDepdCZI=4>xpHp>DS(h_Z
z&{$*lb#ZaYx8)h~3ewHF7U8M$NLh}dOmn^^E6<QqWH8E^7U5MUyfrCam@pN{g*J;=
zk}7;Oslr#IbJ4k0i={eOrSsFdDcv%2s#v}-vqI;t46W38D4oB~Q|VUeyp--cowr@-
z13<r8<9s*JNfD&;R%X`dmMC2?2-a$xJl280ty-*i&V*oQgT~2YqjS|w&Q~{MW{b{8
zd3~!fRczBZe{a|MDnmOkyAwlk%7ex^7OI09)gla2;TosByPPw-oilsP<`i$(AS*qh
zs6b8^dmTS@-D%$!dSC1>6A_w}C781o<e2j#veLzYG7;$<G8N<%+N@?-MCn{oJV7Jb
z@{IW+TIZVLqpX&XxeTMpEDq}2)HP(YrLZ91Vit#V?yAu(Hu;XZSR96P6dzcMEmm`$
zIHL1R@o=$&XI7qB#OS>6<l<tpfK#l_JH;Ij=0b~z(=AcCT4l4@BD6Xm^(j*}7D^&s
z=c}%mmz7^2Tf|Y_QkyCPG*lH4$I8TUcv6bexk}MGH|e0xT{@)mkPhoSr6W2oDMq*1
zTZ+{!k>Yedl2+#{#p{+zM{O!|Q-(z(l!+6L1os$6y!(1bo4dxI1pbN*70eYDSi~t^
z$k*rm{CncGZoMjj!2pWFVv9JFDs&4Y7E_ioayC^Yrzp5J7_u@Uw>jM+&ZUa)Q&y{E
zve{&lEm?QW28;2IIk&)w>GP@Lf-Ma%g*Z7|3ktJL7NLj16@wNUEf(13qQl!GB=}No
z%1Vr-XIZSqe3RJ#SLc{voXd_lkKip=&Z+T{XSC#G<!2i5%vPx&U0hX7k3ow_aoqLT
z;(D#AeT)4%ka43-+|;NRPq!8qnnkKf)yr3X@)fsI+`ua%QW%s#su8#K!U#C0xMDO`
zq@6o=?#$odPi=HAR$)q$_DcIz+$a_Va`G%9UFVmwbU|b}D-Bpn7v?(YkfNS~Fq34<
z&#`3PGmDJm-}ZJ2V>TI!MP^d`pJE<=su>D0GAw4RkQ^61x17pKihpqL&yq$0O4PL#
zZnna7qZM|_1|Tj1%YJy#Ic6~5G0Vjw2ar}AW4>d|T^P!BeDVx-&2#*r=Z1~>bxv`N
zxw&aZQ;tOxoKuA{<`zmuQRw*Pok5OpujO{JV59tWqbwVXeTA%ZsS*~G!Gt7bviO?w
zl^?#ss+uWH*i`9rjn=IE{o+owDAJ3&HKJHA?kPFLtxA+g#yZKWq6pH(EB{2P1j;hK
z_`xddX{t{P3K2-K(td+epz=Cr3|Wohz9b}7hzbcSSL($BWo5rAQKhcz_zSOSd$qD)
zjb7ARMO~U|o5ckUW-RfrP92kt`RN6D24Kr-7WGa=_c#al7y!zgT(hsi;2cni3?^%V
z>?<CrQIdRV|KY<24vNQBqCqd7l!-=7$}*^vmSwdYkh9IQ5!9kddaN4O$>nL8aA;Ij
za*#xxEvnM2sPbGdTA)gDA*|FYInii~6fc};Y}1REK%?}FIDD0+7<39-g*y-MLwd*d
z>9rdA5=qn4Km#;Fqe<MGW#dL}&9I!N58QA${dDk3LorvZrlBJ|PG45>Bn|n}dw#wJ
z^K;ZfuR#8kE9rg+f2Pkgy^nfLdE!Pmmj8@5-X5dQ4C-0IlQwFkmRr<*itD)}f_u2#
z&wf(BDeKG)YQw7Y^kVa2s^t-SvFw^p@yV_Hn5VJibFR6>Bg<%rXAbdu`*rR-Pfw|b
zhPDR;t~T(9yqvqJlU{NSPx1W={Dr6KJ#~hDpL&TWa<5P|_w!r=-KU}RJhR*LJD=M$
z6r?Moj?AlU?+o7S5lW-`w#4q>XAygnxF?MrT*2eor9RxuP1Hq?6ScYVd_Tk0dWr|Z
z_n|NK(YqsC6QcG~?NO@XaqhHiKfo_xkNLgf)O5#{=dK>4nsYqD^EA%mF5F2kph+`V
z^YA(zqN#nGsLu@o+v|((Q4@dS8NX5^SC`NOuA&*QQ2GKD--K{~(Pr(<ZPWrms$t79
z+wx>?cgrf{4tDIOZu+n$W);2VPY3w*MvsHkw2hk%>$oHNN)XL(IrneK&Vm)|`HdI#
z@(acO!7$eQW%P!hQU_1i3+RO>4Z;5tG)i+mOS~fQZo3?{+C-CChf{dqHa8vjpo!G8
z{ETa<{ua#9Wq0MuovQ=s4Nba69;T1{c?&g%!st^5(;b?R;X$_yXfaH0V@+-wZt&0v
zn!S|7qs27HEqVNrYtL}|`Fx&Cq**)9g;Qk&KcM?O$AhliPM-q!i6@m?*b#{r-mddp
zlE(8lX>JGCx}4oW<tq+TC-<*98A9X7xF?+JbfGky;0ic>Ilql-s7DH>N_xh9^qI?(
zVDsq{o6GnObsYstKk#SG1)ey;y)@^^Z`uBcrY`HL3buGg@A9MViO2XOVxjxsaZjN4
z6IOW~x|6#%Q$3AwEx&Q4sWU+RXMXfO4@W?mfvoRo%r6fNCw*NK?BA7T(?`I0o?1c@
z{L`rQAir=8<>xegYc;*!0iR5BM^Pv>@C;0P-wxP1&K?5xR?tl7ccvr!kn46v0I_eV
zX&u)`^JF%)?Fpm`YNdy#{g1`)L<G;!OZw;uP(6&NfjcGea5b?0lt-nAO<X~B)a)A0
zqb1zovw|ObTJF-!ZFXc_OTTrUrugMD1ZE#kui&nGWdF!5#i2YCb?`F3;#uytdT(4y
zRV6fbo$u2NSfWw!Gu6@qnjuFS0-%QHQfWMoT31BFR1dilNb1sTqGqE%H&DlAo-Xkc
zcVh1NXC-n^9Di6!kB}|i0PIZ=^m7@+o(4E3sGp|DK9I+QJp2Rn`$&)YMK;y(Y#BXY
zsnsXPaNlk2S$^^e4<6@7^eGg^`;cS=Ql@E+8$A&#uQF)f5<z{MsH+~!Vbj;!Pm~>}
zne2=71s>?)v30354CD7v-CjVeNza{#$u20?>XFAi2iQJJ1KfInp3?)9ZvFXPJPhB>
zBpa^Yv$Dgu2XLP`1H(*F^=<0jksP?rm6~@3T%uQ0YrLCtmfL9D@1#i+R<_~@*J{%M
zqgHCnLCiOE<y!7Kj^u@CPv;7q3{OwQCeX0AD@|UZFPH2{c@uUX<sO808$DmP0u~>O
z=h5?9HQ_5QQQWg3kH=t?dW7z~%Ywe#peHwXQ3X4qPb24))0E3Wn%)Aebih9^XcTyU
zvk7Uvjo_g^#o*Mt?gl^FOhXpSPM5Ph6TX6Kk)E#=VR|Ynn%nJE9YF2W><1_H>EMz{
z>VRt=aHk&+-lL9Ep9ua$k7<18E&AlYf}Uu(HgJbKh=!LX^W#hVxtp6-rUlSrTY&dk
zu85N9EfB!fh?agWEd7~gxq;smP-P;$w_UtpDlMR~P4oofPi)(L-A-M5uk%#c8hz3Q
zB%rC3Gcd>;wtya*JwXi-x!bA7;ESk!&5ft%^WH*Cwj!4YBLW{5UiXebrf;TxIOypu
zu8{Go5eccH6d;=nL>j-(^UJ8g1k9ADZ=-pxyB6qE%AZUz{5*(jg3j3aOW+dN?^Ov+
z+~#LK{P>4;Jo+8P?xH7QkYk#j+@;Yd?xsqvDyA1cFh?uRYWBuZ)rphH3SCrj1~E6y
zvq!IOHJa%`yp^iBJC3Vv^6Shn?m})uzUeQ%wHeaA3ZdTP+{V3T>bP}mGw|of(=q%6
zn_mlkqKC-``89-Y=b>xB%Lu>Cpb?n3IRpl%3}I!D>rI6Sjsohp^Q?}Cl6P}W-1#G6
zo6cN!h1N4Sjof^QK7r7|t+zJwoc4SvB6JAb(-?m^3Eq!#OHYDedM&(Qf4PSycAciV
ztMr&1G*Zg_xB2t;+yW#|uFT}2?aTI2J6DEs9lZt?UthQhZGf;|82XhDk0kl?KqN3%
z?@xVt>`{++CIZHIwKR)g24R<J=2q^$#C_|-4Pe)az#XE_bo(wEwt^$_e+#sE7sPKb
zNR}*~pfQ>|gkXP2&!S_N@T3V*N<*5h^ghS~>31B_I>Q5JVW5XR5D$s#c`|@I6TN`A
zR++|p`PGHX+*b%mYLNk6mhxouc@N~N=Ui!^@dNHgE6-}TA&q_6FCp{TAJcRHf`iye
zn|1Wa<(P(F!4(w<`LO_awV7Vt;6YtF%_q_vwMqQwwv9f})AjW6RGv4E8)G~xQAMKF
zZEt2#^f51ZxVkJT3jAg~BjUI-b1!f@8AU^7s|;JYe2H&zYH+mqD9zsT$)|F98*Hb?
z`m!SMd3+h~)&ur)N%Tw#bVaspK%N`lsGKe?Qq7hU`D_|JT>@<NQu(emz7ZwZ6PG~P
z4%^1qP4s*flH52PG0h)~sTz5{hx>fO;OAig;W15l!nE!Du_zAxD~TtrfN4XC-A1$9
zK{Sc2w%0fB`#gT{a(ELq*bmtI+Ng3n&Bp>V160l(`9`YH8jr#^<;XS@%62*skcezC
zir0t9v4fu=Uyac-q|iokcmwz|N*0d8`Qz#7grsyHl#vlCS0GP)2{F-tAAhm0+=R?C
zx;H6Y%O7lc{4fbBbkGM~2tNlfUQ<U9b=r}wdT{D9q;ac(KmS0TcX{|EmLE9{OIN2-
z6V=^<!MaMfL#ZaZzxg&dY{Nj?p=kbq&Akg5sm%_d9W;p3Rtt_|Z-AUH^wNGgKU!Ja
z&pYQfKVQkSL4d-81F%Nb`n{3V=0%;+Fi$TzRMYm+$k~e}#R&WN^wOW^eQ5M5joJ_x
z1CY0pdcDGUU@0Q(Y6`#GndzB$DAN-szj5q6y<pqSb2q6khURZwg{KFWYT5n_5%Zj<
zGw3OmUt7WDUh894BGw$&jw30w0Pbx&{gD3JHD)u_`_YsMPM%8l_oUv`-I?qt+XI-q
z1#sTxBuR+bMsBzPW;6VrYN$T;V5E}oUjw_tNNElnSLV-gdpu9_yN!69PPHem!Rs>?
zWSJg*fm3A_&3f<){V^KVaTU*fZ{^qY%4O{a9!uE;H^0n?;Ev<iQ>%F}`V^0!qFT=*
zp>m*~5lH;(k+~juYJkSvxa!n#YDE~m4y4)3c|dZT$A+^WutY2OZ(4l=wwhmqw1o4=
zV;;Yy=gDpG=erX(V51ruC+P0AG8YIN)*RqQUqGyRDZ=#~J`XC3a(VI)j%!a7`2NWZ
zSbl^WH&8_hU^jPG>z3|`;Gd_ubLro?`^1YNY(Ev3H5-?4&pr6PlIpgF7XTBr+`gMW
zApcOUlyESI2Z~d?_|uW?mo9Us#2xZl#Capt`5sP9L_!`<#7XcCvS`y<Kq^_@h;!xS
zWg1Prg3qQ|B#frQ(#-5q?87e+1Ff-3s5>3$p)Q;p2TLNkV~zKfHGwdH1NUqWj^q9W
z1XLfM?cL|^oWl+kh!e$gfA35j`vyF>0J=|kYIST`5aZ;uE`eIM7OkRYOGE&@*o)jX
zur9)A4(A7vx{Z-{Q~AkvSgi(roOj`x5CiN`g{;|jX=|i^!WlhJpQ7>OMLg`W#e|7I
zfaL*I%G@4B-E4P+^RsXsyRkEtr(LMQ8*{VNca#S2Xm8<r(1BxFw>yp)UpD5NayG;u
zdY%}m%>YG5Z+IXw9H|jkB;ad`-yKFkwO-e7*HY@U!ubP5ZeB8t=!3GjDR{Rh{QH4t
z&jR>U!92A#Gzuxa*?x~(ZqR^x0kXh&2EU9ArFS%Q+_e2XO@`vku>WK!zr`ke-=zr0
z$H{0L(t0<wl+b8^9?G=i%+L;_4i&8A*W$JEE#IyeZ>*vtP5pFMYE_*&lTEE)NJfhx
z(`ZArr)#04muWn#Ja#AHV_f}Ht4%2Gp+bYY1gd$W2c#=7mubw)Ga9TCs(eyGZo24o
z{3I|qsPCLwIYF6avEI~=lH1d!_H{QeDzyS%F@S2)V3~NUk?u<m)QX2i`M$S^p~Wxy
zcgh$2uwIO)We=TSwHS34J|2K=>b+B1>ZI<|TAb3FPHWBFl&E!gGzLZa>j1R5`18>;
zAJ6rrVev7JKL`r7k0~~IuNUK1F_E@dqf0M9*=!QEzEsI5p)~hFLN#tmLiz2Zgd*EC
z)K)4{=DUT7xdoYNO5tN>QM_42yg9v?|0>=mr+CG918HC+KlTEBD2P2xg9Wh`yCyG|
zdLe6f+<|4eu@>cVkb81tBjR#nalGumrbVJ@x~4rz&jYk8x6q3?cYicKtI%vejRpy%
zYE@j}@(`{1_=A-P<I++waf+JkG5mB*Y0QP=B{AZ&qUaaB_$90O<upYpQk1&RidMg(
zX!Q>?D7rm0+tQ?Z>F?HIezi*cL%sM%W#ZQ~Qj_$n5=t%qSR;KcvHZHC@o(tGZ!VTt
zeoK0Zy2*rJaP{)Yf1<GcZN2!XR`EM&Qm4+tCIL;-FF?rYWrM|*Cw><tvW@oNgXyiR
zLzL~0`;Up=SC;w%z4*gw@ke^`$0|US@~;)e!9PhA|4gk;TCM3u@y{2X@h=q7{-s|0
zL~+Iv1uK8LQ2xm{B>v1<{`psW@#nu-{#mNjA^){n|M^QtwkE=LljF7KH(hT#azcmk
zJ2))EwZl;!7U1Tx%h3|HZ>h@#WU|Lf(e0??cf-P5T^v6P+vrM;4dKGwg$5js@WXCg
z?)a_nXMX=i(c|Ch#lN$PfA7fO73T35bt!8dVpoV={D(!s|Dz)Kf6|Np>=b-SwfHa6
zpDV?vUs5Wm3xAie4<FFd2-nARM;b!AX)W?#J%lLaiap1uL(8xBamUIG>QzUP^+t2J
z(W>QV9v4o@G@*{VY>MH5ZQ48Ma_z{1?bt)Q6*U(AHc|PdT{NksNp5jjeg;WJoki-J
zx=S_Mqtu*8y&Jy1gk;mSS<5pX<S0UpQ`SQ2>kl|h*mTub7Z+S)fU`jBx$G1qw{AQ5
z-VQ|aM3T}{v#n%T_`*vXoGsf|(FbjQQVKHn{e|I0HBorU<9@`^JH-@_K=Jf0&Lxdb
z{ZvkoRI`U?aYXA@2an>o){hU98ZEsG-nSOx@3~&#zmkU6M#6H}JtEctc2EAUx<4o1
z?nr9R#0J;kq)?e)Q*uy)&c65>jw7v!^iX>b=Zp&K3q+?_@0%Zb=$SSbXNX}%?n!%s
z790Nv_6)^`-$YF32Z=a+j8o6gDMZt*HXLAPXxup%pe?d$J$<fU$M@W#3!UC@artUK
zV04h%3)2?exF}#L*PRPEX#e`6qD>0Dx`~g=hQ&cAOm+p8My<#30`k7Y$4PHKBz9tM
z(Q;=m^P}}Aa0FG9a8bp=A$#zLB5YAVC%^cwN*Me%z4%M3`0r_IeRqW^*M>VE*?CPT
z{)gk&_HH-)PizSPt4#cFjTDy@Kl4KQYg4l?ZgX(=kyn_DTK)gQ9e{erXTEDS<Qmh=
zx#Is~NV$T+24u^{{_+1(#sAloiNDgQqgfV|*=4i%Ye(|F;2l3T84PK*tX!1V4F+G_
z4)}_nrHa4Nxf-iv7hF>4WmmM0>U|sRvKxL$KTDO}(JNPSo;q)HmF%HXd7@QI(y9ou
z7anm}pp>q0Rgjf#C^X9Ee5*zF#)PuQ5|sc<eDFgr`=Yg#$xG2`q`$6}eyY&;DKsvi
zS*Fm~<>lzI7ll}%km5?wL+1q{Rw@*-ze1q0cFA63v<hSDBHyVb0cgQ!HM(z%0vBn5
zRGKwtzw!xID3L-v>?)(RD#bd5;zx|uV@zcfqLOSt3q~8!ePgsqp;1h`S*6*6_A8^U
z3MKNg)2iE4g6#@{)0WxxMU8e~P+erFN~1vwMxp4wF+vu03KOo<>_YpM(QbuOxi?W2
z+M^QeRS15h&_0Z*3hh@(>=Ed|=m5HJj3N~xMWZN{C>rfoMh6v2XymTq<d8~mSRq)@
zXi=df7*p4XQAuLaf=?W}Z+x___62F;RidM4zcM<ekg6Ja01L-ef&_)&M;e{Ln5t2t
zN^%k{7$u?m#^{tnqbPJ*r8$Gv$;S*t>J(B{BTq0ot5PH@6hG4F9LCf|zE?@kqqX69
zp&kz{1v2jM3=08okuP8}Ro0``l*tzz_5<7Om+*kY;ZIZL%gS)J3Vaonsv{NM%?rAH
zy(v^;+9F?3-t=%@@mdUO=*d?xRtr6q%QQ{)x3#aS&q{b_t3XtJr|4^6Sll(+xm>n$
z1+yB?IFhF2nhnkiR{1)}7dCGBhVuJvjeHZmUQR{p^od`!d`p>9H}SthM!+9#2zqZ*
zr;;YQgL^ZbKP8*`16c$0==098^Nu%>k;{0xVDXT_t&X2WhPjS8e4{qH4miT15{?;^
z<?mF>w^hzYv{pF{ZCXk&7^tE;6RXnNAic#Z<8oIvfhHvoBu#GuOXPIrY0ZybFy-RT
zcah$#P$>Ti;B2}vS^57Iat20lqLee0{%oo&DgC)rIZL7R|NGXGvoU4FsZ-8TXs;P<
HX?y+_EizJ9

delta 8102
zcmZX32V7L$b}t<%!Y6h`v7nCVhk}3zIA8@Fv5XD%!*PJYLml=MP(N{yUZgpIpr|Nd
z0~G}Y5hYnG$<0e%+D*If<-N3f({s~rdUEsMXFzY>!=LBO*=z5$SN*Sj))_W87Pi+r
zt9CB-5Z=xSuXPTeOn)~ww-S4?q1c>jDzXY+omXZMhAx>(tog-;!g7Pra>**@RR}*#
zrVj`?W=pBvD*UrVfF?@>YIJTo_c}3O=PqwSIuCgZHf4#gW+@AFo^mKe=Ou3ob>8x}
zNarJOiyb;&d0&G0OEs>w%Um;-yC^Gk^W=n;m=LOQ?Xe0IJnF=17bOgoH5%6*Yh7H|
zxt6X6C0yqxmv1m;iH#c9?@c;?Iivw~vnJU!7U3G(q6?6utss4^amgF$qHJ?fwwp|u
zzOX@lZgjcXk}IN|<i5!fz4?aNQ6XY9ne#xinF~!N(fPR|wnD_YhH}isrFNUiB6jNB
zGQBZJu@@UlM7+*D(@%0M0WHs%V-mY`9x4xu$y#bIv6@7J&QmqI)ov*>T@i_Jj`V@`
ziq&Q+7D+nqOfNSF_RKFfiQPIMJh{2qt=LJc^Ud_cgQ?Uil6CXcU2PVV$tqHGe(KXD
zi?LJ@dvyLP$Kw1Fv&Aa*>H_Sl1kg}bMC_{&`{7Atr_No8*Lf(rbe>9r&Pz$uc`Hde
zA7!_0g|DL3%~N1XB}M12?9l}%d+qAxInET%Fy|W2^<JqJBF&lVnUa%^-K7O(#Fd(@
z;*f6D=jW1=t3sy>wP$~hNt?MeKgTK#!z@zmQlr%h4Kj4m_RK|iE;bdLEmsVAmV8r5
zZqXHk&6aBvN1S`SmL?sAZ5PNXMr&by$t8nLflU>2QLZ?4M2+t`Lb1uFm~+K()pQuN
z3ca%viE%>lfm=^jh*KKXT)DO@r6zGY2ZEPg@mHVx#hFYG?2#8EGG!Lji?e!h4zZZ&
zj?pA>{>YIdhklGd^`djLiEpx%C}oH0bZG#%z1S+Ubb*-xSdg1<wOR7B5h%IhLPDb=
zwU*(Lqu5Idt@&3?!jSQ+zAj-*ImRpEVrug5gN{GdI_A7QtH~ye&UWv0j_lOrnyY`1
z`f|~?DsQ=)y)@TogZXmc%?t!$AQrmD45l)Z<%-CK%_JI_%8W&`LnefWYR3GM{9@@c
zCEoz5-6Zn9Y!Hk!r6!9(ri8)evNhsTqf2;WQBk%rr_d^tBdX}eqEf{u@}0+hHaH4q
ztyz+5v{;N+{6(S8tyUD}7;=C`IaYsDiTvR&idBPVixO4OBBL$8Bubd;M5$hgdSTHE
ztAqxRT47VpH7dnw07Ir^`Cr%-%q-K3a+|oEt-9V^ihO}R;|wlgt~9z*#by*&l>$W-
z;#&orujxgF<Q!NluDdvI2TSKR->t|wuy#n+m3nc*CaSVkd(N>on82*MQ5~}wOLEP{
z2CEI$7d0-GBVB|327_Luo{&UCj?HZG7qy%9qE4e`MaIdfUAuPd6!mjrHAt}<^}^8v
zxiU(ja<k$JLVJw3=?cOYy=X-cD*u1@wPj1|%mCg`>^nm*=oPncHN9HL&vK}fdueJH
zeb{tj9golyk1gan9zIC#sg-)DCXgC<f;#EbGOU>ic5Gzqk$^S#xXqWlL#Z|4xc`xi
zyxrVJcgm=HD>czaesPJr^;R%y3{Ih!<`pzecX^UtUchz}NvG#iC5`eu?mD%VUeR+J
z;gK_%W7N&H#-QW^ZlE^0M;*m0_z6GYnmp5yjnwHu4V$>ev6d%KK(cq##SOXCy`37l
zCYhfvrCNGMJwEiIK+gkb&wG%QAJfaV)S~csKn9PWu<-+`P20{jU+0%{72n|JI|JyI
zBY61^9^-DF%BMaa$%VEp+|2C|y(yR9rcyr*c+iAFK=6i(JmKKi3EVK>iyBgCH16D*
zlk{ebfk*k-W%|V9)P0a|b8jF`(5LPEjGL&<gGQ+3BE7*TuU2ypKf>PAJn6gg2tDA&
zL;U;#%=PjJf8^$)p)?S+F@_uF@gPrGsFi!{X;htNsi3<Xk5iAq$~PV9bYHUw=4?oa
z;tt=06V$>FuYN;K+{V=*+{-nWjs+a2uC!$SbTDqATLKM0us7U852-nL>lGfm6u@;!
zG*Gyh22S%p05|TTM^r87xm`;S_2K-U+G)UZEjOjpB;VRZ?VkLQ-!G;QiToji99=v{
zqmhTXgKB6hhsPYD7W}%)1BP@SUzJKv_%6SOb^GzFlbaWYE)MqIWs2l`qLjw`_!T{j
z=E0L#GQlJBcO+0f4bzir)DE-Uqh|`jt~ZiqjQlzYanVcF$uPqUdcQa?i>K$q{?t<l
z|MVa-2KbfVMF=~_9X#wwLtb#;JFt0Yg&A(dALUovb<V_3uOZOh(=C2<=`20i4`uEK
z`|!XqJHNJai$71;xrS;^pW!?Fa0A#jpE>)@HM#}6j#CvvvHz=RdLBr35om9YlH&n)
zZ%^kbJ5B4kmY*D;aYV+*y6tu(ME$ZY2!)x1$WjNi|3DLh$4bv`@;r}3n&c{947P9!
zP0vg4o);(REi~F6LtUr0ajhHO=LaXD@=YFs9s~J1Vbj5d%WVr+nfcLL8iXIKxa%9o
z!KjeU<y1xQ_UzwS=1pBu{A!+=-Z_v@Px&TQ1tn;e{d}wN?Al{gd6@<iPg3XVuc^_E
zhIKMFx6%i`;Z9Ay)a65yvB<M(SnXyq_0bER=lU3NYTt6|DCfHc+cqe935EQUTjBC5
z>Zv$?mKz<#MHhm&=6J>yg!eSk_zg|bTY3(~ZX%5H;f+BaI)e0h#8ao%@Ev-@PlD*>
z-VClfBu4|-d5G%ayxY+N*6!ms5a?bgkL|?oc7D&5>DlZ=KvV50nhvA7lw`vpGu`Gs
z2X|_}ax6DdGmrDV#nkR~iXUE~nlovq%-l|0G3RqG02!RzNkehS+c6rVnv-57+jwGo
zMqI(FavEJ4PPgLrK+@**{E>!vGy|~VEZ$4Q`SAE8k8R-UavERC18b<--5aV6!*#b1
z1@F0KRYZ6KIR=ZvsbR4;4gMOW_Mp8>OZfFg017olfI&Aw;q{J1#*phU=;Y~$&C&EW
zWBr12h|_9>WjohgSO;rWAI9Xn;9s>PKAVU5?uoV3x3m26N+j~k%0+(rHD>NTOSe~r
zr)cN}S8mMB3N<2%XB<?UK!bq8ZD*;?Lbo!a)&pZ{AkmLH%hu5I^Yo6V*T?QCR_GHf
zaI=6%eCS?qAUk=eg!`5vj{A1f^xg~&^_-&Kd5GF~a0g)e>ginpMZ4%VcP6aimUO;n
z^``#9bAYesJi}wjvGkf+%N%?^4XOJG{%P9`FLoZ<N)s#SM#wQg?g&_H1Ggr=8A%_v
za`OdCXjC-p_wJ&Onp5(rfgV%idFn`vGM}N__%{$h<KEo99_VzF?ntjj#K$h?$~Dw`
zfNuxJ?%>IiNbtMEkGYy30`ea@3f(V;U7{ygJ;gIL9I4^@^?0a6UXARdMylF&ebXl2
zSQ<Z&$4`+mFL}y$fp-dmtD2s1)nXc5M@??@I9&ok0*x)ny+&1qkf7O<+rB{p_QdjY
z>yG71XzElJw;r@yM|FD0T~Rc&j5=31+<7>RTXLu`?Q}Nvh0!}dnH|fohXBqixsmV0
z9<Wkd7%=JqjnaMUSpm2A^0T;2`fITZuhD~&=v)tOK9>#S_0flghD@%!d?EvW=vqo|
z{OIkXgcaQ2hTxjyC#&dQq=jDZH31CXER09~I7a;T=JOPHmAL!C(e&B?w|v^i^<{;J
zx%%h^c;*d1p~pPx9<hMl?kMM5bbsM8z;Ppg;#PO#89nzV(M|nVh}65V>+nSy-+<Vx
zqw!sdnR_%b`xCh%oQAFdOu9pM(`2H;&jLL2smoDN9$|9Ot4yli^UX0|o>;!86e`xB
z7PnG={53%NXu-E!&$Y|KdD5D<GM4*8<o3X`K~&g#{@ldV!Ss0J0Ypjn4!=W}xPL|Q
zvIUs)j$fsqTny>T_|u}}JYnD&HyYZn<t{BU`6E)kPP2Ul-(tsOdhUimn2{keO0_t$
z)F0Z3LeYrge@%axs}}HsBQ)(<z5%mFf|B^28DV!nn0n5wy+)(<lv0}Vq`oM6l^jY>
z&+*N4Am^hjasta<!~p9$wo!dJJq7cQ)Ust<v-Wa;J9N0M;rrA<4hcS=Xc}t1=8-Kh
zTPq;1ny0pL$I+GasA4D8nz-Y{e(K4O;z3V@WgDWkh5NaC2?FFLOx}hHAWsDKMf`Cc
z>^HnSh^G#1*h3RW#X$oF`a?WHZ77F5fVHQ7SU$k5^uTwy)c@n|V~DM;)UaY2)d0EQ
zE{se%%T@E2CCM$0a@9p__LMsp^RsO<gwof{H&5_K3(X{6aOCmJAg-(cOm*>Xl-k?W
z!Ox-06ptHLQ`N}`+p4QLF0>$>x}jzLermW3NbbYrQNCTwBl{eP2utXG9AWBE=<9>`
zQZL92k!3hI)zZ^&&rMi8u$MmOR(R1kH_@Xd^y)-*0M+o#B$`~Y0X2+j3d-QGw&Z=>
zUzSG?Xn5%|>^GF}VT^=DixBznDx&6o7@+AnjV4P>q5d!)T?zcdJ`=&*w*i%R5^~(2
z57f1s2VltO{JMyp?xpa_U=}yJ9awW<3l29!+%?~F9?HI3k8{S*S2%DDV$sM|8jLt%
zN3}x=wNuaalb2D-?|5Pk4xp9{>WGe^IxDxu#QMT>cc}ZS8#(%MEEph(EcYYL+(lJu
zzvUag$gL_I2RaH-?rNwqjvJ8kZ*fd(Tp7=|6&!mS1Nn_R%-u%KhYs$9{cj*JZgS_j
zHK;?=;V}T!djRL=z{Le?5sEk9lm2sOfxaF5xM*)6N@bVywu9c|H21NBI^0Z$vNFzJ
z<E8_L=>-h*o?oX^-BK+NoVkjck77@)T(fCYyxD>3{cxAhdU|}F+Bc$Zj~xi5{&KLa
zmerRUuPN~16WH<z-NG@g2IlErM7KQ<<hQskTjJ1*kQB$)Tj{Ah*o-1PEBWaKV;Z-G
z1sSMn!>P0qdg>nnyFX>;MS5G|^-TuXB=Rubv80v9Be_~=P`SM0oImx0c?XcVCyE{$
z!`x})NO}g3#3Au;=3NJnpP}cAX!2V`{Rh2I1Zs69kZKZ#`}_QU7dd+Ts4^ilGY@-p
z2bO2Bb5UF|y^cF`m7ETGaoK?^klq{(qIMlnuW~zJ<soYE%M6~m3f%{In8!kCc;y1F
z-;41+YBABX_(-_5jXN~Jirc8Obrq=1(}1P6Y@SZT0jJil+?2`_z`rWIl2jkn(Wjjt
zxAKE6@N+#FPh7-V3U3wBJa%$5cOHN#?&f6ihh^NeHYB)$J{+XsUFnuwXwn35dA4UY
zjvQ~1pttDh3VwJ3^{gAF|G-mtC3;a<OfYgmlz&nn;;}Y$3DpQ}@)qwOt8mmGKR_?e
zp3@xTsxv{<v)6kaH?5{O^x$MEz2s3J>R*kRXw&jrp732r-FyQ%aWL$HHpw6PFhTW6
zs1Q%IUq@$NLJ{bsTi-&7F)wOF)ZL5F9^U50kE8iR88=^|cUvO8!x4oy?3xAC<N$tk
zo}$Y2(KLf|H_id5vZGwP0qNQ*&qO#Y)KZ&2isDP^q}ydjw-;lt8U)xRpy&-4-9_j>
z$|~Y%YF$meE2+A0mlqEjd8Q(V@A}8v3jlXR7Ou>sk+LuzSmXtJyvG^oF;7MF;a=Z0
zrep-vqf^}LZl=Bzi62P4TW6_Xw9EJM4!yW#6V7bi4|Hy+cwJZjY@*ZEUCKxA1$_iG
z*+iE!Yw6O&?$qRC*%{yc+h_l-_CU~#DK*-16tvQAADQb%p}|pTLYJ%ujS|_$bG_2{
zI)|37J9P)$qPrEMPorE_D%3uORg$gZ-rT$UefjSGKri~$W`!=WPCRrq6>7lho2zMf
z@1lWpEyAZGU5nyzK<nWQT^76GFv8?R^07FYPUhxo5y?r;>}8uBkED(RdNF7dL)mjJ
zv|KZ~WslK_JFT2U8%=hVOqWX!6qzq&%Z9|0IZ>WUQJ(3=^UtCTyF|Hy6@c`YB}vEV
ztvM;3o|%)Znqs^tqV^5!-Z8((_4w4mgGEWvyNi;Lpij<f;qZ>L+5^A~?N{rlcelp^
zTmnOI2~Wob$C}!Z-6Cj}R(-r~<7d0Gvp`6r4o4#2T6Qh*_}*_5#fVhug<ib0iC5Y7
zY^6q#&5%)z^0@)>T1x&#FM`MB8X)6JE4mtE-(OxAwC$~2|4uI^Y+^DS4cwn}{Ck(<
zU0yL*?ZsjWO{k>~YmTwVh_=fI<s6zLvNy@{r2Ax#n3mi=>cyuzF{2khr8-vjRBdS5
z{dATfwPmRG9C^;LKO@EaS-to<Y1nzv$KRRl&*a66pLg|V{)b-t!k7Nc!k_F9{-WBT
z`3L90&C%{D&MTXL3D?w!7w&G(A4TLXtEB3DdMLYEK(_}+=YDQ+TDEKtOgWAeZjnT)
zbiUXU;l9TCy)9wx?EL2~!V3ua#96d;mk0GbpKR?6{GU>@U($<TwuxVHrhXma^{b7U
zE1ZYEUgZ$KHmAyWr7FL!7r)_BWnP{5P30G4|L8li=Q;bQp=!Dlt_9jOC-Y$GYHD18
z_oQZ3jJoxEs9!6OP(Z9ZCe&X!PL=Weg4=Cc0Q%j+Knspd>hxpiLWOG`3J)(p5x<X$
zGa$Ll?USS1O5@rKJaaDi5CBKgfIelHsa{LBZ6Wk{_1ub!K%DlqT7K_E&T?S0WCW1u
z6X`e@G^;B$E^c$osn6p=aV9r`=74Q06Xzl0?rWoxcp3*0*OGX;LCqobR*SM*8-xS?
zjah;lx%51Y&dz<n=Mdf#nv<#Fx*v+&06<&SPTeY|C<Lz%cYtdyL#)G$9w?B@?Xdv-
zUV6<ANlDQxih<?gSMS-3Ky3SO%Kn6Wu<Fyy%Rs$Wb%skD=s3>R=N+@laA3LVI6zHW
z2y$}^y<37;PnYdxy#Ty+;^nsmX#E0WIRMs^wlF|#$NwWF+||4g^WM<lPpE|A`0)ge
zOYiB;?0n!vZ@EqG*RtdcPBXK9ob`&E+ZRLD`y4OLp7rINoDQwqx*ZF5&aF~)f<pCH
z#7yhls4G;qF1;2TO0TeOnk>>bo}7)mS@S9J`}yrY6djkbXXWBPTi~H%IFf#Gi})>>
z4Zp1y-?ND&|0_FlK6cG1vf~Sj#r084{BP&o?HdAq2MGUvD#ZWQD9O%W+`b~=chR3$
zKc1N`+YCj<Y*Uf=Jq$?%!M6r`5wQRFv&0{0D#RbUy1DsQG`1}!@qK4l)bcey$T1kQ
z?fFG$W*ZFt=-m5@Kgtq+taCTkia*hdKXqP+TI~>jreN^T<@YUbov*1@{DofprCNJj
z8~zZLNXm<ze15K>)Mzo4VB5cvi~d^W^f!9(xAqG0cN*pU4ayJX_}}Z?u){y-#g7z~
z`9I3<ZeBVcF#jjL_-EyZ_&lI8_!pg<12g|sFa8Z2{{0Jw|4=#nr(XP6<zR8c=MD)H
zDie!4p7a(ETtBn2c;ZPevUsUPZ(Qva79ZR`bG7)&IZ)m*Po3k3tKyInTKwfaV5e#U
zOMpralw@eHifWmUCv|O*N({yo{J+BO3;zXjj+7xpowE>Em-vh1H2M8N+Q70{r7n@w
zAL<+~Q&^VbQC+)Cr7p)6{8!-ih5t%9N6H_n&RK=4i~njl4dQ!B2U@~Z>KaM?EdE+N
zsY}<X#Pzs>eYiV5?S0|DK~9qLZ&W93!qvrJgI_NBy}*C7N{x`zIZs-);7MJ&RV99n
zEBHs^_J#d6IY)}WU7Zt!tMX@Y()`hKo+`gL`0r53F_QdQ{#ZPzYvWYnPUq{G(4cty
z&a&*nRa0R}aQ;qAxFb=Wr~Dwxk|c+fKP`}6E|8`wkQ<sL%g<lOWUg2(yD^Thp7P5o
z@<)wxixy)I&>$IiO~DtgDe_tQk@DjL>D>ZpBFWONz{SL)K<jLaU7o!M<Fi%HvRD2V
z^_G3O>n;0nb(u1-&T>GK)Qb17*c^X=z@dwCRhNe@xqe=>&;H<kzhkbU6sLb&OyIoI
qxr)sFPF-At`=qlhE)3W1xP74qC3AC~B~{%w4Og2b9ao%sBL53fknimP

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 540410e5a0..e416de9644 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -1,348 +1,350 @@
-# #!/usr/bin/env python
-# # -*- coding: utf-8 -*-
-# #
-# # Author: Tobias B <github.com/sezanzeb>
-#
-# """
-# Automated tests for checking the EnsembleLda Class
-# """
-#
-#
-# import logging
-# import unittest
-#
-# import numpy as np
-# from copy import deepcopy
-#
-# from gensim.models import EnsembleLda, LdaMulticore
-# from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
-#
-# num_topics = 2
-# num_models = 4
-# passes = 50
-#
-#
-# class TestModel(unittest.TestCase):
-#     def setUp(self):
-#         # same configuration for each model to make sure
-#         # the topics are equal
-#         random_state = 0
-#
-#         self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-#                                 passes=passes, num_models=num_models, random_state=random_state)
-#
-#         self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-#                                    passes=passes, num_models=num_models, random_state=random_state,
-#                                    memory_friendly_ttda=False)
-#
-#     def check_ttda(self, ensemble):
-#         """tests the integrity of the ttda of any ensemble"""
-#         self.assertGreater(len(ensemble.ttda), 0)
-#         a = ensemble.ttda.sum(axis=1)
-#         b = np.ones(len(ensemble.ttda)).astype(np.float32)
-#         np.testing.assert_allclose(a, b, rtol=1e-04)
-#
-#     def test_eLDA(self):
-#         # given that the random_state doesn't change, it should
-#         # always be 2 detected topics in this setup.
-#         self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
-#         self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
-#         self.check_ttda(self.eLDA)
-#
-#         # reclustering shouldn't change anything without
-#         # added models or different parameters
-#         self.eLDA.recluster()
-#
-#         self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
-#         self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
-#         self.check_ttda(self.eLDA)
-#
-#         # compare with a pre-trained reference model
-#         reference = EnsembleLda.load(datapath('ensemblelda'))
-#         np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
-#         # small values in the distance matrix tend to vary quite a bit around 2%,
-#         # so use some absolute tolerance measurement to check if the matrix is at least
-#         # close to the target.
-#         atol = reference.asymmetric_distance_matrix.max() * 1e-05
-#         np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
-#                                    reference.asymmetric_distance_matrix, atol=atol)
-#
-#     def test_clustering(self):
-#         # the following test is quite specific to the current implementation and not part of any api,
-#         # but it makes improving those sections of the code easier as long as sorted_clusters and the
-#         # cluster_model results are supposed to stay the same. Potentially this test will deprecate.
-#
-#         reference = EnsembleLda.load(datapath('ensemblelda'))
-#         cluster_model_results = deepcopy(reference.cluster_model.results)
-#         sorted_clusters = deepcopy(reference.sorted_clusters)
-#         stable_topics = deepcopy(reference.get_topics())
-#
-#         # continue training with the distance matrix of the pretrained reference and see if
-#         # the generated clusters match.
-#         reference.asymmetric_distance_matrix_outdated = True
-#         reference.recluster()
-#
-#         self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
-#         self.assertEqual(reference.sorted_clusters, sorted_clusters)
-#         np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05)
-#
-#     def test_not_trained(self):
-#         # should not throw errors and no training should happen
-#
-#         # 0 passes
-#         eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-#                            passes=0, num_models=num_models, random_state=0)
-#         self.assertEqual(len(eLDA.ttda), 0)
-#
-#         # no corpus
-#         eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics,
-#                            passes=passes, num_models=num_models, random_state=0)
-#         self.assertEqual(len(eLDA.ttda), 0)
-#
-#         # 0 iterations
-#         eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-#                            iterations=0, num_models=num_models, random_state=0)
-#         self.assertEqual(len(eLDA.ttda), 0)
-#
-#         # 0 models
-#         eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-#                            passes=passes, num_models=0, random_state=0)
-#         self.assertEqual(len(eLDA.ttda), 0)
-#
-#     def test_memory_unfriendly(self):
-#         # at this point, self.eLDA_mu and self.eLDA are already trained
-#         # in the setUpClass function
-#         # both should be 100% similar (but floats cannot
-#         # be compared that easily, so check for threshold)
-#         self.assertEqual(len(self.eLDA_mu.tms), num_models)
-#         np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05)
-#         np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05)
-#         self.check_ttda(self.eLDA_mu)
-#
-#     def test_generate_gensim_rep(self):
-#         gensimModel = self.eLDA.generate_gensim_representation()
-#         topics = gensimModel.get_topics()
-#         np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
-#
-#     def assert_cluster_results_equal(self, a, b):
-#         """compares important attributes of the cluster results"""
-#         np.testing.assert_array_equal([row["label"] for row in a],
-#                                       [row["label"] for row in b])
-#         np.testing.assert_array_equal([row["is_core"] for row in a],
-#                                       [row["is_core"] for row in b])
-#
-#     def test_persisting(self):
-#         fname = get_tmpfile('gensim_models_ensemblelda')
-#         self.eLDA.save(fname)
-#         loaded_eLDA = EnsembleLda.load(fname)
-#         # storing the ensemble without memory_friendy_ttda
-#         self.eLDA_mu.save(fname)
-#         loaded_eLDA_mu = EnsembleLda.load(fname)
-#
-#         # was it stored and loaded correctly?
-#         # memory friendly
-#         loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation()
-#         topics = loaded_eLDA_representation.get_topics()
-#         ttda = loaded_eLDA.ttda
-#         amatrix = loaded_eLDA.asymmetric_distance_matrix
-#         np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
-#         np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05)
-#         np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05)
-#
-#         a = self.eLDA.cluster_model.results
-#         b = loaded_eLDA.cluster_model.results
-#
-#         self.assert_cluster_results_equal(a, b)
-#
-#         # memory unfriendly
-#         loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation()
-#         topics = loaded_eLDA_mu_representation.get_topics()
-#         np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
-#
-#     def test_multiprocessing(self):
-#         # same configuration
-#         random_state = 0
-#
-#         # use 3 processes for the ensemble and the distance,
-#         # so that the 4 models and 8 topics cannot be distributed
-#         # to each worker evenly
-#         workers = 3
-#
-#         # memory friendly. contains List of topic word distributions
-#         eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-#                                  num_topics=num_topics, passes=passes, num_models=num_models,
-#                                  random_state=random_state, ensemble_workers=workers, distance_workers=workers)
-#
-#         # memory unfriendly. contains List of models
-#         eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-#                                     num_topics=num_topics, passes=passes, num_models=num_models,
-#                                     random_state=random_state, ensemble_workers=workers, distance_workers=workers,
-#                                     memory_friendly_ttda=False)
-#
-#         np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05)
-#         np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05)
-#
-#     def test_add_models(self):
-#         # same configuration
-#         num_models = self.eLDA.num_models
-#
-#         # make sure countings and sizes after adding are correct
-#         # create new models and add other models to them.
-#
-#         # there are a ton of configurations for the first parameter possible,
-#         # try them all
-#
-#         # quickly train something that can be used for counting results
-#         num_new_models = 3
-#         num_new_topics = 3
-#
-#         # 1. memory friendly
-#         eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-#                                 num_topics=num_new_topics, passes=1, num_models=num_new_models,
-#                                 iterations=1, random_state=0, topic_model_class=LdaMulticore,
-#                                 workers=3, ensemble_workers=2)
-#
-#         # 1.1 ttda
-#         a = len(eLDA_base.ttda)
-#         b = eLDA_base.num_models
-#         eLDA_base.add_model(self.eLDA.ttda)
-#         self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
-#         self.assertEqual(eLDA_base.num_models, b + 1)  # defaults to 1 for one ttda matrix
-#
-#         # 1.2 an ensemble
-#         a = len(eLDA_base.ttda)
-#         b = eLDA_base.num_models
-#         eLDA_base.add_model(self.eLDA, 5)
-#         self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
-#         self.assertEqual(eLDA_base.num_models, b + 5)
-#
-#         # 1.3 a list of ensembles
-#         a = len(eLDA_base.ttda)
-#         b = eLDA_base.num_models
-#         # it should be totally legit to add a memory unfriendly object to a memory friendly one
-#         eLDA_base.add_model([self.eLDA, self.eLDA_mu])
-#         self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda))
-#         self.assertEqual(eLDA_base.num_models, b + 2 * num_models)
-#
-#         # 1.4 a single gensim model
-#         model = self.eLDA.classic_model_representation
-#
-#         a = len(eLDA_base.ttda)
-#         b = eLDA_base.num_models
-#         eLDA_base.add_model(model)
-#         self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics()))
-#         self.assertEqual(eLDA_base.num_models, b + 1)
-#
-#         # 1.5 a list gensim models
-#         a = len(eLDA_base.ttda)
-#         b = eLDA_base.num_models
-#         eLDA_base.add_model([model, model])
-#         self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics()))
-#         self.assertEqual(eLDA_base.num_models, b + 2)
-#
-#         self.check_ttda(eLDA_base)
-#
-#         # 2. memory unfriendly
-#         eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-#                                    num_topics=num_new_topics, passes=1, num_models=num_new_models,
-#                                    iterations=1, random_state=0, topic_model_class=LdaMulticore,
-#                                    workers=3, ensemble_workers=2, memory_friendly_ttda=False)
-#
-#         # 2.1 a single ensemble
-#         a = len(eLDA_base_mu.tms)
-#         b = eLDA_base_mu.num_models
-#         eLDA_base_mu.add_model(self.eLDA_mu)
-#         self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
-#         self.assertEqual(eLDA_base_mu.num_models, b + num_models)
-#
-#         # 2.2 a list of ensembles
-#         a = len(eLDA_base_mu.tms)
-#         b = eLDA_base_mu.num_models
-#         eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu])
-#         self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models)
-#         self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models)
-#
-#         # 2.3 a single gensim model
-#         a = len(eLDA_base_mu.tms)
-#         b = eLDA_base_mu.num_models
-#         eLDA_base_mu.add_model(self.eLDA_mu.tms[0])
-#         self.assertEqual(len(eLDA_base_mu.tms), a + 1)
-#         self.assertEqual(eLDA_base_mu.num_models, b + 1)
-#
-#         # 2.4 a list of gensim models
-#         a = len(eLDA_base_mu.tms)
-#         b = eLDA_base_mu.num_models
-#         eLDA_base_mu.add_model(self.eLDA_mu.tms)
-#         self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
-#         self.assertEqual(eLDA_base_mu.num_models, b + num_models)
-#
-#         # 2.5 topic term distributions should throw errors, because the
-#         # actual models are needed for the memory unfriendly ensemble
-#         a = len(eLDA_base_mu.tms)
-#         b = eLDA_base_mu.num_models
-#         self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics()))
-#         # remains unchanged
-#         self.assertEqual(len(eLDA_base_mu.tms), a)
-#         self.assertEqual(eLDA_base_mu.num_models, b)
-#
-#         self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
-#         self.check_ttda(eLDA_base_mu)
-#
-#     def test_add_and_recluster(self):
-#         # 6.2: see if after adding a model, the model still makes sense
-#         num_new_models = 3
-#         num_new_topics = 3
-#
-#         # train
-#         new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-#                                num_topics=num_new_topics, passes=10, num_models=num_new_models,
-#                                iterations=30, random_state=1, topic_model_class='ldamulticore',
-#                                distance_workers=4)
-#         new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-#                                   num_topics=num_new_topics, passes=10, num_models=num_new_models,
-#                                   iterations=30, random_state=1, topic_model_class='ldamulticore',
-#                                   distance_workers=4, memory_friendly_ttda=False)
-#         # both should be similar
-#         np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
-#         np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
-#         # and every next step applied to both should result in similar results
-#
-#         # 1. adding to ttda and tms
-#         new_eLDA.add_model(self.eLDA)
-#         new_eLDA_mu.add_model(self.eLDA_mu)
-#         np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
-#         self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics)
-#         self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics)
-#         self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models)
-#         self.check_ttda(new_eLDA)
-#         self.check_ttda(new_eLDA_mu)
-#
-#         # 2. distance matrix
-#         new_eLDA._generate_asymmetric_distance_matrix()
-#         new_eLDA_mu._generate_asymmetric_distance_matrix()
-#         np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix,
-#                                    new_eLDA_mu.asymmetric_distance_matrix)
-#
-#         # 3. CBDBSCAN results
-#         new_eLDA._generate_topic_clusters()
-#         new_eLDA_mu._generate_topic_clusters()
-#         a = new_eLDA.cluster_model.results
-#         b = new_eLDA_mu.cluster_model.results
-#         self.assert_cluster_results_equal(a, b)
-#
-#         # 4. finally, the stable topics
-#         new_eLDA._generate_stable_topics()
-#         new_eLDA_mu._generate_stable_topics()
-#         np.testing.assert_allclose(new_eLDA.get_topics(),
-#                                    new_eLDA_mu.get_topics())
-#
-#         new_eLDA.generate_gensim_representation()
-#         new_eLDA_mu.generate_gensim_representation()
-#
-#         # same random state, hence topics should be still similar
-#         np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
-#
-#
-# if __name__ == '__main__':
-#     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
-#     unittest.main()
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Author: Tobias B <github.com/sezanzeb>
+
+"""
+Automated tests for checking the EnsembleLda Class
+"""
+
+import sys
+sys.path.append('/mnt/data/Code/ensemble_lda_repo/gensim')
+
+import logging
+import unittest
+
+import numpy as np
+from copy import deepcopy
+
+from gensim.models import EnsembleLda, LdaMulticore
+from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
+
+num_topics = 2
+num_models = 4
+passes = 50
+
+
+class TestModel(unittest.TestCase):
+    def setUp(self):
+        # same configuration for each model to make sure
+        # the topics are equal
+        random_state = 0
+
+        self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                                passes=passes, num_models=num_models, random_state=random_state)
+
+        self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                                   passes=passes, num_models=num_models, random_state=random_state,
+                                   memory_friendly_ttda=False)
+
+    def check_ttda(self, ensemble):
+        """tests the integrity of the ttda of any ensemble"""
+        self.assertGreater(len(ensemble.ttda), 0)
+        a = ensemble.ttda.sum(axis=1)
+        b = np.ones(len(ensemble.ttda)).astype(np.float32)
+        np.testing.assert_allclose(a, b, rtol=1e-04)
+
+    def test_eLDA(self):
+        # given that the random_state doesn't change, it should
+        # always be 2 detected topics in this setup.
+        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
+        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
+        self.check_ttda(self.eLDA)
+
+        # reclustering shouldn't change anything without
+        # added models or different parameters
+        self.eLDA.recluster()
+
+        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
+        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
+        self.check_ttda(self.eLDA)
+
+        # compare with a pre-trained reference model
+        reference = EnsembleLda.load(datapath('ensemblelda'))
+        np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
+        # small values in the distance matrix tend to vary quite a bit around 2%,
+        # so use some absolute tolerance measurement to check if the matrix is at least
+        # close to the target.
+        atol = reference.asymmetric_distance_matrix.max() * 1e-05
+        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
+                                   reference.asymmetric_distance_matrix, atol=atol)
+
+    def test_clustering(self):
+        # the following test is quite specific to the current implementation and not part of any api,
+        # but it makes improving those sections of the code easier as long as sorted_clusters and the
+        # cluster_model results are supposed to stay the same. Potentially this test will deprecate.
+
+        reference = EnsembleLda.load(datapath('ensemblelda'))
+        cluster_model_results = deepcopy(reference.cluster_model.results)
+        sorted_clusters = deepcopy(reference.sorted_clusters)
+        stable_topics = deepcopy(reference.get_topics())
+
+        # continue training with the distance matrix of the pretrained reference and see if
+        # the generated clusters match.
+        reference.asymmetric_distance_matrix_outdated = True
+        reference.recluster()
+
+        self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
+        self.assertEqual(reference.sorted_clusters, sorted_clusters)
+        np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05)
+
+    def test_not_trained(self):
+        # should not throw errors and no training should happen
+
+        # 0 passes
+        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                           passes=0, num_models=num_models, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+        # no corpus
+        eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics,
+                           passes=passes, num_models=num_models, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+        # 0 iterations
+        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                           iterations=0, num_models=num_models, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+        # 0 models
+        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                           passes=passes, num_models=0, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+    def test_memory_unfriendly(self):
+        # at this point, self.eLDA_mu and self.eLDA are already trained
+        # in the setUpClass function
+        # both should be 100% similar (but floats cannot
+        # be compared that easily, so check for threshold)
+        self.assertEqual(len(self.eLDA_mu.tms), num_models)
+        np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05)
+        self.check_ttda(self.eLDA_mu)
+
+    def test_generate_gensim_rep(self):
+        gensimModel = self.eLDA.generate_gensim_representation()
+        topics = gensimModel.get_topics()
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+
+    def assert_cluster_results_equal(self, a, b):
+        """compares important attributes of the cluster results"""
+        np.testing.assert_array_equal([row["label"] for row in a],
+                                      [row["label"] for row in b])
+        np.testing.assert_array_equal([row["is_core"] for row in a],
+                                      [row["is_core"] for row in b])
+
+    def test_persisting(self):
+        fname = get_tmpfile('gensim_models_ensemblelda')
+        self.eLDA.save(fname)
+        loaded_eLDA = EnsembleLda.load(fname)
+        # storing the ensemble without memory_friendy_ttda
+        self.eLDA_mu.save(fname)
+        loaded_eLDA_mu = EnsembleLda.load(fname)
+
+        # was it stored and loaded correctly?
+        # memory friendly
+        loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation()
+        topics = loaded_eLDA_representation.get_topics()
+        ttda = loaded_eLDA.ttda
+        amatrix = loaded_eLDA.asymmetric_distance_matrix
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05)
+
+        a = self.eLDA.cluster_model.results
+        b = loaded_eLDA.cluster_model.results
+
+        self.assert_cluster_results_equal(a, b)
+
+        # memory unfriendly
+        loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation()
+        topics = loaded_eLDA_mu_representation.get_topics()
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+
+    def test_multiprocessing(self):
+        # same configuration
+        random_state = 0
+
+        # use 3 processes for the ensemble and the distance,
+        # so that the 4 models and 8 topics cannot be distributed
+        # to each worker evenly
+        workers = 3
+
+        # memory friendly. contains List of topic word distributions
+        eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                 num_topics=num_topics, passes=passes, num_models=num_models,
+                                 random_state=random_state, ensemble_workers=workers, distance_workers=workers)
+
+        # memory unfriendly. contains List of models
+        eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                    num_topics=num_topics, passes=passes, num_models=num_models,
+                                    random_state=random_state, ensemble_workers=workers, distance_workers=workers,
+                                    memory_friendly_ttda=False)
+
+        np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05)
+
+    def test_add_models(self):
+        # same configuration
+        num_models = self.eLDA.num_models
+
+        # make sure countings and sizes after adding are correct
+        # create new models and add other models to them.
+
+        # there are a ton of configurations for the first parameter possible,
+        # try them all
+
+        # quickly train something that can be used for counting results
+        num_new_models = 3
+        num_new_topics = 3
+
+        # 1. memory friendly
+        eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                num_topics=num_new_topics, passes=1, num_models=num_new_models,
+                                iterations=1, random_state=0, topic_model_class=LdaMulticore,
+                                workers=3, ensemble_workers=2)
+
+        # 1.1 ttda
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model(self.eLDA.ttda)
+        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
+        self.assertEqual(eLDA_base.num_models, b + 1)  # defaults to 1 for one ttda matrix
+
+        # 1.2 an ensemble
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model(self.eLDA, 5)
+        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
+        self.assertEqual(eLDA_base.num_models, b + 5)
+
+        # 1.3 a list of ensembles
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        # it should be totally legit to add a memory unfriendly object to a memory friendly one
+        eLDA_base.add_model([self.eLDA, self.eLDA_mu])
+        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda))
+        self.assertEqual(eLDA_base.num_models, b + 2 * num_models)
+
+        # 1.4 a single gensim model
+        model = self.eLDA.classic_model_representation
+
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model(model)
+        self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics()))
+        self.assertEqual(eLDA_base.num_models, b + 1)
+
+        # 1.5 a list gensim models
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model([model, model])
+        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics()))
+        self.assertEqual(eLDA_base.num_models, b + 2)
+
+        self.check_ttda(eLDA_base)
+
+        # 2. memory unfriendly
+        eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                   num_topics=num_new_topics, passes=1, num_models=num_new_models,
+                                   iterations=1, random_state=0, topic_model_class=LdaMulticore,
+                                   workers=3, ensemble_workers=2, memory_friendly_ttda=False)
+
+        # 2.1 a single ensemble
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model(self.eLDA_mu)
+        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
+        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+
+        # 2.2 a list of ensembles
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu])
+        self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models)
+        self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models)
+
+        # 2.3 a single gensim model
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model(self.eLDA_mu.tms[0])
+        self.assertEqual(len(eLDA_base_mu.tms), a + 1)
+        self.assertEqual(eLDA_base_mu.num_models, b + 1)
+
+        # 2.4 a list of gensim models
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model(self.eLDA_mu.tms)
+        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
+        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+
+        # 2.5 topic term distributions should throw errors, because the
+        # actual models are needed for the memory unfriendly ensemble
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics()))
+        # remains unchanged
+        self.assertEqual(len(eLDA_base_mu.tms), a)
+        self.assertEqual(eLDA_base_mu.num_models, b)
+
+        self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
+        self.check_ttda(eLDA_base_mu)
+
+    def test_add_and_recluster(self):
+        # 6.2: see if after adding a model, the model still makes sense
+        num_new_models = 3
+        num_new_topics = 3
+
+        # train
+        new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                               num_topics=num_new_topics, passes=10, num_models=num_new_models,
+                               iterations=30, random_state=1, topic_model_class='ldamulticore',
+                               distance_workers=4)
+        new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                  num_topics=num_new_topics, passes=10, num_models=num_new_models,
+                                  iterations=30, random_state=1, topic_model_class='ldamulticore',
+                                  distance_workers=4, memory_friendly_ttda=False)
+        # both should be similar
+        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
+        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+        # and every next step applied to both should result in similar results
+
+        # 1. adding to ttda and tms
+        new_eLDA.add_model(self.eLDA)
+        new_eLDA_mu.add_model(self.eLDA_mu)
+        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
+        self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics)
+        self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics)
+        self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models)
+        self.check_ttda(new_eLDA)
+        self.check_ttda(new_eLDA_mu)
+
+        # 2. distance matrix
+        new_eLDA._generate_asymmetric_distance_matrix()
+        new_eLDA_mu._generate_asymmetric_distance_matrix()
+        np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix,
+                                   new_eLDA_mu.asymmetric_distance_matrix)
+
+        # 3. CBDBSCAN results
+        new_eLDA._generate_topic_clusters()
+        new_eLDA_mu._generate_topic_clusters()
+        a = new_eLDA.cluster_model.results
+        b = new_eLDA_mu.cluster_model.results
+        self.assert_cluster_results_equal(a, b)
+
+        # 4. finally, the stable topics
+        new_eLDA._generate_stable_topics()
+        new_eLDA_mu._generate_stable_topics()
+        np.testing.assert_allclose(new_eLDA.get_topics(),
+                                   new_eLDA_mu.get_topics())
+
+        new_eLDA.generate_gensim_representation()
+        new_eLDA_mu.generate_gensim_representation()
+
+        # same random state, hence topics should be still similar
+        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
+    unittest.main()

From 254ca60fa71d97c57d1f2682384b48efbe61f571 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 23 Nov 2019 18:03:18 +0100
Subject: [PATCH 079/166] tox

---
 gensim/models/ensemblelda.py    | 11 ++++++++---
 gensim/test/test_ensemblelda.py |  3 ---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 514d87c305..66d7ce2489 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -276,9 +276,14 @@ def load(cls, *args, **kwargs):
             eLDA.topic_model_class = getattr(module, eLDA.topic_model_class_string)
             del eLDA.topic_model_module_string
             del eLDA.topic_model_class_string
-        except:
-            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module and set the '
-                '"topic_model_class" attribute to it. Try setting this manually instead.')
+        except ModuleNotFoundError:
+            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module in order to provide the '
+                f'"{eLDA.topic_model_class_string}" class as "topic_model_class" attribute. '
+                'Try setting this manually instead after loading.')
+        except AttributeError:
+            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the '
+                f'{eLDA.topic_model_module_string} module in order to set the "topic_model_class" attribute. '
+                'Try setting this manually instead after loading.')
         return eLDA
 
     load.__doc__ = SaveLoad.load.__doc__
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index e416de9644..f2b05bb72e 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -7,9 +7,6 @@
 Automated tests for checking the EnsembleLda Class
 """
 
-import sys
-sys.path.append('/mnt/data/Code/ensemble_lda_repo/gensim')
-
 import logging
 import unittest
 

From aec58c9173ccf64c5c6592c04c20dbe7fd898694 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 23 Nov 2019 18:06:55 +0100
Subject: [PATCH 080/166] quotes in logger.error

---
 gensim/models/ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 66d7ce2489..da58e199cb 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -282,7 +282,7 @@ def load(cls, *args, **kwargs):
                 'Try setting this manually instead after loading.')
         except AttributeError:
             logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the '
-                f'{eLDA.topic_model_module_string} module in order to set the "topic_model_class" attribute. '
+                f'"{eLDA.topic_model_module_string}" module in order to set the "topic_model_class" attribute. '
                 'Try setting this manually instead after loading.')
         return eLDA
 

From f7de190083f3175c727aa2e5037836fa7a86cb62 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 23 Nov 2019 18:14:41 +0100
Subject: [PATCH 081/166] multiline string

---
 gensim/models/ensemblelda.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index da58e199cb..938eea5c5b 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -277,13 +277,13 @@ def load(cls, *args, **kwargs):
             del eLDA.topic_model_module_string
             del eLDA.topic_model_class_string
         except ModuleNotFoundError:
-            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module in order to provide the '
-                f'"{eLDA.topic_model_class_string}" class as "topic_model_class" attribute. '
-                'Try setting this manually instead after loading.')
+            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module in order to provide the \
+                "{eLDA.topic_model_class_string}" class as "topic_model_class" attribute. \
+                Try setting this manually instead after loading.')
         except AttributeError:
-            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the '
-                f'"{eLDA.topic_model_module_string}" module in order to set the "topic_model_class" attribute. '
-                'Try setting this manually instead after loading.')
+            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the \
+                "{eLDA.topic_model_module_string}" module in order to set the "topic_model_class" attribute. \
+                Try setting this manually instead after loading.')
         return eLDA
 
     load.__doc__ = SaveLoad.load.__doc__

From dfc97a08d29d5c07f1e50e98cb92d97dd48c36a5 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 23 Nov 2019 18:28:17 +0100
Subject: [PATCH 082/166] python 3.5 format strings

---
 gensim/models/ensemblelda.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 938eea5c5b..d975835351 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -277,13 +277,13 @@ def load(cls, *args, **kwargs):
             del eLDA.topic_model_module_string
             del eLDA.topic_model_class_string
         except ModuleNotFoundError:
-            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module in order to provide the \
-                "{eLDA.topic_model_class_string}" class as "topic_model_class" attribute. \
-                Try setting this manually instead after loading.')
+            logger.error('Could not import the "{}" module in order to provide the "{}" class as "topic_model_class" '
+                'attribute. Try setting this manually instead after loading.'
+                .format(eLDA.topic_model_class_string, eLDA.topic_model_class_string))
         except AttributeError:
-            logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the \
-                "{eLDA.topic_model_module_string}" module in order to set the "topic_model_class" attribute. \
-                Try setting this manually instead after loading.')
+            logger.error('Could not import the "{}" class from the "{}" module in order to set the '
+                '"topic_model_class" attribute. Try setting this manually instead after loading.'
+                .format(eLDA.topic_model_class_string, eLDA.topic_model_module_string))
         return eLDA
 
     load.__doc__ = SaveLoad.load.__doc__

From 88b338b1dbb4cf0ba27cfb238d58021531012dfd Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 23 Nov 2019 19:23:49 +0100
Subject: [PATCH 083/166] ModuleNotFoundError: No module named
 'numpy.random._pickle'

---
 gensim/test/test_data/ensemblelda | Bin 14251 -> 14238 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 79525c3aca30b28384bb1089a558222777737cad..85cf7b33f1966b8fdac4027524a82863b2552592 100644
GIT binary patch
delta 5057
zcmY*dS5s8mx+SO}NJc~iQ4ml8!Hp;?G%bPz6#)|l6xo7`NKm3_k#lac<kaLQ=TJ4i
zms55Bz^(gq?$0<6r|OOd_r7(os;k%PwdVZ72;X!%t3PW#)>Ij=G%%1gXr&3gKaXWx
zM%7~sT$C9lL%OeHAs$svWI7-;PMWMw$Y8z<{y@{|6Q`rlcmvxW>C)iohk98T&DqbG
ze0>Sy!HLp_28?=rl|9U2Nv5RIHc0XC%|oNi$f`8t_;?<{rV6dMldKfzWLxH=t!3=+
zl?04ATsmovMoVl5$T%jhI~)x{jXl~?nw08ncQVEfW%kapV(ONP=8xxPSmx7HuAvFr
zPjAV#m3MsNYs^cdOn;H?Q$I2>h0Y?GmW?}>sB<`V_|+L%^UrpaY0P2p(kszQry{zK
z#;8Y&Y=@x9BJYD#tHLf}iAi6>&>u2#8M~;#X8a{o+GL|vy1W8iB2&2Ns52I1@|;Yf
zPmM80mZ0bOdqt7iWK=Vv4m9OSBj)3k8|cPfz8`vJA}bChCeoWGbFXAY7DRVNK|i*A
z&)v52OcX9&!0<=ZX&(Ho7pqbyOEFhstTB2xSlZpN631OirIUG8I$e+kEMWrMXH3o-
z2gr=^qjM-rXU%uem1u-PX)B8KmS*&vNx@ba*DXDALFNpx63J&)Fko{cM>J3dj=Mg=
zD%vtp%!=2rcVifr8rhH)3^RlkCoIb%rZc0D<ClMkv}2S17O|9yPK;T&NG&F^Z!jIS
zVKzMnqdDBVTsnRHWzASBc(?3|F&kKEapVm)Fqto#DQHErEBeq7C`ywBbZC%%sgx1c
ztM3Dwx^*2@Sdl?g`J?w8rmv&%))lo=1avmBvg;LC5NPiv!>A4MmwvR%f(@q9jBjB0
z2HIp?RzAqcOWBb*nX|#dvm8IE@iV~)3qH+)R^*^FNLGAP-(8dC<S@k+)fbb3F)SNC
z2^fe$iBvk>$#lAmF=>+l8T`clbz;_623&dBgPMC%i7GECeawq3nSIE>yZtRa&iejj
zB4dup&oGJ#W2rHD!{RSNcL~MNMxoUS+n4jvFTY+xV=7vdV2E!h>Cinmg;ADK!z^ML
zf4M9h_^=nhP>RVA3_M1sthi$~H|_2tl*vx6%;XSYi)c@gRcL*oefR0Sv>Uraiv`S|
zl@c~jLvSzp%PwYaap5ie%EYA1pc(UM_Io9rM}NrhL)nDVCo_&xV`Xe1+oz4LKDJV1
zw;&wNRz{Dd*2y;wb!<d0N*=xP!Ds}T(3x?=6s1|xqCSE?l^PuZGLeR!Kc((A>pM>_
z8Ka7AsX-<AUYA<rf{HKqRga=k=3<wH`gAOvlcB<d+r)&<4a!trL;y<6gJtTAA2;eH
zzBN&2NTET|-k~gQ5RtPgsm_uqYRL}PelqvT44KNp*jpLHsM^fh8M;t=>5(!c(f-I{
z0V}1?I5j7o&Fn+_MYJEmB$iS6={Ac{Nl{vs#(RuzU7D3bnU^x`1mDV+y@yy%!Hl$9
z>WPP5_h+9`^76rBOxZp%V{EmUM!9U?kd;R=^*N!Cjh()h^sX@dd^**0FxJuU@oQ@?
z+h>OgY}&=dUUTE(o0cikl9nEN$%t%d50@sX$9#%(vAd;auas|YaiR^7O$JdVRq=6B
zgFT|7?;NHMV@;tDjAMpfn33IE4$k*|(IFEUB4FCw%?o+4ppuca+bL-_1wYgnBm@`n
ziv(XpBO9>r(?=E}%~2PXnTItQWjVG@xl|SUL>GbjZ=z^DNtMpmo}ZaQH<>;DE!*L4
zNOBs6bI~ll((s%On2}!?z-*FCJ}k;*4f@bcl`UgE`epPiMwm+(`kx$=l2@`NtpSCO
zGAna=R%`|FhV~N}*IPlHvrOeXxp|z({fc#u!_wdnWfn6zOmS3}j4}P<Q1&%@X}tJQ
z7Cb!OOS?B^t1Q*tB$R<P#$en3iW_R8bETTh>%elp`iT)*6N%7OBl(q#%DhaOcz4)2
zkGt+twD!>Wr)b`w_&AzmD;i@?vXfjOeQrS*V57q@C(4dAOGhedT~KvQO209=R@q3B
zWe05Cm4)~88BE)u@=RiOAaq4iddK^&=%k7WmBt@&?C$_JrRM@#a|roC8GPvJOLf&s
zd6Knt^37lpaj^)~5m*UKXM>wic8ZQsYLFyN0bUMviAoUlcG%A`F!1PE%$!C!>NBnp
zYrE*8FOIR3wRy7m3>8n9LVY@STO%ZjX}J~e>_a?3@sT<7`pee0+vu={#y;Bk3T?La
zL=3h1gLIhFe+Nvcr+w#S-t@AubSH<~Q^}^-k}kVKMf$wdZ2LSrIdO_Dt!3;}&+{Ww
zef-`?y<4;kn$1k|u_#rf;xH<cFG`=DlDAm(1uP{=)pM$j<|FyF<Bn!0XA*e%`A2Du
z{KJ<l)<xYvYb_HA>0G$l^^Vlv{hmwIHgWe_*^c#5Jj3sNzgKWncD{%%jCg6lhBUl*
zZGSm33gtg#T#epH(rGQ!R?4A~trMvI5HB0p2s<Ue;$`tvCUx#dge+q<E`*vt9$@Al
zg{fSk?S<4Bo&CVqn)0yol5`lonJRt9Y_TH^vR=f(6hoP?phz#sD0&Y||BtKpA7;Gw
zu}X4sXS|crNlBYLCF4F=jrFj$LDNm9-oyBdz3EtE^v$NTrXer0WHsc$YpTu)W`DAN
z<Jf#CQzk6QFH>2S5l@=G;dRQ8R7a7@<+5Q*49#QY7L+dO&A|3&8Fk~1b9BgGUmnQZ
zQ5L(3GNq9gY+U#Eli@>-Z=}U8`H5YyOv<7R%2ec68dGCo<Pn!gFR1WCGI&Bp&r7`w
zL3_X8E1C*Zqwm?H!;LWCJc^lfvZoxQSbL%E%Slpr4?UcJFd3j=ojo0v<?PoCk6$XJ
zy#PIKS7fiij+)Z&JT8VHpXV5Ilf2%#NQWCIZAvcFvRl+4W|)ujG7Z0QS$aOd*Q3N;
zru?DJ_IfH!sUJO_N-26t7n*;etl3@)lx|=2979VGdWpX>84lwZGnXJ*H(7HrPooxa
zG_ZO7(2nSDqBGCkleycN_mhrb2h6)i%52nKy1C|279Xt1MfYPVziP*_j6V<^ZDqxP
zg<16@-8i6etf2_)x2+$#KfjxPPfNu13z<zL44VE>@>J*zAV%6)(hBUpa3CM&Fk~sK
zk!W|&mve&4bfCqJIWND!I;E!>^?sMx?A@cDuF@P%MD)d^iPn%$wA`dxFW#kIwmQ%S
zb%n%<P9?>+626*Jcqplw?@VdydxSe-RQ63lyE!IgH<nmyE+qfP@^ZNcs~v~!a2bCs
zZFk)Mh;|mO)aMlBf0VWOr0<w{Y=q9RSR!e~(CQ@VJ}K>Quj=RFxoFwQkt#}#A+{O7
z0$N{W(auI?*pq;?iN{3TQ#09Std8`r5#>ox7#d#C73*$(GYY?&`HCFUobmIayNAnY
zwmE6D!ogLhDfS!a33IfMFOXJSjz<+{=uAF@IuEM<S}fXwu>78!8GP{K+7tTE7<EY#
z;ErymAo6S}?xlrjy-2)Hj9zC|TQI<s*A?kHBAq$+JUK9PICGIQk1zQa6w(PShf2@+
zurSQJU*)X078h{@t+JHBcSlZ2kG)es689JmlJz(KvSedV7h(s^NUU<UppWFM;MA9f
z-KRF_LR-8vzUQDT^BE~7GX2($+#N-+w-V?;rR_qK4Q+QK>ZlAw<|RJyq!(TPcvEUs
zF(`e@ij?ONAjKwJc-H(P#aEM*afrUzj^!lTV(6s=ZI`n-J*S7_-H;L>+GJ~4JAWZv
z%8p~ko~BZH4R!fWD7KM32Tpmz*mh+&itgG=slF4BS`SN<y0~G|lmb{qAE@zmO(2;{
zrP_j$QtKBcD-js{B9%rc`7=uIbCJ|Tz41<Pu<JmpYczo~6RS5QOr)A6K#OX3b8fVQ
zia0E(0^VVvfLrQ~Ws!XBWaJwPhoD`~6FN-WNM*FXDwdeM_X721QgT)nOpE+vweT&e
z(ae)tiAl`$n`q<g=7!?up>Na#MGvRQIi6zH16lvsB3UeuNox$`q0baO0ncSB_nU(m
z1|re;p4+x?KB!A$xC`%s;xP2=`F+aXY&zjpf#obredPSKg3V8f$A0j1GR7fz>fTlT
zSWzx>e>{^8D@Ix-%Tjml*>g6!#y6+$eST$5e?rbcZq8szqOpO2L3Y-coT0l}8n+Yw
zR4RC#@-<4urADGvJkhj7tJ0+<LUrh|)Pz1bAFg_@zrys#2vur!xQg<zjVkpq!n8_F
z?dS?pW8x6DLuHqE-U^MS3~q%cT-D!c_<k`o<lAla{?!GkvQ}G=w4pgvk{XIjKb7K8
z19pyU${q8ScA3izeu|Z>;CM`C1%EyNDVVC)kCBi7i;RQIc^S<Jz7>)Y%=4Z0nTp=i
z@=O(l4ttfY8@fYGZ1`bm%MY(;e(2F|8}h}{MYVtVGgl@<-X-yeD6}XyrN{C6&G@iy
zH*<FN_1Rl0IvHzaIHUQ;`DP;&Yo1+jHB@VgF2p$)L?&P;lvSe8FFil_DubGTUU23E
z|Gwa2Y=A}0>5J#h)c3v}{F<zZzgW-PHJbmuXy;%c)gGADFQCR$fK=`;Rje`djyEjU
zy!4K<KyeX6ZP4GBV$Hg@tKk{Ve|xtal0j%S89shIsxI~VoHSJa%ZQKs{!9EoKU)u>
zbJc|&$_hZE6S-fvzecISEextSS}Z_ol12&8-$wDrd(xb%B0-m=+Wa#=_n)@5mx*&K
zv!iT9KPQ}48PTut`;YoJ<LQHCz9p;1ig}E*?EkL5bEObfsvA5$=rDNt@3)N6a#ocY
zk@cA*(yzoWSIOdCOi^F;>;ApoUmM)GBVAegbN&9WgA_-a0s2)K4aK6r_wU!DqzR3-
zP*zl!w~`7zUh?}F135@m2sWO{iQyTh^_B`ah8F5w^FfmP+hrWVtOadn<lx>@(yEI|
z#ATwChl_5GIQoL{ye2&Xtl#r<(sV}lcpA~irq5X6)B7m+D@xfrD#KZr=A{4sRYS*T
zR3s?B-y=a5FcX1Va%xC8$~$l@I4H&Mew52<jGF2(%k3$ldv3-gaLa$jDD{>F=u)T-
zqX$3iCqywTN&d{KVR8Q^x6+6%KUd{-GWZSF{<A3t2EU8AqLTI8+^;-w{SM==_{>PP
zI_GPDzoy^9>@X|2%JlYOamT>m04?{V+{27VN^X0Gr>KUnvi=)<e^)Qc6*{+Y+Wi6c
zen%#~xyNFbKKf@KHvE_7e**n1qNIcJyP=01nl%nrUDXqxA`eJpn$2U{0VyVU?)r-$
n&GO4jD)cWT#!)2K_21M1{7;nqZ>Enl1y|1-&MysKH8=b}AwDsH

delta 5090
zcmY*d**}$S+g__sqEI4AQ7KaCm7&5SLZXr<4P=T?ksd=TJh#X^gq3+{nU{H5Sj+hS
zj&twZ`Tm7(@7sR2zn$-(dfwk_W3|?GU&DDF=W!h8c{i9poxkwlpt5>IrP?ogX~2va
zojtJQ3nN;f#~kLPWbAdZv`TkYVKi2xB_T7~$u{+qR369lEm^|OQ88fp5EiBLET3<n
z3;mqlE>-yABHP%8@fyZ5^8QTn;_lmAF?Ab5Ss1kuLor&>l>%k(t~83yM@D7PIx^n%
z^}QtYa@S4txZOmZ%%SJ-bIVVW=cHMT+_F<@AIscfOpED;EGmgI6o+<<V%pu_;drXd
z6&#Ua453eIWmUdBk{vN(5ksC2lOM}+?i19=glvT43#Q}5bjAFbVwr|%$Cz^HvY*P3
zled{C=C7R%x**?uu13k|OVLWDENQGPr9;{<jOGYUZisxzQWxEoNuJwq2;-Q!buQe`
z8+D<ml_eP}biXcr#ADOj3vD0Nvhg$kwGU)YHnAj2Dj7l#Z_*((GJ8^yX{=sz#rQr(
z-1zA;+GI;i)@A8ZQ;M%r4U>f@c6h}fE;3Q#rg?E5T@0j#SS}PFxi5qJUYCnbv{x{K
zl@ozSF(q5qW#Eii0NTz=+szo!-GAZ;lT^sW={K*5Vx#=9z?k&x;_oENFcuHvhqOZ{
z%lYUr$29Z5jCpKW9<=cNcs{}Jcm@owZ$1-!nzV(PVL3TgzDpe%QwYbfqL!+Q#~ttk
z2DR@^tjYFSba)c<)zpj+*bJ1(eObK4ET)1o_MJ|W={s0|93}IgFezPmvZhGgJsFI9
zCx!^D%Z3-KeB>*>$c9X+We{8T^35C-g`)N4f~k|{2VctO3v67FI#ujxRM_0akW3tn
zbivYX8TFF-7*EWGtI4LFxbsqr(HtjK;k%4uTPmW6x$%x$x%|LzC|SB9+aAgzS-vA<
z*f5tVX~JqkJgN!Bcg(!=S0Zl98nrMI5N1yHZgH1p&x<nRhDIz%o%~Q^^&VNjElm$)
z)<@Pv|N0>otn+xF;gBj^IadA#Yozlgx?IBTUZOiddeolM31iA}EM8zvjEd+=JkcU1
zV)})&)yCc+_iSMDJeF}w8l?WB4;lLdZ6~GCUkq>1d({Dz=te`d-R(eG_7M~Mux3tD
zHH1L_p_Cb}A=lewKF7yZDp8LPwU^A7iq6tT8q5nn!jvZ3tmm02iCC3B<uH{wE=C&}
z{><!keex8OUqFKN%essd9XR8R>QXEuGPWKf(Yl*w)Zr^k(3VpGwX%_nr95=o_!FuI
zsUnevJzUX|ZYM3!C5W-q;?EC(AM9R7$Y`*vSfPn-@q=7%W1!pRq70F4vn1RER-w<6
zNtykN_gh1g^t?g6te2zHRug|eP{z|_^ho3_nG9o|wmb>mN>n<TV#J2Dno}$NS=cP}
z!MKOtQ?nz4bin;ic^FJ@{=g11Ff0ovlQB#1$5HQ1YPH47kjyExiym~F<;m!EnTKAw
zAE6UpGh$A~U6xtq&U8QZ>E|NRre7u9P4@`QDr(X&a^b$iNi{ky+CN4=>e4^Hjg?+3
zSx07gTq?J@C3TvYq)|6o-%|4(QspeeVKgs_JyR-U3Yh8Tpm59{P}DFy!Omlap0_OX
zGD>;(qVuo~5nl+A`M7f)UbcmPGI}~s76?isr8|@GCw7wX?ZstOiq=1xo>PHURRD}<
z$rT;*(}Q`^`G+fAy%)^T_%7fWJDDU+ZkR4Cyr_y5la~$Zt|=IMUc%?Aul<QhjXeg@
zVa+6s$1o!X=$M(WGGr~2pP?(W^pPzzW9ecFw(M;1O(S&<7xr_~?BV;;{P-U-B5k%W
z?a^Fpf52HP{R%Ke21pInGNB>R+mv{V%;cjg0*l3eBxRK4D_C&FH^#qw{z@W-y%S~i
z(uw$}M>L?-XZHxm21~#w*6*OjC)XXLsfVcQ0cpL9ZSVJd*iA2Yq5>P<Cm!&j(|2Hk
z32IM~$^w3CC8aCNs11VyZTyRD9!5(BIa85&9-C7C+`&>QldX(E>2#D@$9uBlaFF=*
zmtr|hzFSK3Ungb$7{eaMw@Zw}h;JV;?<XUulIl<Bu_PSbSoiP_MD?9Wy23Cj?@@C`
zSr2`FF8vk0h7_S%MuMgKg$!h0l3{uq?PlUriW{TrcfptvCcQHD3c8frZZdBz#z6VT
z;@5>OG~5r60R}xN)6dD5Iho4B93k#-Aplh_vRmWEk{N<F9}|kK$H?^kD^eRAck7bt
zy+k!{ZF-R|9XGJWiCXDSagz;ATp1NQ&GNJ$E0H`}Y052&<P4)Tv(R@H%a^0E@f6LX
z#ax+8q{;R}=^~R2w%Ic6e)t*&rOH$4F+eU3JbJ;~kYQs4y5CafqU=QPOU^agVLYBS
zs!2>f1f$>Twe&f%$aF}zj69a{Q(mb&ZHU61Rxp&QT*hn>k6f27-e$&8rXJ-KzLsSy
zVB-eG-hl7^fd|EqL6KtG;~W+p&1k<%RI5q!u>`0=^dXG6Q4IDn6k$Qebr)dakn}%F
zkn#77q>c{IS0)DkI5YZGr&Oh4>H7ZkB2m^g=jmfR|Hz^BG^5ilKY*3ABL+>X+aA)-
zR8&&(3--)v2l|p^GA0w75!gU)jx?neV+HM}uy!}^0NafPS<gXbMA1I)>=OUm7R+!>
zxuYM?GnxAY%TTsAd1~;(botTLb5e2e(EVg5f8|vu)>ACsp#oD*XpS#0;W^C-T-!+6
zZ$#mHj)R(R+d`jPI<wnd{DIoDr3JAs(SDHJ8bih96Nh|CSrZR3SY7d%AXjuArO7QZ
z5k^`2gc|z%sEk`)Vwz`3gl4Ro6WT#pD+z|Ofc_j=e!^2*N{g}X&PfKA+Ob1<5A$SE
zb<>pvc7?UC7gg@q3?gOps1V(|bX026Lz$Zjy3O)#In~=o&@JY<d=}aZ@{NAAfbX=>
zR%j29_@-SK4rc$+9vvQ)DU$_sget{`Mz!OT0ll^C*t*hr=Dnl5G_q2hCk;`&f(|P&
zF49S23_$0IA%$i&%C0Z<rF5st_FowKD6^5=elD6w*CeA2^(h2wxXhh*YQq<|G-)}{
zf!-Vbvclp%Oh@Wf7_=6XXj90!ZC15L*+E~zJOACitq;6SlTKS%cO(_QUMCvWr@Vd9
zYk?tu!ZQZ#9<<|_i~XbY6J@`MK09n$V*V-Sizy5}<E_G&#U;^ONmK6AXEJj!+dSe%
zwmCb$CHm|bPwkM67Z_Ipv6=Fesn%O**a9?hr7;WbsCaH86&9xgZK*Zw^Dvr17wO)6
z(VF%*=<(_m8m+OK!6dI{+L>c4`AW8E%de6qQzXvR+o&9BwpCh^c$i5RQH#c)0AKn&
zQQTmDwt0s&*?b_%V2BuG<_v!ulkcD4a#JB)W>k9ERaRr&OnTyPV?IWzW#eIy44_}-
zd|DQsU1pm5vV24r#+q6qlm2&Q=`QNbZ+R9tSf`VT+h*CP=u>(u?33!dVd$oe1{|=S
zkVhu>nw^d{BNE-BOt>CTB3PRz=u2zt36@gg#c-baoVk-kjB2q2-rd4BmkgWrxGt@S
zNU;tps&$612UR7xVz|L}tvy`6L}n5BIkcWeWhrU55v$pkX-@HPqW(eVvHg}Ie>$@K
zR3><wIV_`}nf5BwE9brPNs2ldyn+=vU0qIiz+2IOdS)S8x7;5m$WV^x3Y@9)R@5K5
znH)hwUW#BR_=7Im=1fZ6FF3=#(vX1p<R^S>TBl*Oy)VmtQ^q>nM-6lb9mGf`?V;Wa
z+JI7D(VaN(<b(s^Z;??q#VJ@O!l|fn4vfToc)$wfz>cER?m#x%KE3%_Qny`Jj|Y`I
z3OlEh;pjS7augkfKKAJGrgiC0`lP?|lCQooXM8^<%eQ0Ptd@D)R7tH%DsiZ$YxX`q
z=W7>!FJ6pyu@qJ!vu0;A_+VU=Mtnt8feiX#M6_P=-AfjdE(VEFg_Z++w2tvmOn%U$
z@O@CrwrpfSTa2AExfyRy2l8FzTRM7EdC~bKGb%%y?DHgt{92Wno77WxqK6DyVZ4aR
z*O%<GD8O92<7;Vix?;}!uE=^UMxSEUNmiXb{pjHxpQS4a`u+KIfu$_z3-ZJaOko-B
zaWHtYXZS~CnM|<=e^HgNeKhzN)7D4ORf>80cwT0ZJ;NYDozAzF0qwsWoYWawE64v)
zzN%EJ<j;3Cd@US_*08~8i`Kl7wg}Ds1gQ^+_PC?T-2H`#w-K7u_jfe(tWDI09wHlR
zLlx=15sC>jY=+1t$+Zy@!<=q}#QE=zG?rZo2`mi{-TUPswYH&FaUB0Z?bH*G5mvb7
z{n&EWzB}<jsgb#NL65MK9~6zr{GdXQydaD*oRQ$`hjV}aoRrbrAfMpeAU0nU&oz``
z+jC7MI_)*KXVDY9-;N)aNBQ9%%@4|}`e^X`BQBb~pMMp}Wbn&mKDbL?3zYT~Wr2^v
zN(2Ag&iPRD=gxA5(*qmzly=FZ)h+E6?Zg;DU{G<2EzI6msjy-#-}~yS&Gd|O6kTLI
zhJMb%a^lagQlb4X&r@nw?f-hZn6X(})E>CxVU-qJ`g^AK=B1`9Z0PFPDgNAyG#dEz
zRqEU^ySrm3rmjoP-rhRxzb-ve>$Krsan_-q=v&RZ!=lsbyw0dq+JAbr?N>psO}Ol>
zPFt?I?4(xyxsMZm{XM2%U)Nwx8bd_)k`!ojrd}Gk#e1o|dK)^8^j{O5Z5GD%MyZbZ
zx6)cfw>5iow}4Ie?p&2?py<zQK0L^!SO#e|BfEpY{#;S<_%aL{7^F*e<en&PKOd9g
zs~=IT2}MT)#!mmeig|By(}>9o+7fDEcQc|fVIK{e^WM_mJN`a_HKY5%n`hL(?zw3W
z)EK#}K}T`vWv^drJlKK^9>t30of7qo3jgwJ{P#8af}ST6WYo+q6iuAb{*m>o_2>Qe
zPVz<VRd!jul=|+aLs`}*nEhG}W}L1$b9`}c+!nLWz637O%;zNk1LfO)-d3ej{tobp
zFFLX@P0{@uFPfJYEiuaPbu3%m5opkSW<9FJxC8&$o&4`SjN-c{m$hbgH*Q0?hV@~N
z*7BbbX3EzH)~F?nN~7|3ZeUawlQ1SM?M+wNYf6eV=9Y<xEG55l_%j<S)!+Ehl_zEJ
z{Y!etUevz>ER*^N_5pvd+ePdd`d{!Jq~-VEPnHB;b}aKd#WKWrXDIQpcMSO_=YB&M
zn;=c6?yyzaBiCM7Y{n-4Dp^ju_Rd`Wf7<`{^FAEMV$XNV-P@o}Yvq3?OqUn+OJ&A<
whWS$n-RxC<i=tclE=u+8if3_rPu<hkjq7x*p9)_1L;J?xLv38v`dg|0AEv5L%K!iX


From b96c041ee886afbe379583c86e492a7287c39131 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 23 Nov 2019 20:00:46 +0100
Subject: [PATCH 084/166] ModuleNotFoundError: No module named
 'numpy.random._pickle' x2

---
 gensim/test/test_data/ensemblelda | Bin 14238 -> 10082 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 85cf7b33f1966b8fdac4027524a82863b2552592..127d00888c3c5184ca18d49e7117e0d31352d7c9 100644
GIT binary patch
literal 10082
zcma)i2{@Er^zhgjOV)&x-N=$9g&1V5p==}4G-k#yliA)`$~GcP$eL_fvXn|8X;F5i
z5+#+AB2uD7i}ua`o%hvm`M&S>f4=v5IPX2%J@?#m=e_6Ln;=YzATu~rx*44rMy7Gh
zzy_I4q>*W11f<*IyBGKl2pJ^;!hApkg;nD+SyU1p5)l07(a3lXmrZ3v0HWBX|5gX8
zf%pJ1tSpJf<8aArSP&3L{VzScY;fJ+@8Y}}2}m#jNj@Nj!Xg>uC@!8xAd+bS5+jV2
zrc)Vs4uQ_1kvV`gh!7#OIDm|Y<HZuea|kQJCUbZ+E(egs0dgolumrT6L8eAfh)jX4
zcz6_4Mi`X@YLF-4@kAb##s#(FksLA?2`JzIMHEH^BZiS6r}zO%SjB&eLzZv=WeUiq
zf)OHs980mXy9hLD7@kF7lNn&ossiFNn5c%8pmOjeCYubXr(wn5foNcaczj?viXt!I
z)I?!~F`^i8L2oN4kmi*bgg~<v1yZFAw{$4#X%r0%0u<F<6xBmvg#L)?!=eT-vmq>s
zf?Gy^i>_J}H3mt4#%KbIn!+ZV!J@0-mN_gcFxX;I)Kbu^Fl?|DEV>5n#~K!03%Az&
z-K))_sI8z^5!kbKu;_ZYAA4BT0dArH?$vQo)CulYRNy9OSabv2j|(j73b)+;?$v!!
z)C2BSOd#qBi*AJb*#wJX;Fi~)UdcQncG*AUokSyWIDf+38!HK}4R6890XAbrxO5J%
z1&0;oa>EFK4~fB}v!cxeGj2u)1CdH#vkB2iJey2nGQjo1BY`pI3v(eEVK5c&!yr<y
zLd1Q3fWKgvk{FrPMV2sbG>eP`0<gm2HlXi>Ko%?*o>(FK&6_uGS^yvD9|plCrUxKI
z5Q1moVM&W42ouOfFkD^?!7k}T5Of;Ws29*Hk>*zg5V43ykkH+b+>$bgFVsWkAjD_;
zF&~1&IS3*={HmaK^EL=hpo1W&Rptvpgvazn0o|!!6V!I9fbj{cTVVPb7(bC+epA5L
zb)g!9S_GqBNX{!3;tNG5!7v?$30J&eedtG!{)sGSxP2MMSB5X@ec=M(D@O|<y_Irh
z5KJi1gZ015f&EdD1wlkQ47HNW;r4a?ddQACE66_yi+HVer)~jWOT7}}b2_*L@=urN
z7sxMKk?&#ukcn{r7%Bw2l3+OTWO4i^Pa*reh^`RbX&lB+P`sfwqRAbGzNN6;(UlPF
zx@-v9A39P1^&dK%4?(S77>1%tq4q@8O341o77KWO9btTzg=4qCe#C2doK9Y{ko`{R
zPmtYRJKsWZq5_`ZiC}nME7M_!cmdDj#IYBU{h=t>&dNo6s6RZ8u1HVF{z|b5*lxob
z2u_@a=Xat4j?d8f#rS-_2=BpoC#pJRzg7kupH6@9j&OWd7Qyk^^-LPFKlJlF>~|Cx
zpIjnV1us}`JT`~A3;aeAc9CgJLKp`K!~t8e%b}(G*N2QvW`Q3J@MoLAr7{^n5CN+|
zCvYO6F9ijpP?)qZAb2xYhD~6EG3nq>0D(&ewk@u57>UH={UEBpfQTn?nQSBwV&n&G
zM`0zr11xN9t=0iJiZA8wRlWmU<=!me6biVmdEk#Hw65|0xvsmU5C}!tsyva`X3hcn
zUzjm@{3@A=24|jC=gfnkK$QCx&$t%iFSPIthbe-*TViw(O+IL@v$E$`?!W@Rm#8z=
zF|=eHZ@Ycv3AD<pv8O@>n*)xU>PC;RkRkCaayF!8bQLeV?b9CCoMyDc2ALx~@byiY
z{2T2u_3ScWhoK}QzWMi<(J_Uf{<wjY8P}DP)vf^7s@8K?kBREB5AA!PJX(Ycu;1v=
zw@P)JWlT?w65_}A(1!<GK8QJ}nPCDATZfl6YJcJ<OQz>7J!7rpD)cL-V63Zo+tj;H
zYckh-lv{H>^nl9^qxZ51Wo{n9*gtSrSa#T^bL;$s4oUmAm}B`CS$jT}ifrE>o{cNL
zqW<ARN=s}pPTxYd-N%x0=T_(2_p#PVTa+z!_Q=}zYprPAQ6arQf+hEe|5Cx&#e+5W
z%y@f+QV=hTle%Nu7JfrWq8P(J(<vnI%FUwo-X?04tZDRR<<Thg^_9$)g%$IUNmrLm
zt+cP5b1>*~EOU^Q%Cs9W%uB^tt-Sq?kgKrop{Q}~<Sgfh@yBhl`!~D#%&Vc;JG6`)
zUX}=LLuc>2<V_K}lD-=F=x()KLkqn}IwJh)gygqOKSSF@?M%1J^3vb;57VY<y5m;{
z%qM$@n!IR}%(uPkCUT-PU0e<EN;1Say*2tG?QqR6A<4VL<$2D|wTIGk8HCCcF-v3I
zzs1#%?q=r{^_z+G8n0^c9uM*N7Sq^M9wW2D?fIS%eJL!j$11$vD{9O8mF`AE#+@ow
zY-2m@p0tur%equ+7pQLeCuJ50cA8BYq(^qqH=~I1Y1Tbu7WZ$tKL2%+Ud#NJ@HSIa
zp1$c|!+|&@Rb<x3$%5}sI)(x{Pf@Y)g*mYWPk(E#zI(BY7@2C)H0>4dzGLj`HWdbM
z$7kY>^!Hw}Pviz;rNd?YPd?adgS(--(WYwInL?pvU}K)h=_lU;%|4C1XnR%KTAriO
zy0up0q1LhdtAvLI-utz_Hq<R^terp5eq(jFUnH=){OKvv-6!P|n8JO+b*JKf4%S^w
z+2C5=O}h20lhbYJJ^$?!H{(V9(c9W<J*o>Zddt-KF5@A;5^hO<uQROJ|0UF3j$T+I
zyhg51^SI_~{#eQVyAro#m&ldgJ~wUtIO)d{daOf*_T&R)1?KVGAS`aHFjrB`97T6p
zVYKmhKB8>te5h%o&(9?|?WO2O<=57Fn*ee(@>|G>(}@odyvTD-*!C&j{fEL^p0K*X
z@AS8V)hornga+$q6WXtBz4EiS_uH50Rn0yAchi%T{Ngvzr9U0>ZBVVbHq)EbQs_jz
zlqIhkS5x;~`|HeVJGPd{SaRsQ8Hw?&7q)yiU69KwjQP4Dcf`DTyRYYJT@8M~4L$qk
z;dYL{>H-rbPTDl(m4487u$1O{$4{!12LJrfSG!923yr5ekm}y6S+I6SEcMC?3thhp
zs-JKdCKJ!i>yKN^*mx*;_#Sd=__c<&FHxxBH0`I0jg9^Oo_l;UBiVNb@YrpJ-=bc!
zymw!{%)z#V#J1Ij)ayGkU+LDU?iE9fWse)lRqHKGHAs4oj2x~@U3=q0fWwdN!W4SF
zty}6nT&(CY?qaQQ%BV+Yv@t$EuRiCrhLq#)@nxDtEJvkQkt1#KZgk<Qu{Bl4Yu|s2
zl@D0kZDyD8q9|~WZvr4&=<sv(u1mp<^BK3x{F*G<64vltbj6Jn&+Vqz9O~lLn|t%$
z+&E$+$zW|vii>$~WZDsN)uZl`)4MxQE`F&S&}%?T-t{Bm?z~QGtvku=IWf-nUlx3G
z>N3qmW$s)0&n2o>ucaGL^*!8maA4m|?qicP;x2Q_gYVvBeG>weF)DG_kVEoisJ5@g
z#_!C&=>$jh=2vCfKhr}ED6G4zb-qirH}9gn^0TZLxvS39e_X%XV5B5A$bYWd@4mZ}
zXx`xQx_wVIDcc*i#~LyAqo-c3v{Qf3q*lIkUA)F%yxHv86B1QVEq<a1fHqD3WBQ7O
zD<r$L^n1<M<lEGk#y(=mSG~Xta>1aeB6`L%<0*b-cSFo_S)fqC`hnine8Zbx4H{&;
zgd=7fds1#_jNhoA*)%E4UiCX7=XTJ*g%bx%W4nFrxZW#FV=R`$FR0v_rp<(y57UkS
zy_I6kX2n$=$^`*yW7qNaZaeX)|I92|-@S8&lhfC}>Cv7{_4C}wd&Wy!(i+b@{l4F-
zp*DS7ckf%LE!0<g^e;$Y<?Q_Ll$YC}kB+=KcQ9>fCuaHQ4~2KOl+g^Wvcps763-nu
zGo|-t6qhSrh@>w_hf7?P^;q8_yumPN)`GV1{n%X=*)P0ctme+v((-n~+hC1Bn`YIA
ztJ^mGoSaPl613}T!Kw6RM{b7d7h8<(-%h|D+BfZHwHtA{DBHmD+%@(C<gs^WE_r!7
zB_G-79Iezv=2xuzZTCD&{XM6lwQ~Gjm9<9GvkqZnhvnLCn)IcSZ#r@ta;$guPbt_u
zo~+(E>*?0<g*1%x3+mjTO~UssoUk~jN+>XjP2<?s3^uFG$BV7F_NrUslF-p%b{b!M
zIJ_4aTwc*sE1t?|QRd_pMykmuSnjp5R(?NSl84^NAnz>p4Vni!9~V@(K0P2JBB#*2
ztD|sJZGesQ{`EDiPN~BunQ?6ob)RWji!}H2d+~aGGo?BZTrK;6j2G2;rr)+7KY%yc
zQL|SVw~r$vyj?*z){x3SH2)#KZQ1ldyn=c1_2IGvLb(lc$mC;Fde*kp<mt5A<(c+X
zF5PwK9-zfk<Eo^a5Y@KBHxmPsly+t04YmxExW&{iP0L=tmDM%~H2S%&^-*m?qm2(m
zmYO%r^{iiO9poK3?Ka6Q>NWR0C%-C6EOb_<j`X=xc4u{8&)h^-AV*2-^3;(JLKli3
zi|t^2O7Jc_|Kn3Q<Jx#c?Lg<HsHyo$qpA++b<tnzcDTOXprZRNmB<x+fMDz}xjn6V
z={xC|bA*KZ2j1;($39&YOIz!xzO*D`;hg1b-o6B^$o#6AkcFE2A`vZ8RWm<5J`;+W
zPKs7im6BOH7c{Twd~Nx4H~3RNPK#(>I>;aT{c*l7M)L;x;b&U+oF1d2{)21U@%K&P
z_Zwbgdgss9DyHszy=ry&z#aEP&BiIS<2sM77;GJ+$Sw8bQ1dTOA3d$%yUVW$eY0Jc
z6_}`J`r^X7?x-PV*svnPoJ+m9?j_=^SY)$b%=2`!!JM~qNwR&0*K`Z3@<!WE{9aL!
zW;Bl^tTFyvbJa}q^Nq369mmz))Qc7Gu|L=BJCwt#3_q=0Y);+a-yHlo&NTkfjZ0~*
z6Q?`I+hhanU-M@^Ut=rpd-s&Rd~j4QFjS@C+I`defiEY^@0=G4;}XG49pNF^ehVIO
zn`(e}+2X%(vm4%Zqkn+&W*0ZSi?f#t*2_6y<Ce|%08c+Rf6pymu0W`V<B7%U1`^CJ
z;YtbzAYeo(LH|cJ<xk-S`6od|0z|NaL*fHrDC`mzSccinWQV~;5Dq|2$AWf&`6Uo8
z*ocPz*@#B`$3_&a#_;H%f=Du(15hwRSR@s!>_9tb4nV~quuCEcbUFdg{cGpE)5Fn+
z=zZt?iJw?;7MMbVC6`E$Pg*bx9xDvoE^`1Hh!6`SlL*lO-NSJL@(#!nMio?-h&(Pi
z3}AqU!aHx~|1w}``;7!x;D7->z(!#u@pvd3=KvfWC7cpL3B+N+%{n!L0cPi5Lcqmg
z#W@1K9Dw()?fEXa)*zEgiAcwW5@S5kM{72stMfLZcMiFsMX`O3TTrfO3iy69(hW^!
zxuOf&_BqKs*yofmV&imNw9WDNT&v^ecdd?oYug-e#;tO6h;?xiyc=DUcarsKa|9;(
z+Q(aOwyT`H)eeS&2$@R&cK^$UD1i+!DFO@3L7xcnw*T)xP@@Dm@-QZe14Lt_1dia4
zsH|uX5Q76t8*)q*mrAF`knsc>i$VZm)3HimU5!cyXNARP5(z{qjmnJ%;yfIaV<tTs
zlw(290elpN30AYhECnG1#FMc|GFbP*lXsEX(ZC)I5-UZb@EDOCP%n_+eOm&|r2gon
zQ1Kuuj|?RKr4i&N&}c*gDUt*1-HeqIL<AU2U@avHBZC!ZhKGaII3U@>@!euTh{3j|
z{L4wH&_qE_lKkT!aPTsJ4N&0Med$;v7)>k!mqa16fwaw(7#xKIh62>5psL9M(gn3e
zaumyr#*82egfjls&;EY}#PA%$e(O1=z3&O8eGH0jf4ri`!Ah0}y`O$Yvv+o&+1DKl
zM!y;gMtc@2qZM^Z93&J<9OM*B91J&>I8^p3I&kDP(SrAQ?yN%uuEfDJZ_oPCWpV3$
zH^i>z5-DJ4%D@X`(ZQJ*Axiw8;0LQqEGC;^1{N^|X)A#pjdcC{703ctKsF!9K?z1F
zDHtUb3Autu;4&l03`=SlkjDoOpn}9fuYuFV1Q&EZMmk6gY(}sNEDCTCgM^5jXs~)q
z2MRD^5RuC!gG;s$BNZelH3@33MHtB-f$vF7ItyH^Y@ir}fF*hCUF2w>1S1+G3etHT
zkah?o0jZ(z=mZ8(iV=cHU@($_GK_c-nC2p=3{XipfkXz%F~X2$Fb!vcwKFnMff3<>
zktlGENLV-rILrqs1*JV<s2(W{LJ_ExDhw7jMKI#vy95X&K_~@6B=`qbjCs(~z%v*z
zKow>=RuT+o$R|KGAE*JfQHnuNmtuq|<roo41^63Vf)S$>V8kh97zxTDj3lK9BSk61
zASnki(v<u~Z7e1SID*l{ijY9Fj`D%p|1t}#^ZtifU{Lail--ml$~KCicyJ6w`FG9W
zI2eOa%>bVsASeigI_=)v{kt~*#+Q`qx)^+jMp@a)(a4u=dkf^utffzoGB@pQEjxHI
z%HGKHZReBN);G+T-}mbFJ&g`>9<M)fM7+!OuIGiDwo|rtYaaD#x9c};>#e%%t^M_E
zsZh;kmu`N`DW#yEv#S+PEfns2Op0{e_xKtsH~sCnC&zZ~;5F~%+97=Y$*Pn68%=cP
zepQ@af#TXVP><$KN4dmS|J>!*tzq$@!p`9Gni|c?Ct;k99G@fPPkZ0Qm8r*@{?Oa0
zrmyva!$C#9Q$4U}SJ?N6gtvF{48k@557U*TX&Y9ymq;C}FsUzk`6DUo^alrILR%VF
zha=>ash@Z@)y-}Hy0aT;k88bktAvW~>gwpLwM_8esqi$VfmIc2EiEZPjMp9AwHf-C
z4&`piM6O9|eB*e5s^t0bqwbVP{PIspfmxiT_dRBI+)bLX#6`$szohwhxP-B~1MFw>
zCR7a@KAv4)`{SvKGg2d9NmP939RtTMwzgRiLc{b~N4Dj7S?QL9iRy`$RdS4Me=@N^
zR^)&NrpA2&yEbt4-KBM*Y;BEB>fqF^k58H!l<Le4KQ~tyl%fV)zTXtXq45f1N?ELY
zy&t%91pa&CYyazML;6wFwFnhl%kc+6FB9Sl-(ct7sy~?RtsQ?{=-PAXgpR%z-xL2R
zN8D$6_vMAT&xr?nQhf_GRD&LlXr=T#tl#ePQ$O~V@%8{c-pj0q!z)SW{Hrtfzhu2d
ztzXCYekL)d^fF~qX!dO8=|*9n*YsTb7mAuHvyDf6oGLDf*p3%$vuD1#?yl2s-!PGV
zMI$xqf%-zVzv$6+O0naL72*j)e7)nPS2u(*_Ul$Zs~(iFdB(W-a(-j0;xel)Yp(<S
ziAUdRlec$1{P9EX(%M^|M|K)b=l$e$eR$g5Eb-c<ye&M)m1&lEa5QL0$b`8y@<L}n
zepZa#GgNAq=E-12J~c;}5BY`uq&{A8hn>=Uo~G&Lv@mTM9MeUSr@s<?*lYF<<sUz{
z(kjU668FNy8g*jnG5vCzE!lk+O~O`3hPLQ!a~!$i6pSgPOx=rCc_%F8K<XTbIAfDm
zWL9V6@A{kb&AhT>U))Gi@5l!A!!H88PzyRHF1C77^Yh17Cso<^Tp*I%;}4dKSEBBU
zUH4#|AIRUd(|Osag%9TzZnnn0j#a82bz0ZWPReQ*Ntt~1>QI`WnlyU18&2wZti#0f
z=p5fY15HoP6l8UiHq8okxXcw@VO*FRF8WpUHZJUXU;*~b(@;<0LX|a_+m}SYzIyEi
zab?U`b#{^6{VaO!v%upD(n0qEnD`#q!C#tg7kg{gt+g%Ood4UnDHwM*p^clRs4F~=
z3Qe+{xD_7$ZDQhS^xVZmNy}@`iwm&^hWYW=uhs?)W#Z#GN$<V*p*NPj>GS>Ay=Lla
zsLlTJ9>iA`{pn}LE#jVYJLknPZzkuzOKiujxm$DE&tmqoVnS=m!R!l@d*+n;h~WzA
zE1rh)_vfrkrTQi_P={pHzpFR=XzSmz9>Ln_w4iGgQWMpspH98-;fnFa)ik5RAECc~
zTfd3)Xw}&q9TFK&-ygIUi&6TnC$5IcPuAbHw!ptYv%5_haaC0Bp3LDBXMY;F{$@5!
z-C1Rl`J6c`?UF!{IpSf7E8iZ?dRrgNvo*H(P%L|WXX3WyPjX8<CS+5dX&otIU0t{z
zl;C14>K#&%mXSemeN<Gf5nHpA6c^TaPdrQMN|C|dgG~=Q<PIG3N|B0`i&SWdwmR!P
zWtFpR@X*OiqrN834~zxiE#!sW7$JE{mKHT(`Ey!+<0PRlE%)U2tH!L9;=W_n8Q!VF
zjNEfayBWI#NCh`MjBh}Pl$}OoT(l<3Jh9d+mZ`c_ial?Ro85JOVnd$l?(_<+TW_ya
ztt!bXXQ?aQwLFrlxc2bY*2~Fnl6QON=vzy)9M_okx6sdUeQ>fhfUNmo+&=84-V6Sj
zA*0=RHQtxnBFi=Gp4`pN=Y;aPahJzx;|HVHc~+;DQbUg9A4)fsosJE3DdNvQ)=3~{
z8H|@V%(m(kp^{%+4jzy#{;@TELu%~-bG4?F!#lfk#vSipJ*mh%xNp<sP-h6P>{Rfh
zCtsy=eoFU6I>d}tlwWFXM#cI$MmzH-@90+OG_Gi&eDPe5z8OP)`Nry9l<fCj*Coj&
zqZRkVs$GqaZuZi)lC>+=MIKYo_AwC|*_Ku}(rdD*+CXVZ%%j~x;g@mqgJLR`w2<!8
zEE~CJFP><l(~X$ttFh106GT~c=4SkiM?tIZ+0a&0&wh?TG1#X@9nBLPTII#sHhQej
zB^TsA{5rP%=(_CTS4s!6jUzc5i5chgs?J)-+`oRnXIS{ovRk!F<z%JJFBuo?o&LzV
z>0d1Tow?%ht(t8PpC2`wx@pCfbpSeft45l)k^Rcdcht8It?%)l&Od!DD9`Yw-SsaL
zVFuRE<ZoD=K+L|0;I_YtSLS&OotKK=p{wS@8GTw?$Vm2AReFzknR(_o)2??}(0k1;
zonyEsy`QueZeD7B(O6ayPTN*<I_vn9&-GU+(K;rinr*_t0sWMXjlCY0y8XrV{`8{n
zzn&niyXxqH=Nsj>REPgEaNx!|g$Q*B$K0ihrm6`!?`I2d9jKIh^_6bJBaRy!o1ON4
zEv5N&C`IMu#w4F4m1fgWY>~`(hw_6abCZSx3Fa5q4Q=szos(9Xi}M}uHdV~^wD8#{
z{nnDoq7bi~{B}0h=vHcagvqY-H1zgzprTPqe7XC*xUWkU&!!`{+T0etXs|ALS9si{
z4ziAtanxJxa+>ML6C}%j&GV#M<86;E@a=2o9`8YWp`%siH5yG;r%GoR*hDz`In-z#
zxdo)QiW#Yl*jm}02=wW>c1Agdl^7M|KJCLmrAt;^XtRoc+3s0nh-vQ7AtQz(Uk-9j
zNI4O{z5J@ecN|;Q&)9~`eHFi0Ci6Px4-}RjI3IfF_4l8$lne>cJ-Gu}ZiyuJm~Kwa
z=qvS*MB?m+gt(#+n`Vy0>@nf`$N3ba@_F`!&arP=nGL1COnM%#*zebC6HFVqCt8?p
zRhOyTt}{9ORp&-XL|RUuWA?Lam5+|gcYlgItTrnp^u{b)p3tarVET)tCu2uhNss;A
z^BP4mze}-hSpN;Sk4;M>YW&d~JbYW1`+nT9ZtQY?@9$y*1ILz)uPD}ghx2dcjG8D<
zeRo%6_Fa!?O8=M}&3k-c4oNJHryL$F>I<8AUgWG=o%X$<gAu(dQ^`xi@|EMy?OLab
zb5;!&+7<o`#yv=WR6lFy=Jtbg`sI{y{n?2A1zW|g_gAgv-`iFn8tqflv)o;?siBD$
zaLeRyRI@`f@$iOlupj`YzW*o)fSH*rn2mzhF#aUyC;pX$)cuu&VAHW(;8ywi(NO4|
zZQ^+Z1SvciPDuGcZO{AgIb-fgH>f?bh%e5D&l!)?Js`X!3qEI@DS^)!Z$E<18H0QZ
z;d4%_gAkMqfZ=Hre4e=M<qGJWZiWw^GycJQ>cQuXJ}2RGMwAPD&e&xNpEGI&f#-}+
zk^`K?D*RvP?7%4k776F&cn*&a)c?nsItR=k3E)v%10@10`g-AobE+TEh?Rq{qTnNU
z;|c5t4se<eG@&T*lx#{Q#u6(oNS$~b;0z9EM&W?7f_=JBDsT?WAV6>)+?#{o0vQLi
zEK)9lSp$f=1Qu^V&<bW2AZP=3@gQgi^9vAkfb5-uOB~>P7tGKNQ+mMU14Ld18LkK>
z4}9+h8H7`Tt1#snNVzV!Z30sIV5K);(SDFJ0Ol<q;U)nG+(N<cgJ2p1qHcrKp+(dk
z7<Cs+Za~UCkTQ&dt-TLZ9>A0lGMNIVxjgI=4wDU5Rq%^9b2z|5954!Am3f4M({RDv
z1rG2SOnsDbz!-|6MKPpUQ(VYkuEY`K@nG^eo`zMY6QTs=75x9>ddw3rg@SLYOrR)>
ziY8H%r4%!YJ!KP_GM+|(S%i3GFz>-C;JuxL@P5vlUAK7S!CF{=8}L+ctznNKftCAn
zAWJ86Da<fn3M&d?IKVTkE`bwG2eEAMI9E_EA}~l~Je>fxqwq`~cpXDfJ9?guRsU~3
zVESK6?1f;KBr%ey6fAg~LKs|Qi%$@>>Z?OTzoRJADqH<Pzn~J+gK(}yr!^gu2tO)l
zCq#PP=AutEUzcV#Lv(rN89g*os&%~L&Xj<E@kuykU~(3H>a&RSrXc~|$$d@KKD1rp
z+sYM-d?I<;Xoee;eI9K0?fZbK>lg6Ba(}egxsNb|ZV2#|C&u4appQ77F*k<zJO{HC
z&!JB*>AlenwP8DdI!^Z;J=O0RHt;;JCKs{~QbiN5mwi+NwYO#z??d}A_I`iZBxpMo
zj#^csx9JUPEe!r`=b!!-q@FTQ-V)&9r;wfEw+G@&cz?<*1HE(d&8elgA-f^JB!AxI
z)LY;0cyK{Sq%rUB{s!cB7k@bNw>>S1q3myG1nq>oZ+CIghHCbN3;*y70s)WjE0-HX
zg!~8hhpgDYeBD3%j1(LAd<$+v^Wb$wAGy4--%%?&E9NUSfAIJ`S2pUn*8bh!W3J35
xpR<440~V9y{|c=?rOz2Kyum~rt4t@;ne1qMIGajlgwdk$&=sKBG#=6He*nr6RIdO4

literal 14238
zcmb7r2Ut{1mnMP;f*=T@VnR?skt7CGauBdZp)p{<rh$egHn#(Uijs3iKoF25IU@p!
zvU|?%Y?uwRvpLV~Y>qRVXLn})dGGDU@1O7Ande!2ynVas+^Rb1J*UpCE;rA}Ru$@U
z3VaJRnW}u9FTSV>((_gMnM#`&)BSDS3ufmkgk5EVxmSv#nVFd)AtFT~kGK<kH$jnn
zFXCQ|U~%ggzxc(Ql(S-I)ZOIxJ28srcy!%~icv&G+>W|`JK|pa-8+hVaq^htxVyKb
zh261X(|zJ!e~JyBDl`-*jEfK~WxG@C(PUhjBJa6Esm<02d&&eWuarIL$k1qu3_8I&
zRoLs5D%g0*%w)T21zXuJ@nfINT>P+8r3&_@p8YZlvB^QUTl{d8S&APAWP8L9Cz+M_
z;SBPFUdBgVj0A^_J%?r1BEb<5xOy3<I0^#uTH%<n=Qw&!cp0ZSX&lte_~<G0xXbp6
zk9#Onh0|Wf`!g~dvFR*&Jxxts#-?*JFi|U<M~}CcakUqWJwC=BUzIAw3KGc4y!c$B
z%@q80^e2i$qgDE5=49w|G=)lSsZI1xw}L;^7f>c#^h&YD(|S#ws_<e?rVv;rTrxIg
zXbOr9dX-kVEHg{71dZBIpez)w$abad73V8NUzRdMC0v!6OJitNx*`oML<o{uNJ`Ne
zw8g4Y;Tn`D>PT0r)2j-E>oUuf-Qtuv1uEf&Y>)KCpu<GLGOH8|^r(t-LWs;-npCe<
zsdPf9Y_Ie!TdOQm3t=)FX~2S<LXB1@gv)FV(gGj?i4P&7Oo)V{)t6<v)K_HY>Z>vf
zb&zbg`kKsAeO<OkeM4rY4whM~Lu7l^p)wnFn9NokZjk2A$kGW>WkR&5S99r~8LqqW
z-wOJK9pY*Zm6}7-=1`smlwko)m>VC@#N#n#!p%SJaf?6waZwGT$!BUZbV4i`7PTri
z6lruqob33I?}ddggm~F8iDiWX6h);v;Z~|3Gqvb4a>SO~sX~I}M+!wwHUy>0)CqS|
zg}W&SF|I<PRb?2ox}0K_LZ>WN<!hAaPD~Y&4Cz0dPp>J;!CK_duc*HwrA`N3B+D)u
zQk?O<Kvkg8mMXHeIjX|U{8EKppQ#k?)u%Y1FEdA{R~BZd6tKxW6_j)zW+UQ6fl`;3
zQ<$yLt6?r`O@5~E;Ev?lI~@foy;_qgJe14=jXEJE<rp5!$j5z7h7l34`XW59D%2}Q
z+ZG<}7`+L!09MkS4Uj59#4#J;G3@<Gnefz0GNep>X^~1u&5)#LBYm?Go~4*$qO41T
zLWGYxAx$nQ;l(Mt&>AeH-??)~gtT8s7nzw}$VgZFsRJZ$5cLFqD9{O+vVAGGctDhZ
zHYXh(lqskh)S?hYx5-c&3iEV1FH}NS!f#j^*P_Z$mI~RiVZVib{7J&rWM%18dO;l<
z_PeH5Cw#)<a|%S0&}!0^={fm1`cfe$Hmv%^Z^te;zm(<`S7Ip2RO+FsT=+mTymucS
zHMS{K#VT#7kOysu5UVOy=9`-G;Z%}>cBWA0D9~$A2?ak*upiG9skEY2GZiNDC~Pn?
zrp(VzS7zkt1kD|ZEoFX@S}7D6MeGYSQu!7b^7T1NtyWoTBibLpLa#Ms=xqeSq=>>y
zW3Ql<nbioo3`GW%m!Y#!6^b`Df?m=^x?qrGln><>1_;Hq!gIM$QYVzkg%^_J)d(-u
z$_BMwatJIrO#CmrQiHNgF1*$YZ_*{3(-Z-OpbLd@t>q0y_|Yqc3bmk?)=;U&z*Tag
zS{!&^jZkA8*dHT`gZ}^g7iz@?)X9Z<z0i;@DdookRG71|L2A<~3o|tZ3iyv+B{Z4D
zXKeIUz#;SURW@*6iAo|;Wau?o8=+YOh=h9q*RBO#6<TV9R=LntCbWB{*kP6FIeG=6
zr%I~?vCyGzk#uC7q_a%u@{(B316;q|VZB>qy+<zeg7t(V^BUou+UOyMOG2N~L;B^y
zdw7WYSAAn3U6lU^1Z`fYNp7Qo*ZcAVsO~;}IL~u$^11ICEhkak1D*{GOXN0(Lp)r}
z!)4TV;?RjWYQ0WdXL7matRFS=26vs((zL>tCW2G=Ew#|N=L_DZd0OQeu00U+=8TVl
zT6vB?bBl1{>>m0ON}W-u`)QH4cq!h2KU!Xr(?=KIBev9PPg{XJK~vXU_6AXdGj&sC
zs_K~2ky}nw<$R3Stld@Ap6A7*yp)@MmD*_QVFYjO_fASt&=R-utd9GRmKD+r^_KE1
zZ{D=0CYPg@&)j(3zvwW}(gF?pKI2L5<;6S;YNig}3ZXVTLk`!6M*7k!SiVXl7kMmz
zzEcBzN%Eyy$0BOvcb=Dz$EIV<@nf{i)2Dfw2E*y&VF;lAp!&@lo=>BCEU1Ut4BSdf
zNpG%GA8i-=(Ev|r5~;$92eNqK8L#m&FJ7V{+VVLawf}4iW9;Wx{8)I_8FUTMXKv!v
zTbFJ*(70tVce~SCA||QiUT{_GdX8IYm8NLR&FYjzAkSIcJ55!&5coHGmts!C{I)dK
zo7<`1EuFqbV%*9-=XhZkt;K?7jXpRY61a;#96b4eKGWMmDu=imptp}S!4160Ycz@#
zthv%BUZL5-_=EJ`KZLvK3;wOps)~B)qup_Cq^Y9oV20k(e6B#_0%oq}-V6S`ZoxHZ
zt-53Z?bX`tQPL(&7xR~N>ZJCQG)OI%xy_CisfCBQmd7Bh!5k>`>ouyQH6Erqe;UZ9
z*=y7qaVgw2h87*~@ORJ4&o4W>^C&fj`125T^Rgq&WLaFN(d+b<CwMJ~$DZ;xUf=~s
zT7D$>af6=~jX~P85NC}*y+OR@qsqR@pVA@^Q2qJTU>fDk3v&8!iz>L*^=6@K0Da`Q
z{DFrHpu1k0_u&sG@wJ~CZgVZwd2;1_eEG`r$yj%vzx|n0KCi*Y$HQrlXq;*+xWP&Z
z$*<C)FO}0|9Cf<VRzNWg@p}bzS@R@S&<LK$rNz5PX&f@_0uQ&a`fC1Ub^(fbPn9$s
zLLcr^FRyvfygo~IkE-~ap63ME)(Ul}^Jkj$p~>t5FYdPRph;S$rBhr1&2_=7SN!=q
z%|&3~uk^l<rg@IqX^GnXo^kKqG9FFlFEq$=hq+<Dg&l95Fu!tt{~P}PJeu0~o8RX~
zSD!3uf))m-;@-0hG#&#V=*_=wO_dt%2;W14q2bgM$WvL=pTSL05bhGZ$b#yilLo4V
zpEs#Wk5i2<A@p86RULQIP;)M=p5~Dfc@*qn(Vb=tF@aQJ8_Y90Kg=@#%kGMEgYOLU
z<V~Q_CX8`Dl<PG-0|@y>>#xE2bUx2$=wl{-r15YYhhwxD=FI(EWzLgj8i=o!2Q5@W
zE;Knv-RG%$4^7i2sx63u2x@^vpSbll7PrCeCA`E{^es4|n75PZQ##FYx4r0bG~n?_
zOBGM=-lv%Z_iV7VMw+E+-n!0f_jpDtFM(2Lucl^~<a*@-Gl%aS_Peiez^H>xR6}2!
zZr!=+j={g!r*lVEZkVq*{GdCU+qjvQ()k@!TWRy`rF$Yz2J#oIp^od461jo4VHks_
zX~vS)-v9t6Xbviv<L?nJ#}a&~ho@)+#`4y~wgg|6LwPJKDm}~bd099O!}?a}Jsf+5
zS|R=A*B5v>)|Q&$3JtW*;}FJ{HHNB#&KF~-`6W-Dg^%<q&T7FxA3S>EWs!?4BrS_Z
z_0-M--0~RGpX2xRf#y?rI=NI22@FynptTCp7~=6$GzLzpXz0N{u6V{@xihfjFwgUX
zVL$W$Q={%fG$F!&;xV2nc6C2<NdJO1&RBAbKU(JmFgVVu7Bu^0chOa6ZatsO%V*B0
zx!W6HRi$#a3d1^DZ_$?jC3k9w*K<8wu7^Gqhd(f<&J-BuXLEj^Mzw}Qp31g7X27K1
zfmPeon!#O4DnCeV{576Fy7ITQ=RD{hL?58#NLm2v?cAfH#^Y4Ck1Jn-wNBnl<xej3
zRmRI|gch20qFT3<qRX^c%9S_0WxU9BFivhQON4Ge&=>AMN1XzUeVB)n&-wsTC%HP+
z!6EHNFg$RjlxAaS?Q$-(+eTGK5hW_kQn@YA)5R$z2p~J^clh$<y{BmI1XWXW{#BUS
zcY23N_z_xcH1NtJs(AngnsYJPI?SOQF}5b#Ipiz>b9jLU{Q2w4DC%*bF6VfQONgtT
zhi(C&bGXMA0r!IyP;~G#FIfjza9>)qGXQJ``grG5!h@dSQ27$HICT{2Y{bfE9v8=|
zF8C?A***R^wcA(~(+bzY2S=$k?K}^Pz8C?KFVku&*F6T<bmhUZzIjl)>oIulr^k8R
z8GF$O?z<S5aLR$F<hdBQ{^U(=mc7!$VB0WxBX8Zga5nnptJ}}_@;4nXM#5BDXp>u>
zD4YXg<EZ*IPlVGzEc|Fa>;P9&7k@oOjX6oYNt=;J`F#?v94!R8mBsKU`kWX7oSz7^
z@sFb!Jq+y$H<+Kw!DDR(`t}r#Fn&YDgZmE9H*VpLQi!FTrtARB%REj4mONB;B_TOq
zePMs9s|Qv)jcj0+r;qZ)1^RsFjDsV!-2mhLSbe!Sg49Pu%C=M1Ax|~@IpnSaV6#T^
zuOYn&`jX5uRuIU0YyQk*XA$goT?6`XeH?tanl}%?ewJt~0xBNJr!6gyyJNxygv9r{
zyS%U$lCA?db#Vu6Ui0_k(cOoY+~Jh=z$uugd4-30CiVqlQEN%;p5ynP0M$o$_z;hK
zaWjw5WWw_o)K;R3zwJyt?lAB#dui@8Z@<|Gg!ZJjx+C!2ZR$trFdg`YHlWW@{#2yE
z;&4;L-Os7t{St3KcLJ2OJWjj?^Ik&MaRdJR^*mzQ1RSO!05Q5FJj8~U5<RV>&joP5
zRxMhH2haG^WRd4XZd2u*dB~MC07pTvFX8nAzL&Yrhx+$XM-UB^a21b6B57HW^Q1ek
zAGgf{1|Sn~R3tmW?%NQG7jE-H6fOC2Pp}IudBpO3oD9*s>z)RL4SMRk&(&9)Ae4!_
zyogA$W(Ki*_Jf}x8$%WY?2&XfM2i`(tY2lN(AE>4&w>fGT{MK!Kp@QHE#z54-=Da^
z5f^C0o<GM@w~HvtDU8xX9q!=x(-YbN;<Qt<UjP*Seec<m+#U^U7`&Cmle<9CaRU&&
zA_Gcxx*+Z?mcT|9L%BRc{=yn~10d~s1&+SHit&I6RlTHcTbknUcVK1+KUMV6pvM#t
zbs}v=^Mr!m-gLhhe~c%&S$JNY$LmR{uW0VRIrT=~ff23k+J6LYKF!^kSH#SBA)Yq{
zt^?BSqAkq7Or1|Oh+yM9dKR|mm_$>F4{dl0OFfK`HTEDVl3Jc13^v_(X&!y0@EM$^
z%gyfsqIoor7umvJ){vs|EC7C!`XdiJCq3uR14xEyY^XPFH#MCB;;-MK?jZW4hPMpg
zeRA~yLe59PQCFY`^|=PY!B!KW+VP|(Jg%3<uR*3A^Z^`i@Vh<SE8ISd%ovH-ajrU}
zE4BlAPSK|@?)Qp}q<N1kNDJ2!WA;!dugdZ0*b(k`b`4C$6r(}Bq4eifM`r{csLuu-
zRSo$M!s}{~;bqbHhmQ1)-X?LY8aXX5<)>TmY^D>OcAUz+FT23^-cg$);_Ou1ULJ`x
zq&zr_@N+Hi1~-P@qRLFjqgsG*lv`okdE4_q+^*F8-3S-m^eL6UV&#>v*LTNk5oY><
z*e&USJelUe>t5$_x#}RzIU|tNUZtjDS1NbpZ5O0;qqKDrt69AIl<RLMQR5kVsyyya
z)7F5!I$(c;_enXtsFLgLfFzB6k-Qc|!#b`tr;3a?F|x%11;t1^g(R+rI#0&KXy)#S
zF^HyAHgcMzdJkL7^Nnf}X*D!3o0gwrMlqPIz!7_~@J)#SJJN?mnswybcu^$wG=2LC
zHQR8-DPFcN_2bVanQ)4BY{x3BZe6=UZ;_|DQ~BdCWjG9?9|_|EHY*#KA^iDLUU|;b
z4)no5gVxj^_?Tz(FI{ZtLoBtbF>43%fu=01b~!sJkwzXR0PN;-VOcfwNkcPvNS4;<
zOF_!MGVDP<BHf+2eMQV7s(In!BksYT5ZUf0ZaV$wF_hfkBMgeW@*%nKK`#uaOS@pP
z;5H)D7^J#}TB%cHD-EbajhgD9#&2V>VgW$>GgZuTQ2&vtQ95B9lwzqQTUk(`{H0RY
z$JnsM^n~C1z3D%YsxK%J6)E)@YLq@F?(9_Eb_!;bs9kpc8D*_xu|8lUOrfkbT_((U
zsVmggQdLAJZt8Wy?9MiRPTa=N%Y_AFO>$qYuxPAd)L@0_FXBQWrLp)>?8!RfLvy(!
zCUp0GZV0>U6&<P;ztL=7OsJ|TIuwYwMRno(;CyxAfWvn|;WV|Iw!-*3y!30>9RT~+
zF!>d+rRCN6u#lHg;nKHrT;~vO2P6s;fQ}8o%ORj&3w=9Cv)Gz_6P833EX#!zy|9|T
zQ|FZGpr25oO;sz^C~vN*QPKRYMhSCWjY8!H#4gs^R3$|*`I_u>v4XO>!}AxB=dW^M
z>j%%@j69d(3GU{F!r+IrrU_1>X-)78uYzC%jv*Qg3AD>MeqG}6{NRX?{9tT|Cm)3(
zc6U4qjiVmt&;#z&7h>gzi~RvyJmYZ@LE}J(&i;6)^!2r#r$e$+(Gf=-Z*Fk^;a4}3
zB46GRzKeWr%Y{Fu7yf*@$P<Y?vB=Y35PABGUMMJbstoDsCiR~z=loiY@R#JmUoI2=
zikG@W{cCEJZ2qd3`bVke*F}E+nq2tnJEfYxp?)vcU&>K`Iba9x?6BZD--vr>cwO^?
zI?8Bd9}m+)gdtFbEbdKt;B}L$<6hB5I)4i_=f)5oQ#!;@MJcvs4k=LYHh8SWR;eA(
zxvk)uba%we@7%GUhZNVIo}}r}gJLc>x{JE++@mQUk5V2wu`i<xVr`2=>edMOes`GL
z6X{zfvZVTR&w^~I%^ag`@O-8N&!3_JS8AmJ;hCqk*=uaezVqNYnsiLzrdQkym}umB
zC#pUI90j<I+|c{+;O^5}1kF0`#YT;KQ||@Ghj7FBLys@M0IIb)nE@)kq;Vfg1sl9#
z8FIu|_P`UC*X=@!r+BT5s;!Y^cev2XWvA#}V6Kh2!2{yVW8@=CympD2FYzb)OI)8u
zol)F)j+gDsob}Y9^iF<IoJp;jw`lFEd>8fGCeeof{jvaRc?0-)Z}F6XQLU-iI90m>
zwAO)68!zcy>X`sL`j&W64KQzo=ofkU<z=e6>l~2Fm3G10<aUC-KBFP*ud<8m`F$DB
zIC@9L7GYK6-cM=BDaAl7I;ws4_-+|3JfRB9!enX=fARqE)`Dc^8(?-hr^L>ul&c_<
z66|qLYQX&V30~!a46Z%QV<&@=Uo3>2<?=8Z2(jKv&DXivlRBt2H4teCFDa?r5kl?T
z?G3^ap4RbKZ)D+i)O060kynp+X`i`yhwu#b?Rp$at-16iLA^`FQa^z|-3&Z-)GhRl
zZFUyd@~|~>wPv~JHL5s8%`sf%U<<5o;_iGNL*b<2^~rnK6j#UYzJUEHwvk=exGTm2
z`|53K*mJ~;XRV)jLqlInX)_<&#WDW2%NmJnHzIRKL<F*a)J>?pP)Wly{lewNP2PM(
zW7wg=j@nS(X!1p!$oo8PB=FFa<OIv3Z)otvz7sM0@y(Mnd+!^#GBAsqZkOEU<&!th
zafdf;9}LubU%&qb2HVGLhq)a4Oy16kL9t_YA8#wKA4e+Gn8EL_mvT!<x&zf;7IV@7
z?%~#G?61DjYynMZz;!t?$mZP+1-D>)3)K0_>5vVoQJ;`ecVlP%L3Eu5Iv(|U`5K|D
z4fz|guJ5P(u=Y(ZPtT*;3)o3a+<txY8Fat#_+X*o8GlZHhC15vZ9D#8&uuSlv+R-q
zpuJQXl*)}B@j*0l7%5ZFE?O1h;qln(e&EdqzPyHj4f!HQzDIUHC?+ajAJW88?sG$g
z@8Vh|e55;pn%sB;i=8=)7&!sObi$q*{kTG&C-@P3o929;)2uy}Ys|U99LPBL3W-1)
zp$1Dek<m2I6&cvYkJGk}D?Afma+|!ShvQDtJkqT8Q&(aUWx8dj4<9Luf=VE+a&8q(
zV#X6V(ZuU%!S)4^N1yp6du}*RQ^$F(3^>yV`F2@xtKEKkcvL%b$q9etNLBoGf5FQT
z8af3t?R9q40L9x=?x#L9*B`L-_bLcKaV{QBU1l`k!mDXMw3vuOZt5wQP+D~6xyz?-
z2`U=3;ms#7xg`{Kdd&=Zv=)9Je2*D0XP^09{!qjfcs~S{cKe;&gC#z4Kcxgg-9z}?
z7T4h4Ca?3vc|cZWGW6Gg%&SWNI<mkPNHCd13x{C1bwJq~Bw8EP6P|<Ys>R_DOcj#*
zcnSbG<#%1{W&uX4(*QeDRy_2O*6;EcAKtcyWA?n@jYuHxJAYbzniS{fakT6nH=YjW
zk6G8lAjFvmDE-oG5Nxvp@>w|(a?w_w@R~<ZSbZd>YKXZL=bd>y-*5_0T6+^YGId|!
zDNAhlt4{=Y$Z7o$ucgvD%(DX0T|L7?0X!8H>lB+r{V`%$<C1~usA3n)V=14vqJn~g
zp|47QTc#?c>Li{LVxRje?z&RnG4NSO^ZQ`;Ti6;@(3YPkD&NnL@y>eC*deO4Gy|ut
zJao>UMzNCw6f9ULz(n7{#Zi8~!i~IgFND87HCGqV{IwV?@@?wN-Q1muy~Bh#)o_0-
zZI^nZ;0b=`;p$6A_lil#(Bb3MTNLI*UoDWAZ@rEwFvnhY@Dk$R6byQqKfQFghh)(A
zx@42R58fnExnM^<A-v^ROkW-$L0v^xI}ONbIY3<hrLriGJ?9S4+H-Xf64i18&HE>m
zVei^V<L9gki%@`k^@5kr`Ox=lu8qa|zl1}Do$zp21@zl|nifI<TJrQ%w9WxozJh30
z1K5~Gks$f-BkX3zxGF#DAod8G@b9hthfs63g%8eg1NRkU*U$_&hH3M_1zOn?#S6};
z_$~4BeJuZhv+Uq2uCw6|2O98#lW)*FZ(j1{P4`D7P(>G*Zw}`1)7XMKUgat``V3S<
z2dV)M&hXV1cxU^0;Kn?Ey5V;gTIhS3l8lHyD(9Y?;k%J=w?TX}Si<OW^;24S`tn>M
zHK9hxGbgXR0)Cc{(4foJ*GIYSwJ(b`(P=A)eKdf#yp+PJd<T?M8cx8|t23^Heep%W
zDiT84I->DTw6sBLRU`0}4R4=C02?aB4hkw5Rq)hvT6l5HPM{vg%iMb!7}F*0{5H<g
z!oJ()*_de&LagD&qu3i`55QBmc*I<Lp1)^P8@H!GYU}XpPM?=tk#t?3aKh(B!2Z{~
zeGm{ek<U|4{1Jy}T!1WwEos#k8@!QNybii|<>gfaRX&D(-g6VJ@U-g8O)*W<VOS)u
zxA^@VYIdd1`aFK??E;rZVTwM7!vK5e>(L`9*-Uv*B`=2G&EeTMXK3B(zWw!N?ga7<
z1|i#qsn4UP_m=8iXj^s+RULpfciyFee9u5?&4Rrzh;9fQUg5FRFt+|wYTrXWHU+$G
zn`K3tcCk0;yLqH%3Qq^*!-KwHd)HN@l4pte3(%(XA&`EMzA1U}5!4bytB1JHkG3KK
z1|#tCYF<w%hCsV%jv5ZrIJcyop?9ddP}5W8eMHqdo`?y2ZiYev>OyBP0ut)9+-rkr
zmS8DgZYP8T<i_J8N}ae|g&glBf4`0zR*Ney>i`j7kt_~@bma0;sUG&-+&kphshHyh
zE_q=6JGFzR?Q$*(3S*8zD8-J@Xf993#ULuKXK=TTJOvp*6SZa_(M0WLI)K&=DXIJ1
zORCph1AlF_dXQINTb=-}&T_?3`mzgR?bcF#*(vVM5)G>D7$V9tP3)oCLfUZFVa}es
zBl{dLQf1&}XJEku^-)J?e4#3r-X*2*ssI%aKq5oY2y~qm7WR&`T8y>!+@gtlK@q5I
z&{z>|?*&NBYF@$7`aF2t?Xrak`$!@TT$4o2d+EEG-Yxi^gfG}o`S9jbI8UeI!ZV<K
zrFkmV>2{evb@X`YnNtQlS>=`2v~?JJ`R3P-{?JQ3lA8f4SHe=@(ih-5%LphfkNs3w
za6OP|5Z+89QCtC!T;twQOT>{8TUs`w(QDj|B(UNERXG)7Q!M)J26t!Z>0J(&AHT>o
z@{=?gWEL9AA3ZM9M?|s3phUG@4k)|zaWsGBa!NuT6wf0XBu_Q|Nwns0zQi((r}0K5
zgFn-@T`r9RlPZB!DCRxTIk|Ba;Cq}`cEQ(Bf3$p7grHZR03JT`S6<61gocK)0lv-9
z=$8_J#ww4T115d9y;=&ant2w@i;)LCfPA&-`rU}Yh0iV^37t5E;?<W7nvBS{y2&dB
zG;PBA)@aj`>aT>kx<Gr4Fpaf~JSRq`^7|INdLB-VwlVIp$fk|G0HG<~isjErdZ!6L
z5V)6yt*OPG){k?YJLcX%^`+|(#_URUOZP|MY%Ajqc=0C$fteS0{}{@B{5hX1c;R#*
zEMcBkqT%bP<I!>!e0_oEWU$XwAl}3Qs)ftXofq?@X70QRA6oSWI056`0VCCCQNdK(
zyU2Ap$hWGX%Z~_JNyP{=q0rekk0ONd(>{RWbpU%s33q5Rx%+^bzdnaHukwr;eX`7>
zRrw1zS%WVW+s2Cki7Gp8@Z>s_03p#1xbGG~>-z~<e*HOKwNYO3hStWo*7rC-rV-Ha
z{uxajfeX%C(z`dofSA=J?hs4b!&$WCOz-@2B9m$L5r2N=^BT);M5ZFX8W~1*gz!pR
zb`OHZjBE<S)oGmp6H7d*J%(u1Z~q|eAS6=9JwDv?fM?*-gMhrxmYKre6nDM9As7CZ
zUii&)^@wb@L5)(1`d11l#=?a{XDAT<_MdWUM3<q=SK_S0-$6-0FBJ+<wZzM<2;uLF
zll(oo@b_zle;^nBp;QeK&ywkJB<CNc3jf%6<V-xF^iOt*1pidrH2*WX@Xy5}fwc^U
zk>4^MaL5W0e%pA!;a|vw-}!$ZaQK(f0f*lc=Riqe7#VaQZHVVVs9b!3ZMr>%s3P5R
zxh-r$x((;<qcks(4%_&1t$1hlqt`4Wlq-zm?1vjkPo%{noiaXf4Uo`fyiJD%NVjIO
zavCwVVe4wV<>Am2<DDA~BpL6_VqtijB4Idn-!is^(~zl`$`9Q&(b0;j7n;Ybb8Zoz
z;d1!+sPW!Sd5&wOfo{JvK9d$oRi--tmg!au`nOH<grb%$%`HMlgsG9MOx-ZRA5-#V
zl(8F8M0`Oa%}&Gigm;G$m*v!V+B7|a3{cLrydi2a-kXV-X}onqpc*u_a{U>S<Q`5l
zfkwGg`A_|B^evnv(Yv|Qu<@C2db{5!dM=OSb)&r9@&Exw$*a<LmMY6J-a$-NJGaC9
zVMnAF5LJxqm#0S?DZ_cl)XT%_1Y?^SE#EZVB>|C4{FXuz(k+*Bm8X&QG-PVs#zm$f
zXmpv_l~O*V(S#xiFp6fFMot<z7x@Y>nOxY9{_=n@mKZlfhH8>l_-#8Q<KdwqL>fh$
zi(Repc9@1tM#saCOqPbh8%(!mjGd2UlTi00a82Sp&CA3b_}S@COWqMed0w3H5iuVd
z+F{re0A}PJC}graj5)O9ZfOAEAMOTt{OSJf4oP^9tC4#*YBc#<II>WazlMkHINUHE
zHhCNbJ#K5{KMi2FYErkC<Z{Bl62azQ%Y}cV7yfN}iU*uSsVgnO$we&=DE>J5SfIrB
z5{1S9)Dces{=0@0*FTd~`1d=|^dCfM`j2wqKN+FPx>oql>faFydcQ87P&ECez|43f
z1gP5-wp)YD(G~ADnj}S4nnloXD6o5(CatoN8;Q+So_Cwu^`S`mR=M6*i>pM9ZNSv&
zP@BB~`XR`@lgGp{On-2Yxw9AO6Q1JF7Phw#A;s?So0m$cF7!H8z#@iE{CLQMI!=Wa
z#%Q#N|KdPMnCo~&hK&$%J!uW5;daKqev{v6!~;rFFX}i*92aRQl*Yqo#oycXSR6uL
zD|TU_w6q@~w-GVfIG!0BS!L9makvO!bq-8rA_jlN4rh$ojVwyD<?o?+JD|{*vF#yp
zq<7#Uu@I5_3-EhcYRBOd?hl0Y9-rnmHw2fqUuGCm{ArE6y_ZKdgc`O)Vq}SWv{WO1
zV|ok-xJ+{~)QHOZ2m<e%3o;p_AWX`r=Fhjn;kEOa9b3I^cMDoXESEI1vqWyTe~$EO
z1(}xAF3p6EM@5>JH?c`DYL~lsu~;AKc_dhgG-5}Ml19wTen^lv^!gV#K4MzM&hl*F
zQBB9L{&-BB!p0JE{0#0)V;K$QHdmjq!RZ*x>W1jhg5re9?tX|a?j$X`M<eU|6AR-3
z{k~g?coqE<55j*DJ?VGl!td#Y|C%lxRLsaX;F!49c+OY&Z$Enj^2@(Qh>DDkOpc1U
zV<Y@`yaf3lWy1gTQisHbZJK^UAif;Hd08B9E-1>!Vc7qL6S&gP3RER}MZPj!l`s5n
zG>NCIag-P*Q+2}crwaeat4#R6UQ%n04#!WmD&Y_6Qx1TrP?eLdPS<F~)63H1I^hp9
z6pD014i2>zDik(2uWKXxQL69@*)C;`)(oeH<=S1irX2a%)6&6NMNVO6PKHXSH3y0M
zk5aW3_(ZCKGv-#R8trcBaZB-W@$9`cp>_|x;^eM)o)brNb21f0N-fSn>$Fzr5XZ2V
z2r&Cz+{m>yxEji|w)pT;|3SU_4@H`NB8?en>?9g{TyuBUuwSIax%S<%Jy?T-MBpeA
zNHd$|?#y!lZPFM{5{WaenCBoqewxQ+hvtw(a~Rhj^BfT=k#!m8ag_*;iUdEK=NQ_g
zd5%jYCve3)C-L#qJZ>V5DDhJgjXSPC=J60Ik;xfldRihlBN7-T`GcRcXp_e9lt{dA
z#XRTm@zXr#MH*3V-V)6PTz|~tBT`Dg7!c*=D-rmK1V7`)A8pbY0TRhYTrp1|K7N|#
zl1L-+b6KLfg6ofYu8Nf4$3n7^Ac^3bNboa$uA@yF<Ay{Mj4S2|!N*VYgo-pGKVcG0
zIIchDi4Z9zes+VONQodyB={LW(P)##h>=Kc;);1<@$u6<aUzY#PrO8P3)dg>$V5ts
zA4|-0TOvph34X@U9kfYf+?7ZYaW$y$0#JO(s?%r@T@@zW&}oy<nW~lJ>Q$yq7C)r0
zTc9ive=DK*RX5h&0}<Y~{$Z;2zDS-cxrAh|l9ENqs7##yIP5PXgHHQEd~&z($0s`R
zxO`5g_95EpAqDZMx>xQ`V?UC<i-3?T*{GygQ5vSPcjX#K%QX(5lH45ca`D?M(@}fv
zV^Et?1nm>?zNAk36rXZ!Dy~L_?W@&36T75z;rAgQ{Bb>fOxsDv_+z&5R&6ld@bF%|
zNf(2D9QW+xsVJD>n3*w!d$-e%fZ1xwRB5Up*4S-bw3Dgw6q2f)RJEPon^M)3L~Kx{
zOf)<YWiRQ@`rghREqU!EqKA3rgfUa&sWMZV`b*t$r%h>UB3=iY(p0Y~Q<}=<K0A4;
zTZ)vXY89d!ifgrLl8BVJ>b2>(rb~!yL}BS?B4V&HK+NJVHVU(~8K6mVfjF}N#4q9U
zO+PBpW{UlX(f`XsNWV|n89^mdh(GALZTkI*HVZ9y<EzaUKX0dM)#B%!RBet(DgN%_
WSH{xjqDzU_!rDBM_L0(%?)%4j(7%WP


From 584cd709afd1b40a484bd987fa1928fca294cbf8 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 14 Dec 2019 11:38:39 +0100
Subject: [PATCH 085/166] fixed inference

---
 gensim/models/ensemblelda.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index d975835351..f6dfb1b588 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -98,6 +98,8 @@
 
 """
 
+print('asdf')
+
 import logging
 import os
 from multiprocessing import Process, Pipe, ProcessError
@@ -343,8 +345,8 @@ def generate_gensim_representation(self):
         params["eta"] = self.eta
         params["num_topics"] = num_stable_topics
         # adjust params in a way that no training happens
-        params["iterations"] = 0  # no training
         params["passes"] = 0  # no training
+        # iterations is needed for inference, pass it to the model
 
         classic_model_representation = self.topic_model_class(**params)
 

From 766f562a065f08b3dfdc0754e138ea2a15e38090 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 14 Dec 2019 11:41:43 +0100
Subject: [PATCH 086/166] removed print asdf

---
 gensim/models/ensemblelda.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index f6dfb1b588..d3f9299b74 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -98,8 +98,6 @@
 
 """
 
-print('asdf')
-
 import logging
 import os
 from multiprocessing import Process, Pipe, ProcessError

From 46b7cf6d4f2f098da4e16d86498942fe0df5d401 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 14 Dec 2019 11:51:13 +0100
Subject: [PATCH 087/166] added spec for inference

---
 gensim/test/test_ensemblelda.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index f2b05bb72e..4cefd7daeb 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -7,6 +7,9 @@
 Automated tests for checking the EnsembleLda Class
 """
 
+import sys
+sys.path = ['/mnt/data/Code/ensemble_lda_repo/gensim'] + sys.path
+
 import logging
 import unittest
 
@@ -341,6 +344,16 @@ def test_add_and_recluster(self):
         # same random state, hence topics should be still similar
         np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
 
+    def test_inference(self):
+        import numpy as np
+        # get the most likely token id from topic 0
+        max_id = np.argmax(self.eLDA.get_topics()[0,:])
+        self.assertGreater(self.eLDA.classic_model_representation.iterations, 0)
+        # topic 0 should be dominant in the inference.
+        # the difference between the probabilities should be significant and larger than 0.3
+        infered = self.eLDA[[(max_id, 1)]]
+        self.assertGreater(infered[0][1] - 0.3, infered[1][1])
+
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)

From 6689eaee892b6a0504e78972ae24c047f9ba3dd5 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 14 Dec 2019 12:18:04 +0100
Subject: [PATCH 088/166] tox

---
 gensim/test/test_ensemblelda.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 4cefd7daeb..825c0862b7 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -7,9 +7,6 @@
 Automated tests for checking the EnsembleLda Class
 """
 
-import sys
-sys.path = ['/mnt/data/Code/ensemble_lda_repo/gensim'] + sys.path
-
 import logging
 import unittest
 
@@ -347,7 +344,7 @@ def test_add_and_recluster(self):
     def test_inference(self):
         import numpy as np
         # get the most likely token id from topic 0
-        max_id = np.argmax(self.eLDA.get_topics()[0,:])
+        max_id = np.argmax(self.eLDA.get_topics()[0, :])
         self.assertGreater(self.eLDA.classic_model_representation.iterations, 0)
         # topic 0 should be dominant in the inference.
         # the difference between the probabilities should be significant and larger than 0.3

From 80f4f045f29bf1b0634c9d40a17cbb7d5c0e878c Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 23 Jan 2020 21:32:32 +0100
Subject: [PATCH 089/166] lazy loading topic_model_class

---
 gensim/models/ensemblelda.py    | 46 ++++++++++++++++-----------------
 gensim/test/test_ensemblelda.py | 11 ++++++--
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index d3f9299b74..cda0b0f168 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -110,6 +110,8 @@
 from gensim.models import ldamodel, ldamulticore, basemodel
 from gensim.utils import SaveLoad
 
+print('#')
+
 logger = logging.getLogger(__name__)
 
 
@@ -268,29 +270,27 @@ def __init__(self, topic_model_class="lda", num_models=3,
         # create model that can provide the usual gensim api to the stable topics from the ensemble
         self.generate_gensim_representation()
 
-    @classmethod
-    def load(cls, *args, **kwargs):
-        eLDA = super().load(*args, **kwargs)
-        try:
-            module = importlib.import_module(eLDA.topic_model_module_string)
-            eLDA.topic_model_class = getattr(module, eLDA.topic_model_class_string)
-            del eLDA.topic_model_module_string
-            del eLDA.topic_model_class_string
-        except ModuleNotFoundError:
-            logger.error('Could not import the "{}" module in order to provide the "{}" class as "topic_model_class" '
-                'attribute. Try setting this manually instead after loading.'
-                .format(eLDA.topic_model_class_string, eLDA.topic_model_class_string))
-        except AttributeError:
-            logger.error('Could not import the "{}" class from the "{}" module in order to set the '
-                '"topic_model_class" attribute. Try setting this manually instead after loading.'
-                .format(eLDA.topic_model_class_string, eLDA.topic_model_module_string))
-        return eLDA
-
-    load.__doc__ = SaveLoad.load.__doc__
+    def get_topic_model_class(self):
+        if self.topic_model_class is None:
+            try:
+                module = importlib.import_module(self.topic_model_module_string)
+                self.topic_model_class = getattr(module, self.topic_model_class_string)
+                del self.topic_model_module_string
+                del self.topic_model_class_string
+            except ModuleNotFoundError:
+                logger.error('Could not import the "{}" module in order to provide the "{}" class as "topic_model_class" '
+                    'attribute. Try setting this manually instead after loading.'
+                    .format(self.topic_model_class_string, self.topic_model_class_string))
+            except AttributeError:
+                logger.error('Could not import the "{}" class from the "{}" module in order to set the '
+                    '"topic_model_class" attribute. Try setting this manually instead after loading.'
+                    .format(self.topic_model_class_string, self.topic_model_module_string))
+        return self.topic_model_class
 
     def save(self, *args, **kwargs):
-        self.topic_model_module_string = self.topic_model_class.__module__
-        self.topic_model_class_string = self.topic_model_class.__name__
+        if self.get_topic_model_class() is not None:
+            self.topic_model_module_string = self.topic_model_class.__module__
+            self.topic_model_class_string = self.topic_model_class.__name__
         kwargs['ignore'] = frozenset(kwargs.get('ignore', ())).union(('topic_model_class', ))
         super(EnsembleLda, self).save(*args, **kwargs)
 
@@ -346,7 +346,7 @@ def generate_gensim_representation(self):
         params["passes"] = 0  # no training
         # iterations is needed for inference, pass it to the model
 
-        classic_model_representation = self.topic_model_class(**params)
+        classic_model_representation = self.get_topic_model_class()(**params)
 
         # when eta was None, use what gensim generates as default eta for the following tasks:
         eta = classic_model_representation.eta
@@ -642,7 +642,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
 
             kwArgs["random_state"] = random_states[i]
 
-            tm = self.topic_model_class(**kwArgs)
+            tm = self.get_topic_model_class()(**kwArgs)
 
             # adds the lambda (that is the unnormalized get_topics) to ttda, which is
             # a list of all those lambdas
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 825c0862b7..fbf084417f 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -13,7 +13,7 @@
 import numpy as np
 from copy import deepcopy
 
-from gensim.models import EnsembleLda, LdaMulticore
+from gensim.models import EnsembleLda, LdaMulticore, LdaModel
 from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
 
 num_topics = 2
@@ -138,9 +138,16 @@ def test_persisting(self):
         self.eLDA_mu.save(fname)
         loaded_eLDA_mu = EnsembleLda.load(fname)
 
+        # topic_model_class will be lazy loaded and should be None first
+        assert loaded_eLDA.topic_model_class is None
+
         # was it stored and loaded correctly?
-        # memory friendly
+        # memory friendly.
         loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation()
+
+        # generating the representation also lazily loads the topic_model_class
+        assert loaded_eLDA.topic_model_class == LdaModel
+
         topics = loaded_eLDA_representation.get_topics()
         ttda = loaded_eLDA.ttda
         amatrix = loaded_eLDA.asymmetric_distance_matrix

From b05bf11861176d0d1eb62d5f80dab8bb152cbccb Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 23 Jan 2020 21:38:02 +0100
Subject: [PATCH 090/166] tox

---
 gensim/models/ensemblelda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index cda0b0f168..fa8c3b4791 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -278,8 +278,8 @@ def get_topic_model_class(self):
                 del self.topic_model_module_string
                 del self.topic_model_class_string
             except ModuleNotFoundError:
-                logger.error('Could not import the "{}" module in order to provide the "{}" class as "topic_model_class" '
-                    'attribute. Try setting this manually instead after loading.'
+                logger.error('Could not import the "{}" module in order to provide the "{}" class as '
+                    '"topic_model_class" attribute. Try setting this manually instead after loading.'
                     .format(self.topic_model_class_string, self.topic_model_class_string))
             except AttributeError:
                 logger.error('Could not import the "{}" class from the "{}" module in order to set the '

From 39a9b62f88f97fb9f3fb93f1f5ef3484b876a553 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 23 Jan 2020 21:41:08 +0100
Subject: [PATCH 091/166] removed debug thing

---
 gensim/models/ensemblelda.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index fa8c3b4791..17703b6bbb 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -110,8 +110,6 @@
 from gensim.models import ldamodel, ldamulticore, basemodel
 from gensim.utils import SaveLoad
 
-print('#')
-
 logger = logging.getLogger(__name__)
 
 
From 09ded139d6a60230436ddd61611b1c5d3e21897b Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Thu, 23 Jan 2020 21:49:06 +0100
Subject: [PATCH 092/166] Documents now compile

---
 docs/src/apiref.rst              |  1 +
 docs/src/auto_examples/index.rst | 82 ++++++++++++++++----------------
 gensim/models/ensemblelda.py     | 54 +++++++++++----------
 3 files changed, 71 insertions(+), 66 deletions(-)

diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
index 6218336b06..4158dcc6a7 100644
--- a/docs/src/apiref.rst
+++ b/docs/src/apiref.rst
@@ -29,6 +29,7 @@ Modules:
     corpora/wikicorpus
     models/ldamodel
     models/ldamulticore
+    models/ensemblelda
     models/nmf
     models/lsimodel
     models/ldaseqmodel
diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst
index 5566611a8b..62dba3d50a 100644
--- a/docs/src/auto_examples/index.rst
+++ b/docs/src/auto_examples/index.rst
@@ -13,7 +13,7 @@ If you're thinking about contributing documentation, please see :ref:`sphx_glr_a
 
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
@@ -33,9 +33,9 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. only:: html
 
-    .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png
 
-        :ref:`sphx_glr_auto_examples_core_run_core_concepts.py`
+     :ref:`sphx_glr_auto_examples_core_run_core_concepts.py`
 
 .. raw:: html
 
@@ -53,9 +53,9 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. only:: html
 
-    .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png
 
-        :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py`
+     :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py`
 
 .. raw:: html
 
@@ -73,9 +73,9 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. only:: html
 
-    .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png
 
-        :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py`
+     :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py`
 
 .. raw:: html
 
@@ -93,9 +93,9 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. only:: html
 
-    .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png
 
-        :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py`
+     :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py`
 
 .. raw:: html
 
@@ -108,7 +108,7 @@ Understanding this functionality is vital for using gensim effectively.
    /auto_examples/core/run_similarity_queries
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
@@ -127,9 +127,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`
 
 .. raw:: html
 
@@ -147,9 +147,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py`
 
 .. raw:: html
 
@@ -167,9 +167,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py`
 
 .. raw:: html
 
@@ -187,9 +187,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
 
 .. raw:: html
 
@@ -207,9 +207,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
 
 .. raw:: html
 
@@ -227,9 +227,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_distance_metrics_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_distance_metrics_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py`
 
 .. raw:: html
 
@@ -247,9 +247,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
 
 .. raw:: html
 
@@ -267,9 +267,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_summarization_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_summarization_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py`
 
 .. raw:: html
 
@@ -287,9 +287,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_pivoted_doc_norm_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_pivoted_doc_norm_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py`
 
 .. raw:: html
 
@@ -302,7 +302,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
    /auto_examples/tutorials/run_pivoted_doc_norm
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
@@ -321,9 +321,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. only:: html
 
-    .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png
 
-        :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py`
+     :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py`
 
 .. raw:: html
 
@@ -341,9 +341,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. only:: html
 
-    .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png
 
-        :ref:`sphx_glr_auto_examples_howtos_run_doc.py`
+     :ref:`sphx_glr_auto_examples_howtos_run_doc.py`
 
 .. raw:: html
 
@@ -361,9 +361,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. only:: html
 
-    .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png
 
-        :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py`
+     :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py`
 
 .. raw:: html
 
@@ -381,9 +381,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. only:: html
 
-    .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png
 
-        :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py`
+     :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py`
 
 .. raw:: html
 
@@ -396,7 +396,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
    /auto_examples/howtos/run_compare_lda
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
@@ -440,7 +440,7 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from
 
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
@@ -452,13 +452,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from
 
   .. container:: sphx-glr-download
 
-    :download:`Download all examples in Python source code: auto_examples_python.zip <//home/misha/git/gensim/docs/src/auto_examples/auto_examples_python.zip>`
+    :download:`Download all examples in Python source code: auto_examples_python.zip <//Users/a.loosley/Alex/Repos/gensim/docs/src/auto_examples/auto_examples_python.zip>`
 
 
   .. container:: sphx-glr-download
 
-    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip <//home/misha/git/gensim/docs/src/auto_examples/auto_examples_jupyter.zip>`
+    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip <//Users/a.loosley/Alex/Repos/gensim/docs/src/auto_examples/auto_examples_jupyter.zip>`
 
 
 .. only:: html
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index d3f9299b74..6fb15abde8 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -3,6 +3,8 @@
 #
 # Authors: Tobias Brigl <github.com/sezanzeb>, Alex Salles <alex.salles@gmail.com>,
 # Alex Loosley <aloosley@alumni.brown.edu>, Data Reply Munich
+#
+
 
 """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
@@ -93,11 +95,10 @@
 
 Other Notes
 -----------
-.. The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and
+The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and
 comments.
 
 """
-
 import logging
 import os
 from multiprocessing import Process, Pipe, ProcessError
@@ -170,10 +171,13 @@ def __init__(self, topic_model_class="lda", num_models=3,
             (technically a divergence) from distribution A to B is more of a measure of if A is contained in B.  At a
             high level, this involves using distribution A to mask distribution B and then calculating the cosine
             distance between the two.  The masking can be done in two ways:
-                mass: forms mask by taking the top ranked terms until their cumulative mass reaches the
-                    `masking_threshold`
-                rank: forms mask by taking the top ranked terms (by mass) until the `masking_threshold` is reached.
-                    For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask.
+
+            1. mass: forms mask by taking the top ranked terms until their cumulative mass reaches the
+            'masking_threshold'
+
+            2. rank: forms mask by taking the top ranked terms (by mass) until the 'masking_threshold' is reached.
+            For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask.
+
         masking_threshold : float, optional
             Default: None, which uses 0.95 for "mass", and 0.11 for masking_method "rank".  In general, too small a mask
             threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy distance
@@ -1132,32 +1136,32 @@ def id2word(self):
 class CBDBSCAN():
     """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN).
 
-    The algorithm works based on DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a
+    The algorithm works based on DBSCAN-like parameters 'eps' and 'min_samples' that respectively define how far a
     "nearby" point is, and the minimum number of nearby points needed to label a candidate datapoint a core of a
     cluster. (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).
 
     The algorithm works as follows:
-    1. (A)symmetric distance matrix provided at fit-time (called `amatrix`).
-    For the sake of example below, assume the there are only five topics (amatrix contains distances with dim 5x5),
-        T_1, T_2, T_3, T_4, T_5:
+
+    1. (A)symmetric distance matrix provided at fit-time (called 'amatrix').
+       For the sake of example below, assume the there are only five topics (amatrix contains distances with dim 5x5),
+       T_1, T_2, T_3, T_4, T_5:
     2. Start by scanning a candidate topic with respect to a parent topic
-        (e.g. T_1 with respect to parent None)
-    3. Check which topics are nearby the candidate topic using `self.eps` as a threshold and call them neighbours
-        (e.g. assume T_3, T_4, and T_5 are nearby and become neighbours)
-    4. If there are more neighbours than `self.min_samples`, the candidate topic becomes a core candidate for a cluster
-        (e.g. if `min_samples`=1, then T_1 becomes the first core of a cluster)
+       (e.g. T_1 with respect to parent None)
+    3. Check which topics are nearby the candidate topic using 'self.eps' as a threshold and call them neighbours
+       (e.g. assume T_3, T_4, and T_5 are nearby and become neighbours)
+    4. If there are more neighbours than 'self.min_samples', the candidate topic becomes a core candidate for a cluster
+       (e.g. if 'min_samples'=1, then T_1 becomes the first core of a cluster)
     5. If candidate is a core, CheckBack (CB) to find the fraction of neighbours that are either the parent or the
-        parent's neighbours.  If this fraction is more than 75%, give the candidate the same label as its parent.
-        (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given)
+       parent's neighbours.  If this fraction is more than 75%, give the candidate the same label as its parent.
+       (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given)
     6. If candidate is a core, recursively scan the next nearby topic (e.g. scan T_3) labeling the previous topic as
-        the parent and the previous neighbours as the parent_neighbours - repeat steps
-        2.-6.
-        2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5)
-        3. (e.g. T5 is the only neighbour)
-        4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core)
-        5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3.
-            Therefore the candidate T_3 does NOT get the same label as its parent T_1)
-        6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5)
+       the parent and the previous neighbours as the parent_neighbours - repeat steps 2-6:
+       2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5)
+       3. (e.g. T5 is the only neighbour)
+       4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core)
+       5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3.
+       \ \ \ Therefore the candidate T_3 does NOT get the same label as its parent T_1)
+       6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5)
 
     The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for
     unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is

From 4bd7cf7516b397ae18cf8d35cfc6a1626d4c0d5d Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 23 Jan 2020 21:59:02 +0100
Subject: [PATCH 093/166] escape sequence thing indent fix

---
 gensim/models/ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index bad7a16fe2..7f043de437 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1158,7 +1158,7 @@ class CBDBSCAN():
        3. (e.g. T5 is the only neighbour)
        4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core)
        5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3.
-       \ \ \ Therefore the candidate T_3 does NOT get the same label as its parent T_1)
+       Therefore the candidate T_3 does NOT get the same label as its parent T_1)
        6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5)
 
     The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for

From 49253df75c4d15627657dd7af8dfd3e912511dab Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Thu, 23 Jan 2020 22:06:07 +0100
Subject: [PATCH 094/166] Better document rendering and added opinosiscorpus to
 apirefs

---
 docs/src/apiref.rst             | 1 +
 docs/src/models/ensemblelda.rst | 9 +++++++++
 gensim/models/ensemblelda.py    | 3 ++-
 3 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 docs/src/models/ensemblelda.rst

diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
index 4158dcc6a7..996a6079d9 100644
--- a/docs/src/apiref.rst
+++ b/docs/src/apiref.rst
@@ -22,6 +22,7 @@ Modules:
     corpora/malletcorpus
     corpora/mmcorpus
     corpora/_mmreader
+    corpora/opinosiscorpus
     corpora/sharded_corpus
     corpora/svmlightcorpus
     corpora/textcorpus
diff --git a/docs/src/models/ensemblelda.rst b/docs/src/models/ensemblelda.rst
new file mode 100644
index 0000000000..42e63c94be
--- /dev/null
+++ b/docs/src/models/ensemblelda.rst
@@ -0,0 +1,9 @@
+:mod:`models.ensembelda` -- Ensemble Latent Dirichlet Allocation
+================================================================
+
+.. automodule:: gensim.models.ensemblelda
+    :synopsis: Ensemble Latent Dirichlet Allocation
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index bad7a16fe2..411ba6e18b 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1154,11 +1154,12 @@ class CBDBSCAN():
        (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given)
     6. If candidate is a core, recursively scan the next nearby topic (e.g. scan T_3) labeling the previous topic as
        the parent and the previous neighbours as the parent_neighbours - repeat steps 2-6:
+
        2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5)
        3. (e.g. T5 is the only neighbour)
        4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core)
        5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3.
-       \ \ \ Therefore the candidate T_3 does NOT get the same label as its parent T_1)
+          Therefore the candidate T_3 does NOT get the same label as its parent T_1)
        6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5)
 
     The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for

From 5669f5866e5b2c6308885379334e78688f5a6bf0 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 23 Jan 2020 22:56:56 +0100
Subject: [PATCH 095/166] citation opinosis

---
 docs/notebooks/ensemble_lda_with_opinosis.ipynb | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
index 72ad2a26ba..c79f2a1416 100644
--- a/docs/notebooks/ensemble_lda_with_opinosis.ipynb
+++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
@@ -52,8 +52,7 @@
     "\n",
     "Opinosis [1] is a small (but redundant) corpus that contains 289 product reviews for 51 products. Since it's so small, the results are rather unstable.\n",
     "\n",
-    "[1] Kavita Ganesan, ChengXiang Zhai, and Jiawei Han, _Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions,_ Proceedings of the 23rd International Conference on Computational Linguistics, Association for Computational Linguistics, 2010, pp. 340–348.\n",
-    "http://kavita-ganesan.com/opinosis-opinion-dataset/ https://github.com/kavgan/opinosis"
+    "[1] Kavita Ganesan, ChengXiang Zhai, and Jiawei Han, _Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions [online],_ Proceedings of the 23rd International Conference on Computational Linguistics, Association for Computational Linguistics, 2010, pp. 340–348. Available from: https://kavita-ganesan.com/opinosis/"
    ]
   },
   {
@@ -168,7 +167,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.8.1"
   }
  },
  "nbformat": 4,

From 2aabe8f1f929f41aee979115658e820942bc0866 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Fri, 24 Jan 2020 15:03:04 +0100
Subject: [PATCH 096/166] missing opinosiscorpus.rst file committed

---
 docs/src/corpora/opinosiscorpus.rst | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 docs/src/corpora/opinosiscorpus.rst

diff --git a/docs/src/corpora/opinosiscorpus.rst b/docs/src/corpora/opinosiscorpus.rst
new file mode 100644
index 0000000000..3f62454677
--- /dev/null
+++ b/docs/src/corpora/opinosiscorpus.rst
@@ -0,0 +1,9 @@
+:mod:`corpora.opinosiscorpus` -- Topic related review sentences
+===============================================================
+
+.. automodule:: gensim.corpora.opinosiscorpus
+    :synopsis: Topic related review sentences
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:

From 9c25cf5abd666a57d0106bb115f60391d6d54177 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Thu, 6 Feb 2020 20:35:07 +0100
Subject: [PATCH 097/166] p names refactored to be descriptive, now using
 append for appending singletons to list

---
 gensim/models/ensemblelda.py | 66 ++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 1132141a32..5833c5a41c 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -573,34 +573,34 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
                 # get the chunk from the random states that is meant to be for those models
                 random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
 
-                p = Process(target=self._generate_topic_models,
+                process = Process(target=self._generate_topic_models,
                             args=(num_subprocess_models, random_states_for_worker, childConn))
 
-                processes += [p]
-                pipes += [(parentConn, childConn)]
-                p.start()
+                processes.append(process)
+                pipes.append((parentConn, childConn))
+                process.start()
 
                 num_models_unhandled -= num_subprocess_models
 
             except ProcessError:
                 logger.error("could not start process {}".format(i))
                 # close all pipes
-                for p in pipes:
-                    p[1].close()
-                    p[0].close()
+                for pipe in pipes:
+                    pipe[1].close()
+                    pipe[0].close()
                 # end all processes
-                for p in processes:
-                    if p.is_alive():
-                        p.terminate()
-                    del p
+                for process in processes:
+                    if process.is_alive():
+                        process.terminate()
+                    del process
                 # stop
                 raise
 
         # aggregate results
         # will also block until workers are finished
-        for p in pipes:
-            answer = p[0].recv()  # [0], because that is the parentConn
-            p[0].close()
+        for pipe in pipes:
+            answer = pipe[0].recv()  # [0], because that is the parentConn
+            pipe[0].close()
             # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
             if not self.memory_friendly_ttda:
                 self.tms += answer
@@ -610,8 +610,8 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
             self.ttda = np.concatenate([self.ttda, ttda])
 
         # end all processes
-        for p in processes:
-            p.terminate()
+        for process in processes:
+            process.terminate()
 
     def _generate_topic_models(self, num_models, random_states=None, pipe=None):
         """Train the topic models that form the ensemble.
@@ -731,39 +731,39 @@ def _generate_asymmetric_distance_matrix(self):
                 else:
                     n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i))
 
-                p = Process(target=self._asymmetric_distance_matrix_worker,
+                process = Process(target=self._asymmetric_distance_matrix_worker,
                             args=(i, ttdas_sent, n_ttdas, childConn, threshold, method))
                 ttdas_sent += n_ttdas
 
-                processes += [p]
-                pipes += [(parentConn, childConn)]
-                p.start()
+                processes.append(process)
+                pipes.append((parentConn, childConn))
+                process.start()
 
             except ProcessError:
                 logger.error("could not start process {}".format(i))
                 # close all pipes
-                for p in pipes:
-                    p[1].close()
-                    p[0].close()
+                for pipe in pipes:
+                    pipe[1].close()
+                    pipe[0].close()
                 # end all processes
-                for p in processes:
-                    if p.is_alive():
-                        p.terminate()
-                    del p
+                for process in processes:
+                    if process.is_alive():
+                        process.terminate()
+                    del process
                 raise
 
         distances = []
         # note, that the following loop maintains order in how the ttda will be concatenated
         # which is very important. Ordering in ttda has to be the same as when using only one process
-        for p in pipes:
-            answer = p[0].recv()  # [0], because that is the parentConn
-            p[0].close()  # child conn will be closed from inside the worker
+        for pipe in pipes:
+            answer = pipe[0].recv()  # [0], because that is the parentConn
+            pipe[0].close()  # child conn will be closed from inside the worker
             # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
-            distances += [answer[1]]
+            distances.append(answer[1])
 
         # end all processes
-        for p in processes:
-            p.terminate()
+        for process in processes:
+            process.terminate()
 
         self.asymmetric_distance_matrix = np.concatenate(distances)
 

From 4079c27dbb75ee3f9fff8791c1e83d2398aabea8 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Thu, 6 Feb 2020 20:48:30 +0100
Subject: [PATCH 098/166] Changing to hanging indents where they were not used
 before

---
 gensim/models/ensemblelda.py | 37 ++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 5833c5a41c..b0e40a6f6c 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -280,11 +280,13 @@ def get_topic_model_class(self):
                 del self.topic_model_module_string
                 del self.topic_model_class_string
             except ModuleNotFoundError:
-                logger.error('Could not import the "{}" module in order to provide the "{}" class as '
+                logger.error(
+                    'Could not import the "{}" module in order to provide the "{}" class as '
                     '"topic_model_class" attribute. Try setting this manually instead after loading.'
                     .format(self.topic_model_class_string, self.topic_model_class_string))
             except AttributeError:
-                logger.error('Could not import the "{}" class from the "{}" module in order to set the '
+                logger.error(
+                    'Could not import the "{}" class from the "{}" module in order to set the '
                     '"topic_model_class" attribute. Try setting this manually instead after loading.'
                     .format(self.topic_model_class_string, self.topic_model_module_string))
         return self.topic_model_class
@@ -487,9 +489,10 @@ def add_model(self, target, num_new_models=None):
 
             # 1. ttda array
             if isinstance(target.dtype.type(), (np.number, float)):
-                raise ValueError('ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, '
-                                 'you can call convert_to_memory_friendly, but it will discard the stored gensim'
-                                 'models and only keep the relevant topic term distributions from them.')
+                raise ValueError(
+                    'ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, '
+                    'you can call convert_to_memory_friendly, but it will discard the stored gensim '
+                    'models and only keep the relevant topic term distributions from them.')
 
             # 2. list of ensembles
             elif isinstance(target[0], type(self)):
@@ -509,15 +512,17 @@ def add_model(self, target, num_new_models=None):
             # in this case, len(self.tms) should
             # always match self.num_models
             if num_new_models is not None and num_new_models + self.num_models != len(self.tms):
-                logger.info('num_new_models will be ignored. num_models should match the number of '
-                            'stored models for a memory unfriendly ensemble')
+                logger.info(
+                    'num_new_models will be ignored. num_models should match the number of '
+                    'stored models for a memory unfriendly ensemble')
             self.num_models = len(self.tms)
 
         logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
 
         if self.ttda.shape[1] != ttda.shape[1]:
-            raise ValueError(("target ttda dimensions do not match. Topics must be {} but was"
-                                "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1]))
+            raise ValueError(
+                "target ttda dimensions do not match. Topics must be {} but was {} elements large"
+                    .format(self.ttda.shape[-1], ttda.shape[-1]))
         self.ttda = np.append(self.ttda, ttda, axis=0)
 
         # tell recluster that the distance matrix needs to be regenerated
@@ -636,16 +641,16 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
 
         assert len(random_states) == num_models
 
-        kwArgs = self.gensim_kw_args.copy()
+        kwargs = self.gensim_kw_args.copy()
 
         tm = None  # remember one of the topic models from the following
         # loop, in order to collect some properties from it afterwards.
 
         for i in range(num_models):
 
-            kwArgs["random_state"] = random_states[i]
+            kwargs["random_state"] = random_states[i]
 
-            tm = self.get_topic_model_class()(**kwArgs)
+            tm = self.get_topic_model_class()(**kwargs)
 
             # adds the lambda (that is the unnormalized get_topics) to ttda, which is
             # a list of all those lambdas
@@ -721,8 +726,7 @@ def _generate_asymmetric_distance_matrix(self):
             try:
                 parentConn, childConn = Pipe()
 
-                # figure out how many ttdas to send to the worker
-                # 9 ttdas to 4 workers: 2 2 2 3
+                # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3.
                 n_ttdas = 0
                 if i == workers - 1:  # i is a index, hence -1
                     # is this the last worker that needs to be created?
@@ -741,15 +745,16 @@ def _generate_asymmetric_distance_matrix(self):
 
             except ProcessError:
                 logger.error("could not start process {}".format(i))
-                # close all pipes
+
                 for pipe in pipes:
                     pipe[1].close()
                     pipe[0].close()
-                # end all processes
+
                 for process in processes:
                     if process.is_alive():
                         process.terminate()
                     del process
+
                 raise
 
         distances = []

From f5379ff7485b77de05c102d453a8c3e35a51546a Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Thu, 6 Feb 2020 21:32:38 +0100
Subject: [PATCH 099/166] Adding :meth: and `` `` styling for RST

---
 gensim/models/ensemblelda.py | 68 ++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index b0e40a6f6c..0f7fec955b 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -8,7 +8,7 @@
 
 """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
-Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
+Extracts reliable topics that are consistently learned across multiple LDA models. eLDA has the added benefit that
 the user does not need to know the exact number of topics the topic model should extract ahead of time
 
 For more details read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/).
@@ -319,7 +319,7 @@ def generate_gensim_representation(self):
         -------
         :py:class:`gensim.models.LdaModel`
             A Gensim LDA Model classic_model_representation for which:
-            classic_model_representation.get_topics() == self.get_topics()
+            ``classic_model_representation.get_topics() == self.get_topics()``
 
         """
         logger.info("generating classic gensim model representation based on results from the ensemble")
@@ -374,7 +374,7 @@ def generate_gensim_representation(self):
         # the factor, that will be used when get_topics() is used, for normalization
         # will never change, because the sum for eta as well as the sum for sstats is constant.
         # Therefore predicting normalization_factor becomes super easy.
-        # corpus is a mapping of id to occurences
+        # corpus is a mapping of id to occurrences
 
         # so one can also easily calculate the
         # right sstats, so that get_topics() will return the stable topics no
@@ -394,7 +394,7 @@ def generate_gensim_representation(self):
         return classic_model_representation
 
     def add_model(self, target, num_new_models=None):
-        """Add the ttda of another model to the ensemble.
+        """Add the topic term distribution array (ttda) of another model to the ensemble.
 
         This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use
         the exact same dictionary/idword mapping.
@@ -403,12 +403,12 @@ def add_model(self, target, num_new_models=None):
             self._generate_asymmetric_distance_matrix()
             self.recluster()
 
-        The ttda of another ensemble can also be used, in that case set num_new_models to the num_models parameter
-        of the ensemble, that means the number of classic models in the ensemble that generated the ttda. This is
-        important, because that information is used to estimate "min_samples" for _generate_topic_clusters.
+        The ttda of another ensemble can also be used, in that case set ``num_new_models`` to the ``num_models``
+        parameter of the ensemble, that means the number of classic models in the ensemble that generated the ttda.
+        This is important, because that information is used to estimate "min_samples" for _generate_topic_clusters.
 
         If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other
-        models, you can get it from: self.id2word.
+        models, you can get it from: ``self.id2word``.
 
         Parameters
         ----------
@@ -435,7 +435,7 @@ def add_model(self, target, num_new_models=None):
 
             If target is a 2D-array of float values, it assumes 1.
 
-            If the ensemble has memory_friendly_ttda set to False, then it will always use the number of models in
+            If the ensemble has ``memory_friendly_ttda`` set to False, then it will always use the number of models in
             the target parameter.
 
         """
@@ -775,7 +775,7 @@ def _generate_asymmetric_distance_matrix(self):
         return self.asymmetric_distance_matrix
 
     def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method):
-        """Calculate an (asymmetric) distance from each topic in ttda1 to each topic in ttda2.
+        """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
 
         Parameters
         ----------
@@ -783,10 +783,10 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s
             Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
             topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
         threshold : float, optional
-            threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected method
+            threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method
         start_index : int
             this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
-            complete ttda in that case. start_index would be 0 if ttda1 == self.ttda. When self.ttda is split into two
+            complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into two
             pieces, each 100 ttdas long, then start_index should be be 100. default is 0
         method : {'mass', 'rank}, optional
             method can be "mass" for the original masking method or "rank" for a faster masking method that selects
@@ -796,7 +796,7 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s
         Returns
         -------
         2D Numpy.numpy.ndarray of floats
-            Asymmetric distance matrix of size len(ttda1) by len(ttda2).
+            Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.
 
         """
         # initialize the distance matrix. ndarray is faster than zeros
@@ -860,14 +860,15 @@ def rank_masking(a):
     def _generate_topic_clusters(self, eps=0.1, min_samples=None):
         """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices.
 
-        The final approval and generation of stable topics is done in _generate_stable_topics().
+        The final approval and generation of stable topics is done in ``_generate_stable_topics()``.
 
         Parameters
         ----------
         eps : float
-            eps is 0.1 by default.
-        min_samples : int
-            min_samples is int(self.num_models / 2)
+            dbscan distance scale
+        min_samples : int, optional
+            defaults to ``int(self.num_models / 2)``, dbscan min neighbours threshold which corresponds to finding
+            stable topics and should scale with the number of models, ``self.num_models``
 
         """
         if min_samples is None:
@@ -878,18 +879,16 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None):
         self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples)
         self.cluster_model.fit(self.asymmetric_distance_matrix)
 
-    def _validate_core(self, core):
-        """Check if the core has only a single valid parent, which is also the label assigned to the core.
-
-        If the presumed core is not even a core returns False.
+    def _is_valid_core(self, topic):
+        """Check if the topic is a valid core.
 
         Parameters
         ----------
-        core : {'is_core', 'valid_parents', 'labels'}
-            eps is 0.1 by default.
+        topic : {'is_core', 'valid_parents', 'label'}
+            topic to validate
 
         """
-        return core["is_core"] and (core["valid_parents"] == {core["label"]})
+        return topic["is_core"] and (topic["valid_parents"] == {topic["label"]})
 
     def _group_by_labels(self, results):
         """Group all the learned cores by their label, which was assigned in the cluster_model.
@@ -969,31 +968,32 @@ def _aggregate_topics(self, grouped_by_labels):
         return sorted_clusters
 
     def _remove_from_all_sets(self, label, clusters):
-        """Remove a label from every set in "neighboring_labels" for each core in clusters."""
+        """Remove a label from every set in "neighboring_labels" for each core in ``clusters``."""
         for cluster in clusters:
             for neighboring_labels_set in cluster["neighboring_labels"]:
                 if label in neighboring_labels_set:
                     neighboring_labels_set.remove(label)
 
     def _contains_isolated_cores(self, label, cluster, min_cores):
-        """Check if the cluster has at least min_cores of cores that belong to no other cluster."""
+        """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster."""
         return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores
 
     def _generate_stable_topics(self, min_cores=None):
         """Generate stable topics out of the clusters.
 
         The function finds clusters of topics using a variant of DBScan.  If a cluster has enough core topics
-        (c.f. parameter min_cores), then this cluster represents a stable topic.  The stable topic is specifically
+        (c.f. parameter ``min_cores``), then this cluster represents a stable topic.  The stable topic is specifically
         calculated as the average over all topic-term distributions of the core topics in the cluster.
 
         This function is the last step that has to be done in the ensemble.  After this step is complete,
-        Stable topics can be retrieved afterwards using the get_topics() method.
+        Stable topics can be retrieved afterwards using the :meth:`gensim.models.ensemblelda.EnsembleLda.get_topics`
+        method.
 
         Parameters
         ----------
         min_cores : int
             Minimum number of core topics needed to form a cluster that represents a stable topic.
-                Using None defaults to min_cores = min(3, max(1, int(self.num_models /4 +1)))
+                Using ``None`` defaults to ``min_cores = min(3, max(1, int(self.num_models /4 +1)))``
 
         """
         # min_cores being 0 makes no sense. there has to be a core for a cluster
@@ -1039,7 +1039,7 @@ def _generate_stable_topics(self, min_cores=None):
             topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels}
 
         # keeping only VALID cores
-        valid_core_mask = np.vectorize(self._validate_core)(results)
+        valid_core_mask = np.vectorize(self._is_valid_core)(results)
         valid_topics = self.ttda[valid_core_mask]
         topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask]
         unique_labels = np.unique(topic_labels)
@@ -1059,20 +1059,20 @@ def _generate_stable_topics(self, min_cores=None):
     def recluster(self, eps=0.1, min_samples=None, min_cores=None):
         """Reapply CBDBSCAN clustering and stable topic generation.
 
-        Stable topics can be retrieved using get_topics().
+        Stable topics can be retrieved using :meth:`gensim.models.ensemblelda.EnsembleLda.get_topics`.
 
         Parameters
         ----------
         eps : float
             epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
-            default: 0.1
+            default: ``0.1``
         min_samples : int
             The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.
-            default: int(self.num_models / 2)
+            default: ``int(self.num_models / 2)``
         min_cores : int
             how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
             that look similar have to be present, so that the average topic in those is used as stable topic.
-            default: min(3, max(1, int(self.num_models /4 +1)))
+            default: ``min(3, max(1, int(self.num_models /4 +1)))``
 
         """
         # if new models were added to the ensemble, the distance matrix needs to be generated again

From 591cf77bffddf35ad1badae408b4ac007a8b042b Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 6 Feb 2020 21:37:50 +0100
Subject: [PATCH 100/166] a bunch of reviews

---
 .../ensemble_lda_with_opinosis.ipynb          | 16 +++++++---
 gensim/corpora/opinosiscorpus.py              | 30 ++++++-------------
 gensim/models/ensemblelda.py                  | 14 +++++----
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
index c79f2a1416..7380a87f71 100644
--- a/docs/notebooks/ensemble_lda_with_opinosis.ipynb
+++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
@@ -2,11 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "scrolled": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "####\n"
+     ]
+    }
+   ],
    "source": [
     "import logging\n",
     "from gensim.models import EnsembleLda, LdaMulticore\n",
@@ -23,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -34,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py
index ffc0da8eba..1a68ec670c 100644
--- a/gensim/corpora/opinosiscorpus.py
+++ b/gensim/corpora/opinosiscorpus.py
@@ -54,33 +54,21 @@ def __init__(self, path):
         stemmer = PorterStemmer()
 
         for directory, b, filenames in os.walk(path):
-            # iterates over folders, so in this
-            # scope a new topic is prepared
-
-            # root directory?
-            if len(filenames) == 0:
-                continue
-
-            # folder = directory.split(os.sep)[-1]
-            # print("processing folder ", "'"+folder+"'", "...")
-
+            # each subdirectory of path is one collection of reviews to a specific product
             # now get the corpus/documents
             for filename in filenames:
-
                 filepath = directory + os.sep + filename
                 # write down the document and the topicId and split into train and testdata
                 with open(filepath) as file:
-
                     doc = file.read()
-                    # overwrite dictionary, add corpus to test or train afterwards
-                    # the following function takes an array of documents, so wrap it in square braces
-                    processed = [
-                        stemmer.stem(token) for token in re.findall(r'\w+', doc.lower())
-                        if token not in STOPWORDS
-                    ]
-
-                    dictionary.add_documents([processed])
-                    corpus += [dictionary.doc2bow(processed)]
+
+                preprocessed_doc = [
+                    stemmer.stem(token) for token in re.findall(r'\w+', doc.lower())
+                    if token not in STOPWORDS
+                ]
+
+                dictionary.add_documents([preprocessed_doc])
+                corpus += [dictionary.doc2bow(preprocessed_doc)]
 
         # and return the results the same way the other corpus generating functions do
         self.corpus = corpus
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 1132141a32..ba80a4ab3c 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -113,6 +113,8 @@
 
 logger = logging.getLogger(__name__)
 
+print('####')
+
 
 class EnsembleLda(SaveLoad):
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
@@ -274,6 +276,8 @@ def __init__(self, topic_model_class="lda", num_models=3,
     def get_topic_model_class(self):
         """Get the class that is used for :meth:`gensim.models.EnsembleLda.generate_gensim_representation`."""
         if self.topic_model_class is None:
+            instruction = 'Try setting topic_model_class manually to what the individual models were based on, ' \
+                'e.g. LdaMulticore.'
             try:
                 module = importlib.import_module(self.topic_model_module_string)
                 self.topic_model_class = getattr(module, self.topic_model_class_string)
@@ -281,12 +285,12 @@ def get_topic_model_class(self):
                 del self.topic_model_class_string
             except ModuleNotFoundError:
                 logger.error('Could not import the "{}" module in order to provide the "{}" class as '
-                    '"topic_model_class" attribute. Try setting this manually instead after loading.'
-                    .format(self.topic_model_class_string, self.topic_model_class_string))
+                    '"topic_model_class" attribute. {}'
+                    .format(self.topic_model_class_string, self.topic_model_class_string, instruction))
             except AttributeError:
                 logger.error('Could not import the "{}" class from the "{}" module in order to set the '
-                    '"topic_model_class" attribute. Try setting this manually instead after loading.'
-                    .format(self.topic_model_class_string, self.topic_model_module_string))
+                    '"topic_model_class" attribute. {}'
+                    .format(self.topic_model_class_string, self.topic_model_module_string, instruction))
         return self.topic_model_class
 
     def save(self, *args, **kwargs):
@@ -361,7 +365,7 @@ def generate_gensim_representation(self):
         # to generate the proper sstats for the new gensim model:
         # transform to dimensionality of stable_topics. axis=1 is summed
         eta_sum = 0
-        if int == type(eta) or float == type(eta):
+        if isinstance(eta, (int, float)):
             eta_sum = [eta * len(stable_topics[0])] * num_stable_topics
         else:
             if len(eta.shape) == 1:  # [e1, e2, e3]

From f1aba3e7aa5c2c1f992e9f42ff6fae03dbde2b30 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Thu, 6 Feb 2020 21:58:25 +0100
Subject: [PATCH 101/166] * Changed ensemblelda default to use ldamulticore
 instead of old lda model. * Added more :meth: and `` `` styling for RST,
 fixed a few typos

---
 gensim/models/ensemblelda.py | 38 +++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 0f7fec955b..b9e6129403 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -122,7 +122,7 @@ class EnsembleLda(SaveLoad):
 
     """
 
-    def __init__(self, topic_model_class="lda", num_models=3,
+    def __init__(self, topic_model_class="ldamulticore", num_models=3,
                  min_cores=None,  # default value from _generate_stable_topics()
                  epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
                  min_samples=None, masking_method="mass", masking_threshold=None,
@@ -135,13 +135,13 @@ def __init__(self, topic_model_class="lda", num_models=3,
         ----------
         topic_model_class : str, topic model, optional
             Examples:
-                'ldamulticore' (recommended), 'lda' (default),
+                * 'ldamulticore' (default, recommended)
+                * 'lda'
         ensemble_workers : number, optional
             Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
             num_models should be a multiple of ensemble_workers.
 
-            Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1
-                gensim.models.ldamodel, gensim.models.ldamulticore
+            Setting it to 0 or 1 will both use the non-multiprocessing version. Default: 1
         num_models : int, optional
             How many LDA models to train in this ensemble.
             Default: 3
@@ -153,7 +153,7 @@ def __init__(self, topic_model_class="lda", num_models=3,
             Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
             num_models should be a multiple of ensemble_workers.
 
-            Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1
+            Setting it to 0 or 1 will both use the nonmultiprocessing version. Default: 1
         memory_friendly_ttda : boolean, optional
             If True, the models in the ensemble are deleted after training and only a concatenation of each model's
             topic term distribution (called ttda) is kept to save memory.
@@ -179,12 +179,12 @@ def __init__(self, topic_model_class="lda", num_models=3,
             For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask.
 
         masking_threshold : float, optional
-            Default: None, which uses 0.95 for "mass", and 0.11 for masking_method "rank".  In general, too small a mask
-            threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy distance
-            calculations.  Defaults are often a good sweet spot for this hyperparameter.
+            Default: None, which uses ``0.95`` for "mass", and ``0.11`` for masking_method "rank".  In general, too
+            small a mask threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy
+            distance calculations.  Defaults are often a good sweet spot for this hyperparameter.
         distance_workers : int, optional
-            When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which
-            is not multiprocessed. Set to > 1 to enable multiprocessing.
+            When ``distance_workers`` is ``None``, it defaults to ``os.cpu_count()`` for maximum performance. Default is
+            1, which is not multiprocessed. Set to ``> 1`` to enable multiprocessing.
         **gensim_kw_args
             Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble.
 
@@ -399,9 +399,9 @@ def add_model(self, target, num_new_models=None):
         This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use
         the exact same dictionary/idword mapping.
 
-        In order to generate new stable topics afterwards, use
-            self._generate_asymmetric_distance_matrix()
-            self.recluster()
+        In order to generate new stable topics afterwards, use:
+            1. ``self._generate_asymmetric_distance_matrix()``
+            2. ``self.``:meth:`~gensim.models.ensemblelda.EnsembleLda.recluster`
 
         The ttda of another ensemble can also be used, in that case set ``num_new_models`` to the ``num_models``
         parameter of the ensemble, that means the number of classic models in the ensemble that generated the ttda.
@@ -986,7 +986,7 @@ def _generate_stable_topics(self, min_cores=None):
         calculated as the average over all topic-term distributions of the core topics in the cluster.
 
         This function is the last step that has to be done in the ensemble.  After this step is complete,
-        Stable topics can be retrieved afterwards using the :meth:`gensim.models.ensemblelda.EnsembleLda.get_topics`
+        Stable topics can be retrieved afterwards using the :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics`
         method.
 
         Parameters
@@ -1059,7 +1059,7 @@ def _generate_stable_topics(self, min_cores=None):
     def recluster(self, eps=0.1, min_samples=None, min_cores=None):
         """Reapply CBDBSCAN clustering and stable topic generation.
 
-        Stable topics can be retrieved using :meth:`gensim.models.ensemblelda.EnsembleLda.get_topics`.
+        Stable topics can be retrieved using :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics`.
 
         Parameters
         ----------
@@ -1181,7 +1181,6 @@ def __init__(self, eps, min_samples):
         ----------
         eps : float
             epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
-            default: 0.1
         min_samples : int
             The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.
 
@@ -1212,15 +1211,14 @@ def fit(self, amatrix):
         def scan_topic(topic_index, current_label=None, parent_neighbors=None):
             """Extend the cluster in one direction.
 
-            Results are accumulated in topic_clustering_results, a variable outside of this function.
+            Results are accumulated to ``self.results``.
 
             Parameters
             ----------
             topic_index : int
-                The topic that might be added to the existing cluster, or which might create a new cluster if
-                neccessary.
+                The topic that might be added to the existing cluster, or which might create a new cluster if necessary.
             current_label : int
-                The label of the cluster that might be suitable for topic_index
+                The label of the cluster that might be suitable for ``topic_index``
 
             """
             neighbors_sorted = sorted(

From a9428de68d97652f90f0bd602cd21d59f19b3072 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Thu, 6 Feb 2020 22:02:47 +0100
Subject: [PATCH 102/166] More docstring polish

---
 gensim/models/ensemblelda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 29c7f7fee8..1529d80b6b 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -137,7 +137,7 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3,
             Examples:
                 * 'ldamulticore' (default, recommended)
                 * 'lda'
-        ensemble_workers : number, optional
+        ensemble_workers : int, optional
             Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
             num_models should be a multiple of ensemble_workers.
 
@@ -164,7 +164,7 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3,
             If False, any topic term matrix can be suplied to add_model.
         min_samples : int, optional
             Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering.
-        masking_method : {'mass', 'rank'}, optional
+        masking_method : str, optional
             Choose one of "mass" (default) or "rank" (percentile, faster).
 
             For clustering, distances between topic-term distributions are asymmetric.  In particular, the distance

From 3bdeaf262bf6bf6a36af4fd1c89ffcc673d73504 Mon Sep 17 00:00:00 2001
From: Alex Loosley <aloosley@alumni.brown.edu>
Date: Thu, 6 Feb 2020 22:15:13 +0100
Subject: [PATCH 103/166] removing some camel-case vars for pep8 compliance.

---
 gensim/models/ensemblelda.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 1529d80b6b..4cb384c0b1 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -568,7 +568,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
 
         for i in range(workers):
             try:
-                parentConn, childConn = Pipe()
+                parent_conn, child_conn = Pipe()
                 num_subprocess_models = 0
                 if i == workers - 1:  # i is a index, hence -1
                     # is this the last worker that needs to be created?
@@ -581,10 +581,10 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
                 random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
 
                 process = Process(target=self._generate_topic_models,
-                            args=(num_subprocess_models, random_states_for_worker, childConn))
+                            args=(num_subprocess_models, random_states_for_worker, child_conn))
 
                 processes.append(process)
-                pipes.append((parentConn, childConn))
+                pipes.append((parent_conn, child_conn))
                 process.start()
 
                 num_models_unhandled -= num_subprocess_models
@@ -726,7 +726,7 @@ def _generate_asymmetric_distance_matrix(self):
 
         for i in range(workers):
             try:
-                parentConn, childConn = Pipe()
+                parent_conn, child_conn = Pipe()
 
                 # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3.
                 n_ttdas = 0
@@ -738,11 +738,11 @@ def _generate_asymmetric_distance_matrix(self):
                     n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i))
 
                 process = Process(target=self._asymmetric_distance_matrix_worker,
-                            args=(i, ttdas_sent, n_ttdas, childConn, threshold, method))
+                            args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method))
                 ttdas_sent += n_ttdas
 
                 processes.append(process)
-                pipes.append((parentConn, childConn))
+                pipes.append((parent_conn, child_conn))
                 process.start()
 
             except ProcessError:

From 806952ec97ccd59928a5aa1ef6d2c1c61591a9eb Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Thu, 6 Feb 2020 22:49:11 +0100
Subject: [PATCH 104/166] a bunch of reviews

---
 gensim/models/ensemblelda.py | 80 +++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 37 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 2db339522c..5f8a697926 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -530,6 +530,31 @@ def add_model(self, target, num_new_models=None):
         # tell recluster that the distance matrix needs to be regenerated
         self.asymmetric_distance_matrix_outdated = True
 
+    def _teardown(self, pipes, processes, i):
+        """close pipes and terminate processes
+
+        Parameters
+        ----------
+            pipes : {list of :class:`multiprocessing.Pipe`}
+                list of pipes that the processes use to communicate with the parent
+            processes : {list of :class:`multiprocessing.Process`}
+                list of worker processes
+            i : int
+                index of the process that could not be started
+
+        """
+        logger.error("could not start process {}".format(i))
+
+        for pipe in pipes:
+            pipe[1].close()
+            pipe[0].close()
+
+        for process in processes:
+            if process.is_alive():
+                process.terminate()
+            del process
+
+
     def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
         """Generate the topic models to form the ensemble in a multiprocessed way.
 
@@ -567,21 +592,22 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
         num_models_unhandled = num_models  # how many more models need to be trained by workers?
 
         for i in range(workers):
-            try:
-                parentConn, childConn = Pipe()
-                num_subprocess_models = 0
-                if i == workers - 1:  # i is a index, hence -1
-                    # is this the last worker that needs to be created?
-                    # then task that worker with all the remaining models
-                    num_subprocess_models = num_models_unhandled
-                else:
-                    num_subprocess_models = int(num_models_unhandled / (workers - i))
+            parentConn, childConn = Pipe()
+            num_subprocess_models = 0
+            if i == workers - 1:  # i is a index, hence -1
+                # is this the last worker that needs to be created?
+                # then task that worker with all the remaining models
+                num_subprocess_models = num_models_unhandled
+            else:
+                num_subprocess_models = int(num_models_unhandled / (workers - i))
 
-                # get the chunk from the random states that is meant to be for those models
-                random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
+            # get the chunk from the random states that is meant to be for those models
+            random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
 
-                process = Process(target=self._generate_topic_models,
-                            args=(num_subprocess_models, random_states_for_worker, childConn))
+            try:
+                process = Process(
+                    target=self._generate_topic_models,
+                    args=(num_subprocess_models, random_states_for_worker, childConn))
 
                 processes.append(process)
                 pipes.append((parentConn, childConn))
@@ -590,18 +616,8 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
                 num_models_unhandled -= num_subprocess_models
 
             except ProcessError:
-                logger.error("could not start process {}".format(i))
-                # close all pipes
-                for pipe in pipes:
-                    pipe[1].close()
-                    pipe[0].close()
-                # end all processes
-                for process in processes:
-                    if process.is_alive():
-                        process.terminate()
-                    del process
-                # stop
-                raise
+                self._teardown(pipes, processes, i)
+                raise ProcessError
 
         # aggregate results
         # will also block until workers are finished
@@ -746,18 +762,8 @@ def _generate_asymmetric_distance_matrix(self):
                 process.start()
 
             except ProcessError:
-                logger.error("could not start process {}".format(i))
-
-                for pipe in pipes:
-                    pipe[1].close()
-                    pipe[0].close()
-
-                for process in processes:
-                    if process.is_alive():
-                        process.terminate()
-                    del process
-
-                raise
+                self._teardown(pipes, processes, i)
+                raise ProcessError
 
         distances = []
         # note, that the following loop maintains order in how the ttda will be concatenated

From 3d24f62ef229a7cb4acb884811c032e19120e077 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 9 Feb 2020 20:27:52 +0100
Subject: [PATCH 105/166] fixed linter

---
 gensim/models/ensemblelda.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 6ce0289c76..e793a9edcc 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -522,9 +522,8 @@ def add_model(self, target, num_new_models=None):
         logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
 
         if self.ttda.shape[1] != ttda.shape[1]:
-            raise ValueError(
-                "target ttda dimensions do not match. Topics must be {} but was {} elements large"
-                    .format(self.ttda.shape[-1], ttda.shape[-1]))
+            raise ValueError("target ttda dimensions do not match. Topics must be {} but was {} elements large"
+                .format(self.ttda.shape[-1], ttda.shape[-1]))
         self.ttda = np.append(self.ttda, ttda, axis=0)
 
         # tell recluster that the distance matrix needs to be regenerated
@@ -554,7 +553,6 @@ def _teardown(self, pipes, processes, i):
                 process.terminate()
             del process
 
-
     def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
         """Generate the topic models to form the ensemble in a multiprocessed way.
 
@@ -794,8 +792,8 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s
             threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method
         start_index : int
             this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
-            complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into two
-            pieces, each 100 ttdas long, then start_index should be be 100. default is 0
+            complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into
+            two pieces, each 100 ttdas long, then start_index should be be 100. default is 0
         method : {'mass', 'rank}, optional
             method can be "mass" for the original masking method or "rank" for a faster masking method that selects
             by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the

From 619922d28d34ad113a340a8beb22439887902501 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 11:57:00 +0200
Subject: [PATCH 106/166] less precision for windows

---
 gensim/test/test_ensemblelda.py | 34 ++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 6794cb71a0..84ca9804cb 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -7,6 +7,7 @@
 Automated tests for checking the EnsembleLda Class
 """
 
+import os
 import logging
 import unittest
 
@@ -20,6 +21,9 @@
 num_models = 4
 passes = 50
 
+# windows tests fail due to the required assertion precision being too high
+rtol = 1e-04 if os.name == 'nt' else 1e-05
+
 
 class TestModel(unittest.TestCase):
     def setUp(self):
@@ -59,7 +63,7 @@ def test_eLDA(self):
 
         # compare with a pre-trained reference model
         reference = EnsembleLda.load(datapath('ensemblelda'))
-        np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=rtol)
         # small values in the distance matrix tend to vary quite a bit around 2%,
         # so use some absolute tolerance measurement to check if the matrix is at least
         # close to the target.
@@ -84,7 +88,7 @@ def test_clustering(self):
 
         self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
         self.assertEqual(reference.sorted_clusters, sorted_clusters)
-        np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05)
+        np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=rtol)
 
     def test_not_trained(self):
         # should not throw errors and no training should happen
@@ -115,14 +119,14 @@ def test_memory_unfriendly(self):
         # both should be 100% similar (but floats cannot
         # be compared that easily, so check for threshold)
         self.assertEqual(len(self.eLDA_mu.tms), num_models)
-        np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05)
-        np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=rtol)
+        np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=rtol)
         self.check_ttda(self.eLDA_mu)
 
     def test_generate_gensim_rep(self):
         gensimModel = self.eLDA.generate_gensim_representation()
         topics = gensimModel.get_topics()
-        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol)
 
     def assert_cluster_results_equal(self, a, b):
         """compares important attributes of the cluster results"""
@@ -152,9 +156,9 @@ def test_persisting(self):
         topics = loaded_eLDA_representation.get_topics()
         ttda = loaded_eLDA.ttda
         amatrix = loaded_eLDA.asymmetric_distance_matrix
-        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
-        np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05)
-        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol)
+        np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=rtol)
+        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=rtol)
 
         a = self.eLDA.cluster_model.results
         b = loaded_eLDA.cluster_model.results
@@ -164,7 +168,7 @@ def test_persisting(self):
         # memory unfriendly
         loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation()
         topics = loaded_eLDA_mu_representation.get_topics()
-        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol)
 
     def test_multiprocessing(self):
         # same configuration
@@ -186,8 +190,8 @@ def test_multiprocessing(self):
                                     random_state=random_state, ensemble_workers=workers, distance_workers=workers,
                                     memory_friendly_ttda=False)
 
-        np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05)
-        np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=rtol)
+        np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=rtol)
 
     def test_add_models(self):
         # same configuration
@@ -310,14 +314,14 @@ def test_add_and_recluster(self):
                                   iterations=30, random_state=1, topic_model_class='ldamulticore',
                                   distance_workers=4, memory_friendly_ttda=False)
         # both should be similar
-        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
-        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol)
+        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol)
         # and every next step applied to both should result in similar results
 
         # 1. adding to ttda and tms
         new_eLDA.add_model(self.eLDA)
         new_eLDA_mu.add_model(self.eLDA_mu)
-        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
+        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol)
         self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics)
         self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics)
         self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models)
@@ -347,7 +351,7 @@ def test_add_and_recluster(self):
         new_eLDA_mu.generate_gensim_representation()
 
         # same random state, hence topics should be still similar
-        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol)
 
     def test_inference(self):
         import numpy as np

From 3f0041434eb9c33ab37c7ef108f39e5219b74c4d Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 12:10:45 +0200
Subject: [PATCH 107/166] hanging indents

---
 gensim/models/ensemblelda.py    |  68 +++++++++++------
 gensim/test/test_ensemblelda.py | 126 ++++++++++++++++++++------------
 2 files changed, 123 insertions(+), 71 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index e793a9edcc..a501fb01df 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -122,11 +122,13 @@ class EnsembleLda(SaveLoad):
 
     """
 
-    def __init__(self, topic_model_class="ldamulticore", num_models=3,
-                 min_cores=None,  # default value from _generate_stable_topics()
-                 epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
-                 min_samples=None, masking_method="mass", masking_threshold=None,
-                 distance_workers=1, random_state=None, **gensim_kw_args):
+    def __init__(
+            self, topic_model_class="ldamulticore", num_models=3,
+            min_cores=None,  # default value from _generate_stable_topics()
+            epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
+            min_samples=None, masking_method="mass", masking_threshold=None,
+            distance_workers=1, random_state=None, **gensim_kw_args,
+    ):
         """Create and train a new EnsembleLda model.
 
         Will start training immediatelly, except if iterations, passes or num_models is 0 or if the corpus is missing.
@@ -210,7 +212,8 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3,
             raise ValueError(
                 "at least one of corpus/id2word must be specified, to establish "
                 "input space dimensionality. Corpus should be provided using the "
-                "`corpus` keyword argument.")
+                "`corpus` keyword argument."
+            )
 
         if type(topic_model_class) == type and issubclass(topic_model_class, ldamodel.LdaModel):
             self.topic_model_class = topic_model_class
@@ -221,7 +224,9 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3,
             }
             if topic_model_class not in kinds:
                 raise ValueError(
-                    "topic_model_class should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel")
+                    "topic_model_class should be one of 'lda', 'ldamulticode' or a model "
+                    "inheriting from LdaModel"
+                )
             self.topic_model_class = kinds[topic_model_class]
 
         self.num_models = num_models
@@ -274,8 +279,10 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3,
     def get_topic_model_class(self):
         """Get the class that is used for :meth:`gensim.models.EnsembleLda.generate_gensim_representation`."""
         if self.topic_model_class is None:
-            instruction = 'Try setting topic_model_class manually to what the individual models were based on, ' \
+            instruction = (
+                'Try setting topic_model_class manually to what the individual models were based on, '
                 'e.g. LdaMulticore.'
+            )
             try:
                 module = importlib.import_module(self.topic_model_module_string)
                 self.topic_model_class = getattr(module, self.topic_model_class_string)
@@ -285,12 +292,14 @@ def get_topic_model_class(self):
                 logger.error(
                     'Could not import the "{}" module in order to provide the "{}" class as '
                     '"topic_model_class" attribute. {}'
-                    .format(self.topic_model_class_string, self.topic_model_class_string, instruction))
+                    .format(self.topic_model_class_string, self.topic_model_class_string, instruction)
+                )
             except AttributeError:
                 logger.error(
                     'Could not import the "{}" class from the "{}" module in order to set the '
                     '"topic_model_class" attribute. {}'
-                    .format(self.topic_model_class_string, self.topic_model_module_string, instruction))
+                    .format(self.topic_model_class_string, self.topic_model_module_string, instruction)
+                )
         return self.topic_model_class
 
     def save(self, *args, **kwargs):
@@ -516,14 +525,18 @@ def add_model(self, target, num_new_models=None):
             if num_new_models is not None and num_new_models + self.num_models != len(self.tms):
                 logger.info(
                     'num_new_models will be ignored. num_models should match the number of '
-                    'stored models for a memory unfriendly ensemble')
+                    'stored models for a memory unfriendly ensemble'
+                )
             self.num_models = len(self.tms)
 
         logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
 
         if self.ttda.shape[1] != ttda.shape[1]:
-            raise ValueError("target ttda dimensions do not match. Topics must be {} but was {} elements large"
-                .format(self.ttda.shape[-1], ttda.shape[-1]))
+            raise ValueError(
+                "target ttda dimensions do not match. Topics must be {} but was {} elements large".format(
+                    self.ttda.shape[-1], ttda.shape[-1],
+                )
+            )
         self.ttda = np.append(self.ttda, ttda, axis=0)
 
         # tell recluster that the distance matrix needs to be regenerated
@@ -605,7 +618,8 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
             try:
                 process = Process(
                     target=self._generate_topic_models,
-                    args=(num_subprocess_models, random_states_for_worker, child_conn))
+                    args=(num_subprocess_models, random_states_for_worker, child_conn),
+                )
 
                 processes.append(process)
                 pipes.append((parent_conn, child_conn))
@@ -663,7 +677,6 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
         # loop, in order to collect some properties from it afterwards.
 
         for i in range(num_models):
-
             kwargs["random_state"] = random_states[i]
 
             tm = self.get_topic_model_class()(**kwargs)
@@ -698,7 +711,8 @@ def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pip
         # the chunk of ttda that's going to be calculated:
         ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
         distance_chunk = self._calculate_asymmetric_distance_matrix_chunk(
-            ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method)
+            ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method,
+        )
         pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
         pipe.close()
 
@@ -725,7 +739,8 @@ def _generate_asymmetric_distance_matrix(self):
         # singlecore
         if workers is not None and workers <= 1:
             self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk(
-                ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method)
+                ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method,
+            )
             return self.asymmetric_distance_matrix
 
         # else, if workers > 1 use multiprocessing
@@ -751,8 +766,10 @@ def _generate_asymmetric_distance_matrix(self):
                 else:
                     n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i))
 
-                process = Process(target=self._asymmetric_distance_matrix_worker,
-                            args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method))
+                process = Process(
+                    target=self._asymmetric_distance_matrix_worker,
+                    args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method),
+                )
                 ttdas_sent += n_ttdas
 
                 processes.append(process)
@@ -962,14 +979,17 @@ def _aggregate_topics(self, grouped_by_labels):
                 "max_num_neighboring_labels": max_num_neighboring_labels,
                 "neighboring_labels": neighboring_labels,
                 "label": label,
-                "num_cores": len([topic for topic in group if topic["is_core"]])
+                "num_cores": len([topic for topic in group if topic["is_core"]]),
             })
 
-        sorted_clusters = sorted(sorted_clusters,
+        sorted_clusters = sorted(
+            sorted_clusters,
             key=lambda cluster: (
                 cluster["max_num_neighboring_labels"],
-                cluster["label"]
-            ), reverse=False)
+                cluster["label"],
+            ),
+            reverse=False,
+        )
 
         return sorted_clusters
 
@@ -1202,7 +1222,7 @@ def fit(self, amatrix):
             "is_core": False,
             "neighboring_labels": set(),
             "neighboring_topic_indices": set(),
-            "label": None
+            "label": None,
         } for _ in range(len(amatrix))]
 
         amatrix_copy = amatrix.copy()
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 84ca9804cb..d637af9616 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -31,13 +31,17 @@ def setUp(self):
         # the topics are equal
         random_state = 0
 
-        self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                                passes=passes, num_models=num_models, random_state=random_state,
-                                topic_model_class=LdaModel)
-
-        self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                                   passes=passes, num_models=num_models, random_state=random_state,
-                                   memory_friendly_ttda=False, topic_model_class=LdaModel)
+        self.eLDA = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+            passes=passes, num_models=num_models, random_state=random_state,
+            topic_model_class=LdaModel,
+        )
+
+        self.eLDA_mu = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+            passes=passes, num_models=num_models, random_state=random_state,
+            memory_friendly_ttda=False, topic_model_class=LdaModel,
+        )
 
     def check_ttda(self, ensemble):
         """tests the integrity of the ttda of any ensemble"""
@@ -68,8 +72,10 @@ def test_eLDA(self):
         # so use some absolute tolerance measurement to check if the matrix is at least
         # close to the target.
         atol = reference.asymmetric_distance_matrix.max() * 1e-05
-        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
-                                   reference.asymmetric_distance_matrix, atol=atol)
+        np.testing.assert_allclose(
+            self.eLDA.asymmetric_distance_matrix,
+            reference.asymmetric_distance_matrix, atol=atol,
+        )
 
     def test_clustering(self):
         # the following test is quite specific to the current implementation and not part of any api,
@@ -94,23 +100,31 @@ def test_not_trained(self):
         # should not throw errors and no training should happen
 
         # 0 passes
-        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                           passes=0, num_models=num_models, random_state=0)
+        eLDA = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+            passes=0, num_models=num_models, random_state=0,
+        )
         self.assertEqual(len(eLDA.ttda), 0)
 
         # no corpus
-        eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics,
-                           passes=passes, num_models=num_models, random_state=0)
+        eLDA = EnsembleLda(
+            id2word=common_dictionary, num_topics=num_topics,
+            passes=passes, num_models=num_models, random_state=0,
+        )
         self.assertEqual(len(eLDA.ttda), 0)
 
         # 0 iterations
-        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                           iterations=0, num_models=num_models, random_state=0)
+        eLDA = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+            iterations=0, num_models=num_models, random_state=0,
+        )
         self.assertEqual(len(eLDA.ttda), 0)
 
         # 0 models
-        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-                           passes=passes, num_models=0, random_state=0)
+        eLDA = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+            passes=passes, num_models=0, random_state=0
+        )
         self.assertEqual(len(eLDA.ttda), 0)
 
     def test_memory_unfriendly(self):
@@ -130,10 +144,14 @@ def test_generate_gensim_rep(self):
 
     def assert_cluster_results_equal(self, a, b):
         """compares important attributes of the cluster results"""
-        np.testing.assert_array_equal([row["label"] for row in a],
-                                      [row["label"] for row in b])
-        np.testing.assert_array_equal([row["is_core"] for row in a],
-                                      [row["is_core"] for row in b])
+        np.testing.assert_array_equal(
+            [row["label"] for row in a],
+            [row["label"] for row in b],
+        )
+        np.testing.assert_array_equal(
+            [row["is_core"] for row in a],
+            [row["is_core"] for row in b],
+        )
 
     def test_persisting(self):
         fname = get_tmpfile('gensim_models_ensemblelda')
@@ -180,15 +198,19 @@ def test_multiprocessing(self):
         workers = 3
 
         # memory friendly. contains List of topic word distributions
-        eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
-                                 num_topics=num_topics, passes=passes, num_models=num_models,
-                                 random_state=random_state, ensemble_workers=workers, distance_workers=workers)
+        eLDA_multi = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
+            num_topics=num_topics, passes=passes, num_models=num_models,
+            random_state=random_state, ensemble_workers=workers, distance_workers=workers,
+        )
 
         # memory unfriendly. contains List of models
-        eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
-                                    num_topics=num_topics, passes=passes, num_models=num_models,
-                                    random_state=random_state, ensemble_workers=workers, distance_workers=workers,
-                                    memory_friendly_ttda=False)
+        eLDA_multi_mu = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
+            num_topics=num_topics, passes=passes, num_models=num_models,
+            random_state=random_state, ensemble_workers=workers, distance_workers=workers,
+            memory_friendly_ttda=False,
+        )
 
         np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=rtol)
         np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=rtol)
@@ -208,10 +230,12 @@ def test_add_models(self):
         num_new_topics = 3
 
         # 1. memory friendly
-        eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                num_topics=num_new_topics, passes=1, num_models=num_new_models,
-                                iterations=1, random_state=0, topic_model_class=LdaMulticore,
-                                workers=3, ensemble_workers=2)
+        eLDA_base = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary,
+            num_topics=num_new_topics, passes=1, num_models=num_new_models,
+            iterations=1, random_state=0, topic_model_class=LdaMulticore,
+            workers=3, ensemble_workers=2,
+        )
 
         # 1.1 ttda
         a = len(eLDA_base.ttda)
@@ -254,10 +278,12 @@ def test_add_models(self):
         self.check_ttda(eLDA_base)
 
         # 2. memory unfriendly
-        eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                   num_topics=num_new_topics, passes=1, num_models=num_new_models,
-                                   iterations=1, random_state=0, topic_model_class=LdaMulticore,
-                                   workers=3, ensemble_workers=2, memory_friendly_ttda=False)
+        eLDA_base_mu = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary,
+            num_topics=num_new_topics, passes=1, num_models=num_new_models,
+            iterations=1, random_state=0, topic_model_class=LdaMulticore,
+            workers=3, ensemble_workers=2, memory_friendly_ttda=False,
+        )
 
         # 2.1 a single ensemble
         a = len(eLDA_base_mu.tms)
@@ -305,14 +331,18 @@ def test_add_and_recluster(self):
         num_new_topics = 3
 
         # train
-        new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                               num_topics=num_new_topics, passes=10, num_models=num_new_models,
-                               iterations=30, random_state=1, topic_model_class='ldamulticore',
-                               distance_workers=4)
-        new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
-                                  num_topics=num_new_topics, passes=10, num_models=num_new_models,
-                                  iterations=30, random_state=1, topic_model_class='ldamulticore',
-                                  distance_workers=4, memory_friendly_ttda=False)
+        new_eLDA = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary,
+            num_topics=num_new_topics, passes=10, num_models=num_new_models,
+            iterations=30, random_state=1, topic_model_class='ldamulticore',
+            distance_workers=4,
+        )
+        new_eLDA_mu = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary,
+            num_topics=num_new_topics, passes=10, num_models=num_new_models,
+            iterations=30, random_state=1, topic_model_class='ldamulticore',
+            distance_workers=4, memory_friendly_ttda=False,
+        )
         # both should be similar
         np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol)
         np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol)
@@ -331,8 +361,10 @@ def test_add_and_recluster(self):
         # 2. distance matrix
         new_eLDA._generate_asymmetric_distance_matrix()
         new_eLDA_mu._generate_asymmetric_distance_matrix()
-        np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix,
-                                   new_eLDA_mu.asymmetric_distance_matrix)
+        np.testing.assert_allclose(
+            new_eLDA.asymmetric_distance_matrix,
+            new_eLDA_mu.asymmetric_distance_matrix,
+        )
 
         # 3. CBDBSCAN results
         new_eLDA._generate_topic_clusters()
@@ -345,7 +377,7 @@ def test_add_and_recluster(self):
         new_eLDA._generate_stable_topics()
         new_eLDA_mu._generate_stable_topics()
         np.testing.assert_allclose(new_eLDA.get_topics(),
-                                   new_eLDA_mu.get_topics())
+            new_eLDA_mu.get_topics())
 
         new_eLDA.generate_gensim_representation()
         new_eLDA_mu.generate_gensim_representation()

From a15bb42728401f66e6d9f3c792896d741f82fcfd Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 12:19:48 +0200
Subject: [PATCH 108/166] somehow recognized some old versions of unrelated
 files as current changes

---
 CHANGELOG.md     |  69 +++++++++++++++++++++++++++-
 docs/src/conf.py | 115 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 164 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cc0ce80727..52bf5a989f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,73 @@
 Changes
 =======
 
+## Unreleased
+
+This release contains a major refactoring.
+
+### :+1: Improvements
+
+* Refactor ldamulticore to serialize less data (PR [#2300](https://github.com/RaRe-Technologies/gensim/pull/2300), __[@horpto](https://github.com/horpto)__)
+* KeyedVectors  & X2Vec API streamlining, consistency (PR [#2698](https://github.com/RaRe-Technologies/gensim/pull/2698), __[@gojomo](https://github.com/gojomo)__)
+* No more wheels for x32 platforms (if you need x32 binaries, please build them yourself).
+  (__[menshikh-iv](https://github.com/menshikh-iv)__, [#6](https://github.com/RaRe-Technologies/gensim-wheels/pull/6))
+* Speed up random number generation in word2vec model (PR [#2864](https://github.com/RaRe-Technologies/gensim/pull/2864), __[@zygm0nt](https://github.com/zygm0nt)__)
+* Fix deprecations in SoftCosineSimilarity (PR [#2940](https://github.com/RaRe-Technologies/gensim/pull/2940), __[@Witiko](https://github.com/Witiko)__)
+* Remove Keras dependency (PR [#2937](https://github.com/RaRe-Technologies/gensim/pull/2937), __[@piskvorky](https://github.com/piskvorky)__)
+* Bump minimum Python version to 3.6 (PR [#2947](https://github.com/RaRe-Technologies/gensim/pull/2947), __[@gojomo](https://github.com/gojomo)__)
+
+### :books: Tutorial and doc improvements
+
+ * Clear up LdaModel documentation - remove claim that it accepts CSC matrix as input (PR [#2832](https://github.com/RaRe-Technologies/gensim/pull/2832), [@FyzHsn](https://github.com/FyzHsn))
+ * Fix "generator" language in word2vec docs (PR [#2935](https://github.com/RaRe-Technologies/gensim/pull/2935), __[@polm](https://github.com/polm)__)
+
+### :warning: Removed functionality
+
+ * Remove gensim.summarization subpackage, docs and test data (PR [#2958](https://github.com/RaRe-Technologies/gensim/pull/2958), __[@mpenkov](https://github.com/mpenkov)__)
+
+## :warning: 3.8.x will be the last gensim version to support Py2.7. Starting with 4.0.0, gensim will only support Py3.5 and above
+
+## 3.8.3, 2020-05-03
+
+This is primarily a bugfix release to bring back Py2.7 compatibility to gensim 3.8.
+
+### :red_circle: Bug fixes
+
+* Bring back Py27 support (PR [#2812](https://github.com/RaRe-Technologies/gensim/pull/2812), __[@mpenkov](https://github.com/mpenkov)__)
+* Fix wrong version reported by setup.py (Issue [#2796](https://github.com/RaRe-Technologies/gensim/issues/2796))
+* Fix missing C extensions (Issues [#2794](https://github.com/RaRe-Technologies/gensim/issues/2794) and [#2802](https://github.com/RaRe-Technologies/gensim/issues/2802))
+
+### :+1: Improvements
+
+* Wheels for Python 3.8 (__[@menshikh-iv](https://github.com/menshikh-iv)__)
+* Prepare for removal of deprecated `lxml.etree.cElementTree` (PR [#2777](https://github.com/RaRe-Technologies/gensim/pull/2777), __[@tirkarthi](https://github.com/tirkarthi)__)
+
+### :books: Tutorial and doc improvements
+
+* Update test instructions in README (PR [#2814](https://github.com/RaRe-Technologies/gensim/pull/2814), __[@piskvorky](https://github.com/piskvorky)__)
+
+### :warning: Deprecations (will be removed in the next major release)
+
+* Remove
+    - `gensim.models.FastText.load_fasttext_format`: use load_facebook_vectors to load embeddings only (faster, less CPU/memory usage, does not support training continuation) and load_facebook_model to load full model (slower, more CPU/memory intensive, supports training continuation)
+    - `gensim.models.wrappers.fasttext` (obsoleted by the new native `gensim.models.fasttext` implementation)
+    - `gensim.examples`
+    - `gensim.nosy`
+    - `gensim.scripts.word2vec_standalone`
+    - `gensim.scripts.make_wiki_lemma`
+    - `gensim.scripts.make_wiki_online`
+    - `gensim.scripts.make_wiki_online_lemma`
+    - `gensim.scripts.make_wiki_online_nodebug`
+    - `gensim.scripts.make_wiki` (all of these obsoleted by the new native  `gensim.scripts.segment_wiki` implementation)
+    - "deprecated" functions and attributes
+
+* Move
+    - `gensim.scripts.make_wikicorpus` ➡ `gensim.scripts.make_wiki.py`
+    - `gensim.summarization` ➡ `gensim.models.summarization`
+    - `gensim.topic_coherence` ➡ `gensim.models._coherence`
+    - `gensim.utils` ➡ `gensim.utils.utils` (old imports will continue to work)
+    - `gensim.parsing.*` ➡ `gensim.utils.text_utils`
+
 ## 3.8.2, 2020-04-10
 
 ### :red_circle: Bug fixes
@@ -71,8 +138,6 @@ Changes
 
 ## 3.8.0, 2019-07-08
 
-## :warning: 3.8.x will be the last gensim version to support Py2.7. Starting with 4.0.0, gensim will only support Py3.5 and above
-
 ### :star2: New Features
 
 * Enable online training of Poincare models (__[koiizukag](https://github.com/koiizukag)__, [#2505](https://github.com/RaRe-Technologies/gensim/pull/2505))
diff --git a/docs/src/conf.py b/docs/src/conf.py
index 84d2722c27..61ebaa76b1 100644
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -21,11 +21,17 @@
 
 # -- General configuration -----------------------------------------------------
 
-html_theme = 'gensim_theme'
+html_theme = 'sphinx_rtd_theme'
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'sphinx.ext.imgmath', 'sphinxcontrib.programoutput']
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinxcontrib.napoleon',
+    'sphinx.ext.imgmath',
+    'sphinxcontrib.programoutput',
+    'sphinx_gallery.gen_gallery',
+]
 autoclass_content = "both"
 
 napoleon_google_docstring = False  # Disable support for google-style docstring
@@ -48,16 +54,16 @@
 
 # General information about the project.
 project = u'gensim'
-copyright = u'2009-now, Radim Řehůřek <me(at)radimrehurek.com>'
+copyright = u'2009-now'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '3.8'
+version = '4.0'
 # The full version, including alpha/beta/rc tags.
-release = '3.8.2'
+release = '4.0.0.dev0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -76,6 +82,8 @@
 # for source files.
 exclude_trees = ['_build']
 
+exclude_patterns = ['gallery/README.rst']
+
 # The reST default role (used for this markup: `text`) to use for all documents.
 # default_role = None
 
@@ -109,19 +117,19 @@
 # main_colour = "#ffbbbb"
 
 html_theme_options = {
-# "rightsidebar": "false",
-# "stickysidebar": "true",
-# "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
-# "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
-# "sidebarbgcolor": "fuckyou",
-# "footerbgcolor": "#771111",
-# "relbarbgcolor": "#993333",
-# "sidebartextcolor": "#000000",
-# "sidebarlinkcolor": "#330000",
-# "codebgcolor": "#fffff0",
-# "headtextcolor": "#000080",
-# "headbgcolor": "#f0f0ff",
-# "bgcolor": "#ffffff",
+    # "rightsidebar": "false",
+    # "stickysidebar": "true",
+    # "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
+    # "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
+    # "sidebarbgcolor": "fuckyou",
+    # "footerbgcolor": "#771111",
+    # "relbarbgcolor": "#993333",
+    # "sidebartextcolor": "#000000",
+    # "sidebarlinkcolor": "#330000",
+    # "codebgcolor": "#fffff0",
+    # "headtextcolor": "#000080",
+    # "headbgcolor": "#f0f0ff",
+    # "bgcolor": "#ffffff",
 }
 
 
@@ -149,6 +157,15 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 
+# These paths are either relative to html_static_path
+# or fully qualified paths (eg. https://...)
+# html_css_files = [
+#     'erp/css/global.css',
+#     'erp/css/structure.css',
+#     'erp/css/erp2.css',
+#     'erp/css/custom.css',
+# ]
+
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
 html_last_updated_fmt = '%b %d, %Y'
@@ -218,3 +235,65 @@
 # latex_use_modindex = True
 
 suppress_warnings = ['image.nonlocal_uri', 'ref.citation', 'ref.footnote']
+
+
+def sort_key(source_dir):
+    """Sorts tutorials and guides in a predefined order.
+
+    If the predefined order doesn't include the filename we're looking for,
+    fallback to alphabetical order.
+    """
+    core_order = [
+        'run_core_concepts.py',
+        'run_corpora_and_vector_spaces.py',
+        'run_topics_and_transformations.py',
+        'run_similarity_queries.py',
+    ]
+
+    tutorials_order = [
+        'run_word2vec.py',
+        'run_doc2vec_lee.py',
+        'run_fasttext.py',
+        'run_annoy.py',
+        'run_lda.py',
+        'run_wmd.py',
+        'run_summarization.py',
+    ]
+
+    howto_order = [
+        'run_downloader_api.py',
+        'run_binder.py',
+        'run_doc.py',
+        'run_doc2vec_imdb.py',
+        'run_news_classification.py',
+        'run_compare_lda.py',
+    ]
+
+    order = core_order + tutorials_order + howto_order
+    files = sorted(os.listdir(source_dir))
+
+    def key(arg):
+        try:
+            return order.index(arg)
+        except ValueError:
+            return files.index(arg)
+
+    return key
+
+
+import sphinx_gallery.sorting
+sphinx_gallery_conf = {
+    'examples_dirs': 'gallery',   # path to your example scripts
+    'gallery_dirs': 'auto_examples',  # path where to save gallery generated examples
+    'show_memory': True,
+    'filename_pattern': 'run',
+    'subsection_order': sphinx_gallery.sorting.ExplicitOrder(
+        [
+            'gallery/core',
+            'gallery/tutorials',
+            'gallery/howtos',
+            'gallery/other',
+        ],
+    ),
+    'within_subsection_order': sort_key,
+}

From 17e3e1179f42edb48c262dc33bccbd40d4fb7732 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 12:20:57 +0200
Subject: [PATCH 109/166] fixed autoformat of IDEA

---
 docs/src/conf.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/src/conf.py b/docs/src/conf.py
index 61ebaa76b1..13d26e254b 100644
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -117,19 +117,19 @@
 # main_colour = "#ffbbbb"
 
 html_theme_options = {
-    # "rightsidebar": "false",
-    # "stickysidebar": "true",
-    # "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
-    # "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
-    # "sidebarbgcolor": "fuckyou",
-    # "footerbgcolor": "#771111",
-    # "relbarbgcolor": "#993333",
-    # "sidebartextcolor": "#000000",
-    # "sidebarlinkcolor": "#330000",
-    # "codebgcolor": "#fffff0",
-    # "headtextcolor": "#000080",
-    # "headbgcolor": "#f0f0ff",
-    # "bgcolor": "#ffffff",
+# "rightsidebar": "false",
+# "stickysidebar": "true",
+# "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
+# "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
+# "sidebarbgcolor": "fuckyou",
+# "footerbgcolor": "#771111",
+# "relbarbgcolor": "#993333",
+# "sidebartextcolor": "#000000",
+# "sidebarlinkcolor": "#330000",
+# "codebgcolor": "#fffff0",
+# "headtextcolor": "#000080",
+# "headbgcolor": "#f0f0ff",
+# "bgcolor": "#ffffff",
 }
 
 
From 1bb0194ccdc4e298934259b37a198c26c5513053 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 12:22:11 +0200
Subject: [PATCH 110/166] same thing for tox.ini

---
 tox.ini | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/tox.ini b/tox.ini
index f30bdc068d..1d0c0b0e09 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,6 +1,6 @@
 [tox]
 minversion = 2.0
-envlist = {py27,py35,py36,py37}-{win,linux}, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi
+envlist = {py36,py37,py38}-{win,linux}, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi
 skipsdist = True
 platform = linux: linux
            win: win64
@@ -17,25 +17,15 @@ ignore = F821  ; TODO remove me when all examples in docstrings will be executab
 exclude=.venv, .git, .tox, dist, doc, build, gensim/models/deprecated
 
 [pytest]
-addopts = -rfxEXs --durations=20 --showlocals --reruns 3 --reruns-delay 1
+addopts = -rfxEXs --durations=20 --showlocals
 
 [testenv]
 recreate = True
 
-install_command = python most_recent_pip_install.py  {env:TOX_PIP_OPTS:} {opts} {packages}
+install_command = python -m pip install --timeout=60 {env:TOX_PIP_OPTS:} {opts} {packages}
 
 deps =
     pip>=19.1.1
-    py37: numpy==1.14.5
-    py37: scipy==1.1.0
-
-    py27: numpy==1.11.3
-    py27: scipy==1.0.0
-    py35: numpy==1.11.3
-    py35: scipy==1.0.0
-    py36: numpy==1.11.3
-    py36: scipy==1.0.0
-
     linux: .[test]
     win: .[test-win]
 
@@ -47,46 +37,50 @@ setenv =
     MALLET_HOME={env:MALLET_HOME:}
     SKIP_NETWORK_TESTS={env:SKIP_NETWORK_TESTS:}
     BOTO_CONFIG={env:BOTO_CONFIG:}
+    PIPELINE_WORKSPACE={env:PIPELINE_WORKSPACE:}
     PYTHONHASHSEED=1
+    TOX_PARALLEL_NO_SPINNER=1
 
 commands =
     python --version
     pip --version
-    python -c "from gensim.models.word2vec import FAST_VERSION; print(FAST_VERSION)"
     python setup.py build_ext --inplace
-    python -c "from gensim.models.word2vec import FAST_VERSION; print(FAST_VERSION)"
     pytest {posargs:gensim/test}
 
 
 [testenv:flake8]
 recreate = True
-deps = flake8
+deps =
+    flake8==3.7.9  # 3.8.0 triggers "AttributeError: 'Namespace' object has no attribute 'output_file'"
 
 commands = flake8 gensim/ {posargs}
 
 [testenv:flake8-docs]
 recreate = True
-deps = flake8-rst==0.4.3
+deps =
+   flake8-rst==0.4.3
+   flake8==3.7.9
 
 commands = flake8-rst gensim/ docs/ {posargs}
 
 
 [testenv:compile]
-basepython = python2
+basepython = python3
 recreate = True
 
-deps = numpy==1.11.3
+deps = numpy
 commands = python setup.py build_ext --inplace
 
 
 [testenv:docs]
-basepython = python2
+basepython = python3
 recreate = True
 whitelist_externals = make
 deps = .[docs]
-changedir = docs/src
 
-commands = make clean html
+commands =
+    python setup.py build_ext --inplace
+    make -C docs/src clean html
 
 
 [testenv:docs-upload]

From 9baa79fce0c9f473e71934f4ecf698767b97ee29 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 14:16:32 +0200
Subject: [PATCH 111/166] hanging indents in opinosis notebook

---
 .../ensemble_lda_with_opinosis.ipynb          | 26 +++++++------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
index 7380a87f71..cf96d3d74a 100644
--- a/docs/notebooks/ensemble_lda_with_opinosis.ipynb
+++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
@@ -2,19 +2,11 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "scrolled": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "####\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import logging\n",
     "from gensim.models import EnsembleLda, LdaMulticore\n",
@@ -31,7 +23,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -42,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -135,9 +127,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n",
-    "                   passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n",
-    "                   topic_model_class='ldamulticore', masking_method='rank')\n",
+    "elda = EnsembleLda(\n",
+    "    corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n",
+    "    passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n",
+    "    topic_model_class='ldamulticore', masking_method='rank',\n",
+    ")\n",
     "pretty_print_topics()"
    ]
   },
@@ -175,7 +169,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.1"
+   "version": "3.8.3"
   }
  },
  "nbformat": 4,

From 1fa37292bb79af26afb561ea55d1563947e995b2 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 14:45:06 +0200
Subject: [PATCH 112/166] I hate windows

---
 gensim/test/test_ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index d637af9616..f297e2e6e6 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -22,7 +22,7 @@
 passes = 50
 
 # windows tests fail due to the required assertion precision being too high
-rtol = 1e-04 if os.name == 'nt' else 1e-05
+rtol = 1e-03 if os.name == 'nt' else 1e-05
 
 
 class TestModel(unittest.TestCase):

From d3a615198dd2e78c43879530f8290ee288206ee2 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 14:55:32 +0200
Subject: [PATCH 113/166] test for LdaMulcitore ensemble similarity

---
 gensim/test/test_ensemblelda.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index f297e2e6e6..966269783d 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -22,7 +22,7 @@
 passes = 50
 
 # windows tests fail due to the required assertion precision being too high
-rtol = 1e-03 if os.name == 'nt' else 1e-05
+rtol = 1e-04 if os.name == 'nt' else 1e-05
 
 
 class TestModel(unittest.TestCase):
@@ -325,8 +325,25 @@ def test_add_models(self):
         self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
         self.check_ttda(eLDA_base_mu)
 
+    def test_ldamulticore_reproducibility(self):
+        # Two LdaMulticore ensembles should be similar, but since LdaMulticore has some
+        # non-determinism due to scheduling rtol is lower
+        a = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary,
+            num_topics=3, passes=10, num_models=3, workers=3,
+            iterations=30, random_state=2, topic_model_class='ldamulticore',
+            distance_workers=4,
+        )
+        b = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary,
+            num_topics=3, passes=10, num_models=3, workers=3,
+            iterations=30, random_state=2, topic_model_class=LdaMulticore,
+            distance_workers=4, memory_friendly_ttda=False,
+        )
+        np.testing.assert_allclose(a.ttda, b.ttda, rtol=(rtol * 10))
+
     def test_add_and_recluster(self):
-        # 6.2: see if after adding a model, the model still makes sense
+        # See if after adding a model, the model still makes sense
         num_new_models = 3
         num_new_topics = 3
 
@@ -334,13 +351,13 @@ def test_add_and_recluster(self):
         new_eLDA = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=10, num_models=num_new_models,
-            iterations=30, random_state=1, topic_model_class='ldamulticore',
+            iterations=30, random_state=1, topic_model_class='lda',
             distance_workers=4,
         )
         new_eLDA_mu = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=10, num_models=num_new_models,
-            iterations=30, random_state=1, topic_model_class='ldamulticore',
+            iterations=30, random_state=1, topic_model_class=LdaModel,
             distance_workers=4, memory_friendly_ttda=False,
         )
         # both should be similar

From 216ae64b9e73e34841f92efa0fc3a874bede59e8 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 15:37:19 +0200
Subject: [PATCH 114/166] no

---
 gensim/test/test_ensemblelda.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 966269783d..68afaba170 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -325,23 +325,6 @@ def test_add_models(self):
         self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
         self.check_ttda(eLDA_base_mu)
 
-    def test_ldamulticore_reproducibility(self):
-        # Two LdaMulticore ensembles should be similar, but since LdaMulticore has some
-        # non-determinism due to scheduling rtol is lower
-        a = EnsembleLda(
-            corpus=common_corpus, id2word=common_dictionary,
-            num_topics=3, passes=10, num_models=3, workers=3,
-            iterations=30, random_state=2, topic_model_class='ldamulticore',
-            distance_workers=4,
-        )
-        b = EnsembleLda(
-            corpus=common_corpus, id2word=common_dictionary,
-            num_topics=3, passes=10, num_models=3, workers=3,
-            iterations=30, random_state=2, topic_model_class=LdaMulticore,
-            distance_workers=4, memory_friendly_ttda=False,
-        )
-        np.testing.assert_allclose(a.ttda, b.ttda, rtol=(rtol * 10))
-
     def test_add_and_recluster(self):
         # See if after adding a model, the model still makes sense
         num_new_models = 3

From e238fc9c26736b42f2d0b10ea345cc47f3277fa1 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sat, 24 Oct 2020 23:59:20 +0200
Subject: [PATCH 115/166] fixed wrong max calculation in loop

---
 gensim/models/ensemblelda.py | 41 ++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index a501fb01df..4aeb7629b8 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -349,8 +349,10 @@ def generate_gensim_representation(self):
         num_stable_topics = len(stable_topics)
 
         if num_stable_topics == 0:
-            logger.error("the model did not detect any stable topic. You can try to adjust epsilon: "
-                         "recluster(eps=...)")
+            logger.error(
+                "the model did not detect any stable topic. You can try to adjust epsilon: "
+                "recluster(eps=...)"
+            )
             self.classic_model_representation = None
             return
 
@@ -411,7 +413,6 @@ def add_model(self, target, num_new_models=None):
         the exact same dictionary/idword mapping.
 
         In order to generate new stable topics afterwards, use:
-            1. ``self._generate_asymmetric_distance_matrix()``
             2. ``self.``:meth:`~gensim.models.ensemblelda.EnsembleLda.recluster`
 
         The ttda of another ensemble can also be used, in that case set ``num_new_models`` to the ``num_models``
@@ -966,11 +967,11 @@ def _aggregate_topics(self, grouped_by_labels):
         sorted_clusters = []
 
         for label, group in grouped_by_labels.items():
-            num_neighboring_labels = 0
+            max_num_neighboring_labels = 0
             neighboring_labels = []  # will be a list of sets
 
             for topic in group:
-                max_num_neighboring_labels = max(topic["num_neighboring_labels"], num_neighboring_labels)
+                max_num_neighboring_labels = max(topic["num_neighboring_labels"], max_num_neighboring_labels)
                 neighboring_labels.append(topic["neighboring_labels"])
 
             neighboring_labels = [x for x in neighboring_labels if len(x) > 0]
@@ -982,15 +983,6 @@ def _aggregate_topics(self, grouped_by_labels):
                 "num_cores": len([topic for topic in group if topic["is_core"]]),
             })
 
-        sorted_clusters = sorted(
-            sorted_clusters,
-            key=lambda cluster: (
-                cluster["max_num_neighboring_labels"],
-                cluster["label"],
-            ),
-            reverse=False,
-        )
-
         return sorted_clusters
 
     def _remove_from_all_sets(self, label, clusters):
@@ -1038,7 +1030,20 @@ def _generate_stable_topics(self, min_cores=None):
 
         grouped_by_labels = self._group_by_labels(results)
 
-        sorted_clusters = self._aggregate_topics(grouped_by_labels)
+        clusters = self._aggregate_topics(grouped_by_labels)
+
+        # Start with the one that is the easiest to verify for sufficient isolated cores.
+        # Over time, invalid clusters will drop out and therefore clusters that once had a few (noise) neighbors
+        # will become more isolated, which makes it possible to mark those as stable.
+        sorted_clusters = sorted(
+            clusters,
+            key=lambda cluster: (
+                cluster["max_num_neighboring_labels"],
+                cluster["num_cores"],
+                cluster["label"],
+            ),
+            reverse=False,
+        )
 
         for cluster in sorted_clusters:
             cluster["is_valid"] = None
@@ -1054,8 +1059,6 @@ def _generate_stable_topics(self, min_cores=None):
                 cluster["is_valid"] = True
             else:
                 cluster["is_valid"] = False
-                # This modifies parent labels which is important in _contains_isolated_cores, so the result depends on
-                # where to start.
                 self._remove_from_all_sets(label, sorted_clusters)
 
         # list of all the label numbers that are valid
@@ -1082,6 +1085,8 @@ def _generate_stable_topics(self, min_cores=None):
         self.sorted_clusters = sorted_clusters
         self.stable_topics = stable_topics
 
+        logger.info("found %s stable topics", len(stable_topics))
+
     def recluster(self, eps=0.1, min_samples=None, min_cores=None):
         """Reapply CBDBSCAN clustering and stable topic generation.
 
@@ -1163,7 +1168,7 @@ def id2word(self):
         return self.gensim_kw_args["id2word"]
 
 
-class CBDBSCAN():
+class CBDBSCAN:
     """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN).
 
     The algorithm works based on DBSCAN-like parameters 'eps' and 'min_samples' that respectively define how far a

From 68d5814a58c513e11f53e88e9e1e10c2b5c7f960 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 25 Oct 2020 00:02:36 +0200
Subject: [PATCH 116/166] improved comment for sorting clusters

---
 gensim/models/ensemblelda.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 4aeb7629b8..a5ecb5d314 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -1032,9 +1032,9 @@ def _generate_stable_topics(self, min_cores=None):
 
         clusters = self._aggregate_topics(grouped_by_labels)
 
-        # Start with the one that is the easiest to verify for sufficient isolated cores.
-        # Over time, invalid clusters will drop out and therefore clusters that once had a few (noise) neighbors
-        # will become more isolated, which makes it possible to mark those as stable.
+        # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the
+        # easy ones and potentially already remove some noise by also sorting smaller clusters to the front.
+        # This clears up the clusters a bit before checking the ones with many neighbors.
         sorted_clusters = sorted(
             clusters,
             key=lambda cluster: (

From d299f07091bd761f7ffd11242839f060dcd15a9f Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 25 Oct 2020 00:57:42 +0200
Subject: [PATCH 117/166] added ensemblelda tutorial

---
 docs/src/gallery/tutorials/run_ensemblelda.py | 155 ++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 docs/src/gallery/tutorials/run_ensemblelda.py

diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py
new file mode 100644
index 0000000000..a50e14e355
--- /dev/null
+++ b/docs/src/gallery/tutorials/run_ensemblelda.py
@@ -0,0 +1,155 @@
+r"""
+Ensemble LDA
+============
+
+Introduces Gensim's EnsembleLda model
+
+"""
+
+import logging
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+
+###############################################################################
+# This tutorial will explain how to use the EnsembleLDA model class.
+#
+# EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,
+# it can therefore be used to remove topics from your results that are noise and are not reproducible.
+#
+
+###############################################################################
+# Corpus
+# ------
+# We will use the gensim downloader api to get a small corpus for training our ensemble.
+#
+# The preprocessing is similar to :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`,
+# so it won't be explained again in detail.
+#
+
+import gensim.downloader as api
+from gensim.corpora import Dictionary
+from nltk.stem.wordnet import WordNetLemmatizer
+
+lemmatizer = WordNetLemmatizer()
+docs = api.load('text8')
+docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
+dictionary = Dictionary(docs)
+dictionary.filter_extremes(no_below=1, no_above=0.5)
+corpus = [dictionary.doc2bow(doc) for doc in docs]
+
+###############################################################################
+# Training
+# --------
+#
+# Training the ensemble works very similar to training a single model,
+#
+# You can use any model that is based on LdaModel, such as LdaMulticore, to train the Ensemble.
+# In experiments, LdaMulticore showed better results.
+#
+
+from gensim.models import LdaModel
+topic_model_class = LdaModel
+
+###############################################################################
+# Any arbitrary number of models can be used, but it should be a multiple of your workers so that the
+# load can be distributed properly. In this example, 4 processes will train 8 models each.
+#
+
+ensemble_workers = 4
+num_models = 8
+
+###############################################################################
+# After training all the models, some distance computations are required which can take quite some
+# time as well. You can speed this up by using workers for that as well.
+#
+
+distance_workers = 4
+
+###############################################################################
+# All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as
+#
+num_topics = 20
+passes = 2
+
+###############################################################################
+# Now start the training
+#
+# Since 20 topics were trained on each of the 8 models, we expect there to be 160 different topics.
+# The number of stable topics which are clustered from all those topics is smaller.
+#
+
+from gensim.models import EnsembleLda
+ensemble = EnsembleLda(
+    corpus=corpus,
+    id2word=dictionary,
+    num_topics=num_topics,
+    passes=passes,
+    num_models=num_models,
+    topic_model_class=LdaModel,
+    ensemble_workers=ensemble_workers,
+    distance_workers=distance_workers
+)
+
+print(len(ensemble.ttda))
+print(len(ensemble.get_topics()))
+
+###############################################################################
+# Tuning
+# ------
+#
+# Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.
+#
+# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around
+#
+# until you get as many topics as you desire, which however may reduce their quality.
+# If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.
+#
+# Having an epsilon that is smaller than the smallest distance doesn't make sense.
+# Make sure to chose one that is within the range of values.
+#
+
+import numpy as np
+shape = ensemble.asymmetric_distance_matrix.shape
+without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
+without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()
+
+ensemble.recluster(eps=0.1, min_samples=2, min_cores=2)
+
+print(len(ensemble.get_topics()))
+
+###############################################################################
+# Increasing the Size
+# -------------------
+#
+# If you have some models lying around that were trained on a corpus based on the same dictionary,
+# they are compatible and you can add them to the ensemble.
+#
+# By setting num_models of the EnsembleLda constructor to 0 you can also create an ensemble that is
+# entirely made out of your existing topic models with the following method.
+#
+# Afterwards the number and quality of stable topics might be different depending on your added topics and parameters.
+#
+
+from gensim.models import LdaMulticore
+
+model1 = LdaMulticore(
+    corpus=corpus,
+    id2word=dictionary,
+    num_topics=9,
+    passes=4,
+)
+
+model2 = LdaModel(
+    corpus=corpus,
+    id2word=dictionary,
+    num_topics=11,
+    passes=2,
+)
+
+# add_model supports various types of input, check out its docstring
+ensemble.add_model(model1)
+ensemble.add_model(model2)
+
+ensemble.recluster()
+
+print(len(ensemble.ttda))
+print(len(ensemble.get_topics()))

From 149339626a0ca7e186cd07fa23c760e56849e0b3 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 25 Oct 2020 00:58:06 +0200
Subject: [PATCH 118/166] added test that starts with an empty ensemble

---
 gensim/test/test_ensemblelda.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 68afaba170..01c1ffa085 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -215,6 +215,20 @@ def test_multiprocessing(self):
         np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=rtol)
         np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=rtol)
 
+    def test_add_models_to_empty(self):
+        ensemble = EnsembleLda(id2word=common_dictionary, num_models=0)
+        ensemble.add_model(self.eLDA.ttda[0:1])
+        ensemble.add_model(self.eLDA.ttda[1:])
+        ensemble.recluster()
+        np.testing.assert_allclose(ensemble.get_topics(), self.eLDA.get_topics(), rtol=rtol)
+
+        # persisting an ensemble that is entirely built from existing ttdas
+        fname = get_tmpfile('gensim_models_ensemblelda')
+        ensemble.save(fname)
+        loaded_ensemble = EnsembleLda.load(fname)
+        np.testing.assert_allclose(loaded_ensemble.get_topics(), self.eLDA.get_topics(), rtol=rtol)
+        self.test_inference(loaded_ensemble)
+
     def test_add_models(self):
         # same configuration
         num_models = self.eLDA.num_models
@@ -376,8 +390,10 @@ def test_add_and_recluster(self):
         # 4. finally, the stable topics
         new_eLDA._generate_stable_topics()
         new_eLDA_mu._generate_stable_topics()
-        np.testing.assert_allclose(new_eLDA.get_topics(),
-            new_eLDA_mu.get_topics())
+        np.testing.assert_allclose(
+            new_eLDA.get_topics(),
+            new_eLDA_mu.get_topics(),
+        )
 
         new_eLDA.generate_gensim_representation()
         new_eLDA_mu.generate_gensim_representation()
@@ -385,14 +401,16 @@ def test_add_and_recluster(self):
         # same random state, hence topics should be still similar
         np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol)
 
-    def test_inference(self):
-        import numpy as np
+    def test_inference(self, eLDA=None):
+        if eLDA is None:
+            eLDA = self.eLDA
+
         # get the most likely token id from topic 0
-        max_id = np.argmax(self.eLDA.get_topics()[0, :])
-        self.assertGreater(self.eLDA.classic_model_representation.iterations, 0)
+        max_id = np.argmax(eLDA.get_topics()[0, :])
+        self.assertGreater(eLDA.classic_model_representation.iterations, 0)
         # topic 0 should be dominant in the inference.
         # the difference between the probabilities should be significant and larger than 0.3
-        infered = self.eLDA[[(max_id, 1)]]
+        infered = eLDA[[(max_id, 1)]]
         self.assertGreater(infered[0][1] - 0.3, infered[1][1])
 
 
From 87a11c5000b4d3d807760667ad3d7c867dd67a04 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 25 Oct 2020 00:58:55 +0200
Subject: [PATCH 119/166] some logging of auto parameters

---
 gensim/models/ensemblelda.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index a5ecb5d314..417b43e441 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -466,7 +466,6 @@ def add_model(self, target, num_new_models=None):
             # for memory friendly models/ttdas, append the ttdas to itself
 
             detected_num_models = 0
-
             ttda = []
 
             # 1. ttda array, because that's the only accepted input that contains numbers
@@ -496,7 +495,6 @@ def add_model(self, target, num_new_models=None):
                 self.num_models += num_new_models
 
         else:  # memory unfriendly ensembles
-
             ttda = []
 
             # 1. ttda array
@@ -891,14 +889,15 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None):
         eps : float
             dbscan distance scale
         min_samples : int, optional
-            defaults to ``int(self.num_models / 2)``, dbscan min neighbours threshold which corresponds to finding
-            stable topics and should scale with the number of models, ``self.num_models``
+            defaults to ``int(self.num_models / 2)``, dbscan min neighbours threshold required to consider
+            a topic to be a core. Should scale with the number of models, ``self.num_models``
 
         """
         if min_samples is None:
             min_samples = int(self.num_models / 2)
-
-        logger.info("fitting the clustering model")
+            logger.info("fitting the clustering model, using %s for min_samples", min_samples)
+        else:
+            logger.info("fitting the clustering model")
 
         self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples)
         self.cluster_model.fit(self.asymmetric_distance_matrix)
@@ -964,7 +963,7 @@ def _aggregate_topics(self, grouped_by_labels):
             order. There is one single element for each cluster.
 
         """
-        sorted_clusters = []
+        clusters = []
 
         for label, group in grouped_by_labels.items():
             max_num_neighboring_labels = 0
@@ -976,14 +975,16 @@ def _aggregate_topics(self, grouped_by_labels):
 
             neighboring_labels = [x for x in neighboring_labels if len(x) > 0]
 
-            sorted_clusters.append({
+            clusters.append({
                 "max_num_neighboring_labels": max_num_neighboring_labels,
                 "neighboring_labels": neighboring_labels,
                 "label": label,
                 "num_cores": len([topic for topic in group if topic["is_core"]]),
             })
 
-        return sorted_clusters
+        logger.info("found %s clusters", len(clusters))
+
+        return clusters
 
     def _remove_from_all_sets(self, label, clusters):
         """Remove a label from every set in "neighboring_labels" for each core in ``clusters``."""
@@ -1022,7 +1023,7 @@ def _generate_stable_topics(self, min_cores=None):
         if min_cores is None:
             # min_cores is a number between 1 and 3, depending on the number of models
             min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
-            logger.info("generating stable topics, each cluster needs at least {} cores".format(min_cores))
+            logger.info("generating stable topics, using %s for min_cores", min_cores)
         else:
             logger.info("generating stable topics")
 

From beedbd3822ec76a8f931a93ec1733cdab9633433 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 25 Oct 2020 01:15:37 +0200
Subject: [PATCH 120/166] update auto_examples

---
 docs/src/auto_examples/index.rst              |  37 +-
 .../thumb/sphx_glr_run_ensemblelda_thumb.png  | Bin 0 -> 26786 bytes
 .../tutorials/run_ensemblelda.ipynb           | 205 +++++++++
 .../tutorials/run_ensemblelda.py              | 155 +++++++
 .../tutorials/run_ensemblelda.py.md5          |   1 +
 .../tutorials/run_ensemblelda.rst             | 420 ++++++++++++++++++
 .../tutorials/run_ensemblelda_codeobj.pickle  | Bin 0 -> 8641 bytes
 .../tutorials/sg_execution_times.rst          |  30 +-
 docs/src/gallery/tutorials/run_ensemblelda.py |   2 +-
 9 files changed, 827 insertions(+), 23 deletions(-)
 create mode 100644 docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png
 create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
 create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda.py
 create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
 create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda.rst
 create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle

diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst
index ca3c1ec019..dfee557a6c 100644
--- a/docs/src/auto_examples/index.rst
+++ b/docs/src/auto_examples/index.rst
@@ -71,7 +71,7 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces transformations and demonstrates their use on a toy corpus. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces transformations and demonstrates their use on a toy corpus.">
 
 .. only:: html
 
@@ -92,7 +92,7 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates querying a corpus for similar documents. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates querying a corpus for similar documents.">
 
 .. only:: html
 
@@ -169,7 +169,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s fastText model and demonstrates its use on the Lee Corpus. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s fastText model and demonstrates its use on the Lee Corpus.">
 
 .. only:: html
 
@@ -190,7 +190,28 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces the Annoy library for similarity queries on top of vectors learned by Word2Vec. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s EnsembleLda model">
+
+.. only:: html
+
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png
+     :alt: Ensemble LDA
+
+     :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /auto_examples/tutorials/run_ensemblelda
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces the Annoy library for similarity queries on top of vectors learned by Word2Vec.">
 
 .. only:: html
 
@@ -267,7 +288,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates simple and quick access to common corpora and pretrained models. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates simple and quick access to common corpora and pretrained models.">
 
 .. only:: html
 
@@ -288,7 +309,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="How to author documentation for Gensim. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="How to author documentation for Gensim.">
 
 .. only:: html
 
@@ -405,13 +426,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from
 
   .. container:: sphx-glr-download sphx-glr-download-python
 
-    :download:`Download all examples in Python source code: auto_examples_python.zip <//Volumes/work/workspace/gensim/trunk/docs/src/auto_examples/auto_examples_python.zip>`
+    :download:`Download all examples in Python source code: auto_examples_python.zip </auto_examples/auto_examples_python.zip>`
 
 
   .. container:: sphx-glr-download sphx-glr-download-jupyter
 
-    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip <//Volumes/work/workspace/gensim/trunk/docs/src/auto_examples/auto_examples_jupyter.zip>`
+    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip </auto_examples/auto_examples_jupyter.zip>`
 
 
 .. only:: html
diff --git a/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png
new file mode 100644
index 0000000000000000000000000000000000000000..233f8e605efca4bef384a7c603d53fdc385428bc
GIT binary patch
literal 26786
zcmdRV^;2BU^L2u2U~zXzaCgrl!EKS?!Gl|HcbDK2+!l9tx8Uwhki}V?FVCmmf8zb&
zR^6I=Yo=zVr~8~U-QmiL(&#8eC?7t2K>sEKRQ>SbW8C{gfQ0bg;;5VB{^5g#<u{<X
zx?ASy3-q^%cJgMp_urOtVf<W>S%F4jS*yo)sIw9jjnBhR8wuiv*@haoMbsRi`-;s)
z9j%)m&qag`YmLl;-VH-wNFilp)`}=RE6iLb8P1ZXot7lPS$|;<g+lnBx7`bl{#BNb
zM<B(Hw`bRuW2QlTUOEq<p0Uk`VQqrpK&p6|HF|;5k2F}}z@2RxWSRE{8nQ4NivK>j
zQ6%{N?}NV#OoGh&8zR;Jmp{}RVNldC0&_43DEv|>!G?j^q;OxAyX7Gs_2A9hI-55o
zJ)F=U@-pAz+w*?0hK)p%z(s{qJ1Q+Wv_8Z|z3!m1Cs=&-?)J_<O^ulWX=8eV%D6+%
zcbWyAN;rtOJ<)rC^do+m&e=k9VQo(xZ`wPQyXad`57Tx!%K8b56s@|5g&0UafHeHC
zO#dobhd(9>5Y815AaDH?$jYLqo(p$kO2iW|9P%S0_{dC|2h=c{Q!&bPfpfv0!@c%J
z#_Pjh7|{<b#`8q#W@gMkxbv2f^kaTF%n3DuXqzkubHE=MFmYXLbf_poWJ-gWfxIW8
zMzAA1(^HbyEG$-V6BJ%}_og#+ce!+W?b2~GZckEG&lA4)FIIM5H(ULu>hTI0kIgMg
zxdOWkb%vZV^4lVL61EU=I9L*Tl+?h$RKCIz*L82w5j+y@957yB-O`t_fpVnvVkw>7
zP^nvW>L7(+HJXphZ(3gXtZxLHZqdPVywxY{9!|2K&AEn|@;JMhAZj7?4y>5SAxJrx
zurYaog3=|Jtznf^wPDLtdAwz{-~VzM&<1f=cKIarDeaAWbp~AUPnA+mO6+`%gWKCS
zLp?PEPTuCPmyhdAo0>J$4)AJv7RQniq@AMH$$_|;sq({O!`Uktc6UT#*dpB361N~I
z%}5Y5rnnv`0pPZ?4^O==rd+Ct+d=y*e@vt1VbH@M#e;)=cJ$F$(v2B^V<fN~etO~q
zaasZ_D-1d5Ri?1*+J{a{jrHyyY6DW^VD?+SANvO`XH`)%IwGW=a?$}efioFc^WP#l
z{5dPBb|F#QGap7U)iK2}*<7Rt;7CY#0sk^MgE^uH+%rLIi$5A_00EVtvx8`B<J_p_
zmJlv;1C0qrq_Lnp{{wn|>h2r+Vx~*+Z!G1Zgmq&GRTTsGq2_{@Nw>D#DWRt=`t`6c
z7T8oA;ZHP!1QLB9M_mrUm>*io2P8<K*6^0Q<j>vwgJhjCQtU#<Vyt+SBYx{2<&2#y
z)quokq2K82wm4~>i<m73M}=y}lAcUWUHM18lCm31g5#aR8gJfqFY75TWt=wtedwoQ
zpnaA0^zJgig^Ey23ug#hh%7e!&<G|+=7-NQbAv<Oru(k_lBWxdtw__sVQL?7Wy!Cx
z>ZyN}MKEQUSOs}$gC<q)HjQrP_|9NOV|hf-Ge53)3aW%7QfgCRkXscl2OqVr#crIa
z&+jx;2D>gi)#1GuGMe*f#+Z&G1I+vYi6K9g)e*Db>=_;17hfr#kK~YJsMnHz>ujQG
zarIZk$|9PqN6mLW?3l6YSILh*)(}TG6B!@=Y&v9B#NjQ@cyJA{AT*Rq{1RO3KfRt&
zITYih)&2p&c2`~JyxOI7rZ8o2AVPcMJ0duaT+CTY?{Jzelx=^QZxo4TA}5e|0FnJx
z2h9h%+q<zH<4e67yBagHN+GL6-kZE_rS*o=_)t0EsQ0wa-Otf4h=Yw!VL)L*EyY((
z6a#@c(~6*tz_<^y?cSbrTxA=a42&f$kNDIZuYb-rH9@=74qc|;fZ>97bj6^`YgpWV
zZE^bI*Om;wM(XMc%-aGKa}>axr_VZ=;zDWt8O2-J^Bxv}Ut2%&^MW1SJF|SbFz&kG
zy&cWX=>0)&urGzaisqDY!NnC_{E{vDTOHp*`@+VJID&^uD425yv<TC_Dg$trANosl
zpeO`XxDa7_Tch$Y`##!ox6eG>R;&V7JOge@>7l;gt``i}FMDWyMVa`YI-%kD!E>S>
zL|JS2ysS@b3@)Qb=BIHHjmnEDA|!p~!={`J0RJ#+a8dEKKlk|iw9a|2w`u3eeq7J2
zf-BAe-~bY*&0(nx$VAm(#iNTWBKx>hl>cQ(d|miS=*dXB;Oo=bVeiP)u35s(Z6STU
zHpM@WqxIr<<7Z_^I!@Fq-s0HMekIN5ZdZuA8p<$z5M*`7l{e(44IGp#EYwEGo9l@@
z6GbbJ5wt%PcTH`mE9=xg_LPL9p}<nMq<CnCmIa28_j!qf|9U183!hY#@~r64JtQ3a
zM`BV@TKhNpyXRc`eU&|B8-ZCqBzQKXzp~R9^=Vs$X0L>aIOXebdn84NVw&M*s5qmw
zSAf$>HX1)|Ed<yfLInr11VJqo47VpO#%8IWv=Zb;%{4!J^IG%<Zm(qMI+mptNGAGz
z(%mgMa}I|EZHhf4k&+(ek4ct4z9}<zDK7Je45=;Xu~al+*6<aBhje?bv;NWKBcWfk
z|5EwVbUN~Q(A=SRkC}J5+pH$D^!l2Nf%_whWy-}1wh7d9Tx7N!aAG0KWiDv(im*J)
zA2Cvl{NVeI)I#YLHUYj?)FCh?t-@CF<<+jU>8?UGO(gY6zH<4agAW*RQ-Ux7>@~#J
zmS$%j>>%DF4Y=98ql77up`%Kpu;DPvUrv*EhV?47yMlP0yl6hzZ|fFAuTW<zj-7s2
z;7g&rpqVj)Y~ntpzwpU2-Q@@j9n7CQ{WIv0x*o=h9svv+V1D^SdF>A614#IsaO$_k
zcURoeoZ<S~UONs!Vc<aG1;H|?P5gBiLA#8J#8ui2cg$xM9g>I7P`+2*MMC}ycnKbq
zdhu&vPwa4VxmvM1`j?s0O}oTP&vV-WMtq+90RkmnkO&!Yvc#TgvQ`BFLbdrz?S;T~
zehN^!jvpxcNekpZ#eH3Pq#LsoY`b@>QN$aAV{~(&#mAEtd`K+bsZ4IGv=%(zq*3Xs
z;3<dy<8Ws}ndMOG6^Bw3b~}~&nu;S&90uH^r*^Pb%X~Lt<H%SlS0IwXcVC!f>Rfjc
zkktoX6uL9`be86_LQik`_Ya-b3(;-vd_U$lnA*wemDJ|?HO9AdUpJdGrMmL_VXdOB
z53f;FLk`&=fS`S&PG^o`)InjY@-!!OH&@QVDqhZ(i_7rANfL{LH&pQ<#6nd=<q*@b
zJ~F)3ed7<<V2I}KG4WeM1JomdSM=*}w(B7Q)$9&ey~fE)Q`)_sbqbJRSP%dDB=3p`
z^k8$s`l7;fJ57gO{AML_t?BM{9_}sfXmrNP<5B4CG<$m5GBN5jA)+1TrJIQlx{Yg#
z4Q0}FUA=T0)Tg2<SwKt`mjK<;pG2{z6aEC<1Zp<!q<Xql0JLo3D!F0QT8<|_AchFP
z3J>k@R&QX0zW~~Wq*_`b9rNZYq=4x|Qbh_5fW+|0jZXKA2Gglg!KYS4dpoJq9281$
ztI5PC9ZkFni;lbsSRI+`d<z+Eudh_d!y<VcZNEBnUZ=-S)=@vc^t(|%6$lphzvSfb
zr)rgl2#;e#w_%@$%_D_{)l>V_<2vI#7bicpH+{d`{I&a(w2bj5n;~YRsn$5{c$bbM
zLp`gi<}PqEx5=fu{@$`b8|&0pik;)ojkp*SGu9J-aY?~8d}E6YnF?L^Q*N$;h)+)M
zM%+AaC)QY@X5SI#N78RF95e}jrx<v$b()KEg03ybFSj$X@_Sb}5G6(q>?<LBl^+cl
z#w3D%VN{eQ0SpP6rd_1#Ws3X6-q-v3Q74ByLX(`{rd0W(!L#DGTh&7yV>i{u%sUH(
z&F-em&Q+v^3Hfvted*?*6eORtd44he9$m_2uf_-u6H^xFve=)EF>h`gg3$ic6_)ea
zQjmCUv-pK|(_$10b)X&)yZeaX`^%g0HIP?RAgl7^ki<u}KO56rq{K`lj9X+T<>yoT
z2Sc+M8OsBmIm<3h^&9K>>%hL~oMI^=-xk0PY9Hx^Ew<fv*URjmhtI?y*Oz**=sDex
zJ^yEhd_HuQ-m?gpucW7yj?f;d@M#<LSbU&cAF?lzRE%zp_o?Ra_cvV8qCXNppfY&F
z7xr7=o`#9ej++H3*KO4yx}tOs22FPjU=eqaZppfE?8T$9(~vtZ2p2t9te|A{wOKZ`
zbetGfuT7OG(%8M7C(B7h)5iSg@0<bUp-v~@IM3zJq-K}p#!Bmll>q?q#4JliEENe7
z<rwn!Hdiw7$lalu7cxnBb=X3MJnioMDT5cvtYBWgC7+Dxz&F5EemA=+bA|9by6Efk
z^Q~El?YjNz0b+43<D7m8#@s6oJVwr0V$pyu>E(%!vQ~SSRB+U}ICT{;Xs%G|W-``$
zTF?jz$B)r(#XFeWV2$xsxW0HgSK7HZ@UUu-{kkE@Lky5%Pt6>X5`q95F(?F5Eu70V
z!Y>OcUe9E%E7;u4;(y!~BRy<>z1^!Gdg6;?irc1Kk2vwYfn1HrnS{p1m`s{7WIG?q
zAmwgD`j59UQbjRcnPP5pEhm5`vOhBW)AUYPuYPW_hIU&}F7HbX3fy+AX{6>soT1g#
zD2<<P!WZ(pJSKQpnWnTYWGw#N#R=ncah<5cUfkQ8Z+R1VDcec{r*bIM<@1)ELc(3}
zD}!9$_R55Aq0)v$f~PeiEHobp$y8fNvq<dPr_LYP)&Dp=m!M3XaZrSNOk;AwQJSQw
zvnw9#h{BKSp05z8vo>)GielaDxB>of?4E<K*tBU3DDQkzes+j2yRUeR&8pu1p|=|M
zC&Mn#Zc5y*!Hpr0BxJ}7nP|QS4oBh>c<(TnLceijD3Ze7h0suV%Imdj$OeLqMG6_R
z2~|YF_uo{q{8~vyb`SpCf4k4BOkZ)C9Z#YMq^*{p-nuS;<;qOG_UAM1&feCCiddBx
zMJw3EZGul1{lKuaW+!gvI&1gqo8~S?EHUJEyGCTllavF1YKf&rfe_)nm%h6q7}tQG
zoP&Jl>0O2#)6-ziNKN4>5eq5R1U*{hVX|Ea$nt8Up;o}NK_}Sa^+yp2c6@lwN7ugJ
zx0l!|AeDK{vY)+KzY##WxgI_mwN07=>*i9|zWkt|6wvLW-)tY*xl*^^CiPvX-1&~S
zyr_gH<XfhaDpWcRRh&3SP=Dz{W&5u}Fa<h&CLLUPmq*s$XwjLwH9l1@JoM6w9hL+*
zlURo%>%_YmC|;YvYVG~9*;Mw3dd$|J7DCGwH}Xc#hH=bz`L$TM>C*V9K<YFSgD+<%
zUEZd<a=u2D4=?$~6}zpz@OuKLhZxOw#l9bOShd(+%i#3;_rDkt(a*ti%y!r-jROD<
z&Kp^PM=Prur!y#uXRhJt5xWdbZN@uM6Qru%hm2)Rd$-iH;}#M7|C7p3(*z}XtwH_6
z;ttb*O#oiOh%ni5kCvlCUY^}|L+IkzK(az1&xH(hk{{FRAwKbd-k5n<4XSo}YnaG=
zr=w#dkG`y#OZ<T!e;=JZ42g1bZj8@JUr}x?y`YOE{5r#;Mj~tGsieT;oblbul6r4R
zIaj)X`GnK|(NksOeZI4yUH2Zn6B-cL#s?5OG^sHVnW0GRx8K}mbX2{}^*l2oUO%6I
zY47d~3tgi9asD>c%UCtgd=DUUdizG>g|erOOUj<7DvV(6^D9or9O(7bT=6$}@LsxS
zk@>EP%gzAq6D(&R$zi@M3fmbeY2f0**!_C6MG`}CfPGjf<G%AIB~?(I<+KG3gQFJE
z!1?e2RcC)`5#O`pPg)&oKWul>hD5fNZ<PCakj#B*n+QOiepsRi{L~}sVFADuqg$(+
zO>!~+ytpwiY6q(sD%>cY^p2zfX5}_gov(_;9#$3S2FiPC0h8M$c1_s|YrK#c9hz3Z
zS^Mo0f{rwaO8HX#f0Cl>E3B&p7Iimm*->hfWS|jI^s54--;kJs@ivU1Uvm0JIKAxQ
z9j51*FUG(H%gM>y2h={YoqQRrIN@HuB#z3i#`*@4*>PfO$_#ANdi@V-3-h5ytMH$}
zA5)R2hKPDZ!<|e!qLyWl&f!h1?T77=v{Ed_)U)7s4lI!ezI@H@ITI_=4w;|$gGC--
zqnT{sKOCjQJ+dG{&a7cEarZm7OYq%#s7Z=E$&_Mo)5?Yc_!+|zWIxKujt7;W{M*z7
zdIpA^lJFVpS|T&%8BGUV4|TSSSKRLlbJ3<wQC<c%%}=DV9bfntPwOe1#D2BWy0;>V
zpf#PCmbS|L{_K<&Jxj6cCd9S3EnNFPkhg4(T8rL!hEfy9guQ5#m7F?d5$6xV!L9Ki
zvHaoMuQd_TC&Et~x3<s;A)E=SVNS3>#ncbG9jf(}$D_`H$A`s#`h?WvFA{O5II{L~
z;n;$McVrUq$V7R=1u8=R4}3M6KNs{7Y?KY2aI97+-_26QPIK_Q^3L0#MTZob${Z(6
z;c{kO{Db;5W(oKwO=vOta`Vb9@fp)=U%2jM&T~ZXGYiH5+Y`dVL)}tKBc;PN=8Rjf
z-lq^DIQt9dUauz1cphzsnIBy<6&VKNtG+TDk7yTT1&JG^#$7@9<fbf@x1+u<tN4O%
zzY5PY#P?Fe<YB(t)#x-EyYRL#X}W^}dz_1n3{Ux18C`8E169q-$NEt)Cmnw$Qr@Cj
zecu!^i-*5wf5_@Bk8!8DTyoF!u$4ps#)g4@H{PwdHeb-cQM3E}UZIX#`rqUn@|E6x
zE;6^qM%*WEG0V_LyZ#oRsKE?S<ns|qCtt7TYg1XqIa(q^quiQ@n_sEd%0v{bWIzNK
zXk*x8NL@6mnwywH<S|V+>U<h&Zl%ai*_rJ+JZbP3Sg@nb7KY8UNzS8#bJUunKcuw<
z7Jy7N2}nk9#MDv4$MMpHTqsPkAsOWCO;o1?8zZ9+sSP6kHbg0V0YGV>g0QLX8Cesr
zDT4$yL9BpBb*hLn(j~j<v$6C$xE|v(S+C_iwnfiW!A}<tY^8_J>nr0Mpala3iKY%U
z9&0$&f~ukr%wUCJ1$7_S%k3^gJY3Qo&x#D+>5*k7asU{pbo=+tuT>=ZkisU2J)SQl
zNnsdHAtoh@YGRKCd;MzEo9XlJJANNAeL_%VDc2jS_@r*mfT-^FE0_4GiV6BUA}jj3
zYmvDcoN{XT@}^wT`ScccVr~%P1~c;45AR?55X4Dy_6w+|WBYIFo`5ZT5n4GZ9r7^C
zZX9%iE?D4>((*4f@oZ2xAnJ=_!6C=pi;GTYt2Q&=X^4`L+uHX!t9K@4Hvsq)9-<u=
zmB>|7JXfeN3?RjZTGBQ)tpe$Ph3xgNn2vX-<jzut@o$hgOTwsPdb5@sE~(5j5n&(u
zoErW`rK@K+?2hsMpgt6kRJ?Pv7RmBRQ{<Eo9MWvXklG@>#!!xgl!h<}a>k3zZ}Q1r
z0=yeM$ERRwwY0GCDS7BQS)I28KOC|IpeAsl+sXUdhjm2);H+zeM(`3>r4Ccp?+Aaw
zRonhho{{ZHfJ*r}dGr<o5t(8am=Vke^UkHe{ny`tD)9OXFG*67MM9j>)&XAt)$f)^
zwG&gX6WQYDMf$JF+%(T%22m)b!H$5&i`8gDlg-JaP^D5JIqak@ckkoWIn|{&fjV``
z+xegEsEnub@7%Hx{->Wu$h*c;fA5MEQ5lQHLpHa!np8c#)i1GirdNF(^RKg-G@5)C
zw@yQS;dKgEo*B%2PoUn{gD90lUy+<(B#ycGfxvZBICB^%4sg@xoh*-q<$OZ7heJ;I
zudhy0@1Im0s6L<7M4Jh+I<*C#s!OxX;2ZfBx9h0JseLe=<x7mozlny3=@qI<(hL=b
zd;V!07nH+_(;HmJ9q#|AaLg`rEU7EW!AYz({q=YBg)WN`PpgveeK8aPPl??J*g>xN
zah5^>ps>R(59~(zOi1-j<X7rDHuBpIC@?i?p8Mkfpfw?4(!g#R1&BwE66Qj*-*Rw0
zNOenF5i797Q(?cu1v4b+Y~Uk}xHq7_J4YM1p(eS*mNrkYAqU(s-><VlEEQk@?U7OB
z!siLtYKg39m#(Dm)YYbBZO8juEX@7Ie$^d@h4(|%OOW~<ecttN#eA;mG!$(Uck!`j
zE;MHWcZiB=>ll(L&4e$yQNO;xc@w$CJU!{IebA?~WmdMuW$_>|;<)AAR0^ZkVi0M%
zrpexd?5bveEUm!wFQDq%{l$YPiOH|r+Vr<7#lYb7Sd?Q|X#(=({MdHW^jcbJaG!Fs
zn1U(1Rk^!U;6(oPG4N5MB<*4AuwOAA&IE&C>F>XVbzVNPA}Z{p<)uQ~miwp0rj5^8
zWl-NN>uAVrQ_J1_^??qBKq0u)<jJRZ-hNzIn?g|Go#paSxjcaSBnQ^AoXIgTy0TQu
zkeFTQP>d?q(dtyJX}Zpx^;yA~#TH6YVMkmLV{rQX{Nt<(naEB6e?+*<O8*(#H?d!Y
zrN-p(9?P<fqdqydJhW5^{0;?3MwL@n_8?Pi9OyrTD3DwjA`&$q{#zy;SXTb*Clo$_
z8p|P;hP6B9H=IcdfDOP<Dtyj@MD?ZE)r}9LYHnOAb_0{DK5@WcJ~wgG`6$`U{dp`6
z^U8wG1|PZUhwcarAq&jy>x4oOmJGPF?|oNY*0^LAB{K!~Icu^pN9Md|nEz@X?5?@C
zXxIg5O#lXlWePd&$Uki$wj$A{KMx6hdpAL;95@4~em@@p?CLIidi+$4roQPHu_=3l
zL$PrisR0at1aiHkYR$NJFed!+2Ll!4ti;i_y7$3M)eG=}$>WT!5{u^o=u|YnJqXZP
z#7UDzX@(O8*TZ7h-_*3t@G!Qp^a?6qN*^Ki4*>XBhH`(FoMnW6wWpu|TQtC9+Gs*c
zI^1%AbEds0r9=@M>V^U1Zm=yHK8le(C>wU={CdijvQu*HblVgb9Z}EAq-rqh-me+d
zT!nQ*hxP6=*#40WvtS45hp?uFz)K5+twjXVm=9f)Og`{`qQp)UH+gm}e!e3<d3oz}
zep4`fc**YRRB*k0J0?DV<^)$TV{CtrdAh5k4-xlPY^o#=;t#uFw8o7@rAvo13H>jA
zd`@(yV>EMEUg!1q@uuOaA;(nqZzMhowr#lb_1GOfLBRKWeG-|nj^97;GAgTTKh!{e
z7f>HIq68dR*RlvE>{}u83_s*7j?g@HV~PHowF<Kn>WL{W<JJk4wR_fY#fYX)DQ9Ad
zd=Uu^5<HKQoKBoc?0-8Lt{D_+!wE-<i@Gxiay#=9?wb1wDwD$nL#ALihih;>UO@9+
ztHGK7i_RP{8-K&Z+sT1EY?}p;?5rikIZL!k7)muurAQ?y<9^8PGMk+`7F~bFZWxYV
zk(tpGGH*}}#R+EPuoT1ajCk+=0#1HT{sa^8q<Ead1t^D}s<dl@?XnFEJRlj6hHvs+
zD@*t4^`{};lvWN7A7R+WHGBBFxuHK25&W{fIjM~XR}8`w6SKRZT>8X%@p_PPu@rG=
z1jY0v?OGE>OZ6S9Dt<c>%bs}Q=LyFq&oKYyAF5m5xBgI8Zv6{@AOb(I)-*E^Eqpc`
zBINhLnkP&cKL7kB+7-(G^CBg-zIS56Vt*omIL}dLOBpMUlCvayA9AcnCjpWDXyOwa
zJUGr71oK%LosW`~Y7G%h07D=#U|re9*<#W+K!_ai4~%MF-=}x(1m(|T+Z?_b)XFtz
zolau3{WSx)Hu(ra1p-L|!uG%#%`ymd=f3VtVL_RgjzCT!y)o}V!7aV^){r2V+B<WV
z7n)mmg+)umZ@*ifkmBHTv&zb)_7eKAsJuD*gh6Y>JXA1{kCc%t=x-G4m_l$hRDAHZ
zgf8y){c#SiOj531*iB^?%|BhrYdWRENof_1$SA94lw0JH&bOJ6oPU582DlOqjZ0+o
zym^uKnz{4hoiIMC=sGZpp{#M@PWxlVb4Z4E=hy`d9NFqeJb^Rx!pT^umM4rb&Q<;+
zr<CAzjp3@bwbL%*%P~mj&Qu_dJULI`14b&&!G=AhQ&+JtfQ@my!#^(kQ|5X2Hbt#B
zDdPqDR><nY1Llnor4Auvi%jqRPys$ZDpc4$-|Y;x2}j}sY{P~TTqbYS7fq0wJv%N+
zsNx9qOG0`lAeR2Ap50%%QFS@h5ox(piQ|kYn&5zsRJEU%a97(s$XHpy2Pb`GoB6U>
zaUJ=~IQ+N1J_p@uoY5psZ6?)QhK`#y>ezxKhKFF7o^{8r#V<ER(YE|lgl^4Mbn4+*
z<~opdUfi~VT&;zc$qCNuX>&325P?SN)WqNh*KxVHE+;RVXR1G0zQt-C*wNv-{FJJT
z%2=gOcyxE4wv+wB&=k|dBR5Xsicx^N;_B*%%%7k?`ojfV<}t-_`nzog%01*FM^uLt
z-_SpQf>rwDni$JsxmxcU#rT*;@%*R-pqtpYCOo66HpTn>Qz)vSV9Gl%!+1KcJ3!}w
zXJD8}`(!q!3n`Q2?w$-ZbJjH!2HqEE(OWLjhx7=3VzkxRhBUo2hj;X@pV;08@Z0jK
z*kgB?GbABO^oR2$!;pXdq3l0Cf#!uBuG#MT0!rjXd=bTJduL-E<{bc&YcWCR$ZnLs
z3cLocLPcVa_5?TZ>=zc8l3b6%W03G|EdL`kI2z004bJ=VW}QW&kJG_S*&8|Bvnt46
z+&ae!2!_e>r>sLRTCxSF{<ztdF;v<lohMB@f&Snnp(#1Ubqt=|eT~8FVZ0YZ*Ae=m
zvPR`rD;5%rLPjdIfpfzg@y+xq*nl?U4-CJ+>9+9kyTkWf`RHTGwR?DwzI}qae*BbS
z3o;_%zK-{xl<k;-2Izc4g)|!<q)IsViHK=pd==Uu>p|me%s={fd?wOVMcVB7#Wp4G
zp5bASD>|*~<Fe2c!5tsDO9HCZIUuJLysHqeAKbCVtq<7#UJtf1`+`@yI9Jxe2-K{S
zpxD?21WIc=U!n{p%r_5N`-+Uqx@zPIoUGrT10u#`QsJazi1+m9k%uuY%{*!}M=*Gq
zJGU>$AaY`*;DS0z&(I9&uy6}WNu2rnk4%_DU*U-W;ri0M-$Er^$pIfhB=qTaRJ!3B
zo!Ekl`c5O0d98e>?C-)?Fc}%1;oYp$g!zU(Mn$x~%a`6DJ_SRMr9e+Kc2{0?#w}Rq
zN@BQa(~I-<thogH^OCWcq-H(bXPc{kwS14y*fpslpVDO;&jH`WLPAYO4~rx1k4FH$
zC+N?&!4p1RX{Lzd*545ul8zs}MuclKtcx#UrX?BCcJ=YXB68K;aN~Q$*1r+euFh4i
zp9Lgs;GTE=8;14{m%`S^zN#WUh*t-WwAAwcC(yx=pxZk$&9bcy>wo{ZBq?5*w%QI7
zVaW5>!PUoZ8=ycP!>pS7l7)c-%Py4IoH+Nkac@Ygj75$B_z|VJ`Pdy0p_@n0A;tbe
zli&;B(hC%(8MzV3QAQ6XUJ;l})H2a^e$MPmFUfn~7SFjfA&6NF_IhE|!3%}H<0L_t
z6m)5*JXUR<r9znS);O|6Od0b_4@=~*V6syA;K3z=QN91qRv1tD?^$;r!F$;5d|m~l
zgBiU^7Ox?F69i>9{!6fM^1)ce^he2J*`CN@a=|kA?U5=1wx#?_WQGULb^+O6_Z3#0
zlIlN^(2mg))Jw~pl717rLqznOjKbd@3Axyit$M7Rc_vXD`cXdX@td|be=UvS?6y{2
zN=vV_IX<$|k(D=Dmj3q!dKC|Tchm7T#eI&zZe{G7!_LCXc}xqadtzzxxl(ieJO^80
zB0am9B~=T}y(E^@60|}-4nuAibh=zBwJ^}f@6h6BzrNnMc2Y+fuGa%71s=7`q9#Bi
zgc70U!G%55@eHW$O6sjtGgV5p^uIE7q($2MZQz?2^L{N{$CvF<*L5qwn)@YsHw+a}
zJidzg<&4HZnBBEx^Dni#8P$E)`Leqhcrh)%uZ>QbI^f*ze&`U$+|#BO?R8~ryqy-`
z?a+2;%Of~Gr<Pqb#nVx^RkMOKO=k&M)edU?B@&R*$fV(;S<gqGuS7l~6`RGw{D)eb
z>?`&AjfsR&QQC<?#^R~RzvrhDxw~753V^y&>T)W^XeN!$UY}E$VU!^PaoX5|`qvl|
zi(`C$dYtc2w+Q-?)h<QjMrq$Vf2aoOpx8>A=;IJFqSKFZlFKrEmo9OUhRXaQ-+1di
zHOtP>cUtozS;VFdN1A10VlKi__;Qk^-*s|jt3BwBn}!pg*C>(jrS>b(bwSdrgPB98
zn|+G{TA9y<2v^?{6hynG*@JQVYEMoM3_6--U;WS(j>l`dcOMi(@`utYTf9&Hss|&5
zhT`crTl&94rKk&ui|4cJ_2WW)TqSOxFUn^G^cw_$grGg~B^J-4kd|eEp2di$uP91t
zI^<Bt$a;U65wvN>W~j0UM721B&fnM2cHjxlKDJp8>-0%OUXDQU7XIoI)`CZHPMpr~
zo$Xc0D1YJ#6yaY+5(emlqt5vqN5&T>m~KBUQvwSdjc`rbVUfXgbHVrT2y57n4osM5
z1?fcFP-v}zMPoQi*TKI*u(k6z^_IJ?K{d%ooId4i@S_ire_tepd6wRm=$SX`{3rvb
zXT&gl`=bp#d-m{=7YdH*4LFk6gH99=>P7qWn|0AQYd!SsB80Z^*;~O;i^B1ir(KPo
z4Gb<0vUqU%7^A!atZ(6XyPMLr`$n;x4jJ!se>2f$KvWoXR94n#Xj|265%MIXeH)9C
z{!KL&ac~_o7m+FQxPhYcr}Dp-M4I?1@m_^kGZSkMZ!(&sSvRpY(r|XF+K^TLEl-PG
zJu?xNnRM)_pG(H_Ywx<mD;=!QfuUgbUr^U|R`6BEm}xuDa$*S5N9aQwzT|pZo~$JP
zIU-`@0mb^!S`?@~;R_>ykMFM*=jrF~IR=u93wmGoZNG^@uvLPY0)4`_%ThbaYMzq7
zx^EF=_wQ*R#=gY8v)$W9vD^+B?>n7Aa$nwSP9KT+H!8&Lc{zH1(K3j0E5CUUq)moG
zl;Gwi*7)SoEGc~?So@L5v|6jg*0F2a4%yqHa_a_DL-I(gURRR4fvz26jZ$jdTkQgh
zsULhVyT2{5AarwWamnky+ec=$W1HY7L}M~Kq!@j@C>=TNx~X;2)i`y|9Bs0!fk5um
zWa-~%(cN*^Lofbti3F1;7R6^3z=}{eUz&Hs#mRi@lj1e6W!aWuX}ms78%&E&L^UIo
z5b#U`iNHWs&gL+{J{4wcGr+q%XTNi%CKrCp#!Ano&?b9IzybQYx$&sIW~7BYKfXsG
zH}AS<Y|gh3sw(#zV#tQm(+BT3GFPB7F7RlViD8YvYHz+KzP(~<uhUpI{hO0>0mFe4
zP5F+GUS{-T!_3ghpJwc`Bf8F?AfecS^I3zZ-wddmk!OA85+YZdxsUkY916X^)rygk
zIa06&h`3s-i$YUk6}r5AoK8CSJ702JS#iY;ocWmU)sy#uk;<;xv}XZ!by@#*6-9S6
ziHB#VHGDk4@!22$Y`@}zVJYiH$cL0R;%={HTO0|EKJmq=R=f!{paP~=7V5yKc|NeQ
zcGzR*C1&O>kJQYBf|u~f)Z~Z<p~46KO7Di$H?)m;&w41`XO;J!Pd3w%(dF4ZuZu*<
zip%cLLlu2CAa~)saG?m-uDH2=oYBs^F`jIezD&=#pWpo%YA|-pcR@@^{vNaLTpm#m
zqwUAnvgZUR`_7Is+~6>~(biHNo^sWE`ULP_m4~t+96rZ(@HQ{p>hz()<NZ&+-_yeK
zOYPNj9jfDnpB5DrD>hC&<~h5TO51VcYZuQ+`_q(@io}Y*O^Oww8#VFbyn>NQ{E@=f
zbx6F#gxRn$B3r*3{qWNFEv`kkI$SRXEvZ+P>+mkLT$9?p!R>znEe^t<1z;X~9f>|q
ztncS;JPir4e@M)4`T+&ED(Xt;u6yAa5?FDeDI=Ko8KEl^tbe)6co*fmZZ?X%ufG>R
z=OQZd4v2<0jZ4x=Brno?G6yUh|653~NeS11uSL<-EpxM}y@8jk^1j!c0?`PLEktWt
zkb9^WxC;-u$*g_H;^=vH$hguWv?~!q!lX`x_TaDWVg%mmOG)Efk}_2A3ra6MlI+nd
z3k#pt(2Ed!6C=}i3_etMXIU(tA1yBiFTyl-V!fy`l}QD;0Ag-D*1v$h*iU|t1n#ky
z71C0r3LaQDFMCGl@(W1Zjk{TKERx=nln_i@s+ft0Kwjp#*ER-(ocIEUjc@teKxGw;
zZs^xCQH2>cFpKtGdpL)dicB+^|E2{vD&0@GE}Z5j=CLr46ISd+P?pfA#=Pq$I$J7|
z?+!3h+BQ2^BBS8kDKQ>RDm?`FLVKjIsqld{D~Nh+4P@f)5s6I`6G>YtDqg?K%ebzd
zb%MJt*t{s>;h%gg!nApS<H6UC)MbWaRy7ya)yFn`>zDP;ob0>yhdS-L<pS_;qMvg8
z<X?;n79z#%y;gTHg9*af?;H$pPW67c|1GP~^ef}IeI~S-^i7_!21Fnnq1gw_DedR(
z)T|H+w;p}m63dm%of<A^n?~=dQ%^=UcIVW*HH1J|gaK(4qmI3qA%1)%wswT%><G4k
zu4`pr4j_-@^Y==heKAf(y6aPOl>G-%clLSYEmVs-x!rC%S-Z<seH%TzJ+srhPg`Vg
z0bwE@*Jv(n+MLq;VGt@Je}%Oa`%3NMzg7!;#8Kos)RFbEB694Hv)%Xh(%JWHG^l&=
z{TpW>uR`;3=mdsc8h4fLz~W!K8$0l5ZqcZve}2oT5L>vXpvm%C-|YJVuRd$vG^oLD
zVU)YmpzQP?pcIP&L~5j0YovAQv~+PM$wBul^4XI@VX4!6I6h+wiqtQ8FON<*Iaij?
zbcE=<VX6u-6CL;^o29J@m>kVvpSTXnJTDlaXXq>pR2ZxoSz5oX^ceTAdv2ZBHu|la
z_H0~(J(GX*nw;J{;b1$xs|)Q*RE+w-K74mLUY1zw_XIeaY3%5zuJsSRU!&~{F<*Ac
zS$=gT;(L@Zd!d<*=q&=DtINhUYKZdlrZ9n!CAHH{zL9JtrC+YPB)om=dO^&AxKm|-
zw@OL(+bx-29-V}*veZW^zfSDGTZmNUx(#NEUN}C;{m?no>sglzTGmR+4=}kU(rvPO
zaX2>@y~tlQuzkboXLlm0vTp>M5bmXSOJWO0-fki}T$)P#NzeDbaSGZwJV68-<H_t?
zPW@(z8w#8lJ>9ehzS+`0ns6zyK+E<SNxRlccVzIM6MNe!W~rhWMJkVwcF;5Bzs9x6
zDy^-MzNPAz50xjjouG}yKCJWLp;ymj#_%%&46swsoeX&5ufq(oSvdF?&^XniFZzK1
zpJd@9dLz&|=#g==$nzx~YU|v-LL^!Ho55qsNEaxz=7JLMpi4xlR;pCB`Lp?H_t1%6
zwC3u!&qW(E2I4pyy?}mzUHEFcu~k5MPNyFTf9{zfx`o|pg7hn~6*w9?EL}XD-u21L
z1X23W65_iVUFH;meJ#;^jJUKuT$ScMu%>{o#td|pbbTGbm%1_8fNi&Kj9o{Wd2@JD
zS1+1DaB1FgAn8-ZjeN85n~<Sk=2f?ZqYh1Dw(C<VNm<uM%b^36#N;}!y>0pvZC_Q`
z7&mX1<N8e!(_)fRv{@I=;_zGEg0{#6BEiy2{=@H;Okr8}N5t|$-#LTVD+!Bix4{ON
zl_Ymc5}fVSHZmy_s=+0MG0ZqYnkNY6JcXmn(~BtI)nR98_UbYHuWSF%3DPi_!alrv
zf!>5R1N^2k!<4xevA3%->tFbMFX0Ar2d8f&Hx#nSSS1*ibY$%B0D!Ed7jCrw*F&Ak
z{+@Z!m3g+cQl-XyjV&RD`mo8r<f^-!)sv(y8n!$LCrgwRv~i1>*Rs@YHa-~%9e~nP
z*V@mF#4=|-h)~+=<H_7^C4I6Ai5E=iUiJQC5?4*54w>oeICmp2K)k^`Z<)n+9vj!3
z(6&A7m9P9AEkDG$=u?ShfMy-V`_){2Z$y{>FWLJ^S@}S8G;B$i?bB*Op4$H?T^>>A
z|Fc)9v4rQZobN*@p0_&9cutg|YO_q=I&lCF%Z9xL-;DA!8ad7~7*>PhXv;Np>v`Dg
z=@(krAL|xc!5!--#*?lvTm4+;fr=A|UL3O_s<~Y7#HbDElD0N^Dm{%FXIF;j>&6kb
z!P?G6KS}g7y%<IxOMeR1Q^a1chnF^7B{1L7Lb9poD<ErPfa7)7*VAFwwY6|~I++aF
z{|mZ&<gzu(GRcxIod7-6ELem^hImsyRnJ@b3`rS)16<RS1d`yF+6m$Vh}ZlYaw*+x
z%(kd@G^}KXqCoft4(0>vn&aN^T0C{Qv(@!0tX1V$wy(h>r_}UAL~mmu$7&2W@UpmP
zNM_<LA+32QY|jv@RK1Vq+Y2yueur*R3wjTStv3z6axF(V*YvW^QXu*PXOvV46D%XF
z5js-3c(xD>RXz9~z4)^Sc!%jnXkJ4i=g~|MdjCqvp|^d#%FdQdoEl;euf0ysPC(dZ
zH32?wAHlQ%XD+4smbfDcHFEpDO0*)M{!Q+kC9)M%frZrtQ-SKmB}#Y2=*QHGt=+M#
zoZa#SI8hU}_+}dxrUI|of6_;Jd7juc8<?BG1>n(9MsefdyRoCPq%n<spHhx+2p$eP
zT{1sbKQdBJwEWyW1=5k2%y`e*dI0vjPQmJ^Lh@BeKvnWO6){OeF*y<lI%DkLVn@NM
zV@h<l#RZXZV>ssJPM9Ipb@#Mw9#56+#N-ReXm&8#k~io@{r0BJ@0yanPW5Cm21znR
zhDomFwb~Gx{dj?g`t&P^b7uPGHN#^L=&Gpb!l=IK1J?%7hwNjizPm?B#mVaC83Rh}
zBP~9eXit@MAnOES&Ns{`t79@mNCD@Fe}fDkwNN9px3=`^1aC?ePiaSBzCC{i+~@YR
zeKMO~VM))>$#=BdnJHiUvR-K`xzW)G>b&|Zd;2%peg3Zta4jM(I7$nb$^?>7CL0q^
ziG`^Kj6RyC0K4G=$7aH(J4S(wP%WEO8>!i=<&o#P8jaT$jW~MFq;Luo6mQ&r$QznT
zfAUC!I}M${k0cl=CwmIjby{3&(wMCFR)z~~O(9r;+H3uzWh(|RJXuZpV1Y<I)O_P+
zJW6V~c&1hpBtkku#SjZIq>|QwQA`GxF91ddFomjA-ORw)_?V(;hYyeAjvsfdmwMzh
zef$m|B{}R_o)Kz6s0H~viwwClk=w^q(mOc15b0VmEK%7HB6)#PY1KA)YaI=z;zxwP
zd}?#$__w|2J*qHCHIrZ}uE3hOd+rCxre-x_KZ>Y&kX+K^)zU%<G8u5bf7h0j&_AG5
zCz$`atTUOPG#MK_eJO-LP^2O(tJi)(f1zyan~mU2^b%hax_T}ZP5haW%Zz8Z&vSZ1
zZ}DIpt_b}%2db)>_z0Mw_Iru&7!cO&gG<S{9z&C|S#S!7kg;AnB<>bD@#w(J3=}(v
zsuDR}JYBAE7Vrp=V~KChF!84$OC;{b-C7@QMo~vV_cwHj>1xfxLq<A=h2jQsQ323+
z@#iqn*i@o_@}-8)^i?y>2BJ)=vRdp46);><F2msYHB}!oX-&ozrNgsTC@H1GM6?l=
zFa)#&@Ck%5x1=H$S-`QYQyZ4tu>a5#(NlsMDtWSRkJX~b`X=^XKSXuuClu4x@?TBS
z>4*Q<B1ZG^+k4?qKWsc4&3k1L76wpuSK$=llQoQ7w}5hhc9v?y+(7bNDK8uURL<Xf
zOIYf_Y;5RY$zcoUeI0z_oVhy*CM?d{u_Mn%3nTgbsNd<lv?4U`owtc2>wf4&d(HuS
z$Pqf9xX5#GRN*(vrkg|~Z^iWxZXs_G27moZ^3(^!;6rRT9TWt&FNR81ffeW@C6dI=
zKp^*I(-s*L^q}A3_u5-(@AVg<*=|eUU=rL!(Mn42!4=}d=VBeD6_ccM6ICa9?JoO;
zVRgv!An;#)4_<><)I+?{ieQ$`?Sxp<KXwBertu@#3O7q!(f?fRV!~uL?58bhqUKpy
zR|nh>efvL153rHYequ`FVH>o)g)R{bbhnE4BgK3vvGy&5bh7hF2X2M+sRkQ-$v0T~
ze3o@P^Ec~UgLm^J@r2l&I>zNOSgI))42b3CjWl*Lx^YW}nx}6&Z+oZ$9h&l3?#fu2
z4&3cU5an-@xo4+VWXIi`5BX~X3!Eszq4}t~@cJ>P=Ym>x%*u*8JTXH6p+*NQKYYFx
z0O!E`S{Aq~qQc-};}K?o_2ZDPf>4QUe+LQ?jfYBH!}kPk`r!!Vlpl0K);K8^7RWd1
zeRxx}COQWzAaos>C5bx8Vd~s_k_%#?wKJ724Jn^84AkizXYJ+GaISovGKmZ^&h#b{
zyH>T-xHN50E0arflI=dDUL<kHQO9VwZAsg{Z8MUJe_UCVsgX02eLJp+2u8YE<X1Qb
z+<#?LYDWl{iY%?U;&_~4Hu1%E@}lUOG}<iTAK!fx0VeykI6wNw-{MMPpbIN4XlUcb
zLiYUuW8_BqAKnoR9Ee(2Oa?p53=C1pUI;z~zas2q@ql~$)Mvl7c}`pt!*A*cNzI@o
zjqF#AZ<|k4*6GVA5ZZUqGH_IY-4+<{qN%Oc<RVl#>v@Q!qn%y+T8AL;h#D4^v)X2{
zzCi$gHyN{By4i$LtJ#l$Cu!M!9XK$-`qIT(rT4nua@of`O2`%UO*>jHIU39s@{kIS
zYIYw3p@=&IM%MXhdL4Qq{N4{baAf=|Pe~d1dMVsLn=~ko>`B>hy63VhbVl+?KV|GZ
zz<JFZwgxw;x?Xv?urcWMkiIT@`IiWJoCESocRBL$>b0<&=#yn7wx`&;`#Lwe3)OPL
zJyF<pAd1~J+vn2=_vkX-f81|&xz5zc>a<9ea=|jsJo9U^(8s@^_FI(1)+1u!f@f}g
z+gJywAIoZmQIa{tQqXFJdxm?EUprnrp&SLe0vg9FV6Dxn!=KFRAwQ<72aam!nX{jm
zRxl^7Bxww!dSSlfSlqe)^nLT$l7jbDPt*GxUBj?qz3C;eqgLkadA;UwdzMtY@-vi;
z8f%C$D(oo?TrXL$4`Ie7g8ZQ-g5$t-jFSk`U2`hs4HgH^HPta8qUU1DI>~-*je+)m
zSr7j-J;0cW5>Hl5+v|br-4Oua(-um!{OtU7vOYi&k4OUZ1YEs<WF*;d+sOSjDI1}a
zum3^el>MEDtg#DDE{t~VJTB^gO)s|KR-|-H9sNWpV*VEgD;pi0O+jwiCWGCMJd<PV
zM&t4pF8G85gQ7YlWq2WZBdvqJv7Bzab+rjO5U}Vx+hJijaKkAKmD>b*7<%U}a~nlM
zu84HXc&AS}cFwWNpe4zhKX!ugu$7|38d>3J()IM5Ud}{2TQ9T5_orkkm5?rJ=;+Z7
zt|)wsS@x)#c_nLmql)qeax67t!l+#Q?{t(r%FGit=bP*e-PeLKaA@beHEk!H)(7Og
zr&8A88Pr?dNuQ1dS{sZyTWYjzBF0cunZZjgL5$NkR}x*<kF%12hZs>7oU@a0wMS&c
z?{HYZJg_5r0D8edt--0`QhEQ_c5Q1F9CQ_Q<2j_>hsGAo8GTgHCXZ}HGEE23MiNIh
z5^VHzRa$LZo!EGd6kXi88@l1`avJzej!PV9!OYuN0ilE;TmQG)NwZ+rdduO&nCY1L
zx}c>cYDFhGw=nKXsYA2JXz`2S2K!W8M~74J<cQIm_n)+AGvV$`aP&`~x)btAr%!`E
zE<$Q)LobF!9OQ9<R3cQEV(LLmK@^mq^E?>kjH;)c4%as3^EVKsHb{r{Dgv6iQUqE0
z&bnOdga(%)$Vgqkw%gI2us<@$j*Vy7F7PBdT=Rhj+aGLW|7sPJAErOZrOD7^v?d$>
z7QeMTE^l={I0ewORFjdDxTS!vb#7RE!W#zSQ}-t$Q?4~`iw1Vf!^Yi7S3moq2~~M|
z#38q4xr3VIaUoh$B1dCypP|tEGeMW9Kcvrtm<&-U@tt9=>lM`Z5v?pb!<t9~=%MGq
z<(EA9#iDW|ubb%KD<KSp%c5QFrf>H<9;|A&O45hG$<!D+H7a?ksO@?ZR=*#sFY1eJ
z(g+Krs|F&UMgPHi8%#P#6YCz^_<x*^m%UJptNq3PSk$L*k>PGwTG)yi-E=KKDawzf
zsDSript;XE2l?AiE%mMf+B8-lQ;{uRrW)_3yVxbSz8-w;s6hTAHAE>$lOP%%;(k$s
zDWPu#KR33tm1avjyTQSIf8iPX<oR5ovFUQHfgr*LKZ3Lh%Uuo=^rxi?v+eN9VjEr)
zoy2KuPBbp@L}xTtw)*_<oHL9Ex>u6vW`gxf=DX81tCNoErP4uZ*j@K2*p^}Jh)L9X
zphTR`nF_=$yXk2Fc6Ii38|n_dMr%6sU|XiT6+%V#+N#2#O{Rqj)zv!IXm5jnPLny7
zhv>pUzZ|3X<K;Jz8Ir@ny}3Cg_fvLsHLUN22Gr3V(!RF_{S#K_5w}VugZVXG%M`yN
z{5^kuGLe{4`Pt>7Tyb6Cv-5LT^q6GXb1%8c*OSKeykb2|JaSFdXl!kfq-)3Q5LD;d
z&i-mkAnX)2c)#g?(XSX=rKa(=H)?iE&Uy>3sl2r25-eq*S9kuN*x1aAiF`}t{p7v&
zU<<*C%f0ehh()rEX(Ln?a9-2qG)KP7h|VFVPE(JjU<~WzPAVm}4mPlYe+d@$LJZfV
z(WZ#SX+&T$rfY5(Z{_vA-a(PGvrgcBO2$PO%9N<hFOe^i`C6-7rE$=N&qUlQ(@yUC
z@YeF=2`GIlL85{uLUG9sV<*F`>FfVfb2m7q;>>9>=@UDJGIvNMVokh8R;@+2sPTB&
zV8;w);Ypl~!h0;0xUHw&b>HY_-m!IU4m+Lf>cZfUBi(=DL4$-LfyE^Y{AK#GzC_Ne
zgicua7`N=L`&50`hq${|rnzu_!86ZbLO8OV`Xi=e6jF~TocOL9sPyiTt5v@7%(wp7
zzpm-xmutZZ*Fo1c17wLiO8}+dzW{GP;e$xSy>V<2TR1<wR|{ybSDcvHHQ&Cf*AD6!
z-R!|s?5fG!mzBKkgnBJQ$M1{FKwkS3x3$gF%(3t(N{8Yi#1g5!$W&L<JC_%ew`Xk8
zYqDPUPUCt0&bnUQ=~e1S6Y^d*9H<<#NFciQ>A!n3&H#N2lV^pgk=)MMcPOd;rqW#=
zJ5#vg6*!R6IdM>!%zZF8WsQQ)nl}LOm)M5@Q&+ApjvTCvewAQo2bvHDaOFEl)K8;o
zxIE$eyp;`1XRjLW{fm8R`LhR|Trt!3zJ9y<d7eR@y_+KK^pe<n4UsPm!7Rh7lNMw1
zN%RkPOd*KK0Y?mfW@A{2--1vuGQQjoI*R)EHYD=$JkFnVxiteYnyx1g{X-<g1;Xx?
z<KgyHy2s9|8!o?}>bJ(EWZ(Yysjr@m9sj#KGemh!Hj-3}Xw`p8+Wy<qWi!UFVG)!}
zws`sm;a_=cy87eN;JeV5&%j}x^`aPgkaS=OXS2{Y8$H8zd^Hy&@T}gDkOVLzFRl*u
zu_L*}Zk`%#pT`vb<-jQgdg)7(*!T(GNuaLg_d@wzX4Mxn!2MWORed$8Noy;-oqNlu
zibL<uZwSP^y|?&I1RdyJQH@_)Ghg4Y$5A7Xr|JsV<RpLnS2546NN)QR=`SZzD7=)z
z(2rY-<}wkAz;OeijRzfZpadNWf*EJZY{I{Nm%4Lb^HQj}QuM<-{ovIsvuemfAMjlm
z$IUli@Yc53O|a4XXF2cvaoXIEUeZ(}Gdakfk?)m{*-$jO=P4xzZkp+C?3;sdlr}bW
z8RO-KO*@Sofmi&%FFx+DzgFH!R7@S2p;=8Ust&1zj~23?(b3GVam2_;D0e22jco6{
zw5zc8e$9lz>>xQdAZq;bf32NmSDRfEsEfM<ibK%iUW!YB;FPwwQ=H=NQrx|GfdZvK
za4QbQ-HQ|4CBZGo$@89d{=xZ}FIl<Q+<W%y+1E8Wt)GVltjcbap(2<k)#(jKZxYy(
zKq8jco5y*Dt86GgZ5XpA?L$(INP&C42Jxr-zo|Sgj?_Z?>3p1pA*!hR;nP1Ol`or&
zji4{X$=c^f8pN*|rBlC&qcS1+9MmLeb*Ox&(phY0o;Q+YUlx9oLZdS-Pw_8@D5K#r
zAV@z7NPm;|{E%a@oiXkF+B*e~xs&ZOyE$G}JHy5n>a1$?Tj1cMxMcDxEfNwuFh5Z_
zm|}d?co$}LoVLK+YACk2ktQRYgdrh(Qp+63kLc<Pa&k{~+|wQ!8kQVMGMZk+l>%Qm
zY~r5-lmQStd$@at0H_CW>1rgj_T&}S`N8g)r(g-|Xh4Y;Gtx_t<9U;c0}>xURK3SA
ziL5c~aQB=^kuVpH#{k}nT$T>}yvm(#z+qzK@BqsNV&&&yA~2D`U&z)3IHW=~NA1`1
z6EiJ!UvqOlw2Lx>KiUla)_O<Rq-ILs*8ryqZRLzq+Z61c-%VR<l=6wxT`MKo>&&y=
zUbgaiE*Bb!h1f`c(U!AA@&(QQ@<Pl|_}B4AUZCMpoLY7#x^d<=kNd;9Rds8tR=*IV
z;Xa=-xql`i>-UDN==lo0e5)K)I}zvdfxj*s)NJUCP!f<Nlw@_Q3ZYw$f@*xuR-hWc
z!D^Sni-)^PWEr&m5*1y)k=(+drXitkGolwd4_sY4!32jiCkllIQCr+>S>0Q$KbcU|
zCR?u*2I-~wmxg|L6hVMeeii0>MqCS%{kvY=iiW})OLt0G_XyFFuu>(F3F&@A*nVN-
z?^dZryzJ+TRI|A&8tY4`1C2(rh9br+ZFc_MsU*U_a`xy4jGdcd|4(x=V`|MKL&{Rd
ztxKndY@i&4mHrmwJra4PNW5D-1n(ebwLbTJBvk2E%X4K6GAMDzw=?o%<x(g3Mqn#E
zr--jgYyqrh=3RihoXD4amEQY=dkbjUZVn>4hWBDb8p!6U#LamFcEUM#0vipCaC*8#
zr^XVR_@832HF|+|h0F+3kR?H{&mIE4)+sN}=-kV_od-OT7u@|SRda|{i1G2a329Ts
zb|p+SUoAm1gN|cokyxl19xW=g0tx<}`n7cC7blmCFKOOC+EQ3|wlrK{>^eSbQ=B4y
z;*YX;6Y)Pc7yn#J-LV6jJF3$^Qj&YdaQ8Xc-jJRN1FlB5kwPD5Dv2W*8Ueqmhrgb%
zCuDPRiuxt76lwK^KE`GgLQo3V$bvU=<UY@23I;m04wobBQg&*pn@_Y}h;V!)WfgkP
zq*}jcR}c6xbw}coC*!WKg}j$&Pv0k@fZ3d|r3xN|<Il(sj$7ic8nU8St`gHCuW*YI
zBU{yyB$X)CPaSAKk2`T!yga*UoWg!k4)1Hd!x~CLfS~!j2Cq63J=Tl;Ea}rZ6quZ(
zEV)10Ore+wc(*IlLHY52MyYPC6&MDkg5msnm%L{h|BzPl`wY;lJqmRoHX70vYPUZp
zAGYN&rOo1q8C(lp4WROCty64HszN;rEd2AKN3+<IuK87ieG^Hj7ABh+2Tsq#^TvKu
z2JmJ9=4T=zNroPJ5(nW31J#FG6ybz_@nxD!Ie#s)pmR=y_x!0J0}=?;)V*vUYsvks
zYj<edTMZ|gSAwe8KjN&lw>5p)>its7-CtCQl!<%imO4Jhu?=6>JM!NRpT5c1UwOzc
zyAWV|YsbnIc^X(`Xl#8>Wk3qQER`g16}Pb|2#p~5>ahIvUqSfPucp>+f|u^LzO{E_
z<$mIhyBJhIf-Jvo`|dJhJ19}r8A)rsvC#WHRKYO$Mcl`Q!%9l`wDM|9ImH_{-n|#b
zVUG?bBNq|b&y&D#1%(OX|4KO#i&=f{`i4s81aeRfXEYHH>tln~vdWQDoL16^PW?m4
zvY{GsZg3Py(F|na6JioY9Q^n>oUPa=r6kqET1G@9PtGMK5V?mS)1y11X6;?MVB%LB
zV6Zb~<|n+YoHrOSZhB07Z1t!E^5q``TddvdI`79SvdB#9w8F4J*~nQev4J{^P@Y0C
z@{H?w=!aV}h`()pjAZsPTJX8Mqy-ESe%eo6U!KN;Dh)D3ue9^t_M|pbqqlej%;+WN
z_UG!09?9<_4Fxk(IB*I!EASab;*RSFgHNYk)rqOGFngv*udKG>cD)iL4cjmBfnUh`
zqum|310+{)A2K4vrUnxm%qh7gzTk0~G>%`cN8v~bAJ@K(j(A(DHqTa|B<xA01vsNl
zM#jVIbAu+&5$CF^QiceCn1t#%4+o)dYvd`xu49{h#cD0jC9xe&RXXQyLUH~wCsA&m
z01%Wo5Z#|-OJ{4o=iT9IW?K8x_gHFK57p!oNt|A0;Hv^b8Q#|!oMv*R+qAH1^a%Ob
z7O|E#v}7h)ztAF&`?cTw!WkF=^AIv_r~X(*z2_i1j6AewB4?XL#sKR73B5J2y6b_L
zg{r4vFS$pGi`q&0B~z`A#+(mT<r1=gQJ5+AkS=fMPzLA5`9a9s%A%1qaNLnDoS&4=
z{uP-1?gFWkCW{rVFI!uCQYe8Ky3`knw20#;ZsNBzM$V2|BG`B<{A(mJ9phInN_2)j
z)E{GG$j{HnGLS32SKWLhN@NZn%itlF$QY|n(IY2A#qqWlWwa|+f`SNcksIL`@s{~s
zzMiEe+ec+6>9%$m));{THTKuo8L)wbpP`#c7Z0d@f4QPC(pu|*f#ut#W=+msoZ9~+
zbo_CJmk9%&Td=sDZonl2_3g=?1AejyKC^_N-BjCnzS##5Yi$@)f+|ACzy(NaHR-R3
zyeFq$*_Xz|r064X<^53DkDEM3@&S+}@teDuwJ=+uUI`!t(KOa&gOn6K)%F0i0?>td
zPYNVQm<rbX1cQU9{6=3Qn=kfMGV@6<_6aG-Di`Qk(rng=|Lp6cqLS3&(%V1)J5T0x
z3BH&}^#2keYJAt<MVc^hiTnUs&Q2B#gKo)W)z>3Xs&Yt>Zp({Quu!Dfh(bQX75ON4
zVtm?pp14-%SLw8f1xiG&Y>1^Hx^Cy_bO>IByKVloN^eQRDz$lF#lxxmNIuc(_S=sC
zW|GIzsCE80_w6G2%_8tWT(i11^RokujhfUoQ%bIjFgru4Fp`R%j-L`_$@5H#3=Inj
zFLPZio(u#Eq3e#PE2LxdY+P8iwWDjm&DYu4!P$;y870rQYDleuaU=G$yfN?^{kVKS
zQ7j9Mm4HpCO{hRW0%)$4Lj{Nj5}ud7bd+tS#h@xlm(%kpo@&jFjSpxEi>Du_1}qRl
z2I_V!mb4I={~hJPk0V}omTw|S%-}C<P(F)a&u<k=S~2#Rr2&<8c?d?5XeFOQt3S6Q
zW{zp3)|kYNWenfPJ5e0*6Ywz@v>l_<7@L25<5g%yhljVC(L!l|LH@+x-*h%(4!a|e
za{TQ$OXmvL%b9l0KK4ojdL|7EGR4@1Jkwa+N6Os+5-uHu4?TQIK4*y2cSM|6l)}%R
z4%Ea|SpX{T^o*Qx<bl<22sMM3E`xb<{%Ta9Pc7L?sg0FSmB0FCKRv;DGE>=wv1uo$
z>(bBgzWyb8oI~fxwoNp$=vTir>%VYsq_@rty3vZXzqX8#8Hl=7yp$tHIDgv-gI3$2
zRLG1-37+heo%R1-F7DHwrQUfvEN=U0BC?jX@b_}fj`Enr((U@vKS)ZXLrG;E?UbM7
z>m`Xs2+rs<N?>~d?SI}A^_sU?D-sTCc@QNJ&jmKQ6zfb^&onM^A>y6L+mzm#n^GI9
z_x(h8BoiGE>R}MmN63f4g#0E9c!L0Hqa14!r^tS}dXaf4$?-<^)8Q5uB7gA6&#`&^
zQ7C5#WSJLo*oLU-C3+Q-T#YfI69V$7hP$Dt_QY)}9g8Y`#eJ0@NtKgs$q3(oeQDbh
z7111<BMFlJrY*{8YJoJti&Ue11QZt25+3ZUZxF!-97#qjQG-9neV$(7g3TCvc;Edg
z+W>xO?~^6SILg!M=OsC1tW7isCmc7@_*}8M1r1%!6wtI-UJX&Q^=LCd>=Xf_UsMQn
z#vVf4UM=uRr?{RFQ&ZEM8K+Fk*8Azxfr*S{*V{RbN~t;72A`#WesUMgR$6a%Bz!$k
z3jbZQ2*XE6N4Gw)y3fo{;Uve^{)XcIAkc9j8iMncHwx1E6U2+S!zv?%Z`>vX3-guU
z0${Pka+YFg+=oMiE?&;E-^CU79`@{X1V>+ZDeI^hMWo8lbiVD?4BnR}efM6dwOfpr
z+S%%Y6U{wP7S5K&{Q_6Qu`$XfzxCBc+75T@N`Pk>eJAa@mAWeeLfRM=nR^<tgp5rN
zJ1$mq#9k&gE!DfOXb?(3)wT=?iVTW1t_5yMX+3`GH{AF(8Ir9KkjKieu7;3EJk2$)
z0>FU-K&L70ox%e{sB#P&D!-xsC!!&Vjku`M=GZ2&+6*1OO0WN`O6`3xB$$$1Cc~P2
zZvZ;-`G{OcKuAXM<NX4s`q($rj)gV$QvR&{9r{+xVp%1J@P828*+v>ls2ap51--}H
zptl=EM4@mcj3RV|IXagPYshgsecOeafP}%}Cm~IP<Okf0R-cSmKZR-h`+u)fS&Zof
zJ|gwQB?LlSw#UzU1{C>Pm#d@psNUyXWi6B8>f>TmgMa^RW>1~ANq%#_s5CR)dF%~<
zcO1UgMsi51$+gZT=SRN2k8idb5?$(}cbM=8&ani~!zHyv=6UaXg45G;OO4b2RIqR3
zbEUZpy6gV?+TKX@S&O$|if+4#->4=28#C?pf8t1!3l9i<XyeGr)jdmUBZau88!hGh
z+`v5b$mC3Dv$~y(!M|H{s+#VahDY4Oc87FU_C2c6gwT0d%U{NnJ0>oLdm<r)wBCqC
zFv>+cWGu?2zRDwc#OQ<ob!rBZ?%YyPFvsr=6dsT;)cq@PW-r^DY-llCa}0H#c}FAZ
zWsE|`4`SxU*GxuW3J9GMdC6hbt;WR=_t7|3-E8uF#Pl<N6`+n7Ec%2*tNV>qx+1Hb
zCr6{jo8`XhhD;q^A^$0hw_w*w2D=c3K)pQkH`G)#$8vFzl-m9kxau?lN;K`2=BmGO
zFGOPQq6Be=)KDNVj7cyoy9_qD`5e2RAL<jt`$ExvnL-))d>ME3V4R3=p}~<J8pjm~
zGfiZ6!LY)gA9TpF5yCVjDQ!-i4DZoS4$s-M1i7P}V{@)|NV7nR0k_l{=wCZlTvE82
zjxi%5MR^cVfY@npM1*~lxR=-4SWKEh9pzXR-L*~0*=@~+bpeyZ!R8%DDq)FOj|5L}
zxv;&h?9^&#Glo~!{(WBGQ?ixYg3ITm2dR%7Ba(>g@?2m|uvR%RfCSIr=j|IhWUVUG
z4wE|M3M86pgDvJ86c3lRQwM{G$Ny{W^`aR{zedy<`N3!*R9gliJdw?Tjb2Z4ql&uQ
zl3awrju8-VkB)Z3mQ_>}g@af{#9Seqn5dsNL`k^_c+qh?N+hOjZTVGk^U)chEfSy%
z)kMo3uuxfx!b86KEK3IF<yYsOsZe$?@X&R)C=aY`FviO;pcOdxtyCdHsAbZ{kO3oS
z9q}w$c!+~S#BA>=INbk(wdBp`fjnRAJK@`xEa923uB$G|w_cZ!>&4{?@TjBYG5cFY
z^l5PobrAswMhKX3G2dc9?Kl0u69Bz~G&T6zxh3RA1rjefirxV7%{2_npzkHfk3w&K
zn*V5@Rn7(a7#k);4QlN8w9d$y!OfYmfuCJoE2K<XS}V<FdAB=z#ge+1XdicM_n<ZK
zMH~UXq`)(7NqO_T%dE~CD~EEXPq2ry<M?usmBeQddDH>4+GnI`57*T&gDq9j5s!~a
zI0mo(T=wU1FJG8G8Ho%EOvg_1a@1@mJ--F%c2~H2eXOeJP=G$K(RiVLbm;O-M?6F#
z{Y>v6J#76KYi;amLV!EO9tp2JE|1*uVhl;3OJ}psuMjMm{Pue3(9BU_RvA<+cj@u>
zP1*!GrIoN6)=aS^Q}&{?I<Cb+GLd`7O|JJLJq<4vc43ropQ{mP-lctm#p77@bS5Q+
z$mOQV-@Rq}$Ho$cp7Bd)PKM~YL?{C$F=a@r9=jbvIiXnD>tPPol0yLgc_z`h^oShF
z_~v)HFtZF#@fkiA(in}sR*lz?L9qW0+0<j7ns`T{IcvVH^t-%BDc5MJio!<OBo@W|
z`T}~H-JuAvQ6$(eZ7wmDlr4z73;~wP5hpUFb#ZN@I)%ROcvWQ3$}w~O%yrw+%UHH&
zAfYQy2DsDFIMZd2{e$?|wp9~)D)&9qtt1?#*cX7eV?_udoO_M7Bt0`>(*!tWYenV5
z?oUr8^11yk9puWv;pR`Q%gjMKq(Bu_1-%+=S(2ycOeC$-)#}`V+G%OUZ{~f)T#5hU
z`Cv&bg>^du(V-#bPxF}JTl?Qu!T83flX|?!Vjk^-+F}O?&BZyfyxAz(%+O<JX>vA`
zCfM1z#6%52Li|V!v;t4J*XVsfG{BFG_0prMHSt>XRFUfxaDTV(ThpYzA%V8(T5UZc
zDv)EFXbU@FIrc5LOkV&mtW4+u%GgZh=&8}S#z5<&G4@DOcf&NbdZ?pevs)-7rg!ks
ztEi(<e<ZweW`xi}9i*OhYh<+&unYZyuDC9%i(|zRZX4MUu=06E6UrZPTjoAoo|cnP
z*?sD|KAhYbyIPx+=|>QqaR%kEaUtPi4(9tJMC$&zIE{@sUKJkEuASY5R-qpk;WyH=
z;()&U0O1sM2M^6yT~T3E{i0b(K&1MN&sYJVg$vuSSeNh4@VDlET0&|fZ-?qS6b(oi
z@4yEBlcN*m$!CHLDt>$ssx6C^TjdhlT#?~6wN6%f?AK6k^%DISF{4!>-#rd8GCvOK
zMMNVtA;nO)E&hn9@-UJ2<<jm-2N_7_q93J-SVD3~a(&JQ@;m%Xio*xfp#^;JLPx5c
ze&ALV@IRb*yWFW)+upD{w%(&|T{6s|&a@;!3P}%U&ffG3B{cPJmJq}Rh+_^0|08k7
z0DSa1hnoF~a!q$N@1LLaTx?x(^=1xxMAeJPICMTyV|c}$WUX6m(xSy9_VBxbK&WRi
z+9!1&4qXC$d2P4ATcAx}WP#N8d_at9cgy$dTMGZeY0kaXAG*bMU9&crQS6p_RoCT5
z*U92-4}IQwG+(WVrLPH{W0)Ak5Z6hbRC5$x&>ie;$?sEKP9+TSLG|18a?+l_<cIbz
zT5^LrtL0790dhI1v@CXk`dEG*lPaGis5j2MH>+Qds01{PN?(6kf_+4veh)cfwX{4%
zhT3a6vqe?MS#7#WUoV*^9)4y*G^lV7iR(C^&sBes8Fp`ak`ppcaSO5aB67s`MulPC
zOZTEr$0iI;T?tAeaV{hb3w=*^3x0D!6{{wIjM3>1+PXNx87ps=Qg|5N#3KmBpZLn`
z{X}`aAfkf+Mg#-ln<Z_TogIlE;C-1bmv-p?MJpEBf|3J#lHdzurI*T|6t68^8ls${
z7)atb-TtRoyJ~x+6)rUX#N}mD0cOtviuHTWgi%=4!X{$my1p0ezfGf(1wt5A5T-fB
zjNIj=edjIvY3d}PPHr#<Dj@qQ`G8Gg2MI7I3I*WRa0TDiOz4PTfXBrBqLNedLmuU7
z=6MImH1jj<j@R#EO;37`%ONTYV&??%uC;~@yH?v^dg5SmjoLj7%Os!hwlq3BZdaem
z$zu$vKnXIV|G9~Xfdx9HpoWg<D()8eJ3tq5C$?jgay*M9YDmj9=KxhySLL5;bxcNU
z+3hoh>8&7wM{KYxfN(=3yh|!Aa$T4&J;N?7ped%^cEZo<k0a^v-cdA$HGGv|#$!K1
zSV#FbN28!4o8yT-5t{r)kIrw9DL$FU`4L8I@$|^Fd*pNG0<!vqMb0MsBne0}XsqHr
zZt0H&jyl<D|FGOz646(fNO~uQ9f$8ziiDYUu@fu~f1I3bT5D<e%*C1UoJRM2uDV+U
z+%Am*mwvHgs|n2ihu4eR)a*FXcKVYlXww<4!7|AD&)e+(op}zVYhFUkT-h4-UIcD3
z)&{^zbz5v<)E4kpr!^Q4XS|CZ1LP`UmM(r0>BlB1^U+-LYL8kn@YvEgJfH7g{7XYu
zLJ19&86g%gCtGvlJVH3tArUCkCsBV(m9^W<X9CWGo8Rd$WXe(NowRfW)(rV#<UCM`
zh)47Dow!JXnc93{L(=<GX97NL$4`|R66w`AuDSd-V`HC{tZ}4X73v$%?(u265T8)5
zy^v;i&@ksYzV+S1Ex6{S+>=&_kXRENdTn+2x4u69>>ta%z5+AVl53cg*0hPswpCm$
z3|boDuK$EWR~Us!YHm5LJ^u7)wH`EudtoeVi7(u9#x<pY*i!uxs^z#QZZf)VY0Ca|
zVzfK##X=oXKkR;h21mUoegEBNo}^aA>`5xNBQ5B(|4NTS>QbeE?>fI|Y1lOJI)Igu
zqRZV?;IV^6p0db*q^q8E8%EV`If3m+T8FE|)_H3I-}ss3(bw{2qFl-l8dO5xo{iD*
z-y-^-Dazi!mn?O8OK`u~%{^hKdo5UF%XcjiAO%8o3nf>Hb-`=$dJV6#J6E%zfko7A
zr)D>rfa*)iV&6b@K%2V2D`L0Ple@R(^s287W$%Jpx@@5fW?DC#@{Duiz9yXV)nKr|
zzPja&GT>??$n}?mOGPg2)K1~L#|$b=M3e}*3z&@|NhH_lz_I?6{e2hvekd%51f5F@
z>y@LD%7*8+??1hzM4_WmCWOZUh*2*&rbk7W4T(=$GRFdx__%)Ux3UHPd(r0u-NW*C
zdj#pCCdTDEbI~$E{bQ0_J^q+_DzB)1-Ti)&8~=OscU2ic3b$G}O1ywi#@n7-wB5C}
zEK^LCL`Y01<o?%oLAfGk%}?^LgE0;o_j4P`2*a$3FfXGLez&i+)sjOhW14w#`kS1C
z#=~_e#C*eVqD!#G^A&neY6g9}Sbuy)m3X@_G@tilr0Zf?xrO0QdJvKe20tjEeZ?#s
zCTa;4^)_yS6FJj28)mXuoF6E97f{@(fPLK;W&NoY`(H;Y`}cw`-uNz5FC8s$@9y!{
z4NO}3Y@>p&yM=EDT^HCyqc9FP!d&z`AuZT9qH_bZ=wMp@``X?Hf*@)K@x&uCpJpMD
z)9qL%Lc^%;Xm7plwKG`D8|yV03JH>*CT17S-K8L9lx$`JcEJnP_yGWc(Sut*1IR%T
z`o&DNr*<wmhm}HT9{va!NsMXYjmw7V^Ir+%u#$H;AG=o<+G`&TG;=?I^8Sz$C^pQS
zcWk>?_s{-?uzPi}6;=!TuZYH);1xwyPm4b4)y=OLPt^G=vK&W89Azy<Ja5Jl`=3(b
z9828|frj2PWr{VESawmP1)a?Lo5umo-NP)sq%u^5#nwk8MSxdApscCL7h`W5M9(tt
z@75cRg--d~0ZK^u_LUfhQ9X-XU2nFfIxYL9f+(|Vd3QQyl=-w#xMM?e*pi>xwQO0d
z2~K70E%=7GNDqGSRpvSNZ#BNpf!MguMv(sT;DWL7T5DjZ#vgrp|2m4R)U6-*BkkBl
z43r=eFY`9PnWB+XbVCoE1NLQKsbxKq7U2b4oI!cleD<sV2DkxlbZLbUX_|J5w>mz$
zE6R%Otdd7B=$e!J=pamMu5?Wq7T``6y&}vfKe!}d3J^Z-gQ?5~l?5Fdm^cLNI9V?(
z*1nL|y&#ioe@%hxV4ihRjPK0tv*f%C2KjRbUb`t+lsUsJz;?|YU%;0!iKfPP<_!y;
zv6+5{&h>H6>TOA*g9>#YCmQP}aReeHItP;x<HuMk^b5Kk2mg3ni+1c{@rXq-g>9>O
z3ZT;5xucB%ec4(ZTyO)4YS^>4IS8H1&I^0x#udc0RU<3NyOigj>Ep8i8$oFPE>XSf
zIO*NbCb`O8Erw|dzSH>~OVLYa6l_H_V{au2Cd7MGe;2eEjjV1aN9RPZddjzCW!<Y|
z?YpIM5KB8!+wY35<LBF>1-LXb2KXZi%C6EH+wjG<?)iS6`o{se(Tk6*4V_WERGPh>
z&QGRY_D$+;Fjgjt(PlQ_df$jnyMEAcIW%XfJ3lbuCb=v8^(FK2XFTk&UPhC=LgXs)
ziX6S(pSAkSPC<4pmtqS-;N;=L^Fx_OM*1@uLJ{h^;J4j0L7)WUuS4(Y<{bzGm1C=&
zG@pw@PiyW^ywAhSINQlVgCTYKU_5bcM6>ysLAG9R3ZRc+&Dyq5s$Hd?CiN;|;6MSn
z0U|**XMlDKjOds7!xfKY%;S!3EV70~8Z5^ppRT$R<Hu+h7dIm+sxU8!G$v(LV*GZa
z8CkUSpIDaLojLLN@UhRBA=9y(DAn{Do4Mhix*q^j8IbFbvm%i$f?kQbT+2oyx>+*;
zGML;gT2idEh}1eh#Jtkqo_)4RZtC7N@y>-sj}A`U3y#%=p?6T3#K%hc_g4C<$4=+V
zUVZO%w|O11RH?ac0F4b8FzP-R1MmCBWv-+*bB5-!cp^a6_3y*hQflUV#EO5wl!A1s
zMVnYDiBzGZj)tYHzXbBvSOX3rl}8`B9d71x7-)I;pQ7m8niIo03HWQyR_&9%)dm?E
ztdN$)6w(t`_L!YC!y5y?J5^_#PpT#Ej<d+e=2MLBs*+xn_T1GBJ~bnxny2W(B`hW}
z>Fo`whI#GcplwcGluooj_L~O{WHdFxGL@N=rkmxa0S)*L-sQok%{Ah*|J<mqFt@eF
z0Ko=)?FM{7tV+bhMR*`+6LTc_+YIIurso7}D}<cqnw<xgrwHysB0+Bc4hsG7^Amk@
zQC%e~+@YLLQN{DEP$8gv#IzSug*xZO{}>-y`FIqayme&($ivGXsfUpv)+(%afz8j!
z!08;``p~;z9%QLnH$RBkuc7F<oa{s5pd`rJZY_A%>IJQAYk#w{4*nM5OKsIh?ziPh
zlbF$66>8*moO}Dcm$Ni9{e^haU->dRpFtoGgt%WI7|>t$F4oZA=XTUKfv@#E^iK^D
zG|J@-hX9GJjY0U943TpF{G+a**VbZc$f49<zO5jpu`+n+6|;L-TE+s(!3RCAmWHh3
z6E}q~uXb~-2PoYJmzP)+E>7GF{NtXC_D^aV5bpvoAie#%A|lk_`=|;2Q==D_-0YCu
zV6)%XI{`}(>=eXYTtEHWE$(DWT`C|F-(bfts9dKE=0c6L9tzpn9nkVdka4scQMxn>
z5=cfK@xbOXX#oQ`yO(l|VtS?i$CYy@jo4N3yXpn4d)@>r*|9AKP2ep|W5EtbXRU>=
ztH&3UC_2X*Odb8c)YG|D*c`$O{O4Y>c^$jP9Wyou;j=~ewVuF0z0Ik-gm*3<Q_wEX
z&M4>M{8~+s*9pXJB9L;Y{&+j8s?AcHKy@Z+&z+w*+gk!M=~x$x`>wE6j|Zc!_R##A
z-;avRTrv)(&7nRJrH}1>hhh~mgLaOd<7_<db!Vp;2OBaaL-&qt{)D|2!rDZi$(Fv>
z31;zf?gX+tSlzUBkaw9V+h|SWr|r!ak6u}kEbw@q#P5RH-GnL67!gMnW`2}4QkK-#
zdqs$~C46elll_dQtlR4%l{DVrcx!&)^lsq~2QiC$FQ+zL4mYRvp3eA_`k|{QQ)=b*
z2Z^_kG5kGtGJR>C?5xh8XQS^s{WLPY3IF%70z1~Mu919i2HX^%F*XMf?~wj@Dp6e`
zgLrT~+l2I4nQp&epDISBI?Oew3)Z>G@(B5PNhPev^?rKHxtg%dnWy-V;5=Ga#Hh&k
zdd-}5mYl`HfRpMO@*w<FX7*5oCH#+3r7;%xgL=-yuIyPI1yxH9AT*mdETpofvQOFu
z`w)JTMfj*ErzRGNn>kP%@7IeM-IIxx{q>VY&Geni76OwHjbsRE-d@Gu$;oPKD_BM|
zQP}w%WulqpHFgyrUgiH{lh>pccj#8B8U#F&pVT){{aU#kiNM4*Edk&lCF|?RfqA5M
zQ6(oI5l4tF?iy|AV`<Ush65;vL7+4g8onJ)C$iPjh6)wl?+l=>e7*~%lf0XEPQ)*!
z3m}7SUV#oaW~AHA`8+lK0p+B_3Tb<f^y3PzhbQ_gjFcrG<mMiAB1FC>(!=btpW4#+
zjX5}^^s^@+Crj7;m9}qsywTmj{6|p6s0J6OSR3MYdT4m*4!%fKYa9NNGo(%q!#)TW
zMeb1uI)uJq5BNb;fa4D7IOaZ{ZaFeru*#N`%o#06=Cl{!7ZIu|m>aqG*r@lD)QE=v
zCpW90Reba4emlkw9uMj{5_(ohH!fq%3vB(zn{TbEnXH5$5OB;h<BRyQ`6KR1+en3x
zq>u1Q#Cqt2bdS)w$I+C%U@=3Y6O<F|BQc`oQPrSJ5}r|#reco!y`}OW566>JzFS>y
zcJk2<skXqTVc&gQ7Orr$A*Iw82Y+f;6JIvgO&|}Ib`W+c$e0>QaG7C%x)uu`vz8^Y
zInlWbfWplmH3OMG%7zQ*7}NI3Y(?)|LV^C)pYYMvZ7S~0gwfX6DuzyNIlV^m3~i$|
zJwuq$%dz!9(8H(lyXv{q?=d&OByaff51lu9-}B?p8uNd-(biDH`=koAKDsQS2r`Z3
z2TtjJD45F}5RTz5gmT4d5w{2X1$(1gG_#^$WGJ3meAy@=>Lye@e%2OYz$0<UR#Czv
z0aA}Mq2f*dBK%AbpF<zdN=>3g`c2>1NzH&4j<P~VCp(BQApG2vkeIIzX0K(%DAL)s
zY;a+W6Opu4TUpK76TrnDbaUeeXthg@Z&!(WtGV2EdogS^N9~2YM#j_PaWR|X#0HS0
zg{aEpc`QfjS02iIv>N$56<L*`g2;)9lnZ~k&4Jwp%_JKXPzss9mCrx|!8_42r6EuM
zzE?Dh4A-|?f<rLPY0Q@wPbfKy8F@_M0bC6mn^ODlbCcX4{*)?_1}ZsLtI2qj8`CIN
zl)I&v=gq&x6j6nz>{J!?jJREqVY>=p=y>wZsp47}@FE~vWIn6HZEw=wqU!RGOeIXC
z#;w|Ie)9Bu^%)(QCh^u!%OS0$#MXd)MfFDcPw_ybtG}kJ(SY~xeeNSXwlGog{VSW=
w`d9B`zrC7bpnKJTiTR2P8TtR)$NYAw_Qv~}e-4QM9(tuDrv|E$`4aMf0M43JzW@LL

literal 0
HcmV?d00001

diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
new file mode 100644
index 0000000000..f4aea41496
--- /dev/null
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
@@ -0,0 +1,205 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Ensemble LDA\n\nIntroduces Gensim's EnsembleLda model\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import logging\nlogging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "This tutorial will explain how to use the EnsembleLDA model class.\n\nEnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,\nit can therefore be used to remove topics from your results that are noise and are not reproducible.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Corpus\nWe will use the gensim downloader api to get a small corpus for training our ensemble.\n\nThe preprocessing is similar to `sphx_glr_auto_examples_tutorials_run_word2vec.py`,\nso it won't be explained again in detail.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import gensim.downloader as api\nfrom gensim.corpora import Dictionary\nfrom nltk.stem.wordnet import WordNetLemmatizer\n\nlemmatizer = WordNetLemmatizer()\ndocs = api.load('text8')\ndocs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]\ndictionary = Dictionary(docs)\ndictionary.filter_extremes(no_below=20, no_above=0.5)\ncorpus = [dictionary.doc2bow(doc) for doc in docs]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Training\n\nTraining the ensemble works very similar to training a single model,\n\nYou can use any model that is based on LdaModel, such as LdaMulticore, to train the Ensemble.\nIn experiments, LdaMulticore showed better results.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from gensim.models import LdaModel\ntopic_model_class = LdaModel"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Any arbitrary number of models can be used, but it should be a multiple of your workers so that the\nload can be distributed properly. In this example, 4 processes will train 8 models each.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "ensemble_workers = 4\nnum_models = 8"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "After training all the models, some distance computations are required which can take quite some\ntime as well. You can speed this up by using workers for that as well.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "distance_workers = 4"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "num_topics = 20\npasses = 2"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Now start the training\n\nSince 20 topics were trained on each of the 8 models, we expect there to be 160 different topics.\nThe number of stable topics which are clustered from all those topics is smaller.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from gensim.models import EnsembleLda\nensemble = EnsembleLda(\n    corpus=corpus,\n    id2word=dictionary,\n    num_topics=num_topics,\n    passes=passes,\n    num_models=num_models,\n    topic_model_class=LdaModel,\n    ensemble_workers=ensemble_workers,\n    distance_workers=distance_workers\n)\n\nprint(len(ensemble.ttda))\nprint(len(ensemble.get_topics()))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Tuning\n\nDifferent from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.\n\nYou can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around\n\nuntil you get as many topics as you desire, which however may reduce their quality.\nIf your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.\n\nHaving an epsilon that is smaller than the smallest distance doesn't make sense.\nMake sure to chose one that is within the range of values.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nwithout_diagonal.min(), without_diagonal.mean(), without_diagonal.max()\n\nensemble.recluster(eps=0.1, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Increasing the Size\n\nIf you have some models lying around that were trained on a corpus based on the same dictionary,\nthey are compatible and you can add them to the ensemble.\n\nBy setting num_models of the EnsembleLda constructor to 0 you can also create an ensemble that is\nentirely made out of your existing topic models with the following method.\n\nAfterwards the number and quality of stable topics might be different depending on your added topics and parameters.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from gensim.models import LdaMulticore\n\nmodel1 = LdaMulticore(\n    corpus=corpus,\n    id2word=dictionary,\n    num_topics=9,\n    passes=4,\n)\n\nmodel2 = LdaModel(\n    corpus=corpus,\n    id2word=dictionary,\n    num_topics=11,\n    passes=2,\n)\n\n# add_model supports various types of input, check out its docstring\nensemble.add_model(model1)\nensemble.add_model(model2)\n\nensemble.recluster()\n\nprint(len(ensemble.ttda))\nprint(len(ensemble.get_topics()))"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py
new file mode 100644
index 0000000000..b8e8e20887
--- /dev/null
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py
@@ -0,0 +1,155 @@
+r"""
+Ensemble LDA
+============
+
+Introduces Gensim's EnsembleLda model
+
+"""
+
+import logging
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+
+###############################################################################
+# This tutorial will explain how to use the EnsembleLDA model class.
+#
+# EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,
+# it can therefore be used to remove topics from your results that are noise and are not reproducible.
+#
+
+###############################################################################
+# Corpus
+# ------
+# We will use the gensim downloader api to get a small corpus for training our ensemble.
+#
+# The preprocessing is similar to :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`,
+# so it won't be explained again in detail.
+#
+
+import gensim.downloader as api
+from gensim.corpora import Dictionary
+from nltk.stem.wordnet import WordNetLemmatizer
+
+lemmatizer = WordNetLemmatizer()
+docs = api.load('text8')
+docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
+dictionary = Dictionary(docs)
+dictionary.filter_extremes(no_below=20, no_above=0.5)
+corpus = [dictionary.doc2bow(doc) for doc in docs]
+
+###############################################################################
+# Training
+# --------
+#
+# Training the ensemble works very similar to training a single model,
+#
+# You can use any model that is based on LdaModel, such as LdaMulticore, to train the Ensemble.
+# In experiments, LdaMulticore showed better results.
+#
+
+from gensim.models import LdaModel
+topic_model_class = LdaModel
+
+###############################################################################
+# Any arbitrary number of models can be used, but it should be a multiple of your workers so that the
+# load can be distributed properly. In this example, 4 processes will train 8 models each.
+#
+
+ensemble_workers = 4
+num_models = 8
+
+###############################################################################
+# After training all the models, some distance computations are required which can take quite some
+# time as well. You can speed this up by using workers for that as well.
+#
+
+distance_workers = 4
+
+###############################################################################
+# All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as
+#
+num_topics = 20
+passes = 2
+
+###############################################################################
+# Now start the training
+#
+# Since 20 topics were trained on each of the 8 models, we expect there to be 160 different topics.
+# The number of stable topics which are clustered from all those topics is smaller.
+#
+
+from gensim.models import EnsembleLda
+ensemble = EnsembleLda(
+    corpus=corpus,
+    id2word=dictionary,
+    num_topics=num_topics,
+    passes=passes,
+    num_models=num_models,
+    topic_model_class=LdaModel,
+    ensemble_workers=ensemble_workers,
+    distance_workers=distance_workers
+)
+
+print(len(ensemble.ttda))
+print(len(ensemble.get_topics()))
+
+###############################################################################
+# Tuning
+# ------
+#
+# Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.
+#
+# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around
+#
+# until you get as many topics as you desire, which however may reduce their quality.
+# If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.
+#
+# Having an epsilon that is smaller than the smallest distance doesn't make sense.
+# Make sure to chose one that is within the range of values.
+#
+
+import numpy as np
+shape = ensemble.asymmetric_distance_matrix.shape
+without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
+without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()
+
+ensemble.recluster(eps=0.1, min_samples=2, min_cores=2)
+
+print(len(ensemble.get_topics()))
+
+###############################################################################
+# Increasing the Size
+# -------------------
+#
+# If you have some models lying around that were trained on a corpus based on the same dictionary,
+# they are compatible and you can add them to the ensemble.
+#
+# By setting num_models of the EnsembleLda constructor to 0 you can also create an ensemble that is
+# entirely made out of your existing topic models with the following method.
+#
+# Afterwards the number and quality of stable topics might be different depending on your added topics and parameters.
+#
+
+from gensim.models import LdaMulticore
+
+model1 = LdaMulticore(
+    corpus=corpus,
+    id2word=dictionary,
+    num_topics=9,
+    passes=4,
+)
+
+model2 = LdaModel(
+    corpus=corpus,
+    id2word=dictionary,
+    num_topics=11,
+    passes=2,
+)
+
+# add_model supports various types of input, check out its docstring
+ensemble.add_model(model1)
+ensemble.add_model(model2)
+
+ensemble.recluster()
+
+print(len(ensemble.ttda))
+print(len(ensemble.get_topics()))
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
new file mode 100644
index 0000000000..d779810a22
--- /dev/null
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
@@ -0,0 +1 @@
+cebdae557712e17fb52871e2a012e9c8
\ No newline at end of file
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
new file mode 100644
index 0000000000..a22e937444
--- /dev/null
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
@@ -0,0 +1,420 @@
+.. only:: html
+
+    .. note::
+        :class: sphx-glr-download-link-note
+
+        Click :ref:`here <sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py>`     to download the full example code
+    .. rst-class:: sphx-glr-example-title
+
+    .. _sphx_glr_auto_examples_tutorials_run_ensemblelda.py:
+
+
+Ensemble LDA
+============
+
+Introduces Gensim's EnsembleLda model
+
+
+.. code-block:: default
+
+
+    import logging
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+
+
+
+
+
+
+
+
+This tutorial will explain how to use the EnsembleLDA model class.
+
+EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,
+it can therefore be used to remove topics from your results that are noise and are not reproducible.
+
+
+Corpus
+------
+We will use the gensim downloader api to get a small corpus for training our ensemble.
+
+The preprocessing is similar to :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`,
+so it won't be explained again in detail.
+
+
+
+.. code-block:: default
+
+
+    import gensim.downloader as api
+    from gensim.corpora import Dictionary
+    from nltk.stem.wordnet import WordNetLemmatizer
+
+    lemmatizer = WordNetLemmatizer()
+    docs = api.load('text8')
+    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
+    dictionary = Dictionary(docs)
+    dictionary.filter_extremes(no_below=20, no_above=0.5)
+    corpus = [dictionary.doc2bow(doc) for doc in docs]
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    2020-10-25 01:05:04,690 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
+    2020-10-25 01:05:10,080 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions)
+    2020-10-25 01:05:10,262 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]...
+    2020-10-25 01:05:10,263 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents
+    2020-10-25 01:05:10,338 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...)
+
+
+
+
+Training
+--------
+
+Training the ensemble works very similar to training a single model,
+
+You can use any model that is based on LdaModel, such as LdaMulticore, to train the Ensemble.
+In experiments, LdaMulticore showed better results.
+
+
+
+.. code-block:: default
+
+
+    from gensim.models import LdaModel
+    topic_model_class = LdaModel
+
+
+
+
+
+
+
+
+Any arbitrary number of models can be used, but it should be a multiple of your workers so that the
+load can be distributed properly. In this example, 4 processes will train 8 models each.
+
+
+
+.. code-block:: default
+
+
+    ensemble_workers = 4
+    num_models = 8
+
+
+
+
+
+
+
+
+After training all the models, some distance computations are required which can take quite some
+time as well. You can speed this up by using workers for that as well.
+
+
+
+.. code-block:: default
+
+
+    distance_workers = 4
+
+
+
+
+
+
+
+
+All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as
+
+
+
+.. code-block:: default
+
+    num_topics = 20
+    passes = 2
+
+
+
+
+
+
+
+
+Now start the training
+
+Since 20 topics were trained on each of the 8 models, we expect there to be 160 different topics.
+The number of stable topics which are clustered from all those topics is smaller.
+
+
+
+.. code-block:: default
+
+
+    from gensim.models import EnsembleLda
+    ensemble = EnsembleLda(
+        corpus=corpus,
+        id2word=dictionary,
+        num_topics=num_topics,
+        passes=passes,
+        num_models=num_models,
+        topic_model_class=LdaModel,
+        ensemble_workers=ensemble_workers,
+        distance_workers=distance_workers
+    )
+
+    print(len(ensemble.ttda))
+    print(len(ensemble.get_topics()))
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    2020-10-25 01:05:13,988 : INFO : generating 8 topic models...
+    2020-10-25 01:09:45,442 : INFO : generating a 160 x 160 asymmetric distance matrix...
+    2020-10-25 01:09:46,332 : INFO : fitting the clustering model, using 4 for min_samples
+    2020-10-25 01:09:46,348 : INFO : generating stable topics, using 3 for min_cores
+    2020-10-25 01:09:46,348 : INFO : found 32 clusters
+    2020-10-25 01:09:46,350 : INFO : found 2 stable topics
+    2020-10-25 01:09:46,350 : INFO : generating classic gensim model representation based on results from the ensemble
+    2020-10-25 01:09:46,496 : INFO : using symmetric alpha at 0.5
+    2020-10-25 01:09:46,496 : INFO : using symmetric eta at 0.5
+    2020-10-25 01:09:46,497 : INFO : using serial LDA version on this node
+    2020-10-25 01:09:46,500 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 01:09:46,500 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    160
+    2
+
+
+
+
+Tuning
+------
+
+Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.
+
+You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around
+
+until you get as many topics as you desire, which however may reduce their quality.
+If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.
+
+Having an epsilon that is smaller than the smallest distance doesn't make sense.
+Make sure to chose one that is within the range of values.
+
+
+
+.. code-block:: default
+
+
+    import numpy as np
+    shape = ensemble.asymmetric_distance_matrix.shape
+    without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
+    without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()
+
+    ensemble.recluster(eps=0.1, min_samples=2, min_cores=2)
+
+    print(len(ensemble.get_topics()))
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    2020-10-25 01:09:46,802 : INFO : fitting the clustering model
+    2020-10-25 01:09:46,816 : INFO : generating stable topics
+    2020-10-25 01:09:46,816 : INFO : found 47 clusters
+    2020-10-25 01:09:46,819 : INFO : found 2 stable topics
+    2020-10-25 01:09:46,819 : INFO : generating classic gensim model representation based on results from the ensemble
+    2020-10-25 01:09:46,821 : INFO : using symmetric alpha at 0.5
+    2020-10-25 01:09:46,821 : INFO : using symmetric eta at 0.5
+    2020-10-25 01:09:46,823 : INFO : using serial LDA version on this node
+    2020-10-25 01:09:46,825 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 01:09:46,825 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2
+
+
+
+
+Increasing the Size
+-------------------
+
+If you have some models lying around that were trained on a corpus based on the same dictionary,
+they are compatible and you can add them to the ensemble.
+
+By setting num_models of the EnsembleLda constructor to 0 you can also create an ensemble that is
+entirely made out of your existing topic models with the following method.
+
+Afterwards the number and quality of stable topics might be different depending on your added topics and parameters.
+
+
+
+.. code-block:: default
+
+
+    from gensim.models import LdaMulticore
+
+    model1 = LdaMulticore(
+        corpus=corpus,
+        id2word=dictionary,
+        num_topics=9,
+        passes=4,
+    )
+
+    model2 = LdaModel(
+        corpus=corpus,
+        id2word=dictionary,
+        num_topics=11,
+        passes=2,
+    )
+
+    # add_model supports various types of input, check out its docstring
+    ensemble.add_model(model1)
+    ensemble.add_model(model2)
+
+    ensemble.recluster()
+
+    print(len(ensemble.ttda))
+    print(len(ensemble.get_topics()))
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    2020-10-25 01:09:47,300 : INFO : using symmetric alpha at 0.1111111111111111
+    2020-10-25 01:09:47,301 : INFO : using symmetric eta at 0.1111111111111111
+    2020-10-25 01:09:47,304 : INFO : using serial LDA version on this node
+    2020-10-25 01:09:47,314 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 01:09:47,314 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2020-10-25 01:09:47,315 : INFO : training LDA model using 11 processes
+    2020-10-25 01:09:47,546 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2020-10-25 01:09:52,514 : INFO : topic #8 (0.111): 0.001*"league" + 0.001*"minister" + 0.001*"jewish" + 0.001*"animal" + 0.001*"km" + 0.001*"soviet" + 0.001*"election" + 0.001*"emperor" + 0.001*"energy" + 0.001*"cell"
+    2020-10-25 01:09:52,514 : INFO : topic #2 (0.111): 0.001*"km" + 0.001*"china" + 0.001*"county" + 0.001*"album" + 0.001*"jewish" + 0.001*"california" + 0.001*"actor" + 0.001*"economy" + 0.001*"jew" + 0.001*"band"
+    2020-10-25 01:09:52,514 : INFO : topic #6 (0.111): 0.001*"km" + 0.001*"election" + 0.001*"software" + 0.001*"user" + 0.001*"japanese" + 0.001*"female" + 0.001*"minister" + 0.001*"prime" + 0.001*"season" + 0.001*"saint"
+    2020-10-25 01:09:52,515 : INFO : topic #0 (0.111): 0.001*"actor" + 0.001*"km" + 0.001*"cell" + 0.001*"minister" + 0.001*"italian" + 0.001*"male" + 0.001*"economy" + 0.001*"energy" + 0.001*"y" + 0.001*"soviet"
+    2020-10-25 01:09:52,515 : INFO : topic #5 (0.111): 0.001*"band" + 0.001*"actor" + 0.001*"soviet" + 0.001*"album" + 0.001*"india" + 0.001*"minister" + 0.001*"ship" + 0.001*"km" + 0.001*"energy" + 0.001*"china"
+    2020-10-25 01:09:52,515 : INFO : topic diff=1.009308, rho=1.000000
+    2020-10-25 01:10:03,105 : INFO : -9.257 per-word bound, 611.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 01:10:03,105 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2020-10-25 01:10:07,873 : INFO : topic #4 (0.111): 0.002*"album" + 0.001*"energy" + 0.001*"band" + 0.001*"blue" + 0.001*"cell" + 0.001*"emperor" + 0.001*"bc" + 0.001*"actor" + 0.001*"japanese" + 0.001*"software"
+    2020-10-25 01:10:07,873 : INFO : topic #2 (0.111): 0.001*"jewish" + 0.001*"jew" + 0.001*"carbon" + 0.001*"county" + 0.001*"california" + 0.001*"jesus" + 0.001*"china" + 0.001*"km" + 0.001*"cell" + 0.001*"gospel"
+    2020-10-25 01:10:07,873 : INFO : topic #8 (0.111): 0.002*"league" + 0.001*"season" + 0.001*"baseball" + 0.001*"jewish" + 0.001*"animal" + 0.001*"cell" + 0.001*"y" + 0.001*"ball" + 0.001*"china" + 0.001*"minister"
+    2020-10-25 01:10:07,874 : INFO : topic #1 (0.111): 0.002*"km" + 0.002*"soviet" + 0.002*"minister" + 0.002*"election" + 0.001*"est" + 0.001*"economy" + 0.001*"male" + 0.001*"chinese" + 0.001*"liberal" + 0.001*"russian"
+    2020-10-25 01:10:07,874 : INFO : topic #5 (0.111): 0.002*"band" + 0.001*"album" + 0.001*"window" + 0.001*"actor" + 0.001*"comic" + 0.001*"irish" + 0.001*"india" + 0.001*"user" + 0.001*"microsoft" + 0.001*"minister"
+    2020-10-25 01:10:07,874 : INFO : topic diff=0.149585, rho=0.592297
+    2020-10-25 01:10:18,504 : INFO : -9.186 per-word bound, 582.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 01:10:18,505 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2020-10-25 01:10:23,685 : INFO : topic #2 (0.111): 0.002*"jewish" + 0.001*"jew" + 0.001*"jesus" + 0.001*"carbon" + 0.001*"gospel" + 0.001*"bible" + 0.001*"california" + 0.001*"hebrew" + 0.001*"cell" + 0.001*"translation"
+    2020-10-25 01:10:23,685 : INFO : topic #5 (0.111): 0.002*"band" + 0.002*"album" + 0.002*"window" + 0.001*"comic" + 0.001*"microsoft" + 0.001*"irish" + 0.001*"apple" + 0.001*"user" + 0.001*"software" + 0.001*"actor"
+    2020-10-25 01:10:23,685 : INFO : topic #4 (0.111): 0.002*"album" + 0.002*"energy" + 0.002*"band" + 0.001*"cell" + 0.001*"blue" + 0.001*"y" + 0.001*"bc" + 0.001*"guitar" + 0.001*"emperor" + 0.001*"universe"
+    2020-10-25 01:10:23,686 : INFO : topic #1 (0.111): 0.003*"km" + 0.002*"election" + 0.002*"est" + 0.002*"soviet" + 0.002*"minister" + 0.002*"economy" + 0.002*"male" + 0.001*"territory" + 0.001*"liberal" + 0.001*"elected"
+    2020-10-25 01:10:23,686 : INFO : topic #8 (0.111): 0.003*"league" + 0.001*"baseball" + 0.001*"season" + 0.001*"ball" + 0.001*"drug" + 0.001*"football" + 0.001*"cell" + 0.001*"animal" + 0.001*"park" + 0.001*"algorithm"
+    2020-10-25 01:10:23,686 : INFO : topic diff=0.178759, rho=0.509614
+    2020-10-25 01:10:34,248 : INFO : -9.123 per-word bound, 557.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 01:10:34,249 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2020-10-25 01:10:38,910 : INFO : topic #5 (0.111): 0.002*"band" + 0.002*"album" + 0.002*"window" + 0.002*"microsoft" + 0.002*"comic" + 0.002*"irish" + 0.002*"apple" + 0.001*"software" + 0.001*"user" + 0.001*"musical"
+    2020-10-25 01:10:38,910 : INFO : topic #2 (0.111): 0.002*"jewish" + 0.002*"jew" + 0.002*"jesus" + 0.001*"carbon" + 0.001*"gospel" + 0.001*"bible" + 0.001*"hebrew" + 0.001*"christ" + 0.001*"translation" + 0.001*"judaism"
+    2020-10-25 01:10:38,910 : INFO : topic #3 (0.111): 0.002*"import" + 0.002*"card" + 0.002*"tree" + 0.001*"league" + 0.001*"bit" + 0.001*"ford" + 0.001*"fiction" + 0.001*"season" + 0.001*"fan" + 0.001*"novel"
+    2020-10-25 01:10:38,911 : INFO : topic #7 (0.111): 0.003*"actor" + 0.002*"emperor" + 0.002*"alexander" + 0.002*"bc" + 0.002*"chinese" + 0.001*"actress" + 0.001*"jewish" + 0.001*"china" + 0.001*"singer" + 0.001*"japanese"
+    2020-10-25 01:10:38,911 : INFO : topic #1 (0.111): 0.003*"km" + 0.003*"election" + 0.003*"est" + 0.003*"soviet" + 0.003*"minister" + 0.003*"economy" + 0.002*"male" + 0.002*"territory" + 0.002*"parliament" + 0.002*"elected"
+    2020-10-25 01:10:38,911 : INFO : topic diff=0.154319, rho=0.454053
+    2020-10-25 01:10:49,397 : INFO : -9.086 per-word bound, 543.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 01:10:49,473 : INFO : using symmetric alpha at 0.09090909090909091
+    2020-10-25 01:10:49,473 : INFO : using symmetric eta at 0.09090909090909091
+    2020-10-25 01:10:49,475 : INFO : using serial LDA version on this node
+    2020-10-25 01:10:49,488 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 01:10:49,488 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2020-10-25 01:11:00,304 : INFO : -10.500 per-word bound, 1448.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 01:11:00,304 : INFO : PROGRESS: pass 0, at document #1701/1701
+    2020-10-25 01:11:03,567 : INFO : topic #4 (0.091): 0.001*"moon" + 0.001*"actor" + 0.001*"cell" + 0.001*"election" + 0.001*"minister" + 0.001*"prime" + 0.001*"software" + 0.001*"novel" + 0.001*"emperor" + 0.001*"blue"
+    2020-10-25 01:11:03,567 : INFO : topic #10 (0.091): 0.001*"emperor" + 0.001*"actor" + 0.001*"election" + 0.001*"italian" + 0.001*"soviet" + 0.001*"irish" + 0.001*"love" + 0.001*"card" + 0.001*"china" + 0.001*"ball"
+    2020-10-25 01:11:03,567 : INFO : topic #8 (0.091): 0.001*"league" + 0.001*"territory" + 0.001*"jewish" + 0.001*"minister" + 0.001*"y" + 0.001*"election" + 0.001*"india" + 0.001*"km" + 0.001*"machine" + 0.001*"map"
+    2020-10-25 01:11:03,568 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"km" + 0.001*"software" + 0.001*"actor" + 0.001*"india" + 0.001*"indian" + 0.001*"animal" + 0.001*"male" + 0.001*"prime" + 0.001*"philosophy"
+    2020-10-25 01:11:03,568 : INFO : topic #2 (0.091): 0.001*"actor" + 0.001*"league" + 0.001*"season" + 0.001*"cell" + 0.001*"band" + 0.001*"emperor" + 0.001*"album" + 0.001*"station" + 0.001*"female" + 0.001*"km"
+    2020-10-25 01:11:03,568 : INFO : topic diff=1.023399, rho=1.000000
+    2020-10-25 01:11:14,271 : INFO : -9.279 per-word bound, 621.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 01:11:14,271 : INFO : PROGRESS: pass 1, at document #1701/1701
+    2020-10-25 01:11:17,511 : INFO : topic #1 (0.091): 0.002*"energy" + 0.001*"season" + 0.001*"league" + 0.001*"africa" + 0.001*"chinese" + 0.001*"football" + 0.001*"blue" + 0.001*"ship" + 0.001*"machine" + 0.001*"symbol"
+    2020-10-25 01:11:17,511 : INFO : topic #4 (0.091): 0.002*"cell" + 0.002*"moon" + 0.001*"acid" + 0.001*"disease" + 0.001*"plant" + 0.001*"specie" + 0.001*"surface" + 0.001*"scientific" + 0.001*"software" + 0.001*"energy"
+    2020-10-25 01:11:17,512 : INFO : topic #8 (0.091): 0.001*"territory" + 0.001*"league" + 0.001*"jewish" + 0.001*"minister" + 0.001*"election" + 0.001*"machine" + 0.001*"india" + 0.001*"irish" + 0.001*"apple" + 0.001*"y"
+    2020-10-25 01:11:17,512 : INFO : topic #3 (0.091): 0.001*"import" + 0.001*"horse" + 0.001*"soviet" + 0.001*"km" + 0.001*"india" + 0.001*"software" + 0.001*"animal" + 0.001*"male" + 0.001*"indian" + 0.001*"lake"
+    2020-10-25 01:11:17,512 : INFO : topic #7 (0.091): 0.002*"actor" + 0.002*"emperor" + 0.001*"league" + 0.001*"software" + 0.001*"y" + 0.001*"band" + 0.001*"linux" + 0.001*"italian" + 0.001*"singer" + 0.001*"season"
+    2020-10-25 01:11:17,513 : INFO : topic diff=0.165568, rho=0.577350
+    2020-10-25 01:11:17,513 : INFO : ensemble contains 9 models and 160 topics now
+    2020-10-25 01:11:17,519 : INFO : ensemble contains 10 models and 169 topics now
+    2020-10-25 01:11:17,525 : INFO : asymmetric distance matrix is outdated due to add_model
+    2020-10-25 01:11:17,525 : INFO : generating a 180 x 180 asymmetric distance matrix...
+    2020-10-25 01:11:18,609 : INFO : fitting the clustering model, using 5 for min_samples
+    2020-10-25 01:11:18,629 : INFO : generating stable topics, using 3 for min_cores
+    2020-10-25 01:11:18,629 : INFO : found 19 clusters
+    2020-10-25 01:11:18,634 : INFO : found 1 stable topics
+    2020-10-25 01:11:18,635 : INFO : generating classic gensim model representation based on results from the ensemble
+    2020-10-25 01:11:18,635 : INFO : using symmetric alpha at 1.0
+    2020-10-25 01:11:18,635 : INFO : using symmetric eta at 1.0
+    2020-10-25 01:11:18,637 : INFO : using serial LDA version on this node
+    2020-10-25 01:11:18,638 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 01:11:18,638 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    180
+    1
+
+
+
+
+
+.. rst-class:: sphx-glr-timing
+
+   **Total running time of the script:** ( 6 minutes  52.125 seconds)
+
+**Estimated memory usage:**  1636 MB
+
+
+.. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py:
+
+
+.. only :: html
+
+ .. container:: sphx-glr-footer
+    :class: sphx-glr-footer-example
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-python
+
+     :download:`Download Python source code: run_ensemblelda.py <run_ensemblelda.py>`
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+     :download:`Download Jupyter notebook: run_ensemblelda.ipynb <run_ensemblelda.ipynb>`
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle b/docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..91585af924ab6004df91e11df745eacc51ffe39e
GIT binary patch
literal 8641
zcmd5>ONbm*6dlccI@2@B%p{Xfi;B2OR7x@%6*n<4N+2IdBH}{YH&t)Cs#0Cm)W^vf
zln9C%3IbZ<T5#jWWk3)F!G${!1VIo2f*>e@Ac!09t9t*c?y73WPgaw9_ndp*eeZtW
z?HhCN-+o{w|Ce^n!1J&fJAof%Y4>7i;SgI-J7%1{mmR;99ZMHIJRG9fc?E~ri`lVP
z;nS>-hB!-CpASOo0*=oKfpmTtScyj#eJ{RhL@^#3+aTuSI7^p`#eQT5A^dlih-7KU
ziTb98q9}VO0d{+#irC(p?&XY`-KAiAc`~L@VR!9=L=vYfmJ`LuH*p_~x{AXHwp8fc
z<M?p~S~{DF<HZhm15if^3Z-+AjYgP@&c(?Hwx|o=R|qc>=Ycy>xI_H~JCZ*F?&tvf
zkux;RARGlDGAsdst?g4+Gkf`M>KKitB-jEwUnn8Fd{`*xTt+YBa{;o*?v|(>N$hyU
z9r_1umV%OqHw5v*c{Cb1{($iAm+qW_HcdjU+{0>^D$=!|5Mqm``7Lg#Ktt*}P6{1!
zmmvglzqGJlfOO_(1#{<Yj6<|pT&Wi13Qp(0Ao%y54*VE7J_K+(@2fyv!*fEzH20E(
z<C?hmh836_o58k-nKcned~u4{VF_XkU85MP;gGKUmdHpeES6;J^_a4q8i2&@3DNyD
zoMh^Xn_~x)A^MSqLVQhxRGpHf7NzUCCJ@VcSk<YxJ_(?+zb1rNSZLKLc)bRr>v%)R
zuk!G!Q}R|FNY`?c0C2kG1%m-3Pvh)`XD*5WsVrHygpS!<971M+;z_c9x1d~s0)&T<
zbD9;x4+sjS9flG{bk9(ta~2}$g_Sla_1KFH2JpygH(jPBrC5c^#v)6n(FI#TrHJ{c
z*zBU84Et2&#Crd6vCTDRwSg?F0E!K-OB<Hfr{DT`!WK>#Pa?VFw($2FPflWsu!T~3
z`m`1z@qbphTRF?3WrP+;`P$xFtOe3(8u(}&RI>alQOlHOM)!yApxPGbYe77P<s|q?
zM~%G=aRECJY3Io|5^jf-fHEM=a`B0=rVupnAB#~1Cpqs<aqLUPt^|5#=xRx`Or8pG
z?&@bSxmPxsqpDH_JH8T81JlQ1CG&q3N2Khqjcb`6g`kCf3Qn@^Phs2Yl@R&SRuGcM
zSl~Z(>NGk3Rq9N5z2I;QnK*CMtG@E9T1_OBx-)mp%m}^vPN4{22}X{|FWq7F1UEr|
z-@5T@+<*$klB?W>IwK^57ZurE4PbXw;hyFWk|2IiRFr6EIK*-2n0@{hg%b^(YY;L;
z#4Pw03PZF*&UX$<*9iXVy1m>GQiYE<ATydgS>5|N=K;<+z;Qn&$4%bGG=>k!HBS-%
z1E;L_6BtW(VSLvxfknC=kI-Ef-aWu8pSlq})w_>2f=PNdC;+D&G;$1&Jfpb_J6nmL
z&jrY-%0mYCF-6a84g(n$!&b2ElW*kp)~)=S_lWakmXL^f#;gaEj*_mER2NDQZac9Z
zBr)7VG=OP^XAB*mU%?)og7lfy{UGeawY7u%1+3VAhI5K6Cbm6ovHFKHs(UH7ma$jo
zLgxD2OsX#^dMi@k_{7;w$FH?ZsxQ?UA>C$Ek=@;->Z;uG9cdK(I9(Kz0}*4|7=opy
zrc*Z7=V&yoCIlK*^5^w@&!ia4!l<JzP;XFal&+Tp6{G1Q^-WHizleC2FA`(bbJD%Z
zA@d^`(L#H{s#K-;EiS&y3rZZr+mE5DYg@))MYnrGoN1}zwrwBTsg{Tu-L%3rC9HI=
zA5z<K)!3Tn!HQWq$?<%o&4=WW<LS_vhjul#dK}IZTX{AQ`q((!{UkSfvhZZ%$VC?K
z<Pe))0>ggx69M%7%8OL2gB2_;^PaK!h+sTni<2BrA|2`H^hk@t7f)qn7o~pTizX;Z
z%a{D(^h7c<w&2$X9QN_GIK;y|+-MKhlg6RCkmS_1$*zXBRr0IhlJ@?M;;SRPC9Bw2
z$<N<5$4VN%Q&5N5{Io)0#T?cBzA;)G;kiF3$euJ+ZK(QA8v<SXj|zCbn6)-k{8K}$
zuJdOFcZ5%18>;=KIb7HLn*u#5=CTb{|K0+x>;8jcaaCuH#N)0_XZ;uDMCpp&1k@DN
znY+moo;(&0u-P%Z(j60AvgHg!bshgInWGFv4(!Tk$TreM+3iqC*tz(r*x-K(W%rb7
zk|YJBbp@v|rH-Y#F0wA54$Uj%I^Wg2KQi!+G|?LGw4UO$Jp={&5^wl>Iq7QNd=;~0
z`s+9E6~r?N!2g3Ol>1O_Ht!bxwL#PgMel#~W_rBN2&u?CsmSi`o9Ptg#)<JaW&)Cy

literal 0
HcmV?d00001

diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst
index af55f3f18a..5b1c444a94 100644
--- a/docs/src/auto_examples/tutorials/sg_execution_times.rst
+++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst
@@ -5,18 +5,20 @@
 
 Computation times
 =================
-**00:07.863** total execution time for **auto_examples_tutorials** files:
+**06:52.125** total execution time for **auto_examples_tutorials** files:
 
-+-----------------------------------------------------------------------------------------------+-----------+----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``)                       | 14:40.672 | 752.8 MB |
-+-----------------------------------------------------------------------------------------------+-----------+----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``)           | 00:00.000 | 0.0 MB   |
-+-----------------------------------------------------------------------------------------------+-----------+----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``)                 | 00:00.000 | 0.0 MB   |
-+-----------------------------------------------------------------------------------------------+-----------+----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``)                           | 00:00.000 | 0.0 MB   |
-+-----------------------------------------------------------------------------------------------+-----------+----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``)                           | 00:00.000 | 0.0 MB   |
-+-----------------------------------------------------------------------------------------------+-----------+----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``)                 | 00:00.000 | 0.0 MB   |
-+-----------------------------------------------------------------------------------------------+-----------+----------+
++-------------------------------------------------------------------------------------+-----------+-----------+
+| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 06:52.125 | 1636.4 MB |
++-------------------------------------------------------------------------------------+-----------+-----------+
+| :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``)             | 00:00.000 | 0.0 MB    |
++-------------------------------------------------------------------------------------+-----------+-----------+
+| :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``) | 00:00.000 | 0.0 MB    |
++-------------------------------------------------------------------------------------+-----------+-----------+
+| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``)       | 00:00.000 | 0.0 MB    |
++-------------------------------------------------------------------------------------+-----------+-----------+
+| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``)                 | 00:00.000 | 0.0 MB    |
++-------------------------------------------------------------------------------------+-----------+-----------+
+| :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``)                 | 00:00.000 | 0.0 MB    |
++-------------------------------------------------------------------------------------+-----------+-----------+
+| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``)       | 00:00.000 | 0.0 MB    |
++-------------------------------------------------------------------------------------+-----------+-----------+
diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py
index a50e14e355..b8e8e20887 100644
--- a/docs/src/gallery/tutorials/run_ensemblelda.py
+++ b/docs/src/gallery/tutorials/run_ensemblelda.py
@@ -33,7 +33,7 @@
 docs = api.load('text8')
 docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
 dictionary = Dictionary(docs)
-dictionary.filter_extremes(no_below=1, no_above=0.5)
+dictionary.filter_extremes(no_below=20, no_above=0.5)
 corpus = [dictionary.doc2bow(doc) for doc in docs]
 
 ###############################################################################

From e84e8c2ddb1527a5eeb55962a1e2e912f63a40d3 Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 25 Oct 2020 01:21:34 +0200
Subject: [PATCH 121/166] changed eps in example

---
 docs/src/auto_examples/tutorials/run_ensemblelda.ipynb | 2 +-
 docs/src/auto_examples/tutorials/run_ensemblelda.py    | 2 +-
 docs/src/gallery/tutorials/run_ensemblelda.py          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
index f4aea41496..3952a97d12 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
@@ -159,7 +159,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nwithout_diagonal.min(), without_diagonal.mean(), without_diagonal.max()\n\nensemble.recluster(eps=0.1, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))"
+        "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nwithout_diagonal.min(), without_diagonal.mean(), without_diagonal.max()\n\nensemble.recluster(eps=0.09, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))"
       ]
     },
     {
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py
index b8e8e20887..bde40f98ea 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.py
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py
@@ -112,7 +112,7 @@
 without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
 without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()
 
-ensemble.recluster(eps=0.1, min_samples=2, min_cores=2)
+ensemble.recluster(eps=0.09, min_samples=2, min_cores=2)
 
 print(len(ensemble.get_topics()))
 
diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py
index b8e8e20887..bde40f98ea 100644
--- a/docs/src/gallery/tutorials/run_ensemblelda.py
+++ b/docs/src/gallery/tutorials/run_ensemblelda.py
@@ -112,7 +112,7 @@
 without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
 without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()
 
-ensemble.recluster(eps=0.1, min_samples=2, min_cores=2)
+ensemble.recluster(eps=0.09, min_samples=2, min_cores=2)
 
 print(len(ensemble.get_topics()))
 

From 0b5aaf298a6b903c4fe477504ed47a4ccd9e043c Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 25 Oct 2020 11:14:52 +0100
Subject: [PATCH 122/166] added to tutorials

---
 docs/src/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/conf.py b/docs/src/conf.py
index 13d26e254b..42ede6d7f8 100644
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -258,6 +258,7 @@ def sort_key(source_dir):
         'run_lda.py',
         'run_wmd.py',
         'run_summarization.py',
+        'run_ensemblelda.py',
     ]
 
     howto_order = [

From 3bd838e94001f82b0f9256f88982bcfcc2e1389f Mon Sep 17 00:00:00 2001
From: sezanzeb <to.213692@protonmail.ch>
Date: Sun, 25 Oct 2020 11:30:56 +0100
Subject: [PATCH 123/166] rebuilt

---
 docs/src/auto_examples/index.rst              |  40 ++--
 .../tutorials/run_ensemblelda.py.md5          |   2 +-
 .../tutorials/run_ensemblelda.rst             | 208 +++++++++---------
 .../tutorials/sg_execution_times.rst          |   4 +-
 4 files changed, 127 insertions(+), 127 deletions(-)

diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst
index dfee557a6c..e46809100c 100644
--- a/docs/src/auto_examples/index.rst
+++ b/docs/src/auto_examples/index.rst
@@ -190,14 +190,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s EnsembleLda model">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces the Annoy library for similarity queries on top of vectors learned by Word2Vec.">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png
-     :alt: Ensemble LDA
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
+     :alt: Fast Similarity Queries with Annoy and Word2Vec
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
 
 .. raw:: html
 
@@ -207,18 +207,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_ensemblelda
+   /auto_examples/tutorials/run_annoy
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces the Annoy library for similarity queries on top of vectors learned by Word2Vec.">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s LDA model and demonstrates its use on the NIPS corpus.">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
-     :alt: Fast Similarity Queries with Annoy and Word2Vec
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
+     :alt: LDA Model
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
 
 .. raw:: html
 
@@ -228,18 +228,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_annoy
+   /auto_examples/tutorials/run_lda
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s LDA model and demonstrates its use on the NIPS corpus.">
+    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates using Gensim&#x27;s implemenation of the WMD.">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
-     :alt: LDA Model
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
+     :alt: Word Mover's Distance
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
 
 .. raw:: html
 
@@ -249,18 +249,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_lda
+   /auto_examples/tutorials/run_wmd
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates using Gensim&#x27;s implemenation of the WMD.">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s EnsembleLda model">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
-     :alt: Word Mover's Distance
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png
+     :alt: Ensemble LDA
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py`
 
 .. raw:: html
 
@@ -270,7 +270,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_wmd
+   /auto_examples/tutorials/run_ensemblelda
 .. raw:: html
 
     <div class="sphx-glr-clear"></div>
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
index d779810a22..aff4513799 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
@@ -1 +1 @@
-cebdae557712e17fb52871e2a012e9c8
\ No newline at end of file
+f5b586678dc10d31de5c29251f4f7ad9
\ No newline at end of file
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
index a22e937444..f4cfc8ad5d 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.rst
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
@@ -67,11 +67,11 @@ so it won't be explained again in detail.
 
  .. code-block:: none
 
-    2020-10-25 01:05:04,690 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
-    2020-10-25 01:05:10,080 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions)
-    2020-10-25 01:05:10,262 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]...
-    2020-10-25 01:05:10,263 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents
-    2020-10-25 01:05:10,338 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...)
+    2020-10-25 11:23:34,467 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
+    2020-10-25 11:23:39,808 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions)
+    2020-10-25 11:23:39,989 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]...
+    2020-10-25 11:23:39,989 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents
+    2020-10-25 11:23:40,063 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...)
 
 
@@ -185,18 +185,18 @@ The number of stable topics which are clustered from all those topics is smaller
 
  .. code-block:: none
 
-    2020-10-25 01:05:13,988 : INFO : generating 8 topic models...
-    2020-10-25 01:09:45,442 : INFO : generating a 160 x 160 asymmetric distance matrix...
-    2020-10-25 01:09:46,332 : INFO : fitting the clustering model, using 4 for min_samples
-    2020-10-25 01:09:46,348 : INFO : generating stable topics, using 3 for min_cores
-    2020-10-25 01:09:46,348 : INFO : found 32 clusters
-    2020-10-25 01:09:46,350 : INFO : found 2 stable topics
-    2020-10-25 01:09:46,350 : INFO : generating classic gensim model representation based on results from the ensemble
-    2020-10-25 01:09:46,496 : INFO : using symmetric alpha at 0.5
-    2020-10-25 01:09:46,496 : INFO : using symmetric eta at 0.5
-    2020-10-25 01:09:46,497 : INFO : using serial LDA version on this node
-    2020-10-25 01:09:46,500 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 01:09:46,500 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2020-10-25 11:23:43,277 : INFO : generating 8 topic models...
+    2020-10-25 11:28:16,432 : INFO : generating a 160 x 160 asymmetric distance matrix...
+    2020-10-25 11:28:17,287 : INFO : fitting the clustering model, using 4 for min_samples
+    2020-10-25 11:28:17,315 : INFO : generating stable topics, using 3 for min_cores
+    2020-10-25 11:28:17,315 : INFO : found 21 clusters
+    2020-10-25 11:28:17,319 : INFO : found 2 stable topics
+    2020-10-25 11:28:17,319 : INFO : generating classic gensim model representation based on results from the ensemble
+    2020-10-25 11:28:17,487 : INFO : using symmetric alpha at 0.5
+    2020-10-25 11:28:17,487 : INFO : using symmetric eta at 0.5
+    2020-10-25 11:28:17,488 : INFO : using serial LDA version on this node
+    2020-10-25 11:28:17,491 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 11:28:17,491 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
     160
     2
 
@@ -226,7 +226,7 @@ Make sure to chose one that is within the range of values.
     without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
     without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()
 
-    ensemble.recluster(eps=0.1, min_samples=2, min_cores=2)
+    ensemble.recluster(eps=0.09, min_samples=2, min_cores=2)
 
     print(len(ensemble.get_topics()))
 
@@ -240,17 +240,17 @@ Make sure to chose one that is within the range of values.
 
  .. code-block:: none
 
-    2020-10-25 01:09:46,802 : INFO : fitting the clustering model
-    2020-10-25 01:09:46,816 : INFO : generating stable topics
-    2020-10-25 01:09:46,816 : INFO : found 47 clusters
-    2020-10-25 01:09:46,819 : INFO : found 2 stable topics
-    2020-10-25 01:09:46,819 : INFO : generating classic gensim model representation based on results from the ensemble
-    2020-10-25 01:09:46,821 : INFO : using symmetric alpha at 0.5
-    2020-10-25 01:09:46,821 : INFO : using symmetric eta at 0.5
-    2020-10-25 01:09:46,823 : INFO : using serial LDA version on this node
-    2020-10-25 01:09:46,825 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 01:09:46,825 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    2
+    2020-10-25 11:28:17,675 : INFO : fitting the clustering model
+    2020-10-25 11:28:17,687 : INFO : generating stable topics
+    2020-10-25 11:28:17,688 : INFO : found 33 clusters
+    2020-10-25 11:28:17,690 : INFO : found 4 stable topics
+    2020-10-25 11:28:17,691 : INFO : generating classic gensim model representation based on results from the ensemble
+    2020-10-25 11:28:17,692 : INFO : using symmetric alpha at 0.25
+    2020-10-25 11:28:17,692 : INFO : using symmetric eta at 0.25
+    2020-10-25 11:28:17,694 : INFO : using serial LDA version on this node
+    2020-10-25 11:28:17,698 : INFO : running online (multi-pass) LDA training, 4 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 11:28:17,698 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    4
 
 
@@ -305,79 +305,79 @@ Afterwards the number and quality of stable topics might be different depending
 
  .. code-block:: none
 
-    2020-10-25 01:09:47,300 : INFO : using symmetric alpha at 0.1111111111111111
-    2020-10-25 01:09:47,301 : INFO : using symmetric eta at 0.1111111111111111
-    2020-10-25 01:09:47,304 : INFO : using serial LDA version on this node
-    2020-10-25 01:09:47,314 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 01:09:47,314 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    2020-10-25 01:09:47,315 : INFO : training LDA model using 11 processes
-    2020-10-25 01:09:47,546 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2020-10-25 01:09:52,514 : INFO : topic #8 (0.111): 0.001*"league" + 0.001*"minister" + 0.001*"jewish" + 0.001*"animal" + 0.001*"km" + 0.001*"soviet" + 0.001*"election" + 0.001*"emperor" + 0.001*"energy" + 0.001*"cell"
-    2020-10-25 01:09:52,514 : INFO : topic #2 (0.111): 0.001*"km" + 0.001*"china" + 0.001*"county" + 0.001*"album" + 0.001*"jewish" + 0.001*"california" + 0.001*"actor" + 0.001*"economy" + 0.001*"jew" + 0.001*"band"
-    2020-10-25 01:09:52,514 : INFO : topic #6 (0.111): 0.001*"km" + 0.001*"election" + 0.001*"software" + 0.001*"user" + 0.001*"japanese" + 0.001*"female" + 0.001*"minister" + 0.001*"prime" + 0.001*"season" + 0.001*"saint"
-    2020-10-25 01:09:52,515 : INFO : topic #0 (0.111): 0.001*"actor" + 0.001*"km" + 0.001*"cell" + 0.001*"minister" + 0.001*"italian" + 0.001*"male" + 0.001*"economy" + 0.001*"energy" + 0.001*"y" + 0.001*"soviet"
-    2020-10-25 01:09:52,515 : INFO : topic #5 (0.111): 0.001*"band" + 0.001*"actor" + 0.001*"soviet" + 0.001*"album" + 0.001*"india" + 0.001*"minister" + 0.001*"ship" + 0.001*"km" + 0.001*"energy" + 0.001*"china"
-    2020-10-25 01:09:52,515 : INFO : topic diff=1.009308, rho=1.000000
-    2020-10-25 01:10:03,105 : INFO : -9.257 per-word bound, 611.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 01:10:03,105 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2020-10-25 01:10:07,873 : INFO : topic #4 (0.111): 0.002*"album" + 0.001*"energy" + 0.001*"band" + 0.001*"blue" + 0.001*"cell" + 0.001*"emperor" + 0.001*"bc" + 0.001*"actor" + 0.001*"japanese" + 0.001*"software"
-    2020-10-25 01:10:07,873 : INFO : topic #2 (0.111): 0.001*"jewish" + 0.001*"jew" + 0.001*"carbon" + 0.001*"county" + 0.001*"california" + 0.001*"jesus" + 0.001*"china" + 0.001*"km" + 0.001*"cell" + 0.001*"gospel"
-    2020-10-25 01:10:07,873 : INFO : topic #8 (0.111): 0.002*"league" + 0.001*"season" + 0.001*"baseball" + 0.001*"jewish" + 0.001*"animal" + 0.001*"cell" + 0.001*"y" + 0.001*"ball" + 0.001*"china" + 0.001*"minister"
-    2020-10-25 01:10:07,874 : INFO : topic #1 (0.111): 0.002*"km" + 0.002*"soviet" + 0.002*"minister" + 0.002*"election" + 0.001*"est" + 0.001*"economy" + 0.001*"male" + 0.001*"chinese" + 0.001*"liberal" + 0.001*"russian"
-    2020-10-25 01:10:07,874 : INFO : topic #5 (0.111): 0.002*"band" + 0.001*"album" + 0.001*"window" + 0.001*"actor" + 0.001*"comic" + 0.001*"irish" + 0.001*"india" + 0.001*"user" + 0.001*"microsoft" + 0.001*"minister"
-    2020-10-25 01:10:07,874 : INFO : topic diff=0.149585, rho=0.592297
-    2020-10-25 01:10:18,504 : INFO : -9.186 per-word bound, 582.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 01:10:18,505 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2020-10-25 01:10:23,685 : INFO : topic #2 (0.111): 0.002*"jewish" + 0.001*"jew" + 0.001*"jesus" + 0.001*"carbon" + 0.001*"gospel" + 0.001*"bible" + 0.001*"california" + 0.001*"hebrew" + 0.001*"cell" + 0.001*"translation"
-    2020-10-25 01:10:23,685 : INFO : topic #5 (0.111): 0.002*"band" + 0.002*"album" + 0.002*"window" + 0.001*"comic" + 0.001*"microsoft" + 0.001*"irish" + 0.001*"apple" + 0.001*"user" + 0.001*"software" + 0.001*"actor"
-    2020-10-25 01:10:23,685 : INFO : topic #4 (0.111): 0.002*"album" + 0.002*"energy" + 0.002*"band" + 0.001*"cell" + 0.001*"blue" + 0.001*"y" + 0.001*"bc" + 0.001*"guitar" + 0.001*"emperor" + 0.001*"universe"
-    2020-10-25 01:10:23,686 : INFO : topic #1 (0.111): 0.003*"km" + 0.002*"election" + 0.002*"est" + 0.002*"soviet" + 0.002*"minister" + 0.002*"economy" + 0.002*"male" + 0.001*"territory" + 0.001*"liberal" + 0.001*"elected"
-    2020-10-25 01:10:23,686 : INFO : topic #8 (0.111): 0.003*"league" + 0.001*"baseball" + 0.001*"season" + 0.001*"ball" + 0.001*"drug" + 0.001*"football" + 0.001*"cell" + 0.001*"animal" + 0.001*"park" + 0.001*"algorithm"
-    2020-10-25 01:10:23,686 : INFO : topic diff=0.178759, rho=0.509614
-    2020-10-25 01:10:34,248 : INFO : -9.123 per-word bound, 557.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 01:10:34,249 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2020-10-25 01:10:38,910 : INFO : topic #5 (0.111): 0.002*"band" + 0.002*"album" + 0.002*"window" + 0.002*"microsoft" + 0.002*"comic" + 0.002*"irish" + 0.002*"apple" + 0.001*"software" + 0.001*"user" + 0.001*"musical"
-    2020-10-25 01:10:38,910 : INFO : topic #2 (0.111): 0.002*"jewish" + 0.002*"jew" + 0.002*"jesus" + 0.001*"carbon" + 0.001*"gospel" + 0.001*"bible" + 0.001*"hebrew" + 0.001*"christ" + 0.001*"translation" + 0.001*"judaism"
-    2020-10-25 01:10:38,910 : INFO : topic #3 (0.111): 0.002*"import" + 0.002*"card" + 0.002*"tree" + 0.001*"league" + 0.001*"bit" + 0.001*"ford" + 0.001*"fiction" + 0.001*"season" + 0.001*"fan" + 0.001*"novel"
-    2020-10-25 01:10:38,911 : INFO : topic #7 (0.111): 0.003*"actor" + 0.002*"emperor" + 0.002*"alexander" + 0.002*"bc" + 0.002*"chinese" + 0.001*"actress" + 0.001*"jewish" + 0.001*"china" + 0.001*"singer" + 0.001*"japanese"
-    2020-10-25 01:10:38,911 : INFO : topic #1 (0.111): 0.003*"km" + 0.003*"election" + 0.003*"est" + 0.003*"soviet" + 0.003*"minister" + 0.003*"economy" + 0.002*"male" + 0.002*"territory" + 0.002*"parliament" + 0.002*"elected"
-    2020-10-25 01:10:38,911 : INFO : topic diff=0.154319, rho=0.454053
-    2020-10-25 01:10:49,397 : INFO : -9.086 per-word bound, 543.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 01:10:49,473 : INFO : using symmetric alpha at 0.09090909090909091
-    2020-10-25 01:10:49,473 : INFO : using symmetric eta at 0.09090909090909091
-    2020-10-25 01:10:49,475 : INFO : using serial LDA version on this node
-    2020-10-25 01:10:49,488 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 01:10:49,488 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    2020-10-25 01:11:00,304 : INFO : -10.500 per-word bound, 1448.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 01:11:00,304 : INFO : PROGRESS: pass 0, at document #1701/1701
-    2020-10-25 01:11:03,567 : INFO : topic #4 (0.091): 0.001*"moon" + 0.001*"actor" + 0.001*"cell" + 0.001*"election" + 0.001*"minister" + 0.001*"prime" + 0.001*"software" + 0.001*"novel" + 0.001*"emperor" + 0.001*"blue"
-    2020-10-25 01:11:03,567 : INFO : topic #10 (0.091): 0.001*"emperor" + 0.001*"actor" + 0.001*"election" + 0.001*"italian" + 0.001*"soviet" + 0.001*"irish" + 0.001*"love" + 0.001*"card" + 0.001*"china" + 0.001*"ball"
-    2020-10-25 01:11:03,567 : INFO : topic #8 (0.091): 0.001*"league" + 0.001*"territory" + 0.001*"jewish" + 0.001*"minister" + 0.001*"y" + 0.001*"election" + 0.001*"india" + 0.001*"km" + 0.001*"machine" + 0.001*"map"
-    2020-10-25 01:11:03,568 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"km" + 0.001*"software" + 0.001*"actor" + 0.001*"india" + 0.001*"indian" + 0.001*"animal" + 0.001*"male" + 0.001*"prime" + 0.001*"philosophy"
-    2020-10-25 01:11:03,568 : INFO : topic #2 (0.091): 0.001*"actor" + 0.001*"league" + 0.001*"season" + 0.001*"cell" + 0.001*"band" + 0.001*"emperor" + 0.001*"album" + 0.001*"station" + 0.001*"female" + 0.001*"km"
-    2020-10-25 01:11:03,568 : INFO : topic diff=1.023399, rho=1.000000
-    2020-10-25 01:11:14,271 : INFO : -9.279 per-word bound, 621.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 01:11:14,271 : INFO : PROGRESS: pass 1, at document #1701/1701
-    2020-10-25 01:11:17,511 : INFO : topic #1 (0.091): 0.002*"energy" + 0.001*"season" + 0.001*"league" + 0.001*"africa" + 0.001*"chinese" + 0.001*"football" + 0.001*"blue" + 0.001*"ship" + 0.001*"machine" + 0.001*"symbol"
-    2020-10-25 01:11:17,511 : INFO : topic #4 (0.091): 0.002*"cell" + 0.002*"moon" + 0.001*"acid" + 0.001*"disease" + 0.001*"plant" + 0.001*"specie" + 0.001*"surface" + 0.001*"scientific" + 0.001*"software" + 0.001*"energy"
-    2020-10-25 01:11:17,512 : INFO : topic #8 (0.091): 0.001*"territory" + 0.001*"league" + 0.001*"jewish" + 0.001*"minister" + 0.001*"election" + 0.001*"machine" + 0.001*"india" + 0.001*"irish" + 0.001*"apple" + 0.001*"y"
-    2020-10-25 01:11:17,512 : INFO : topic #3 (0.091): 0.001*"import" + 0.001*"horse" + 0.001*"soviet" + 0.001*"km" + 0.001*"india" + 0.001*"software" + 0.001*"animal" + 0.001*"male" + 0.001*"indian" + 0.001*"lake"
-    2020-10-25 01:11:17,512 : INFO : topic #7 (0.091): 0.002*"actor" + 0.002*"emperor" + 0.001*"league" + 0.001*"software" + 0.001*"y" + 0.001*"band" + 0.001*"linux" + 0.001*"italian" + 0.001*"singer" + 0.001*"season"
-    2020-10-25 01:11:17,513 : INFO : topic diff=0.165568, rho=0.577350
-    2020-10-25 01:11:17,513 : INFO : ensemble contains 9 models and 160 topics now
-    2020-10-25 01:11:17,519 : INFO : ensemble contains 10 models and 169 topics now
-    2020-10-25 01:11:17,525 : INFO : asymmetric distance matrix is outdated due to add_model
-    2020-10-25 01:11:17,525 : INFO : generating a 180 x 180 asymmetric distance matrix...
-    2020-10-25 01:11:18,609 : INFO : fitting the clustering model, using 5 for min_samples
-    2020-10-25 01:11:18,629 : INFO : generating stable topics, using 3 for min_cores
-    2020-10-25 01:11:18,629 : INFO : found 19 clusters
-    2020-10-25 01:11:18,634 : INFO : found 1 stable topics
-    2020-10-25 01:11:18,635 : INFO : generating classic gensim model representation based on results from the ensemble
-    2020-10-25 01:11:18,635 : INFO : using symmetric alpha at 1.0
-    2020-10-25 01:11:18,635 : INFO : using symmetric eta at 1.0
-    2020-10-25 01:11:18,637 : INFO : using serial LDA version on this node
-    2020-10-25 01:11:18,638 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 01:11:18,638 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2020-10-25 11:28:17,999 : INFO : using symmetric alpha at 0.1111111111111111
+    2020-10-25 11:28:18,000 : INFO : using symmetric eta at 0.1111111111111111
+    2020-10-25 11:28:18,002 : INFO : using serial LDA version on this node
+    2020-10-25 11:28:18,013 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 11:28:18,013 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2020-10-25 11:28:18,013 : INFO : training LDA model using 11 processes
+    2020-10-25 11:28:18,129 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2020-10-25 11:28:22,666 : INFO : topic #0 (0.111): 0.001*"km" + 0.001*"band" + 0.001*"league" + 0.001*"album" + 0.001*"season" + 0.001*"emperor" + 0.001*"y" + 0.001*"actor" + 0.001*"ball" + 0.001*"machine"
+    2020-10-25 11:28:22,666 : INFO : topic #4 (0.111): 0.001*"league" + 0.001*"jewish" + 0.001*"band" + 0.001*"software" + 0.001*"emperor" + 0.001*"season" + 0.001*"album" + 0.001*"minister" + 0.001*"israel" + 0.001*"australia"
+    2020-10-25 11:28:22,666 : INFO : topic #3 (0.111): 0.001*"jewish" + 0.001*"actor" + 0.001*"y" + 0.001*"animal" + 0.001*"energy" + 0.001*"election" + 0.001*"km" + 0.001*"lake" + 0.001*"emperor" + 0.001*"cell"
+    2020-10-25 11:28:22,667 : INFO : topic #7 (0.111): 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"cell" + 0.001*"soviet" + 0.001*"energy" + 0.001*"actor" + 0.001*"russian" + 0.001*"emperor" + 0.001*"band"
+    2020-10-25 11:28:22,667 : INFO : topic #6 (0.111): 0.001*"soviet" + 0.001*"blue" + 0.001*"actor" + 0.001*"italian" + 0.001*"economy" + 0.001*"car" + 0.001*"animal" + 0.001*"km" + 0.001*"channel" + 0.001*"energy"
+    2020-10-25 11:28:22,667 : INFO : topic diff=1.009925, rho=1.000000
+    2020-10-25 11:28:33,290 : INFO : -9.255 per-word bound, 611.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 11:28:33,290 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2020-10-25 11:28:37,864 : INFO : topic #1 (0.111): 0.002*"minister" + 0.002*"km" + 0.002*"election" + 0.002*"est" + 0.002*"economy" + 0.001*"actor" + 0.001*"male" + 0.001*"india" + 0.001*"china" + 0.001*"elected"
+    2020-10-25 11:28:37,865 : INFO : topic #8 (0.111): 0.001*"software" + 0.001*"user" + 0.001*"apollo" + 0.001*"metal" + 0.001*"cell" + 0.001*"machine" + 0.001*"actor" + 0.001*"soviet" + 0.001*"lincoln" + 0.001*"disk"
+    2020-10-25 11:28:37,865 : INFO : topic #3 (0.111): 0.001*"y" + 0.001*"animal" + 0.001*"lake" + 0.001*"energy" + 0.001*"jewish" + 0.001*"window" + 0.001*"aircraft" + 0.001*"cell" + 0.001*"memory" + 0.001*"machine"
+    2020-10-25 11:28:37,865 : INFO : topic #7 (0.111): 0.002*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"soviet" + 0.001*"cell" + 0.001*"est" + 0.001*"energy" + 0.001*"economy" + 0.001*"parliament" + 0.001*"russian"
+    2020-10-25 11:28:37,865 : INFO : topic #4 (0.111): 0.002*"league" + 0.001*"season" + 0.001*"jewish" + 0.001*"israel" + 0.001*"emperor" + 0.001*"baseball" + 0.001*"band" + 0.001*"software" + 0.001*"album" + 0.001*"irish"
+    2020-10-25 11:28:37,865 : INFO : topic diff=0.150801, rho=0.592297
+    2020-10-25 11:28:48,377 : INFO : -9.186 per-word bound, 582.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 11:28:48,378 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2020-10-25 11:28:52,718 : INFO : topic #4 (0.111): 0.003*"league" + 0.002*"season" + 0.002*"jewish" + 0.001*"baseball" + 0.001*"israel" + 0.001*"emperor" + 0.001*"band" + 0.001*"irish" + 0.001*"jew" + 0.001*"hebrew"
+    2020-10-25 11:28:52,718 : INFO : topic #2 (0.111): 0.002*"album" + 0.002*"band" + 0.002*"chinese" + 0.001*"love" + 0.001*"jew" + 0.001*"japanese" + 0.001*"china" + 0.001*"artist" + 0.001*"jewish" + 0.001*"actor"
+    2020-10-25 11:28:52,718 : INFO : topic #8 (0.111): 0.002*"software" + 0.002*"apollo" + 0.001*"user" + 0.001*"disk" + 0.001*"cell" + 0.001*"metal" + 0.001*"lincoln" + 0.001*"machine" + 0.001*"alexander" + 0.001*"bit"
+    2020-10-25 11:28:52,718 : INFO : topic #3 (0.111): 0.002*"y" + 0.001*"window" + 0.001*"aircraft" + 0.001*"lake" + 0.001*"machine" + 0.001*"energy" + 0.001*"animal" + 0.001*"bit" + 0.001*"memory" + 0.001*"user"
+    2020-10-25 11:28:52,719 : INFO : topic #5 (0.111): 0.001*"cell" + 0.001*"japanese" + 0.001*"japan" + 0.001*"energy" + 0.001*"software" + 0.001*"emperor" + 0.001*"blood" + 0.001*"bc" + 0.001*"actor" + 0.001*"brain"
+    2020-10-25 11:28:52,719 : INFO : topic diff=0.173393, rho=0.509614
+    2020-10-25 11:29:03,210 : INFO : -9.128 per-word bound, 559.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 11:29:03,210 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2020-10-25 11:29:07,566 : INFO : topic #3 (0.111): 0.002*"y" + 0.001*"aircraft" + 0.001*"window" + 0.001*"user" + 0.001*"bit" + 0.001*"machine" + 0.001*"memory" + 0.001*"internet" + 0.001*"energy" + 0.001*"lake"
+    2020-10-25 11:29:07,566 : INFO : topic #7 (0.111): 0.002*"soviet" + 0.002*"minister" + 0.002*"israel" + 0.002*"election" + 0.002*"parliament" + 0.002*"emperor" + 0.001*"israeli" + 0.001*"cell" + 0.001*"russian" + 0.001*"lebanon"
+    2020-10-25 11:29:07,566 : INFO : topic #0 (0.111): 0.003*"ball" + 0.002*"import" + 0.002*"brown" + 0.001*"band" + 0.001*"cat" + 0.001*"iraq" + 0.001*"km" + 0.001*"album" + 0.001*"california" + 0.001*"football"
+    2020-10-25 11:29:07,566 : INFO : topic #1 (0.111): 0.003*"est" + 0.003*"km" + 0.003*"minister" + 0.003*"election" + 0.002*"economy" + 0.002*"actor" + 0.002*"india" + 0.002*"male" + 0.002*"china" + 0.002*"constitution"
+    2020-10-25 11:29:07,567 : INFO : topic #6 (0.111): 0.002*"blue" + 0.002*"car" + 0.002*"engine" + 0.001*"soviet" + 0.001*"channel" + 0.001*"flag" + 0.001*"microsoft" + 0.001*"county" + 0.001*"director" + 0.001*"communist"
+    2020-10-25 11:29:07,567 : INFO : topic diff=0.148814, rho=0.454053
+    2020-10-25 11:29:18,053 : INFO : -9.094 per-word bound, 546.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 11:29:18,119 : INFO : using symmetric alpha at 0.09090909090909091
+    2020-10-25 11:29:18,119 : INFO : using symmetric eta at 0.09090909090909091
+    2020-10-25 11:29:18,121 : INFO : using serial LDA version on this node
+    2020-10-25 11:29:18,134 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 11:29:18,134 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2020-10-25 11:29:28,764 : INFO : -10.500 per-word bound, 1448.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 11:29:28,765 : INFO : PROGRESS: pass 0, at document #1701/1701
+    2020-10-25 11:29:31,978 : INFO : topic #4 (0.091): 0.002*"software" + 0.001*"japanese" + 0.001*"km" + 0.001*"china" + 0.001*"japan" + 0.001*"love" + 0.001*"chinese" + 0.001*"y" + 0.001*"soviet" + 0.001*"emperor"
+    2020-10-25 11:29:31,979 : INFO : topic #8 (0.091): 0.001*"actor" + 0.001*"minister" + 0.001*"card" + 0.001*"band" + 0.001*"map" + 0.001*"car" + 0.001*"muslim" + 0.001*"california" + 0.001*"soviet" + 0.001*"lord"
+    2020-10-25 11:29:31,979 : INFO : topic #1 (0.091): 0.001*"km" + 0.001*"soviet" + 0.001*"album" + 0.001*"minister" + 0.001*"cell" + 0.001*"energy" + 0.001*"jewish" + 0.001*"band" + 0.001*"jew" + 0.001*"actor"
+    2020-10-25 11:29:31,979 : INFO : topic #2 (0.091): 0.001*"election" + 0.001*"km" + 0.001*"band" + 0.001*"energy" + 0.001*"cell" + 0.001*"soviet" + 0.001*"minister" + 0.001*"china" + 0.001*"map" + 0.001*"australia"
+    2020-10-25 11:29:31,980 : INFO : topic #10 (0.091): 0.001*"actor" + 0.001*"league" + 0.001*"jewish" + 0.001*"km" + 0.001*"israel" + 0.001*"bc" + 0.001*"y" + 0.001*"ship" + 0.001*"energy" + 0.001*"minister"
+    2020-10-25 11:29:31,980 : INFO : topic diff=1.023741, rho=1.000000
+    2020-10-25 11:29:42,609 : INFO : -9.277 per-word bound, 620.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2020-10-25 11:29:42,609 : INFO : PROGRESS: pass 1, at document #1701/1701
+    2020-10-25 11:29:45,801 : INFO : topic #9 (0.091): 0.002*"km" + 0.001*"election" + 0.001*"economy" + 0.001*"minister" + 0.001*"est" + 0.001*"oil" + 0.001*"spanish" + 0.001*"male" + 0.001*"liberal" + 0.001*"flag"
+    2020-10-25 11:29:45,802 : INFO : topic #2 (0.091): 0.001*"energy" + 0.001*"cell" + 0.001*"election" + 0.001*"band" + 0.001*"map" + 0.001*"km" + 0.001*"specie" + 0.001*"australia" + 0.001*"metal" + 0.001*"drug"
+    2020-10-25 11:29:45,802 : INFO : topic #6 (0.091): 0.001*"actor" + 0.001*"soviet" + 0.001*"wikipedia" + 0.001*"cell" + 0.001*"jewish" + 0.001*"italian" + 0.001*"football" + 0.001*"band" + 0.001*"acid" + 0.001*"russian"
+    2020-10-25 11:29:45,802 : INFO : topic #4 (0.091): 0.002*"software" + 0.001*"japanese" + 0.001*"japan" + 0.001*"y" + 0.001*"apple" + 0.001*"love" + 0.001*"china" + 0.001*"chinese" + 0.001*"philosophy" + 0.001*"classical"
+    2020-10-25 11:29:45,803 : INFO : topic #7 (0.091): 0.002*"emperor" + 0.002*"minister" + 0.001*"male" + 0.001*"saint" + 0.001*"constitution" + 0.001*"jewish" + 0.001*"election" + 0.001*"female" + 0.001*"actor" + 0.001*"jesus"
+    2020-10-25 11:29:45,803 : INFO : topic diff=0.168078, rho=0.577350
+    2020-10-25 11:29:45,803 : INFO : ensemble contains 9 models and 160 topics now
+    2020-10-25 11:29:45,809 : INFO : ensemble contains 10 models and 169 topics now
+    2020-10-25 11:29:45,815 : INFO : asymmetric distance matrix is outdated due to add_model
+    2020-10-25 11:29:45,815 : INFO : generating a 180 x 180 asymmetric distance matrix...
+    2020-10-25 11:29:46,856 : INFO : fitting the clustering model, using 5 for min_samples
+    2020-10-25 11:29:46,890 : INFO : generating stable topics, using 3 for min_cores
+    2020-10-25 11:29:46,890 : INFO : found 18 clusters
+    2020-10-25 11:29:46,899 : INFO : found 1 stable topics
+    2020-10-25 11:29:46,899 : INFO : generating classic gensim model representation based on results from the ensemble
+    2020-10-25 11:29:46,900 : INFO : using symmetric alpha at 1.0
+    2020-10-25 11:29:46,900 : INFO : using symmetric eta at 1.0
+    2020-10-25 11:29:46,903 : INFO : using serial LDA version on this node
+    2020-10-25 11:29:46,906 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2020-10-25 11:29:46,906 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
     180
     1
 
@@ -387,9 +387,9 @@ Afterwards the number and quality of stable topics might be different depending
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 6 minutes  52.125 seconds)
+   **Total running time of the script:** ( 6 minutes  53.294 seconds)
 
-**Estimated memory usage:**  1636 MB
+**Estimated memory usage:**  1629 MB
 
 
 .. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py:
diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst
index 5b1c444a94..b4845ff578 100644
--- a/docs/src/auto_examples/tutorials/sg_execution_times.rst
+++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**06:52.125** total execution time for **auto_examples_tutorials** files:
+**06:53.294** total execution time for **auto_examples_tutorials** files:
 
 +-------------------------------------------------------------------------------------+-----------+-----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 06:52.125 | 1636.4 MB |
+| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 06:53.294 | 1628.6 MB |
 +-------------------------------------------------------------------------------------+-----------+-----------+
 | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``)             | 00:00.000 | 0.0 MB    |
 +-------------------------------------------------------------------------------------+-----------+-----------+

From 3057927ed54f88011d74f6151c6f132d9199d896 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 23 Jan 2021 12:18:55 +0100
Subject: [PATCH 124/166] update docs

---
 .../tutorials/run_ensemblelda.ipynb           |   8 +-
 .../tutorials/run_ensemblelda.py              |  10 +-
 .../tutorials/run_ensemblelda.py.md5          |   2 +-
 .../tutorials/run_ensemblelda.rst             | 259 ++++++++++--------
 .../tutorials/sg_execution_times.rst          |   4 +-
 docs/src/gallery/tutorials/run_ensemblelda.py |  10 +-
 gensim/downloader.py                          |   5 +
 7 files changed, 170 insertions(+), 128 deletions(-)

diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
index 3952a97d12..d9573e5e22 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
@@ -33,7 +33,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "This tutorial will explain how to use the EnsembleLDA model class.\n\nEnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,\nit can therefore be used to remove topics from your results that are noise and are not reproducible.\n\n\n"
+        "This tutorial will explain how to use the EnsembleLDA model class.\n\nEnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,\nit can be used to remove topics from your results that are noise and are not reproducible.\n\n\n"
       ]
     },
     {
@@ -148,7 +148,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Tuning\n\nDifferent from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.\n\nYou can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around\n\nuntil you get as many topics as you desire, which however may reduce their quality.\nIf your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.\n\nHaving an epsilon that is smaller than the smallest distance doesn't make sense.\nMake sure to chose one that is within the range of values.\n\n\n"
+        "## Tuning\n\nDifferent from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.\n\nYou can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor.\n\nPlay around until you get as many topics as you desire, which however may reduce their quality.\nIf your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.\n\nHaving an epsilon that is smaller than the smallest distance doesn't make sense.\nMake sure to chose one that is within the range of values in ``asymmetric_distance_matrix``.\n\n\n"
       ]
     },
     {
@@ -159,7 +159,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nwithout_diagonal.min(), without_diagonal.mean(), without_diagonal.max()\n\nensemble.recluster(eps=0.09, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))"
+        "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nprint(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max())\n\nensemble.recluster(eps=0.09, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))"
       ]
     },
     {
@@ -197,7 +197,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.8.3"
+      "version": "3.8.5"
     }
   },
   "nbformat": 4,
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py
index bde40f98ea..086f65fc44 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.py
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py
@@ -13,7 +13,7 @@
 # This tutorial will explain how to use the EnsembleLDA model class.
 #
 # EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,
-# it can therefore be used to remove topics from your results that are noise and are not reproducible.
+# it can be used to remove topics from your results that are noise and are not reproducible.
 #
 
 ###############################################################################
@@ -98,19 +98,19 @@
 #
 # Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.
 #
-# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around
+# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor.
 #
-# until you get as many topics as you desire, which however may reduce their quality.
+# Play around until you get as many topics as you desire, which however may reduce their quality.
 # If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.
 #
 # Having an epsilon that is smaller than the smallest distance doesn't make sense.
-# Make sure to chose one that is within the range of values.
+# Make sure to chose one that is within the range of values in ``asymmetric_distance_matrix``.
 #
 
 import numpy as np
 shape = ensemble.asymmetric_distance_matrix.shape
 without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
-without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()
+print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max())
 
 ensemble.recluster(eps=0.09, min_samples=2, min_cores=2)
 
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
index aff4513799..0b31c8851e 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
@@ -1 +1 @@
-f5b586678dc10d31de5c29251f4f7ad9
\ No newline at end of file
+6b332fb6a9968b6e82fa724edbe332c2
\ No newline at end of file
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
index f4cfc8ad5d..4ff2735a69 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.rst
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
@@ -1,12 +1,21 @@
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "auto_examples/tutorials/run_ensemblelda.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
 .. only:: html
 
     .. note::
         :class: sphx-glr-download-link-note
 
-        Click :ref:`here <sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py>`     to download the full example code
-    .. rst-class:: sphx-glr-example-title
+        Click :ref:`here <sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py>`
+        to download the full example code
 
-    .. _sphx_glr_auto_examples_tutorials_run_ensemblelda.py:
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_auto_examples_tutorials_run_ensemblelda.py:
 
 
 Ensemble LDA
@@ -14,6 +23,7 @@ Ensemble LDA
 
 Introduces Gensim's EnsembleLda model
 
+.. GENERATED FROM PYTHON SOURCE LINES 8-12
 
 .. code-block:: default
 
@@ -28,12 +38,16 @@ Introduces Gensim's EnsembleLda model
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 13-18
+
 This tutorial will explain how to use the EnsembleLDA model class.
 
 EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,
-it can therefore be used to remove topics from your results that are noise and are not reproducible.
+it can be used to remove topics from your results that are noise and are not reproducible.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 20-27
+
 Corpus
 ------
 We will use the gensim downloader api to get a small corpus for training our ensemble.
@@ -42,6 +56,7 @@ The preprocessing is similar to :ref:`sphx_glr_auto_examples_tutorials_run_word2
 so it won't be explained again in detail.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 27-39
 
 .. code-block:: default
 
@@ -67,14 +82,16 @@ so it won't be explained again in detail.
 
  .. code-block:: none
 
-    2020-10-25 11:23:34,467 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
-    2020-10-25 11:23:39,808 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions)
-    2020-10-25 11:23:39,989 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]...
-    2020-10-25 11:23:39,989 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents
-    2020-10-25 11:23:40,063 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...)
+    2021-01-23 12:02:54,189 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
+    2021-01-23 12:03:00,997 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions)
+    2021-01-23 12:03:01,199 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]...
+    2021-01-23 12:03:01,199 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents
+    2021-01-23 12:03:01,288 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...)
+
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 40-48
 
 Training
 --------
@@ -85,6 +102,7 @@ You can use any model that is based on LdaModel, such as LdaMulticore, to train
 In experiments, LdaMulticore showed better results.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 48-52
 
 .. code-block:: default
 
@@ -99,10 +117,13 @@ In experiments, LdaMulticore showed better results.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 53-56
+
 Any arbitrary number of models can be used, but it should be a multiple of your workers so that the
 load can be distributed properly. In this example, 4 processes will train 8 models each.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 56-60
 
 .. code-block:: default
 
@@ -117,10 +138,13 @@ load can be distributed properly. In this example, 4 processes will train 8 mode
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 61-64
+
 After training all the models, some distance computations are required which can take quite some
 time as well. You can speed this up by using workers for that as well.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 64-67
 
 .. code-block:: default
 
@@ -134,9 +158,12 @@ time as well. You can speed this up by using workers for that as well.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 68-70
+
 All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 70-73
 
 .. code-block:: default
 
@@ -150,12 +177,15 @@ All other parameters that are unknown to EnsembleLda are forwarded to each LDA M
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 74-79
+
 Now start the training
 
 Since 20 topics were trained on each of the 8 models, we expect there to be 160 different topics.
 The number of stable topics which are clustered from all those topics is smaller.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 79-95
 
 .. code-block:: default
 
@@ -185,38 +215,41 @@ The number of stable topics which are clustered from all those topics is smaller
 
  .. code-block:: none
 
-    2020-10-25 11:23:43,277 : INFO : generating 8 topic models...
-    2020-10-25 11:28:16,432 : INFO : generating a 160 x 160 asymmetric distance matrix...
-    2020-10-25 11:28:17,287 : INFO : fitting the clustering model, using 4 for min_samples
-    2020-10-25 11:28:17,315 : INFO : generating stable topics, using 3 for min_cores
-    2020-10-25 11:28:17,315 : INFO : found 21 clusters
-    2020-10-25 11:28:17,319 : INFO : found 2 stable topics
-    2020-10-25 11:28:17,319 : INFO : generating classic gensim model representation based on results from the ensemble
-    2020-10-25 11:28:17,487 : INFO : using symmetric alpha at 0.5
-    2020-10-25 11:28:17,487 : INFO : using symmetric eta at 0.5
-    2020-10-25 11:28:17,488 : INFO : using serial LDA version on this node
-    2020-10-25 11:28:17,491 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 11:28:17,491 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-23 12:03:05,136 : INFO : generating 8 topic models...
+    2021-01-23 12:11:47,694 : INFO : generating a 160 x 160 asymmetric distance matrix...
+    2021-01-23 12:11:49,175 : INFO : fitting the clustering model, using 4 for min_samples
+    2021-01-23 12:11:49,206 : INFO : generating stable topics, using 3 for min_cores
+    2021-01-23 12:11:49,206 : INFO : found 26 clusters
+    2021-01-23 12:11:49,209 : INFO : found 1 stable topics
+    2021-01-23 12:11:49,209 : INFO : generating classic gensim model representation based on results from the ensemble
+    2021-01-23 12:11:49,398 : INFO : using symmetric alpha at 1.0
+    2021-01-23 12:11:49,398 : INFO : using symmetric eta at 1.0
+    2021-01-23 12:11:49,400 : INFO : using serial LDA version on this node
+    2021-01-23 12:11:49,402 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 12:11:49,402 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
     160
-    2
+    1
+
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 96-109
 
 Tuning
 ------
 
 Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.
 
-You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around
+You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor.
 
-until you get as many topics as you desire, which however may reduce their quality.
+Play around until you get as many topics as you desire, which however may reduce their quality.
 If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.
 
 Having an epsilon that is smaller than the smallest distance doesn't make sense.
-Make sure to chose one that is within the range of values.
+Make sure to chose one that is within the range of values in ``asymmetric_distance_matrix``.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 109-119
 
 .. code-block:: default
 
@@ -224,7 +257,7 @@ Make sure to chose one that is within the range of values.
     import numpy as np
     shape = ensemble.asymmetric_distance_matrix.shape
     without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
-    without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()
+    print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max())
 
     ensemble.recluster(eps=0.09, min_samples=2, min_cores=2)
 
@@ -240,21 +273,24 @@ Make sure to chose one that is within the range of values.
 
  .. code-block:: none
 
-    2020-10-25 11:28:17,675 : INFO : fitting the clustering model
-    2020-10-25 11:28:17,687 : INFO : generating stable topics
-    2020-10-25 11:28:17,688 : INFO : found 33 clusters
-    2020-10-25 11:28:17,690 : INFO : found 4 stable topics
-    2020-10-25 11:28:17,691 : INFO : generating classic gensim model representation based on results from the ensemble
-    2020-10-25 11:28:17,692 : INFO : using symmetric alpha at 0.25
-    2020-10-25 11:28:17,692 : INFO : using symmetric eta at 0.25
-    2020-10-25 11:28:17,694 : INFO : using serial LDA version on this node
-    2020-10-25 11:28:17,698 : INFO : running online (multi-pass) LDA training, 4 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 11:28:17,698 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    0.07280884735152338 0.12813543590172502 0.2606207782381612
+    2021-01-23 12:11:49,575 : INFO : fitting the clustering model
+    2021-01-23 12:11:49,589 : INFO : generating stable topics
+    2021-01-23 12:11:49,589 : INFO : found 35 clusters
+    2021-01-23 12:11:49,593 : INFO : found 4 stable topics
+    2021-01-23 12:11:49,593 : INFO : generating classic gensim model representation based on results from the ensemble
+    2021-01-23 12:11:49,594 : INFO : using symmetric alpha at 0.25
+    2021-01-23 12:11:49,594 : INFO : using symmetric eta at 0.25
+    2021-01-23 12:11:49,596 : INFO : using serial LDA version on this node
+    2021-01-23 12:11:49,602 : INFO : running online (multi-pass) LDA training, 4 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 12:11:49,603 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
     4
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 120-131
+
 Increasing the Size
 -------------------
 
@@ -267,6 +303,7 @@ entirely made out of your existing topic models with the following method.
 Afterwards the number and quality of stable topics might be different depending on your added topics and parameters.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 131-156
 
 .. code-block:: default
 
@@ -305,79 +342,79 @@ Afterwards the number and quality of stable topics might be different depending
 
  .. code-block:: none
 
-    2020-10-25 11:28:17,999 : INFO : using symmetric alpha at 0.1111111111111111
-    2020-10-25 11:28:18,000 : INFO : using symmetric eta at 0.1111111111111111
-    2020-10-25 11:28:18,002 : INFO : using serial LDA version on this node
-    2020-10-25 11:28:18,013 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 11:28:18,013 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    2020-10-25 11:28:18,013 : INFO : training LDA model using 11 processes
-    2020-10-25 11:28:18,129 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2020-10-25 11:28:22,666 : INFO : topic #0 (0.111): 0.001*"km" + 0.001*"band" + 0.001*"league" + 0.001*"album" + 0.001*"season" + 0.001*"emperor" + 0.001*"y" + 0.001*"actor" + 0.001*"ball" + 0.001*"machine"
-    2020-10-25 11:28:22,666 : INFO : topic #4 (0.111): 0.001*"league" + 0.001*"jewish" + 0.001*"band" + 0.001*"software" + 0.001*"emperor" + 0.001*"season" + 0.001*"album" + 0.001*"minister" + 0.001*"israel" + 0.001*"australia"
-    2020-10-25 11:28:22,666 : INFO : topic #3 (0.111): 0.001*"jewish" + 0.001*"actor" + 0.001*"y" + 0.001*"animal" + 0.001*"energy" + 0.001*"election" + 0.001*"km" + 0.001*"lake" + 0.001*"emperor" + 0.001*"cell"
-    2020-10-25 11:28:22,667 : INFO : topic #7 (0.111): 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"cell" + 0.001*"soviet" + 0.001*"energy" + 0.001*"actor" + 0.001*"russian" + 0.001*"emperor" + 0.001*"band"
-    2020-10-25 11:28:22,667 : INFO : topic #6 (0.111): 0.001*"soviet" + 0.001*"blue" + 0.001*"actor" + 0.001*"italian" + 0.001*"economy" + 0.001*"car" + 0.001*"animal" + 0.001*"km" + 0.001*"channel" + 0.001*"energy"
-    2020-10-25 11:28:22,667 : INFO : topic diff=1.009925, rho=1.000000
-    2020-10-25 11:28:33,290 : INFO : -9.255 per-word bound, 611.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 11:28:33,290 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2020-10-25 11:28:37,864 : INFO : topic #1 (0.111): 0.002*"minister" + 0.002*"km" + 0.002*"election" + 0.002*"est" + 0.002*"economy" + 0.001*"actor" + 0.001*"male" + 0.001*"india" + 0.001*"china" + 0.001*"elected"
-    2020-10-25 11:28:37,865 : INFO : topic #8 (0.111): 0.001*"software" + 0.001*"user" + 0.001*"apollo" + 0.001*"metal" + 0.001*"cell" + 0.001*"machine" + 0.001*"actor" + 0.001*"soviet" + 0.001*"lincoln" + 0.001*"disk"
-    2020-10-25 11:28:37,865 : INFO : topic #3 (0.111): 0.001*"y" + 0.001*"animal" + 0.001*"lake" + 0.001*"energy" + 0.001*"jewish" + 0.001*"window" + 0.001*"aircraft" + 0.001*"cell" + 0.001*"memory" + 0.001*"machine"
-    2020-10-25 11:28:37,865 : INFO : topic #7 (0.111): 0.002*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"soviet" + 0.001*"cell" + 0.001*"est" + 0.001*"energy" + 0.001*"economy" + 0.001*"parliament" + 0.001*"russian"
-    2020-10-25 11:28:37,865 : INFO : topic #4 (0.111): 0.002*"league" + 0.001*"season" + 0.001*"jewish" + 0.001*"israel" + 0.001*"emperor" + 0.001*"baseball" + 0.001*"band" + 0.001*"software" + 0.001*"album" + 0.001*"irish"
-    2020-10-25 11:28:37,865 : INFO : topic diff=0.150801, rho=0.592297
-    2020-10-25 11:28:48,377 : INFO : -9.186 per-word bound, 582.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 11:28:48,378 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2020-10-25 11:28:52,718 : INFO : topic #4 (0.111): 0.003*"league" + 0.002*"season" + 0.002*"jewish" + 0.001*"baseball" + 0.001*"israel" + 0.001*"emperor" + 0.001*"band" + 0.001*"irish" + 0.001*"jew" + 0.001*"hebrew"
-    2020-10-25 11:28:52,718 : INFO : topic #2 (0.111): 0.002*"album" + 0.002*"band" + 0.002*"chinese" + 0.001*"love" + 0.001*"jew" + 0.001*"japanese" + 0.001*"china" + 0.001*"artist" + 0.001*"jewish" + 0.001*"actor"
-    2020-10-25 11:28:52,718 : INFO : topic #8 (0.111): 0.002*"software" + 0.002*"apollo" + 0.001*"user" + 0.001*"disk" + 0.001*"cell" + 0.001*"metal" + 0.001*"lincoln" + 0.001*"machine" + 0.001*"alexander" + 0.001*"bit"
-    2020-10-25 11:28:52,718 : INFO : topic #3 (0.111): 0.002*"y" + 0.001*"window" + 0.001*"aircraft" + 0.001*"lake" + 0.001*"machine" + 0.001*"energy" + 0.001*"animal" + 0.001*"bit" + 0.001*"memory" + 0.001*"user"
-    2020-10-25 11:28:52,719 : INFO : topic #5 (0.111): 0.001*"cell" + 0.001*"japanese" + 0.001*"japan" + 0.001*"energy" + 0.001*"software" + 0.001*"emperor" + 0.001*"blood" + 0.001*"bc" + 0.001*"actor" + 0.001*"brain"
-    2020-10-25 11:28:52,719 : INFO : topic diff=0.173393, rho=0.509614
-    2020-10-25 11:29:03,210 : INFO : -9.128 per-word bound, 559.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 11:29:03,210 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2020-10-25 11:29:07,566 : INFO : topic #3 (0.111): 0.002*"y" + 0.001*"aircraft" + 0.001*"window" + 0.001*"user" + 0.001*"bit" + 0.001*"machine" + 0.001*"memory" + 0.001*"internet" + 0.001*"energy" + 0.001*"lake"
-    2020-10-25 11:29:07,566 : INFO : topic #7 (0.111): 0.002*"soviet" + 0.002*"minister" + 0.002*"israel" + 0.002*"election" + 0.002*"parliament" + 0.002*"emperor" + 0.001*"israeli" + 0.001*"cell" + 0.001*"russian" + 0.001*"lebanon"
-    2020-10-25 11:29:07,566 : INFO : topic #0 (0.111): 0.003*"ball" + 0.002*"import" + 0.002*"brown" + 0.001*"band" + 0.001*"cat" + 0.001*"iraq" + 0.001*"km" + 0.001*"album" + 0.001*"california" + 0.001*"football"
-    2020-10-25 11:29:07,566 : INFO : topic #1 (0.111): 0.003*"est" + 0.003*"km" + 0.003*"minister" + 0.003*"election" + 0.002*"economy" + 0.002*"actor" + 0.002*"india" + 0.002*"male" + 0.002*"china" + 0.002*"constitution"
-    2020-10-25 11:29:07,567 : INFO : topic #6 (0.111): 0.002*"blue" + 0.002*"car" + 0.002*"engine" + 0.001*"soviet" + 0.001*"channel" + 0.001*"flag" + 0.001*"microsoft" + 0.001*"county" + 0.001*"director" + 0.001*"communist"
-    2020-10-25 11:29:07,567 : INFO : topic diff=0.148814, rho=0.454053
-    2020-10-25 11:29:18,053 : INFO : -9.094 per-word bound, 546.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 11:29:18,119 : INFO : using symmetric alpha at 0.09090909090909091
-    2020-10-25 11:29:18,119 : INFO : using symmetric eta at 0.09090909090909091
-    2020-10-25 11:29:18,121 : INFO : using serial LDA version on this node
-    2020-10-25 11:29:18,134 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 11:29:18,134 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    2020-10-25 11:29:28,764 : INFO : -10.500 per-word bound, 1448.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 11:29:28,765 : INFO : PROGRESS: pass 0, at document #1701/1701
-    2020-10-25 11:29:31,978 : INFO : topic #4 (0.091): 0.002*"software" + 0.001*"japanese" + 0.001*"km" + 0.001*"china" + 0.001*"japan" + 0.001*"love" + 0.001*"chinese" + 0.001*"y" + 0.001*"soviet" + 0.001*"emperor"
-    2020-10-25 11:29:31,979 : INFO : topic #8 (0.091): 0.001*"actor" + 0.001*"minister" + 0.001*"card" + 0.001*"band" + 0.001*"map" + 0.001*"car" + 0.001*"muslim" + 0.001*"california" + 0.001*"soviet" + 0.001*"lord"
-    2020-10-25 11:29:31,979 : INFO : topic #1 (0.091): 0.001*"km" + 0.001*"soviet" + 0.001*"album" + 0.001*"minister" + 0.001*"cell" + 0.001*"energy" + 0.001*"jewish" + 0.001*"band" + 0.001*"jew" + 0.001*"actor"
-    2020-10-25 11:29:31,979 : INFO : topic #2 (0.091): 0.001*"election" + 0.001*"km" + 0.001*"band" + 0.001*"energy" + 0.001*"cell" + 0.001*"soviet" + 0.001*"minister" + 0.001*"china" + 0.001*"map" + 0.001*"australia"
-    2020-10-25 11:29:31,980 : INFO : topic #10 (0.091): 0.001*"actor" + 0.001*"league" + 0.001*"jewish" + 0.001*"km" + 0.001*"israel" + 0.001*"bc" + 0.001*"y" + 0.001*"ship" + 0.001*"energy" + 0.001*"minister"
-    2020-10-25 11:29:31,980 : INFO : topic diff=1.023741, rho=1.000000
-    2020-10-25 11:29:42,609 : INFO : -9.277 per-word bound, 620.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2020-10-25 11:29:42,609 : INFO : PROGRESS: pass 1, at document #1701/1701
-    2020-10-25 11:29:45,801 : INFO : topic #9 (0.091): 0.002*"km" + 0.001*"election" + 0.001*"economy" + 0.001*"minister" + 0.001*"est" + 0.001*"oil" + 0.001*"spanish" + 0.001*"male" + 0.001*"liberal" + 0.001*"flag"
-    2020-10-25 11:29:45,802 : INFO : topic #2 (0.091): 0.001*"energy" + 0.001*"cell" + 0.001*"election" + 0.001*"band" + 0.001*"map" + 0.001*"km" + 0.001*"specie" + 0.001*"australia" + 0.001*"metal" + 0.001*"drug"
-    2020-10-25 11:29:45,802 : INFO : topic #6 (0.091): 0.001*"actor" + 0.001*"soviet" + 0.001*"wikipedia" + 0.001*"cell" + 0.001*"jewish" + 0.001*"italian" + 0.001*"football" + 0.001*"band" + 0.001*"acid" + 0.001*"russian"
-    2020-10-25 11:29:45,802 : INFO : topic #4 (0.091): 0.002*"software" + 0.001*"japanese" + 0.001*"japan" + 0.001*"y" + 0.001*"apple" + 0.001*"love" + 0.001*"china" + 0.001*"chinese" + 0.001*"philosophy" + 0.001*"classical"
-    2020-10-25 11:29:45,803 : INFO : topic #7 (0.091): 0.002*"emperor" + 0.002*"minister" + 0.001*"male" + 0.001*"saint" + 0.001*"constitution" + 0.001*"jewish" + 0.001*"election" + 0.001*"female" + 0.001*"actor" + 0.001*"jesus"
-    2020-10-25 11:29:45,803 : INFO : topic diff=0.168078, rho=0.577350
-    2020-10-25 11:29:45,803 : INFO : ensemble contains 9 models and 160 topics now
-    2020-10-25 11:29:45,809 : INFO : ensemble contains 10 models and 169 topics now
-    2020-10-25 11:29:45,815 : INFO : asymmetric distance matrix is outdated due to add_model
-    2020-10-25 11:29:45,815 : INFO : generating a 180 x 180 asymmetric distance matrix...
-    2020-10-25 11:29:46,856 : INFO : fitting the clustering model, using 5 for min_samples
-    2020-10-25 11:29:46,890 : INFO : generating stable topics, using 3 for min_cores
-    2020-10-25 11:29:46,890 : INFO : found 18 clusters
-    2020-10-25 11:29:46,899 : INFO : found 1 stable topics
-    2020-10-25 11:29:46,899 : INFO : generating classic gensim model representation based on results from the ensemble
-    2020-10-25 11:29:46,900 : INFO : using symmetric alpha at 1.0
-    2020-10-25 11:29:46,900 : INFO : using symmetric eta at 1.0
-    2020-10-25 11:29:46,903 : INFO : using serial LDA version on this node
-    2020-10-25 11:29:46,906 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2020-10-25 11:29:46,906 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-23 12:11:49,879 : INFO : using symmetric alpha at 0.1111111111111111
+    2021-01-23 12:11:49,879 : INFO : using symmetric eta at 0.1111111111111111
+    2021-01-23 12:11:49,881 : INFO : using serial LDA version on this node
+    2021-01-23 12:11:49,895 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 14000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 12:11:49,895 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-23 12:11:49,896 : INFO : training LDA model using 7 processes
+    2021-01-23 12:11:49,969 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-23 12:11:57,803 : INFO : topic #6 (0.111): 0.001*"emperor" + 0.001*"cell" + 0.001*"minister" + 0.001*"energy" + 0.001*"chinese" + 0.001*"female" + 0.001*"animal" + 0.001*"award" + 0.001*"spanish" + 0.001*"season"
+    2021-01-23 12:11:57,803 : INFO : topic #1 (0.111): 0.001*"band" + 0.001*"league" + 0.001*"km" + 0.001*"car" + 0.001*"india" + 0.001*"bc" + 0.001*"actor" + 0.001*"jewish" + 0.001*"album" + 0.001*"minister"
+    2021-01-23 12:11:57,804 : INFO : topic #3 (0.111): 0.001*"actor" + 0.001*"band" + 0.001*"novel" + 0.001*"election" + 0.001*"album" + 0.001*"km" + 0.001*"chinese" + 0.001*"energy" + 0.001*"russian" + 0.001*"male"
+    2021-01-23 12:11:57,804 : INFO : topic #5 (0.111): 0.001*"soviet" + 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"lord" + 0.001*"software" + 0.001*"plant" + 0.001*"african" + 0.001*"territory" + 0.001*"est"
+    2021-01-23 12:11:57,805 : INFO : topic #8 (0.111): 0.001*"km" + 0.001*"soviet" + 0.001*"africa" + 0.001*"japanese" + 0.001*"chinese" + 0.001*"energy" + 0.001*"minister" + 0.001*"band" + 0.001*"user" + 0.001*"china"
+    2021-01-23 12:11:57,805 : INFO : topic diff=1.009632, rho=1.000000
+    2021-01-23 12:12:18,557 : INFO : -9.256 per-word bound, 611.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 12:12:18,557 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-23 12:12:26,238 : INFO : topic #4 (0.111): 0.001*"league" + 0.001*"ball" + 0.001*"minister" + 0.001*"cell" + 0.001*"engine" + 0.001*"bc" + 0.001*"soviet" + 0.001*"software" + 0.001*"election" + 0.001*"album"
+    2021-01-23 12:12:26,239 : INFO : topic #1 (0.111): 0.001*"league" + 0.001*"band" + 0.001*"car" + 0.001*"window" + 0.001*"album" + 0.001*"microsoft" + 0.001*"km" + 0.001*"india" + 0.001*"ball" + 0.001*"season"
+    2021-01-23 12:12:26,240 : INFO : topic #6 (0.111): 0.002*"emperor" + 0.001*"cell" + 0.001*"jewish" + 0.001*"chinese" + 0.001*"animal" + 0.001*"energy" + 0.001*"award" + 0.001*"fiction" + 0.001*"orthodox" + 0.001*"japanese"
+    2021-01-23 12:12:26,241 : INFO : topic #5 (0.111): 0.002*"soviet" + 0.001*"election" + 0.001*"minister" + 0.001*"km" + 0.001*"est" + 0.001*"territory" + 0.001*"lord" + 0.001*"economy" + 0.001*"african" + 0.001*"plant"
+    2021-01-23 12:12:26,241 : INFO : topic #8 (0.111): 0.002*"km" + 0.001*"est" + 0.001*"africa" + 0.001*"chinese" + 0.001*"japanese" + 0.001*"energy" + 0.001*"election" + 0.001*"minister" + 0.001*"economy" + 0.001*"y"
+    2021-01-23 12:12:26,242 : INFO : topic diff=0.148485, rho=0.592297
+    2021-01-23 12:12:46,644 : INFO : -9.193 per-word bound, 585.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 12:12:46,645 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-23 12:12:54,251 : INFO : topic #6 (0.111): 0.002*"emperor" + 0.002*"jewish" + 0.001*"cell" + 0.001*"jesus" + 0.001*"orthodox" + 0.001*"christ" + 0.001*"animal" + 0.001*"jew" + 0.001*"fiction" + 0.001*"chinese"
+    2021-01-23 12:12:54,252 : INFO : topic #5 (0.111): 0.003*"soviet" + 0.002*"election" + 0.002*"minister" + 0.001*"km" + 0.001*"economy" + 0.001*"territory" + 0.001*"parliament" + 0.001*"est" + 0.001*"lord" + 0.001*"elected"
+    2021-01-23 12:12:54,253 : INFO : topic #2 (0.111): 0.002*"actor" + 0.001*"love" + 0.001*"lincoln" + 0.001*"mary" + 0.001*"emperor" + 0.001*"bc" + 0.001*"album" + 0.001*"energy" + 0.001*"band" + 0.001*"ford"
+    2021-01-23 12:12:54,253 : INFO : topic #3 (0.111): 0.002*"band" + 0.002*"album" + 0.001*"bass" + 0.001*"instrument" + 0.001*"novel" + 0.001*"actor" + 0.001*"blue" + 0.001*"card" + 0.001*"kong" + 0.001*"chinese"
+    2021-01-23 12:12:54,254 : INFO : topic #7 (0.111): 0.002*"software" + 0.001*"horse" + 0.001*"y" + 0.001*"blue" + 0.001*"file" + 0.001*"actor" + 0.001*"moon" + 0.001*"apollo" + 0.001*"video" + 0.001*"season"
+    2021-01-23 12:12:54,254 : INFO : topic diff=0.174918, rho=0.509614
+    2021-01-23 12:13:15,179 : INFO : -9.137 per-word bound, 563.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 12:13:15,179 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-23 12:13:22,431 : INFO : topic #2 (0.111): 0.002*"actor" + 0.002*"love" + 0.001*"lincoln" + 0.001*"mary" + 0.001*"henry" + 0.001*"emperor" + 0.001*"bc" + 0.001*"ford" + 0.001*"louis" + 0.001*"singer"
+    2021-01-23 12:13:22,431 : INFO : topic #7 (0.111): 0.002*"software" + 0.002*"y" + 0.001*"horse" + 0.001*"file" + 0.001*"moon" + 0.001*"blue" + 0.001*"apollo" + 0.001*"video" + 0.001*"user" + 0.001*"color"
+    2021-01-23 12:13:22,432 : INFO : topic #0 (0.111): 0.002*"india" + 0.002*"actor" + 0.002*"aircraft" + 0.002*"import" + 0.002*"km" + 0.002*"ship" + 0.002*"minister" + 0.002*"italian" + 0.002*"irish" + 0.001*"indian"
+    2021-01-23 12:13:22,433 : INFO : topic #5 (0.111): 0.003*"soviet" + 0.002*"election" + 0.002*"minister" + 0.002*"economy" + 0.002*"parliament" + 0.002*"km" + 0.001*"territory" + 0.001*"liberal" + 0.001*"vote" + 0.001*"elected"
+    2021-01-23 12:13:22,433 : INFO : topic #8 (0.111): 0.003*"km" + 0.002*"est" + 0.002*"africa" + 0.002*"chinese" + 0.002*"economy" + 0.002*"election" + 0.002*"energy" + 0.001*"lake" + 0.001*"male" + 0.001*"african"
+    2021-01-23 12:13:22,434 : INFO : topic diff=0.152037, rho=0.454053
+    2021-01-23 12:13:42,802 : INFO : -9.102 per-word bound, 549.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 12:13:42,889 : INFO : using symmetric alpha at 0.09090909090909091
+    2021-01-23 12:13:42,890 : INFO : using symmetric eta at 0.09090909090909091
+    2021-01-23 12:13:42,895 : INFO : using serial LDA version on this node
+    2021-01-23 12:13:42,928 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 12:13:42,929 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-23 12:14:03,840 : INFO : -10.500 per-word bound, 1447.8 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 12:14:03,840 : INFO : PROGRESS: pass 0, at document #1701/1701
+    2021-01-23 12:14:09,469 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"emperor" + 0.001*"band" + 0.001*"india" + 0.001*"energy" + 0.001*"actor" + 0.001*"indian" + 0.001*"canadian" + 0.001*"minister" + 0.001*"italian"
+    2021-01-23 12:14:09,470 : INFO : topic #4 (0.091): 0.001*"import" + 0.001*"minister" + 0.001*"bc" + 0.001*"lord" + 0.001*"ball" + 0.001*"km" + 0.001*"chinese" + 0.001*"season" + 0.001*"internet" + 0.001*"japanese"
+    2021-01-23 12:14:09,470 : INFO : topic #5 (0.091): 0.001*"km" + 0.001*"chinese" + 0.001*"cell" + 0.001*"actor" + 0.001*"israel" + 0.001*"election" + 0.001*"minister" + 0.001*"china" + 0.001*"economy" + 0.001*"energy"
+    2021-01-23 12:14:09,471 : INFO : topic #1 (0.091): 0.001*"lake" + 0.001*"y" + 0.001*"soviet" + 0.001*"irish" + 0.001*"km" + 0.001*"actor" + 0.001*"league" + 0.001*"band" + 0.001*"male" + 0.001*"jewish"
+    2021-01-23 12:14:09,471 : INFO : topic #6 (0.091): 0.001*"cell" + 0.001*"actor" + 0.001*"emperor" + 0.001*"election" + 0.001*"album" + 0.001*"band" + 0.001*"lake" + 0.001*"china" + 0.001*"minister" + 0.001*"la"
+    2021-01-23 12:14:09,472 : INFO : topic diff=1.023212, rho=1.000000
+    2021-01-23 12:14:30,022 : INFO : -9.278 per-word bound, 620.9 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 12:14:30,023 : INFO : PROGRESS: pass 1, at document #1701/1701
+    2021-01-23 12:14:35,544 : INFO : topic #7 (0.091): 0.001*"emperor" + 0.001*"blue" + 0.001*"bear" + 0.001*"soviet" + 0.001*"actor" + 0.001*"alexander" + 0.001*"constitution" + 0.001*"bc" + 0.001*"mary" + 0.001*"county"
+    2021-01-23 12:14:35,545 : INFO : topic #4 (0.091): 0.002*"import" + 0.001*"lord" + 0.001*"ball" + 0.001*"bc" + 0.001*"internet" + 0.001*"chinese" + 0.001*"season" + 0.001*"japanese" + 0.001*"jewish" + 0.001*"carbon"
+    2021-01-23 12:14:35,545 : INFO : topic #3 (0.091): 0.002*"band" + 0.001*"india" + 0.001*"instrument" + 0.001*"soviet" + 0.001*"emperor" + 0.001*"actor" + 0.001*"bass" + 0.001*"indian" + 0.001*"album" + 0.001*"canadian"
+    2021-01-23 12:14:35,546 : INFO : topic #8 (0.091): 0.002*"jewish" + 0.002*"ball" + 0.002*"jew" + 0.002*"actor" + 0.001*"soviet" + 0.001*"minister" + 0.001*"football" + 0.001*"russian" + 0.001*"australian" + 0.001*"japanese"
+    2021-01-23 12:14:35,547 : INFO : topic #5 (0.091): 0.001*"km" + 0.001*"chinese" + 0.001*"israel" + 0.001*"energy" + 0.001*"flag" + 0.001*"moon" + 0.001*"cell" + 0.001*"speed" + 0.001*"china" + 0.001*"software"
+    2021-01-23 12:14:35,547 : INFO : topic diff=0.163866, rho=0.577350
+    2021-01-23 12:14:35,548 : INFO : ensemble contains 9 models and 160 topics now
+    2021-01-23 12:14:35,557 : INFO : ensemble contains 10 models and 169 topics now
+    2021-01-23 12:14:35,564 : INFO : asymmetric distance matrix is outdated due to add_model
+    2021-01-23 12:14:35,565 : INFO : generating a 180 x 180 asymmetric distance matrix...
+    2021-01-23 12:14:37,408 : INFO : fitting the clustering model, using 5 for min_samples
+    2021-01-23 12:14:37,438 : INFO : generating stable topics, using 3 for min_cores
+    2021-01-23 12:14:37,438 : INFO : found 21 clusters
+    2021-01-23 12:14:37,442 : INFO : found 1 stable topics
+    2021-01-23 12:14:37,442 : INFO : generating classic gensim model representation based on results from the ensemble
+    2021-01-23 12:14:37,444 : INFO : using symmetric alpha at 1.0
+    2021-01-23 12:14:37,444 : INFO : using symmetric eta at 1.0
+    2021-01-23 12:14:37,446 : INFO : using serial LDA version on this node
+    2021-01-23 12:14:37,448 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 12:14:37,448 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
     180
     1
 
@@ -387,9 +424,9 @@ Afterwards the number and quality of stable topics might be different depending
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 6 minutes  53.294 seconds)
+   **Total running time of the script:** ( 12 minutes  22.200 seconds)
 
-**Estimated memory usage:**  1629 MB
+**Estimated memory usage:**  1637 MB
 
 
 .. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py:
diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst
index b4845ff578..9ac6299492 100644
--- a/docs/src/auto_examples/tutorials/sg_execution_times.rst
+++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**06:53.294** total execution time for **auto_examples_tutorials** files:
+**12:22.200** total execution time for **auto_examples_tutorials** files:
 
 +-------------------------------------------------------------------------------------+-----------+-----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 06:53.294 | 1628.6 MB |
+| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 12:22.200 | 1637.2 MB |
 +-------------------------------------------------------------------------------------+-----------+-----------+
 | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``)             | 00:00.000 | 0.0 MB    |
 +-------------------------------------------------------------------------------------+-----------+-----------+
diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py
index bde40f98ea..086f65fc44 100644
--- a/docs/src/gallery/tutorials/run_ensemblelda.py
+++ b/docs/src/gallery/tutorials/run_ensemblelda.py
@@ -13,7 +13,7 @@
 # This tutorial will explain how to use the EnsembleLDA model class.
 #
 # EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,
-# it can therefore be used to remove topics from your results that are noise and are not reproducible.
+# it can be used to remove topics from your results that are noise and are not reproducible.
 #
 
 ###############################################################################
@@ -98,19 +98,19 @@
 #
 # Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.
 #
-# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around
+# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor.
 #
-# until you get as many topics as you desire, which however may reduce their quality.
+# Play around until you get as many topics as you desire, which however may reduce their quality.
 # If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.
 #
 # Having an epsilon that is smaller than the smallest distance doesn't make sense.
-# Make sure to chose one that is within the range of values.
+# Make sure to chose one that is within the range of values in ``asymmetric_distance_matrix``.
 #
 
 import numpy as np
 shape = ensemble.asymmetric_distance_matrix.shape
 without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
-without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()
+print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max())
 
 ensemble.recluster(eps=0.09, min_samples=2, min_cores=2)
 
diff --git a/gensim/downloader.py b/gensim/downloader.py
index 8a4395440e..47061ec793 100644
--- a/gensim/downloader.py
+++ b/gensim/downloader.py
@@ -357,6 +357,7 @@ def _download(name):
         If md5sum on client and in repo are different.
 
     """
+    print('_download')
     url_load_file = "{base}/{fname}/__init__.py".format(base=DOWNLOAD_BASE_URL, fname=name)
     data_folder_dir = os.path.join(BASE_DIR, name)
     data_folder_dir_tmp = data_folder_dir + '_tmp'
@@ -372,6 +373,8 @@ def _download(name):
 
             fname = "{f}.gz_0{p}".format(f=name, p=part)
             dst_path = os.path.join(tmp_dir, fname)
+            print('downloading', url_data)
+            print()
             urllib.urlretrieve(
                 url_data, dst_path,
                 reporthook=partial(_progress, part=part, total_parts=total_parts)
@@ -393,6 +396,8 @@ def _download(name):
         url_data = "{base}/{fname}/{fname}.gz".format(base=DOWNLOAD_BASE_URL, fname=name)
         fname = "{fname}.gz".format(fname=name)
         dst_path = os.path.join(tmp_dir, fname)
+        print('downloading', url_data, 'to', dst_path, data_folder_dir)
+        print()
         urllib.urlretrieve(url_data, dst_path, reporthook=_progress)
         if _calculate_md5_checksum(dst_path) == _get_checksum(name):
             sys.stdout.write("\n")

From 847874a92e2055f65c5b9482003b99b5f9d77842 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sat, 23 Jan 2021 15:29:02 +0100
Subject: [PATCH 125/166] attempt at fixing that pickle problem

---
 .../tutorials/run_ensemblelda.rst             | 208 +++++++++---------
 .../tutorials/sg_execution_times.rst          |   4 +-
 2 files changed, 106 insertions(+), 106 deletions(-)

diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
index 4ff2735a69..9b959482d9 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.rst
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
@@ -82,11 +82,11 @@ so it won't be explained again in detail.
 
  .. code-block:: none
 
-    2021-01-23 12:02:54,189 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
-    2021-01-23 12:03:00,997 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions)
-    2021-01-23 12:03:01,199 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]...
-    2021-01-23 12:03:01,199 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents
-    2021-01-23 12:03:01,288 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...)
+    2021-01-23 15:15:53,986 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
+    2021-01-23 15:16:00,703 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions)
+    2021-01-23 15:16:00,901 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]...
+    2021-01-23 15:16:00,901 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents
+    2021-01-23 15:16:00,984 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...)
 
 
@@ -215,18 +215,18 @@ The number of stable topics which are clustered from all those topics is smaller
 
  .. code-block:: none
 
-    2021-01-23 12:03:05,136 : INFO : generating 8 topic models...
-    2021-01-23 12:11:47,694 : INFO : generating a 160 x 160 asymmetric distance matrix...
-    2021-01-23 12:11:49,175 : INFO : fitting the clustering model, using 4 for min_samples
-    2021-01-23 12:11:49,206 : INFO : generating stable topics, using 3 for min_cores
-    2021-01-23 12:11:49,206 : INFO : found 26 clusters
-    2021-01-23 12:11:49,209 : INFO : found 1 stable topics
-    2021-01-23 12:11:49,209 : INFO : generating classic gensim model representation based on results from the ensemble
-    2021-01-23 12:11:49,398 : INFO : using symmetric alpha at 1.0
-    2021-01-23 12:11:49,398 : INFO : using symmetric eta at 1.0
-    2021-01-23 12:11:49,400 : INFO : using serial LDA version on this node
-    2021-01-23 12:11:49,402 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 12:11:49,402 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-23 15:16:04,680 : INFO : generating 8 topic models...
+    2021-01-23 15:24:25,880 : INFO : generating a 160 x 160 asymmetric distance matrix...
+    2021-01-23 15:24:27,354 : INFO : fitting the clustering model, using 4 for min_samples
+    2021-01-23 15:24:27,387 : INFO : generating stable topics, using 3 for min_cores
+    2021-01-23 15:24:27,387 : INFO : found 25 clusters
+    2021-01-23 15:24:27,391 : INFO : found 1 stable topics
+    2021-01-23 15:24:27,391 : INFO : generating classic gensim model representation based on results from the ensemble
+    2021-01-23 15:24:27,592 : INFO : using symmetric alpha at 1.0
+    2021-01-23 15:24:27,592 : INFO : using symmetric eta at 1.0
+    2021-01-23 15:24:27,594 : INFO : using serial LDA version on this node
+    2021-01-23 15:24:27,596 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 15:24:27,596 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
     160
     1
 
@@ -273,18 +273,18 @@ Make sure to chose one that is within the range of values in ``asymmetric_distan
 
  .. code-block:: none
 
-    0.07280884735152338 0.12813543590172502 0.2606207782381612
-    2021-01-23 12:11:49,575 : INFO : fitting the clustering model
-    2021-01-23 12:11:49,589 : INFO : generating stable topics
-    2021-01-23 12:11:49,589 : INFO : found 35 clusters
-    2021-01-23 12:11:49,593 : INFO : found 4 stable topics
-    2021-01-23 12:11:49,593 : INFO : generating classic gensim model representation based on results from the ensemble
-    2021-01-23 12:11:49,594 : INFO : using symmetric alpha at 0.25
-    2021-01-23 12:11:49,594 : INFO : using symmetric eta at 0.25
-    2021-01-23 12:11:49,596 : INFO : using serial LDA version on this node
-    2021-01-23 12:11:49,602 : INFO : running online (multi-pass) LDA training, 4 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 12:11:49,603 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    4
+    0.0664596363067147 0.12630570073794478 0.2773686345664633
+    2021-01-23 15:24:27,763 : INFO : fitting the clustering model
+    2021-01-23 15:24:27,777 : INFO : generating stable topics
+    2021-01-23 15:24:27,777 : INFO : found 22 clusters
+    2021-01-23 15:24:27,782 : INFO : found 3 stable topics
+    2021-01-23 15:24:27,782 : INFO : generating classic gensim model representation based on results from the ensemble
+    2021-01-23 15:24:27,784 : INFO : using symmetric alpha at 0.3333333333333333
+    2021-01-23 15:24:27,784 : INFO : using symmetric eta at 0.3333333333333333
+    2021-01-23 15:24:27,785 : INFO : using serial LDA version on this node
+    2021-01-23 15:24:27,790 : INFO : running online (multi-pass) LDA training, 3 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 15:24:27,790 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    3
 
 
@@ -342,79 +342,79 @@ Afterwards the number and quality of stable topics might be different depending
 
  .. code-block:: none
 
-    2021-01-23 12:11:49,879 : INFO : using symmetric alpha at 0.1111111111111111
-    2021-01-23 12:11:49,879 : INFO : using symmetric eta at 0.1111111111111111
-    2021-01-23 12:11:49,881 : INFO : using serial LDA version on this node
-    2021-01-23 12:11:49,895 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 14000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 12:11:49,895 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    2021-01-23 12:11:49,896 : INFO : training LDA model using 7 processes
-    2021-01-23 12:11:49,969 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2021-01-23 12:11:57,803 : INFO : topic #6 (0.111): 0.001*"emperor" + 0.001*"cell" + 0.001*"minister" + 0.001*"energy" + 0.001*"chinese" + 0.001*"female" + 0.001*"animal" + 0.001*"award" + 0.001*"spanish" + 0.001*"season"
-    2021-01-23 12:11:57,803 : INFO : topic #1 (0.111): 0.001*"band" + 0.001*"league" + 0.001*"km" + 0.001*"car" + 0.001*"india" + 0.001*"bc" + 0.001*"actor" + 0.001*"jewish" + 0.001*"album" + 0.001*"minister"
-    2021-01-23 12:11:57,804 : INFO : topic #3 (0.111): 0.001*"actor" + 0.001*"band" + 0.001*"novel" + 0.001*"election" + 0.001*"album" + 0.001*"km" + 0.001*"chinese" + 0.001*"energy" + 0.001*"russian" + 0.001*"male"
-    2021-01-23 12:11:57,804 : INFO : topic #5 (0.111): 0.001*"soviet" + 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"lord" + 0.001*"software" + 0.001*"plant" + 0.001*"african" + 0.001*"territory" + 0.001*"est"
-    2021-01-23 12:11:57,805 : INFO : topic #8 (0.111): 0.001*"km" + 0.001*"soviet" + 0.001*"africa" + 0.001*"japanese" + 0.001*"chinese" + 0.001*"energy" + 0.001*"minister" + 0.001*"band" + 0.001*"user" + 0.001*"china"
-    2021-01-23 12:11:57,805 : INFO : topic diff=1.009632, rho=1.000000
-    2021-01-23 12:12:18,557 : INFO : -9.256 per-word bound, 611.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 12:12:18,557 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2021-01-23 12:12:26,238 : INFO : topic #4 (0.111): 0.001*"league" + 0.001*"ball" + 0.001*"minister" + 0.001*"cell" + 0.001*"engine" + 0.001*"bc" + 0.001*"soviet" + 0.001*"software" + 0.001*"election" + 0.001*"album"
-    2021-01-23 12:12:26,239 : INFO : topic #1 (0.111): 0.001*"league" + 0.001*"band" + 0.001*"car" + 0.001*"window" + 0.001*"album" + 0.001*"microsoft" + 0.001*"km" + 0.001*"india" + 0.001*"ball" + 0.001*"season"
-    2021-01-23 12:12:26,240 : INFO : topic #6 (0.111): 0.002*"emperor" + 0.001*"cell" + 0.001*"jewish" + 0.001*"chinese" + 0.001*"animal" + 0.001*"energy" + 0.001*"award" + 0.001*"fiction" + 0.001*"orthodox" + 0.001*"japanese"
-    2021-01-23 12:12:26,241 : INFO : topic #5 (0.111): 0.002*"soviet" + 0.001*"election" + 0.001*"minister" + 0.001*"km" + 0.001*"est" + 0.001*"territory" + 0.001*"lord" + 0.001*"economy" + 0.001*"african" + 0.001*"plant"
-    2021-01-23 12:12:26,241 : INFO : topic #8 (0.111): 0.002*"km" + 0.001*"est" + 0.001*"africa" + 0.001*"chinese" + 0.001*"japanese" + 0.001*"energy" + 0.001*"election" + 0.001*"minister" + 0.001*"economy" + 0.001*"y"
-    2021-01-23 12:12:26,242 : INFO : topic diff=0.148485, rho=0.592297
-    2021-01-23 12:12:46,644 : INFO : -9.193 per-word bound, 585.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 12:12:46,645 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2021-01-23 12:12:54,251 : INFO : topic #6 (0.111): 0.002*"emperor" + 0.002*"jewish" + 0.001*"cell" + 0.001*"jesus" + 0.001*"orthodox" + 0.001*"christ" + 0.001*"animal" + 0.001*"jew" + 0.001*"fiction" + 0.001*"chinese"
-    2021-01-23 12:12:54,252 : INFO : topic #5 (0.111): 0.003*"soviet" + 0.002*"election" + 0.002*"minister" + 0.001*"km" + 0.001*"economy" + 0.001*"territory" + 0.001*"parliament" + 0.001*"est" + 0.001*"lord" + 0.001*"elected"
-    2021-01-23 12:12:54,253 : INFO : topic #2 (0.111): 0.002*"actor" + 0.001*"love" + 0.001*"lincoln" + 0.001*"mary" + 0.001*"emperor" + 0.001*"bc" + 0.001*"album" + 0.001*"energy" + 0.001*"band" + 0.001*"ford"
-    2021-01-23 12:12:54,253 : INFO : topic #3 (0.111): 0.002*"band" + 0.002*"album" + 0.001*"bass" + 0.001*"instrument" + 0.001*"novel" + 0.001*"actor" + 0.001*"blue" + 0.001*"card" + 0.001*"kong" + 0.001*"chinese"
-    2021-01-23 12:12:54,254 : INFO : topic #7 (0.111): 0.002*"software" + 0.001*"horse" + 0.001*"y" + 0.001*"blue" + 0.001*"file" + 0.001*"actor" + 0.001*"moon" + 0.001*"apollo" + 0.001*"video" + 0.001*"season"
-    2021-01-23 12:12:54,254 : INFO : topic diff=0.174918, rho=0.509614
-    2021-01-23 12:13:15,179 : INFO : -9.137 per-word bound, 563.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 12:13:15,179 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2021-01-23 12:13:22,431 : INFO : topic #2 (0.111): 0.002*"actor" + 0.002*"love" + 0.001*"lincoln" + 0.001*"mary" + 0.001*"henry" + 0.001*"emperor" + 0.001*"bc" + 0.001*"ford" + 0.001*"louis" + 0.001*"singer"
-    2021-01-23 12:13:22,431 : INFO : topic #7 (0.111): 0.002*"software" + 0.002*"y" + 0.001*"horse" + 0.001*"file" + 0.001*"moon" + 0.001*"blue" + 0.001*"apollo" + 0.001*"video" + 0.001*"user" + 0.001*"color"
-    2021-01-23 12:13:22,432 : INFO : topic #0 (0.111): 0.002*"india" + 0.002*"actor" + 0.002*"aircraft" + 0.002*"import" + 0.002*"km" + 0.002*"ship" + 0.002*"minister" + 0.002*"italian" + 0.002*"irish" + 0.001*"indian"
-    2021-01-23 12:13:22,433 : INFO : topic #5 (0.111): 0.003*"soviet" + 0.002*"election" + 0.002*"minister" + 0.002*"economy" + 0.002*"parliament" + 0.002*"km" + 0.001*"territory" + 0.001*"liberal" + 0.001*"vote" + 0.001*"elected"
-    2021-01-23 12:13:22,433 : INFO : topic #8 (0.111): 0.003*"km" + 0.002*"est" + 0.002*"africa" + 0.002*"chinese" + 0.002*"economy" + 0.002*"election" + 0.002*"energy" + 0.001*"lake" + 0.001*"male" + 0.001*"african"
-    2021-01-23 12:13:22,434 : INFO : topic diff=0.152037, rho=0.454053
-    2021-01-23 12:13:42,802 : INFO : -9.102 per-word bound, 549.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 12:13:42,889 : INFO : using symmetric alpha at 0.09090909090909091
-    2021-01-23 12:13:42,890 : INFO : using symmetric eta at 0.09090909090909091
-    2021-01-23 12:13:42,895 : INFO : using serial LDA version on this node
-    2021-01-23 12:13:42,928 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 12:13:42,929 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    2021-01-23 12:14:03,840 : INFO : -10.500 per-word bound, 1447.8 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 12:14:03,840 : INFO : PROGRESS: pass 0, at document #1701/1701
-    2021-01-23 12:14:09,469 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"emperor" + 0.001*"band" + 0.001*"india" + 0.001*"energy" + 0.001*"actor" + 0.001*"indian" + 0.001*"canadian" + 0.001*"minister" + 0.001*"italian"
-    2021-01-23 12:14:09,470 : INFO : topic #4 (0.091): 0.001*"import" + 0.001*"minister" + 0.001*"bc" + 0.001*"lord" + 0.001*"ball" + 0.001*"km" + 0.001*"chinese" + 0.001*"season" + 0.001*"internet" + 0.001*"japanese"
-    2021-01-23 12:14:09,470 : INFO : topic #5 (0.091): 0.001*"km" + 0.001*"chinese" + 0.001*"cell" + 0.001*"actor" + 0.001*"israel" + 0.001*"election" + 0.001*"minister" + 0.001*"china" + 0.001*"economy" + 0.001*"energy"
-    2021-01-23 12:14:09,471 : INFO : topic #1 (0.091): 0.001*"lake" + 0.001*"y" + 0.001*"soviet" + 0.001*"irish" + 0.001*"km" + 0.001*"actor" + 0.001*"league" + 0.001*"band" + 0.001*"male" + 0.001*"jewish"
-    2021-01-23 12:14:09,471 : INFO : topic #6 (0.091): 0.001*"cell" + 0.001*"actor" + 0.001*"emperor" + 0.001*"election" + 0.001*"album" + 0.001*"band" + 0.001*"lake" + 0.001*"china" + 0.001*"minister" + 0.001*"la"
-    2021-01-23 12:14:09,472 : INFO : topic diff=1.023212, rho=1.000000
-    2021-01-23 12:14:30,022 : INFO : -9.278 per-word bound, 620.9 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 12:14:30,023 : INFO : PROGRESS: pass 1, at document #1701/1701
-    2021-01-23 12:14:35,544 : INFO : topic #7 (0.091): 0.001*"emperor" + 0.001*"blue" + 0.001*"bear" + 0.001*"soviet" + 0.001*"actor" + 0.001*"alexander" + 0.001*"constitution" + 0.001*"bc" + 0.001*"mary" + 0.001*"county"
-    2021-01-23 12:14:35,545 : INFO : topic #4 (0.091): 0.002*"import" + 0.001*"lord" + 0.001*"ball" + 0.001*"bc" + 0.001*"internet" + 0.001*"chinese" + 0.001*"season" + 0.001*"japanese" + 0.001*"jewish" + 0.001*"carbon"
-    2021-01-23 12:14:35,545 : INFO : topic #3 (0.091): 0.002*"band" + 0.001*"india" + 0.001*"instrument" + 0.001*"soviet" + 0.001*"emperor" + 0.001*"actor" + 0.001*"bass" + 0.001*"indian" + 0.001*"album" + 0.001*"canadian"
-    2021-01-23 12:14:35,546 : INFO : topic #8 (0.091): 0.002*"jewish" + 0.002*"ball" + 0.002*"jew" + 0.002*"actor" + 0.001*"soviet" + 0.001*"minister" + 0.001*"football" + 0.001*"russian" + 0.001*"australian" + 0.001*"japanese"
-    2021-01-23 12:14:35,547 : INFO : topic #5 (0.091): 0.001*"km" + 0.001*"chinese" + 0.001*"israel" + 0.001*"energy" + 0.001*"flag" + 0.001*"moon" + 0.001*"cell" + 0.001*"speed" + 0.001*"china" + 0.001*"software"
-    2021-01-23 12:14:35,547 : INFO : topic diff=0.163866, rho=0.577350
-    2021-01-23 12:14:35,548 : INFO : ensemble contains 9 models and 160 topics now
-    2021-01-23 12:14:35,557 : INFO : ensemble contains 10 models and 169 topics now
-    2021-01-23 12:14:35,564 : INFO : asymmetric distance matrix is outdated due to add_model
-    2021-01-23 12:14:35,565 : INFO : generating a 180 x 180 asymmetric distance matrix...
-    2021-01-23 12:14:37,408 : INFO : fitting the clustering model, using 5 for min_samples
-    2021-01-23 12:14:37,438 : INFO : generating stable topics, using 3 for min_cores
-    2021-01-23 12:14:37,438 : INFO : found 21 clusters
-    2021-01-23 12:14:37,442 : INFO : found 1 stable topics
-    2021-01-23 12:14:37,442 : INFO : generating classic gensim model representation based on results from the ensemble
-    2021-01-23 12:14:37,444 : INFO : using symmetric alpha at 1.0
-    2021-01-23 12:14:37,444 : INFO : using symmetric eta at 1.0
-    2021-01-23 12:14:37,446 : INFO : using serial LDA version on this node
-    2021-01-23 12:14:37,448 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 12:14:37,448 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-23 15:24:28,057 : INFO : using symmetric alpha at 0.1111111111111111
+    2021-01-23 15:24:28,058 : INFO : using symmetric eta at 0.1111111111111111
+    2021-01-23 15:24:28,059 : INFO : using serial LDA version on this node
+    2021-01-23 15:24:28,074 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 14000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 15:24:28,074 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-23 15:24:28,074 : INFO : training LDA model using 7 processes
+    2021-01-23 15:24:28,150 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-23 15:24:36,133 : INFO : topic #3 (0.111): 0.001*"minister" + 0.001*"actor" + 0.001*"china" + 0.001*"band" + 0.001*"km" + 0.001*"blue" + 0.001*"album" + 0.001*"economy" + 0.001*"soviet" + 0.001*"bc"
+    2021-01-23 15:24:36,133 : INFO : topic #1 (0.111): 0.001*"software" + 0.001*"album" + 0.001*"minister" + 0.001*"election" + 0.001*"emperor" + 0.001*"male" + 0.001*"league" + 0.001*"lord" + 0.001*"band" + 0.001*"jewish"
+    2021-01-23 15:24:36,134 : INFO : topic #2 (0.111): 0.001*"soviet" + 0.001*"china" + 0.001*"league" + 0.001*"actor" + 0.001*"km" + 0.001*"prime" + 0.001*"band" + 0.001*"energy" + 0.001*"economy" + 0.001*"season"
+    2021-01-23 15:24:36,134 : INFO : topic #4 (0.111): 0.001*"election" + 0.001*"league" + 0.001*"band" + 0.001*"km" + 0.001*"cell" + 0.001*"soviet" + 0.001*"philosophy" + 0.001*"lake" + 0.001*"software" + 0.001*"economy"
+    2021-01-23 15:24:36,135 : INFO : topic #8 (0.111): 0.001*"soviet" + 0.001*"jewish" + 0.001*"energy" + 0.001*"jew" + 0.001*"minister" + 0.001*"band" + 0.001*"russian" + 0.001*"album" + 0.001*"emperor" + 0.001*"lord"
+    2021-01-23 15:24:36,135 : INFO : topic diff=1.009252, rho=1.000000
+    2021-01-23 15:24:55,469 : INFO : -9.258 per-word bound, 612.2 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 15:24:55,469 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-23 15:25:02,694 : INFO : topic #4 (0.111): 0.001*"election" + 0.001*"lake" + 0.001*"apple" + 0.001*"philosophy" + 0.001*"software" + 0.001*"league" + 0.001*"km" + 0.001*"user" + 0.001*"band" + 0.001*"window"
+    2021-01-23 15:25:02,695 : INFO : topic #1 (0.111): 0.002*"album" + 0.001*"software" + 0.001*"lord" + 0.001*"emperor" + 0.001*"band" + 0.001*"league" + 0.001*"minister" + 0.001*"saint" + 0.001*"novel" + 0.001*"season"
+    2021-01-23 15:25:02,695 : INFO : topic #2 (0.111): 0.002*"soviet" + 0.002*"league" + 0.001*"china" + 0.001*"season" + 0.001*"actor" + 0.001*"km" + 0.001*"ball" + 0.001*"prime" + 0.001*"chinese" + 0.001*"economy"
+    2021-01-23 15:25:02,696 : INFO : topic #6 (0.111): 0.001*"israel" + 0.001*"km" + 0.001*"cell" + 0.001*"user" + 0.001*"minister" + 0.001*"video" + 0.001*"jewish" + 0.001*"machine" + 0.001*"import" + 0.001*"internet"
+    2021-01-23 15:25:02,697 : INFO : topic #0 (0.111): 0.001*"energy" + 0.001*"actor" + 0.001*"soviet" + 0.001*"blue" + 0.001*"tree" + 0.001*"ball" + 0.001*"russian" + 0.001*"software" + 0.001*"fiction" + 0.001*"file"
+    2021-01-23 15:25:02,697 : INFO : topic diff=0.143090, rho=0.592297
+    2021-01-23 15:25:21,875 : INFO : -9.195 per-word bound, 586.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 15:25:21,876 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-23 15:25:29,503 : INFO : topic #7 (0.111): 0.001*"economy" + 0.001*"africa" + 0.001*"bc" + 0.001*"diamond" + 0.001*"horse" + 0.001*"irish" + 0.001*"county" + 0.001*"election" + 0.001*"file" + 0.001*"car"
+    2021-01-23 15:25:29,503 : INFO : topic #5 (0.111): 0.003*"km" + 0.002*"est" + 0.002*"actor" + 0.002*"election" + 0.002*"minister" + 0.002*"india" + 0.001*"italian" + 0.001*"male" + 0.001*"female" + 0.001*"constitution"
+    2021-01-23 15:25:29,504 : INFO : topic #2 (0.111): 0.002*"league" + 0.002*"season" + 0.002*"soviet" + 0.002*"china" + 0.001*"ball" + 0.001*"chinese" + 0.001*"football" + 0.001*"baseball" + 0.001*"canadian" + 0.001*"nfl"
+    2021-01-23 15:25:29,505 : INFO : topic #0 (0.111): 0.001*"energy" + 0.001*"ball" + 0.001*"tree" + 0.001*"engine" + 0.001*"soviet" + 0.001*"blue" + 0.001*"actor" + 0.001*"fiction" + 0.001*"file" + 0.001*"russian"
+    2021-01-23 15:25:29,505 : INFO : topic #8 (0.111): 0.002*"jewish" + 0.002*"jew" + 0.002*"emperor" + 0.002*"soviet" + 0.001*"orthodox" + 0.001*"christ" + 0.001*"jesus" + 0.001*"russian" + 0.001*"cell" + 0.001*"holy"
+    2021-01-23 15:25:29,506 : INFO : topic diff=0.176887, rho=0.509614
+    2021-01-23 15:25:48,495 : INFO : -9.132 per-word bound, 561.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 15:25:48,495 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-23 15:25:56,501 : INFO : topic #2 (0.111): 0.003*"league" + 0.003*"season" + 0.002*"soviet" + 0.002*"ball" + 0.002*"football" + 0.002*"chinese" + 0.002*"baseball" + 0.002*"china" + 0.002*"nfl" + 0.001*"finland"
+    2021-01-23 15:25:56,502 : INFO : topic #3 (0.111): 0.002*"card" + 0.002*"aircraft" + 0.002*"blue" + 0.001*"band" + 0.001*"comic" + 0.001*"kong" + 0.001*"album" + 0.001*"bridge" + 0.001*"hong" + 0.001*"love"
+    2021-01-23 15:25:56,503 : INFO : topic #6 (0.111): 0.002*"cell" + 0.001*"user" + 0.001*"import" + 0.001*"gun" + 0.001*"disease" + 0.001*"specie" + 0.001*"video" + 0.001*"device" + 0.001*"animal" + 0.001*"weapon"
+    2021-01-23 15:25:56,503 : INFO : topic #7 (0.111): 0.001*"irish" + 0.001*"horse" + 0.001*"bc" + 0.001*"diamond" + 0.001*"economy" + 0.001*"liberal" + 0.001*"car" + 0.001*"africa" + 0.001*"county" + 0.001*"file"
+    2021-01-23 15:25:56,504 : INFO : topic #0 (0.111): 0.002*"energy" + 0.001*"engine" + 0.001*"tree" + 0.001*"ball" + 0.001*"acid" + 0.001*"soviet" + 0.001*"file" + 0.001*"fiction" + 0.001*"blue" + 0.001*"classical"
+    2021-01-23 15:25:56,505 : INFO : topic diff=0.154924, rho=0.454053
+    2021-01-23 15:26:15,743 : INFO : -9.092 per-word bound, 545.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 15:26:15,821 : INFO : using symmetric alpha at 0.09090909090909091
+    2021-01-23 15:26:15,822 : INFO : using symmetric eta at 0.09090909090909091
+    2021-01-23 15:26:15,824 : INFO : using serial LDA version on this node
+    2021-01-23 15:26:15,855 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 15:26:15,855 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-23 15:26:35,321 : INFO : -10.499 per-word bound, 1447.2 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 15:26:35,321 : INFO : PROGRESS: pass 0, at document #1701/1701
+    2021-01-23 15:26:40,838 : INFO : topic #4 (0.091): 0.001*"software" + 0.001*"election" + 0.001*"jewish" + 0.001*"actor" + 0.001*"minister" + 0.001*"territory" + 0.001*"league" + 0.001*"channel" + 0.001*"user" + 0.001*"japan"
+    2021-01-23 15:26:40,839 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"cell" + 0.001*"energy" + 0.001*"software" + 0.001*"km" + 0.001*"economy" + 0.001*"minister" + 0.001*"bc" + 0.001*"china" + 0.001*"league"
+    2021-01-23 15:26:40,839 : INFO : topic #7 (0.091): 0.001*"album" + 0.001*"irish" + 0.001*"ball" + 0.001*"y" + 0.001*"minister" + 0.001*"actor" + 0.001*"chinese" + 0.001*"election" + 0.001*"band" + 0.001*"la"
+    2021-01-23 15:26:40,840 : INFO : topic #8 (0.091): 0.001*"soviet" + 0.001*"album" + 0.001*"league" + 0.001*"minister" + 0.001*"actor" + 0.001*"emperor" + 0.001*"software" + 0.001*"male" + 0.001*"energy" + 0.001*"japanese"
+    2021-01-23 15:26:40,841 : INFO : topic #1 (0.091): 0.001*"band" + 0.001*"km" + 0.001*"cell" + 0.001*"actor" + 0.001*"economy" + 0.001*"lord" + 0.001*"energy" + 0.001*"car" + 0.001*"county" + 0.001*"software"
+    2021-01-23 15:26:40,841 : INFO : topic diff=1.022883, rho=1.000000
+    2021-01-23 15:27:00,369 : INFO : -9.279 per-word bound, 621.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-23 15:27:00,369 : INFO : PROGRESS: pass 1, at document #1701/1701
+    2021-01-23 15:27:05,770 : INFO : topic #2 (0.091): 0.002*"km" + 0.002*"female" + 0.002*"est" + 0.001*"soviet" + 0.001*"election" + 0.001*"male" + 0.001*"minister" + 0.001*"economy" + 0.001*"russian" + 0.001*"russia"
+    2021-01-23 15:27:05,771 : INFO : topic #8 (0.091): 0.001*"league" + 0.001*"album" + 0.001*"color" + 0.001*"energy" + 0.001*"love" + 0.001*"actor" + 0.001*"soviet" + 0.001*"frac" + 0.001*"lake" + 0.001*"blue"
+    2021-01-23 15:27:05,771 : INFO : topic #6 (0.091): 0.002*"actor" + 0.001*"software" + 0.001*"user" + 0.001*"blue" + 0.001*"emperor" + 0.001*"album" + 0.001*"band" + 0.001*"election" + 0.001*"soviet" + 0.001*"minister"
+    2021-01-23 15:27:05,772 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"cell" + 0.001*"novel" + 0.001*"album" + 0.001*"energy" + 0.001*"bc" + 0.001*"philosophy" + 0.001*"knowledge" + 0.001*"band" + 0.001*"japanese"
+    2021-01-23 15:27:05,772 : INFO : topic #10 (0.091): 0.002*"economy" + 0.002*"km" + 0.001*"minister" + 0.001*"est" + 0.001*"election" + 0.001*"import" + 0.001*"russian" + 0.001*"prime" + 0.001*"soviet" + 0.001*"chinese"
+    2021-01-23 15:27:05,773 : INFO : topic diff=0.163523, rho=0.577350
+    2021-01-23 15:27:05,774 : INFO : ensemble contains 9 models and 160 topics now
+    2021-01-23 15:27:05,782 : INFO : ensemble contains 10 models and 169 topics now
+    2021-01-23 15:27:05,789 : INFO : asymmetric distance matrix is outdated due to add_model
+    2021-01-23 15:27:05,789 : INFO : generating a 180 x 180 asymmetric distance matrix...
+    2021-01-23 15:27:07,632 : INFO : fitting the clustering model, using 5 for min_samples
+    2021-01-23 15:27:07,670 : INFO : generating stable topics, using 3 for min_cores
+    2021-01-23 15:27:07,670 : INFO : found 23 clusters
+    2021-01-23 15:27:07,676 : INFO : found 1 stable topics
+    2021-01-23 15:27:07,676 : INFO : generating classic gensim model representation based on results from the ensemble
+    2021-01-23 15:27:07,677 : INFO : using symmetric alpha at 1.0
+    2021-01-23 15:27:07,677 : INFO : using symmetric eta at 1.0
+    2021-01-23 15:27:07,680 : INFO : using serial LDA version on this node
+    2021-01-23 15:27:07,682 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-23 15:27:07,682 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
     180
     1
 
@@ -424,9 +424,9 @@ Afterwards the number and quality of stable topics might be different depending
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 12 minutes  22.200 seconds)
+   **Total running time of the script:** ( 11 minutes  53.863 seconds)
 
-**Estimated memory usage:**  1637 MB
+**Estimated memory usage:**  1625 MB
 
 
 .. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py:
diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst
index 9ac6299492..56d9c96eb0 100644
--- a/docs/src/auto_examples/tutorials/sg_execution_times.rst
+++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**12:22.200** total execution time for **auto_examples_tutorials** files:
+**11:53.863** total execution time for **auto_examples_tutorials** files:
 
 +-------------------------------------------------------------------------------------+-----------+-----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 12:22.200 | 1637.2 MB |
+| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 11:53.863 | 1625.0 MB |
 +-------------------------------------------------------------------------------------+-----------+-----------+
 | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``)             | 00:00.000 | 0.0 MB    |
 +-------------------------------------------------------------------------------------+-----------+-----------+

From f8634ef1284e9e341a1198578e5d043ce918accf Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 24 Jan 2021 11:23:58 +0100
Subject: [PATCH 126/166] merge

---
 docs/src/auto_examples/howtos/run_doc.ipynb   |  10 +-
 docs/src/auto_examples/howtos/run_doc.py      |   2 +-
 docs/src/auto_examples/howtos/run_doc.py.md5  |   2 +-
 docs/src/auto_examples/howtos/run_doc.rst     |  32 ++-
 .../howtos/sg_execution_times.rst             |  20 +-
 docs/src/auto_examples/index.rst              |  40 ++--
 .../tutorials/run_ensemblelda.ipynb           |   2 +-
 .../tutorials/run_ensemblelda.rst             | 210 +++++++++---------
 .../tutorials/sg_execution_times.rst          |   6 +-
 9 files changed, 173 insertions(+), 151 deletions(-)

diff --git a/docs/src/auto_examples/howtos/run_doc.ipynb b/docs/src/auto_examples/howtos/run_doc.ipynb
index 48820b593a..7ce62d39c5 100644
--- a/docs/src/auto_examples/howtos/run_doc.ipynb
+++ b/docs/src/auto_examples/howtos/run_doc.ipynb
@@ -15,14 +15,14 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\nHow to Author Gensim Documentation\n==================================\n\nHow to author documentation for Gensim.\n\n"
+        "\n# How to Author Gensim Documentation\n\nHow to author documentation for Gensim.\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Background\n----------\n\nGensim is a large project with a wide range of functionality.\nUnfortunately, not all of this functionality is documented **well**, and some of it is not documented at all.\nWithout good documentation, users are unable to unlock Gensim's full potential.\nTherefore, authoring new documentation and improving existing documentation is of great value to the Gensim project.\n\nIf you implement new functionality in Gensim, please include **helpful** documentation.\nBy \"helpful\", we mean that your documentation answers questions that Gensim users may have.\nFor example:\n\n- What is this new functionality?\n- **Why** is it important?\n- **How** is it relevant to Gensim?\n- **What** can I do with it? What are some real-world applications?\n- **How** do I use it to achieve those things?\n- ... and others (if you can think of them, please add them here)\n\nBefore you author documentation, I suggest reading\n`\"What nobody tells you about documentation\" <https://www.divio.com/blog/documentation/>`__\nor watching its `accompanying video <https://www.youtube.com/watch?v=t4vKPhjcMZg>`__\n(or even both, if you're really keen).\n\nThe summary of the above presentation is: there are four distinct kinds of documentation, and you really need them all:\n\n1. Tutorials\n2. Howto guides\n3. Explanations\n4. References\n\nEach kind has its own intended audience, purpose, and writing style.\nWhen you make a PR with new functionality, please consider authoring each kind of documentation.\nAt the very least, you will (indirectly) author reference documentation through module, class and function docstrings.\n\nMechanisms\n----------\n\nWe keep our documentation as individual Python scripts.\nThese scripts live under :file:`docs/src/gallery` in one of several subdirectories:\n\n- core: core tutorials.  We try to keep this part small, avoid putting stuff here.\n- tutorials: tutorials.\n- howtos: howto guides.\n\nPick a subdirectory and save your script under it.\nPrefix the name of the script with ``run_``: this way, the the documentation builder will run your script each time it builds our docs.\n\nThe contents of the script are straightforward.\nAt the very top, you need a docstring describing what your script does.\n\n"
+        "## Background\n\nGensim is a large project with a wide range of functionality.\nUnfortunately, not all of this functionality is documented **well**, and some of it is not documented at all.\nWithout good documentation, users are unable to unlock Gensim's full potential.\nTherefore, authoring new documentation and improving existing documentation is of great value to the Gensim project.\n\nIf you implement new functionality in Gensim, please include **helpful** documentation.\nBy \"helpful\", we mean that your documentation answers questions that Gensim users may have.\nFor example:\n\n- What is this new functionality?\n- **Why** is it important?\n- **How** is it relevant to Gensim?\n- **What** can I do with it? What are some real-world applications?\n- **How** do I use it to achieve those things?\n- ... and others (if you can think of them, please add them here)\n\nBefore you author documentation, I suggest reading\n`\"What nobody tells you about documentation\" <https://www.divio.com/blog/documentation/>`__\nor watching its `accompanying video <https://www.youtube.com/watch?v=t4vKPhjcMZg>`__\n(or even both, if you're really keen).\n\nThe summary of the above presentation is: there are four distinct kinds of documentation, and you really need them all:\n\n1. Tutorials\n2. Howto guides\n3. Explanations\n4. References\n\nEach kind has its own intended audience, purpose, and writing style.\nWhen you make a PR with new functionality, please consider authoring each kind of documentation.\nAt the very least, you will (indirectly) author reference documentation through module, class and function docstrings.\n\n## Mechanisms\n\nWe keep our documentation as individual Python scripts.\nThese scripts live under :file:`docs/src/gallery` in one of several subdirectories:\n\n- core: core tutorials.  We try to keep this part small, avoid putting stuff here.\n- tutorials: tutorials.\n- howtos: howto guides.\n\nPick a subdirectory and save your script under it.\nPrefix the name of the script with ``run_``: this way, the the documentation builder will run your script each time it builds our docs.\n\nThe contents of the script are straightforward.\nAt the very top, you need a docstring describing what your script does.\n\n"
       ]
     },
     {
@@ -54,14 +54,14 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Authoring Workflow\n------------------\n\nThere are several ways to author documentation.\nThe simplest and most straightforward is to author your ``script.py`` from scratch.\nYou'll have the following cycle:\n\n1. Make changes\n2. Run ``python script.py``\n3. Check standard output, standard error and return code\n4. If everything works well, stop.\n5. Otherwise, go back to step 1).\n\nIf the above is not your cup of tea, you can also author your documentation as a Jupyter notebook.\nThis is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary.\n\nOnce you're happy with the notebook, convert it to a script.py.\nThere's a helpful `script <https://github.com/mpenkov/gensim/blob/numfocus/docs/src/tools/to_python.py>`__ that will do it for you.\nTo use it::\n\n    python to_python.py < notebook.ipynb > script.py\n\nYou may have to touch up the resulting ``script.py``.\nMore specifically:\n\n- Update the title\n- Update the description\n- Fix any issues that the markdown-to-RST converter could not deal with\n\nOnce your script.py works, put it in a suitable subdirectory.\nPlease don't include your original Jupyter notebook in the repository - we won't be using it.\n\n"
+        "## Authoring Workflow\n\nThere are several ways to author documentation.\nThe simplest and most straightforward is to author your ``script.py`` from scratch.\nYou'll have the following cycle:\n\n1. Make changes\n2. Run ``python script.py``\n3. Check standard output, standard error and return code\n4. If everything works well, stop.\n5. Otherwise, go back to step 1).\n\nIf the above is not your cup of tea, you can also author your documentation as a Jupyter notebook.\nThis is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary.\n\nOnce you're happy with the notebook, convert it to a script.py.\nThere's a helpful `script <https://github.com/RaRe-Technologies/gensim/blob/develop/docs/src/tools/to_python.py>`__ that will do it for you.\nTo use it::\n\n    python to_python.py < notebook.ipynb > script.py\n\nYou may have to touch up the resulting ``script.py``.\nMore specifically:\n\n- Update the title\n- Update the description\n- Fix any issues that the markdown-to-RST converter could not deal with\n\nOnce your script.py works, put it in a suitable subdirectory.\nPlease don't include your original Jupyter notebook in the repository - we won't be using it.\n\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Correctness\n-----------\n\nIncorrect documentation can be worse than no documentation at all.\nTake the following steps to ensure correctness:\n\n- Run Python's doctest module on your docstrings\n- Run your documentation scripts from scratch, removing any temporary files/results\n\nUsing data in your documentation\n--------------------------------\n\nSome parts of the documentation require real-world data to be useful.\nFor example, you may need more than just a toy example to demonstrate the benefits of one model over another.\nThis subsection provides some tips for including data in your documentation.\n\nIf possible, use data available via Gensim's\n`downloader API <https://radimrehurek.com/gensim/gensim_numfocus/auto_examples/010_tutorials/run_downloader_api.html>`__.\nThis will reduce the risk of your documentation becoming obsolete because required data is no longer available.\n\nUse the smallest possible dataset: avoid making people unnecessarily load large datasets and models.\nThis will make your documentation faster to run and easier for people to use (they can modify your examples and re-run them quickly).\n\nFinalizing your contribution\n----------------------------\n\nFirst, get Sphinx Gallery to build your documentation::\n\n    make -C docs/src html\n\nThis can take a while if your documentation uses a large dataset, or if you've changed many other tutorials or guides.\nOnce this completes successfully, open ``docs/auto_examples/index.html`` in your browser.\nYou should see your new tutorial or guide in the gallery.\n\nOnce your documentation script is working correctly, it's time to add it to the git repository::\n\n    git add docs/src/gallery/tutorials/run_example.py\n    git add docs/src/auto_examples/tutorials/run_example.{py,py.md5,rst,ipynb}\n    git add docs/src/auto_examples/howtos/sg_execution_times.rst\n    git commit -m \"enter a helpful commit message here\"\n    git push origin branchname\n\n.. Note::\n  You may be wondering what all those other files are.\n  Sphinx Gallery puts a copy of your Python script in ``auto_examples/tutorials``.\n  The .md5 contains MD5 hash of the script to enable easy detection of modifications.\n  Gallery also generates .rst (RST for Sphinx) and .ipynb (Jupyter notebook) files from the script.\n  Finally, ``sg_execution_times.rst`` contains the time taken to run each example.\n\nFinally, make a PR on `github <https://github.com/RaRe-Technologies/gensim>`__.\nOne of our friendly maintainers will review it, make suggestions, and eventually merge it.\nYour documentation will then appear in the gallery alongside the rest of the example.\nAt that stage, give yourself a pat on the back: you're done!\n\n"
+        "## Correctness\n\nIncorrect documentation can be worse than no documentation at all.\nTake the following steps to ensure correctness:\n\n- Run Python's doctest module on your docstrings\n- Run your documentation scripts from scratch, removing any temporary files/results\n\n## Using data in your documentation\n\nSome parts of the documentation require real-world data to be useful.\nFor example, you may need more than just a toy example to demonstrate the benefits of one model over another.\nThis subsection provides some tips for including data in your documentation.\n\nIf possible, use data available via Gensim's\n`downloader API <https://radimrehurek.com/gensim/gensim_numfocus/auto_examples/010_tutorials/run_downloader_api.html>`__.\nThis will reduce the risk of your documentation becoming obsolete because required data is no longer available.\n\nUse the smallest possible dataset: avoid making people unnecessarily load large datasets and models.\nThis will make your documentation faster to run and easier for people to use (they can modify your examples and re-run them quickly).\n\n## Finalizing your contribution\n\nFirst, get Sphinx Gallery to build your documentation::\n\n    make -C docs/src html\n\nThis can take a while if your documentation uses a large dataset, or if you've changed many other tutorials or guides.\nOnce this completes successfully, open ``docs/auto_examples/index.html`` in your browser.\nYou should see your new tutorial or guide in the gallery.\n\nOnce your documentation script is working correctly, it's time to add it to the git repository::\n\n    git add docs/src/gallery/tutorials/run_example.py\n    git add docs/src/auto_examples/tutorials/run_example.{py,py.md5,rst,ipynb}\n    git add docs/src/auto_examples/howtos/sg_execution_times.rst\n    git commit -m \"enter a helpful commit message here\"\n    git push origin branchname\n\n.. Note::\n  You may be wondering what all those other files are.\n  Sphinx Gallery puts a copy of your Python script in ``auto_examples/tutorials``.\n  The .md5 contains MD5 hash of the script to enable easy detection of modifications.\n  Gallery also generates .rst (RST for Sphinx) and .ipynb (Jupyter notebook) files from the script.\n  Finally, ``sg_execution_times.rst`` contains the time taken to run each example.\n\nFinally, make a PR on `github <https://github.com/RaRe-Technologies/gensim>`__.\nOne of our friendly maintainers will review it, make suggestions, and eventually merge it.\nYour documentation will then appear in the gallery alongside the rest of the example.\nAt that stage, give yourself a pat on the back: you're done!\n\n"
       ]
     }
   ],
@@ -81,7 +81,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.6.5"
+      "version": "3.9.1"
     }
   },
   "nbformat": 4,
diff --git a/docs/src/auto_examples/howtos/run_doc.py b/docs/src/auto_examples/howtos/run_doc.py
index 7e07ab74bd..15e870f1be 100644
--- a/docs/src/auto_examples/howtos/run_doc.py
+++ b/docs/src/auto_examples/howtos/run_doc.py
@@ -111,7 +111,7 @@
 # This is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary.
 #
 # Once you're happy with the notebook, convert it to a script.py.
-# There's a helpful `script <https://github.com/mpenkov/gensim/blob/numfocus/docs/src/tools/to_python.py>`__ that will do it for you.
+# There's a helpful `script <https://github.com/RaRe-Technologies/gensim/blob/develop/docs/src/tools/to_python.py>`__ that will do it for you.
 # To use it::
 #
 #     python to_python.py < notebook.ipynb > script.py
diff --git a/docs/src/auto_examples/howtos/run_doc.py.md5 b/docs/src/auto_examples/howtos/run_doc.py.md5
index d81fe9d241..979aa0eb5e 100644
--- a/docs/src/auto_examples/howtos/run_doc.py.md5
+++ b/docs/src/auto_examples/howtos/run_doc.py.md5
@@ -1 +1 @@
-b3db0b66859316de13e1a36fa6181657
\ No newline at end of file
+512a76ce743dd12482d21784a76b60fe
\ No newline at end of file
diff --git a/docs/src/auto_examples/howtos/run_doc.rst b/docs/src/auto_examples/howtos/run_doc.rst
index 88d1904295..aa5257ed87 100644
--- a/docs/src/auto_examples/howtos/run_doc.rst
+++ b/docs/src/auto_examples/howtos/run_doc.rst
@@ -1,12 +1,21 @@
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "auto_examples/howtos/run_doc.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
 .. only:: html
 
     .. note::
         :class: sphx-glr-download-link-note
 
-        Click :ref:`here <sphx_glr_download_auto_examples_howtos_run_doc.py>`     to download the full example code
-    .. rst-class:: sphx-glr-example-title
+        Click :ref:`here <sphx_glr_download_auto_examples_howtos_run_doc.py>`
+        to download the full example code
 
-    .. _sphx_glr_auto_examples_howtos_run_doc.py:
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_auto_examples_howtos_run_doc.py:
 
 
 How to Author Gensim Documentation
@@ -14,6 +23,8 @@ How to Author Gensim Documentation
 
 How to author documentation for Gensim.
 
+.. GENERATED FROM PYTHON SOURCE LINES 9-59
+
 Background
 ----------
 
@@ -65,6 +76,7 @@ Prefix the name of the script with ``run_``: this way, the the documentation bui
 The contents of the script are straightforward.
 At the very top, you need a docstring describing what your script does.
 
+.. GENERATED FROM PYTHON SOURCE LINES 59-67
 
 .. code-block:: default
 
@@ -91,6 +103,8 @@ At the very top, you need a docstring describing what your script does.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 68-75
+
 The title is what will show up in the gallery.
 Keep this short and descriptive.
 
@@ -99,6 +113,8 @@ When people mouse-over the title, they will see the description.
 Keep this short too.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 77-95
+
 The rest of the script is Python, formatted in a special way so that Sphinx Gallery can parse it.
 The most important properties of this format are:
 
@@ -118,6 +134,8 @@ You should be able to run your script directly from the command line::
 and it should run to completion without error, occasionally printing stuff to standard output.
 
 
+.. GENERATED FROM PYTHON SOURCE LINES 97-128
+
 Authoring Workflow
 ------------------
 
@@ -135,7 +153,7 @@ If the above is not your cup of tea, you can also author your documentation as a
 This is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary.
 
 Once you're happy with the notebook, convert it to a script.py.
-There's a helpful `script <https://github.com/mpenkov/gensim/blob/numfocus/docs/src/tools/to_python.py>`__ that will do it for you.
+There's a helpful `script <https://github.com/RaRe-Technologies/gensim/blob/develop/docs/src/tools/to_python.py>`__ that will do it for you.
 To use it::
 
     python to_python.py < notebook.ipynb > script.py
@@ -150,6 +168,8 @@ More specifically:
 Once your script.py works, put it in a suitable subdirectory.
 Please don't include your original Jupyter notebook in the repository - we won't be using it.
 
+.. GENERATED FROM PYTHON SOURCE LINES 130-183
+
 Correctness
 -----------
 
@@ -207,9 +227,9 @@ At that stage, give yourself a pat on the back: you're done!
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 0 minutes  0.112 seconds)
+   **Total running time of the script:** ( 0 minutes  0.325 seconds)
 
-**Estimated memory usage:**  6 MB
+**Estimated memory usage:**  9 MB
 
 
 .. _sphx_glr_download_auto_examples_howtos_run_doc.py:
diff --git a/docs/src/auto_examples/howtos/sg_execution_times.rst b/docs/src/auto_examples/howtos/sg_execution_times.rst
index 628fe13c1f..03a66b6ece 100644
--- a/docs/src/auto_examples/howtos/sg_execution_times.rst
+++ b/docs/src/auto_examples/howtos/sg_execution_times.rst
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**52:12.903** total execution time for **auto_examples_howtos** files:
+**00:00.325** total execution time for **auto_examples_howtos** files:
 
-+----------------------------------------------------------------------------------------+-----------+-----------+
-| :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` (``run_doc2vec_imdb.py``)     | 52:12.903 | 3494.0 MB |
-+----------------------------------------------------------------------------------------+-----------+-----------+
-| :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` (``run_compare_lda.py``)       | 00:00.000 | 0.0 MB    |
-+----------------------------------------------------------------------------------------+-----------+-----------+
-| :ref:`sphx_glr_auto_examples_howtos_run_doc.py` (``run_doc.py``)                       | 00:00.000 | 0.0 MB    |
-+----------------------------------------------------------------------------------------+-----------+-----------+
-| :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` (``run_downloader_api.py``) | 00:00.000 | 0.0 MB    |
-+----------------------------------------------------------------------------------------+-----------+-----------+
++----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_auto_examples_howtos_run_doc.py` (``run_doc.py``)                       | 00:00.325 | 9.4 MB |
++----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` (``run_compare_lda.py``)       | 00:00.000 | 0.0 MB |
++----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` (``run_doc2vec_imdb.py``)     | 00:00.000 | 0.0 MB |
++----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` (``run_downloader_api.py``) | 00:00.000 | 0.0 MB |
++----------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst
index e46809100c..dfee557a6c 100644
--- a/docs/src/auto_examples/index.rst
+++ b/docs/src/auto_examples/index.rst
@@ -190,14 +190,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces the Annoy library for similarity queries on top of vectors learned by Word2Vec.">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s EnsembleLda model">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
-     :alt: Fast Similarity Queries with Annoy and Word2Vec
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png
+     :alt: Ensemble LDA
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py`
 
 .. raw:: html
 
@@ -207,18 +207,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_annoy
+   /auto_examples/tutorials/run_ensemblelda
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s LDA model and demonstrates its use on the NIPS corpus.">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces the Annoy library for similarity queries on top of vectors learned by Word2Vec.">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
-     :alt: LDA Model
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
+     :alt: Fast Similarity Queries with Annoy and Word2Vec
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
 
 .. raw:: html
 
@@ -228,18 +228,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_lda
+   /auto_examples/tutorials/run_annoy
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates using Gensim&#x27;s implemenation of the WMD.">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s LDA model and demonstrates its use on the NIPS corpus.">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
-     :alt: Word Mover's Distance
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
+     :alt: LDA Model
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
 
 .. raw:: html
 
@@ -249,18 +249,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_wmd
+   /auto_examples/tutorials/run_lda
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s EnsembleLda model">
+    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates using Gensim&#x27;s implemenation of the WMD.">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png
-     :alt: Ensemble LDA
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
+     :alt: Word Mover's Distance
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
 
 .. raw:: html
 
@@ -270,7 +270,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_ensemblelda
+   /auto_examples/tutorials/run_wmd
 .. raw:: html
 
     <div class="sphx-glr-clear"></div>
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
index d9573e5e22..b4f0ef8355 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb
@@ -197,7 +197,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.8.5"
+      "version": "3.9.1"
     }
   },
   "nbformat": 4,
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
index 9b959482d9..d55714a177 100644
--- a/docs/src/auto_examples/tutorials/run_ensemblelda.rst
+++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst
@@ -82,11 +82,11 @@ so it won't be explained again in detail.
 
  .. code-block:: none
 
-    2021-01-23 15:15:53,986 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
-    2021-01-23 15:16:00,703 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions)
-    2021-01-23 15:16:00,901 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]...
-    2021-01-23 15:16:00,901 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents
-    2021-01-23 15:16:00,984 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...)
+    2021-01-24 11:02:47,097 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
+    2021-01-24 11:02:52,618 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions)
+    2021-01-24 11:02:52,798 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]...
+    2021-01-24 11:02:52,798 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents
+    2021-01-24 11:02:52,873 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...)
 
 
@@ -215,20 +215,20 @@ The number of stable topics which are clustered from all those topics is smaller
 
  .. code-block:: none
 
-    2021-01-23 15:16:04,680 : INFO : generating 8 topic models...
-    2021-01-23 15:24:25,880 : INFO : generating a 160 x 160 asymmetric distance matrix...
-    2021-01-23 15:24:27,354 : INFO : fitting the clustering model, using 4 for min_samples
-    2021-01-23 15:24:27,387 : INFO : generating stable topics, using 3 for min_cores
-    2021-01-23 15:24:27,387 : INFO : found 25 clusters
-    2021-01-23 15:24:27,391 : INFO : found 1 stable topics
-    2021-01-23 15:24:27,391 : INFO : generating classic gensim model representation based on results from the ensemble
-    2021-01-23 15:24:27,592 : INFO : using symmetric alpha at 1.0
-    2021-01-23 15:24:27,592 : INFO : using symmetric eta at 1.0
-    2021-01-23 15:24:27,594 : INFO : using serial LDA version on this node
-    2021-01-23 15:24:27,596 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 15:24:27,596 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-24 11:02:56,264 : INFO : generating 8 topic models...
+    2021-01-24 11:07:21,883 : INFO : generating a 160 x 160 asymmetric distance matrix...
+    2021-01-24 11:07:22,745 : INFO : fitting the clustering model, using 4 for min_samples
+    2021-01-24 11:07:22,773 : INFO : generating stable topics, using 3 for min_cores
+    2021-01-24 11:07:22,773 : INFO : found 30 clusters
+    2021-01-24 11:07:22,776 : INFO : found 2 stable topics
+    2021-01-24 11:07:22,776 : INFO : generating classic gensim model representation based on results from the ensemble
+    2021-01-24 11:07:22,923 : INFO : using symmetric alpha at 0.5
+    2021-01-24 11:07:22,923 : INFO : using symmetric eta at 0.5
+    2021-01-24 11:07:22,925 : INFO : using serial LDA version on this node
+    2021-01-24 11:07:22,927 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-24 11:07:22,927 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
     160
-    1
+    2
 
 
@@ -273,18 +273,18 @@ Make sure to chose one that is within the range of values in ``asymmetric_distan
 
  .. code-block:: none
 
-    0.0664596363067147 0.12630570073794478 0.2773686345664633
-    2021-01-23 15:24:27,763 : INFO : fitting the clustering model
-    2021-01-23 15:24:27,777 : INFO : generating stable topics
-    2021-01-23 15:24:27,777 : INFO : found 22 clusters
-    2021-01-23 15:24:27,782 : INFO : found 3 stable topics
-    2021-01-23 15:24:27,782 : INFO : generating classic gensim model representation based on results from the ensemble
-    2021-01-23 15:24:27,784 : INFO : using symmetric alpha at 0.3333333333333333
-    2021-01-23 15:24:27,784 : INFO : using symmetric eta at 0.3333333333333333
-    2021-01-23 15:24:27,785 : INFO : using serial LDA version on this node
-    2021-01-23 15:24:27,790 : INFO : running online (multi-pass) LDA training, 3 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 15:24:27,790 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    3
+    0.06984649987340175 0.12851575757379297 0.31934422319450084
+    2021-01-24 11:07:23,142 : INFO : fitting the clustering model
+    2021-01-24 11:07:23,154 : INFO : generating stable topics
+    2021-01-24 11:07:23,154 : INFO : found 22 clusters
+    2021-01-24 11:07:23,157 : INFO : found 2 stable topics
+    2021-01-24 11:07:23,157 : INFO : generating classic gensim model representation based on results from the ensemble
+    2021-01-24 11:07:23,159 : INFO : using symmetric alpha at 0.5
+    2021-01-24 11:07:23,159 : INFO : using symmetric eta at 0.5
+    2021-01-24 11:07:23,161 : INFO : using serial LDA version on this node
+    2021-01-24 11:07:23,163 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-24 11:07:23,163 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2
 
 
@@ -342,79 +342,79 @@ Afterwards the number and quality of stable topics might be different depending
 
  .. code-block:: none
 
-    2021-01-23 15:24:28,057 : INFO : using symmetric alpha at 0.1111111111111111
-    2021-01-23 15:24:28,058 : INFO : using symmetric eta at 0.1111111111111111
-    2021-01-23 15:24:28,059 : INFO : using serial LDA version on this node
-    2021-01-23 15:24:28,074 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 14000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 15:24:28,074 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    2021-01-23 15:24:28,074 : INFO : training LDA model using 7 processes
-    2021-01-23 15:24:28,150 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2021-01-23 15:24:36,133 : INFO : topic #3 (0.111): 0.001*"minister" + 0.001*"actor" + 0.001*"china" + 0.001*"band" + 0.001*"km" + 0.001*"blue" + 0.001*"album" + 0.001*"economy" + 0.001*"soviet" + 0.001*"bc"
-    2021-01-23 15:24:36,133 : INFO : topic #1 (0.111): 0.001*"software" + 0.001*"album" + 0.001*"minister" + 0.001*"election" + 0.001*"emperor" + 0.001*"male" + 0.001*"league" + 0.001*"lord" + 0.001*"band" + 0.001*"jewish"
-    2021-01-23 15:24:36,134 : INFO : topic #2 (0.111): 0.001*"soviet" + 0.001*"china" + 0.001*"league" + 0.001*"actor" + 0.001*"km" + 0.001*"prime" + 0.001*"band" + 0.001*"energy" + 0.001*"economy" + 0.001*"season"
-    2021-01-23 15:24:36,134 : INFO : topic #4 (0.111): 0.001*"election" + 0.001*"league" + 0.001*"band" + 0.001*"km" + 0.001*"cell" + 0.001*"soviet" + 0.001*"philosophy" + 0.001*"lake" + 0.001*"software" + 0.001*"economy"
-    2021-01-23 15:24:36,135 : INFO : topic #8 (0.111): 0.001*"soviet" + 0.001*"jewish" + 0.001*"energy" + 0.001*"jew" + 0.001*"minister" + 0.001*"band" + 0.001*"russian" + 0.001*"album" + 0.001*"emperor" + 0.001*"lord"
-    2021-01-23 15:24:36,135 : INFO : topic diff=1.009252, rho=1.000000
-    2021-01-23 15:24:55,469 : INFO : -9.258 per-word bound, 612.2 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 15:24:55,469 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2021-01-23 15:25:02,694 : INFO : topic #4 (0.111): 0.001*"election" + 0.001*"lake" + 0.001*"apple" + 0.001*"philosophy" + 0.001*"software" + 0.001*"league" + 0.001*"km" + 0.001*"user" + 0.001*"band" + 0.001*"window"
-    2021-01-23 15:25:02,695 : INFO : topic #1 (0.111): 0.002*"album" + 0.001*"software" + 0.001*"lord" + 0.001*"emperor" + 0.001*"band" + 0.001*"league" + 0.001*"minister" + 0.001*"saint" + 0.001*"novel" + 0.001*"season"
-    2021-01-23 15:25:02,695 : INFO : topic #2 (0.111): 0.002*"soviet" + 0.002*"league" + 0.001*"china" + 0.001*"season" + 0.001*"actor" + 0.001*"km" + 0.001*"ball" + 0.001*"prime" + 0.001*"chinese" + 0.001*"economy"
-    2021-01-23 15:25:02,696 : INFO : topic #6 (0.111): 0.001*"israel" + 0.001*"km" + 0.001*"cell" + 0.001*"user" + 0.001*"minister" + 0.001*"video" + 0.001*"jewish" + 0.001*"machine" + 0.001*"import" + 0.001*"internet"
-    2021-01-23 15:25:02,697 : INFO : topic #0 (0.111): 0.001*"energy" + 0.001*"actor" + 0.001*"soviet" + 0.001*"blue" + 0.001*"tree" + 0.001*"ball" + 0.001*"russian" + 0.001*"software" + 0.001*"fiction" + 0.001*"file"
-    2021-01-23 15:25:02,697 : INFO : topic diff=0.143090, rho=0.592297
-    2021-01-23 15:25:21,875 : INFO : -9.195 per-word bound, 586.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 15:25:21,876 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2021-01-23 15:25:29,503 : INFO : topic #7 (0.111): 0.001*"economy" + 0.001*"africa" + 0.001*"bc" + 0.001*"diamond" + 0.001*"horse" + 0.001*"irish" + 0.001*"county" + 0.001*"election" + 0.001*"file" + 0.001*"car"
-    2021-01-23 15:25:29,503 : INFO : topic #5 (0.111): 0.003*"km" + 0.002*"est" + 0.002*"actor" + 0.002*"election" + 0.002*"minister" + 0.002*"india" + 0.001*"italian" + 0.001*"male" + 0.001*"female" + 0.001*"constitution"
-    2021-01-23 15:25:29,504 : INFO : topic #2 (0.111): 0.002*"league" + 0.002*"season" + 0.002*"soviet" + 0.002*"china" + 0.001*"ball" + 0.001*"chinese" + 0.001*"football" + 0.001*"baseball" + 0.001*"canadian" + 0.001*"nfl"
-    2021-01-23 15:25:29,505 : INFO : topic #0 (0.111): 0.001*"energy" + 0.001*"ball" + 0.001*"tree" + 0.001*"engine" + 0.001*"soviet" + 0.001*"blue" + 0.001*"actor" + 0.001*"fiction" + 0.001*"file" + 0.001*"russian"
-    2021-01-23 15:25:29,505 : INFO : topic #8 (0.111): 0.002*"jewish" + 0.002*"jew" + 0.002*"emperor" + 0.002*"soviet" + 0.001*"orthodox" + 0.001*"christ" + 0.001*"jesus" + 0.001*"russian" + 0.001*"cell" + 0.001*"holy"
-    2021-01-23 15:25:29,506 : INFO : topic diff=0.176887, rho=0.509614
-    2021-01-23 15:25:48,495 : INFO : -9.132 per-word bound, 561.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 15:25:48,495 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
-    2021-01-23 15:25:56,501 : INFO : topic #2 (0.111): 0.003*"league" + 0.003*"season" + 0.002*"soviet" + 0.002*"ball" + 0.002*"football" + 0.002*"chinese" + 0.002*"baseball" + 0.002*"china" + 0.002*"nfl" + 0.001*"finland"
-    2021-01-23 15:25:56,502 : INFO : topic #3 (0.111): 0.002*"card" + 0.002*"aircraft" + 0.002*"blue" + 0.001*"band" + 0.001*"comic" + 0.001*"kong" + 0.001*"album" + 0.001*"bridge" + 0.001*"hong" + 0.001*"love"
-    2021-01-23 15:25:56,503 : INFO : topic #6 (0.111): 0.002*"cell" + 0.001*"user" + 0.001*"import" + 0.001*"gun" + 0.001*"disease" + 0.001*"specie" + 0.001*"video" + 0.001*"device" + 0.001*"animal" + 0.001*"weapon"
-    2021-01-23 15:25:56,503 : INFO : topic #7 (0.111): 0.001*"irish" + 0.001*"horse" + 0.001*"bc" + 0.001*"diamond" + 0.001*"economy" + 0.001*"liberal" + 0.001*"car" + 0.001*"africa" + 0.001*"county" + 0.001*"file"
-    2021-01-23 15:25:56,504 : INFO : topic #0 (0.111): 0.002*"energy" + 0.001*"engine" + 0.001*"tree" + 0.001*"ball" + 0.001*"acid" + 0.001*"soviet" + 0.001*"file" + 0.001*"fiction" + 0.001*"blue" + 0.001*"classical"
-    2021-01-23 15:25:56,505 : INFO : topic diff=0.154924, rho=0.454053
-    2021-01-23 15:26:15,743 : INFO : -9.092 per-word bound, 545.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 15:26:15,821 : INFO : using symmetric alpha at 0.09090909090909091
-    2021-01-23 15:26:15,822 : INFO : using symmetric eta at 0.09090909090909091
-    2021-01-23 15:26:15,824 : INFO : using serial LDA version on this node
-    2021-01-23 15:26:15,855 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 15:26:15,855 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
-    2021-01-23 15:26:35,321 : INFO : -10.499 per-word bound, 1447.2 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 15:26:35,321 : INFO : PROGRESS: pass 0, at document #1701/1701
-    2021-01-23 15:26:40,838 : INFO : topic #4 (0.091): 0.001*"software" + 0.001*"election" + 0.001*"jewish" + 0.001*"actor" + 0.001*"minister" + 0.001*"territory" + 0.001*"league" + 0.001*"channel" + 0.001*"user" + 0.001*"japan"
-    2021-01-23 15:26:40,839 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"cell" + 0.001*"energy" + 0.001*"software" + 0.001*"km" + 0.001*"economy" + 0.001*"minister" + 0.001*"bc" + 0.001*"china" + 0.001*"league"
-    2021-01-23 15:26:40,839 : INFO : topic #7 (0.091): 0.001*"album" + 0.001*"irish" + 0.001*"ball" + 0.001*"y" + 0.001*"minister" + 0.001*"actor" + 0.001*"chinese" + 0.001*"election" + 0.001*"band" + 0.001*"la"
-    2021-01-23 15:26:40,840 : INFO : topic #8 (0.091): 0.001*"soviet" + 0.001*"album" + 0.001*"league" + 0.001*"minister" + 0.001*"actor" + 0.001*"emperor" + 0.001*"software" + 0.001*"male" + 0.001*"energy" + 0.001*"japanese"
-    2021-01-23 15:26:40,841 : INFO : topic #1 (0.091): 0.001*"band" + 0.001*"km" + 0.001*"cell" + 0.001*"actor" + 0.001*"economy" + 0.001*"lord" + 0.001*"energy" + 0.001*"car" + 0.001*"county" + 0.001*"software"
-    2021-01-23 15:26:40,841 : INFO : topic diff=1.022883, rho=1.000000
-    2021-01-23 15:27:00,369 : INFO : -9.279 per-word bound, 621.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
-    2021-01-23 15:27:00,369 : INFO : PROGRESS: pass 1, at document #1701/1701
-    2021-01-23 15:27:05,770 : INFO : topic #2 (0.091): 0.002*"km" + 0.002*"female" + 0.002*"est" + 0.001*"soviet" + 0.001*"election" + 0.001*"male" + 0.001*"minister" + 0.001*"economy" + 0.001*"russian" + 0.001*"russia"
-    2021-01-23 15:27:05,771 : INFO : topic #8 (0.091): 0.001*"league" + 0.001*"album" + 0.001*"color" + 0.001*"energy" + 0.001*"love" + 0.001*"actor" + 0.001*"soviet" + 0.001*"frac" + 0.001*"lake" + 0.001*"blue"
-    2021-01-23 15:27:05,771 : INFO : topic #6 (0.091): 0.002*"actor" + 0.001*"software" + 0.001*"user" + 0.001*"blue" + 0.001*"emperor" + 0.001*"album" + 0.001*"band" + 0.001*"election" + 0.001*"soviet" + 0.001*"minister"
-    2021-01-23 15:27:05,772 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"cell" + 0.001*"novel" + 0.001*"album" + 0.001*"energy" + 0.001*"bc" + 0.001*"philosophy" + 0.001*"knowledge" + 0.001*"band" + 0.001*"japanese"
-    2021-01-23 15:27:05,772 : INFO : topic #10 (0.091): 0.002*"economy" + 0.002*"km" + 0.001*"minister" + 0.001*"est" + 0.001*"election" + 0.001*"import" + 0.001*"russian" + 0.001*"prime" + 0.001*"soviet" + 0.001*"chinese"
-    2021-01-23 15:27:05,773 : INFO : topic diff=0.163523, rho=0.577350
-    2021-01-23 15:27:05,774 : INFO : ensemble contains 9 models and 160 topics now
-    2021-01-23 15:27:05,782 : INFO : ensemble contains 10 models and 169 topics now
-    2021-01-23 15:27:05,789 : INFO : asymmetric distance matrix is outdated due to add_model
-    2021-01-23 15:27:05,789 : INFO : generating a 180 x 180 asymmetric distance matrix...
-    2021-01-23 15:27:07,632 : INFO : fitting the clustering model, using 5 for min_samples
-    2021-01-23 15:27:07,670 : INFO : generating stable topics, using 3 for min_cores
-    2021-01-23 15:27:07,670 : INFO : found 23 clusters
-    2021-01-23 15:27:07,676 : INFO : found 1 stable topics
-    2021-01-23 15:27:07,676 : INFO : generating classic gensim model representation based on results from the ensemble
-    2021-01-23 15:27:07,677 : INFO : using symmetric alpha at 1.0
-    2021-01-23 15:27:07,677 : INFO : using symmetric eta at 1.0
-    2021-01-23 15:27:07,680 : INFO : using serial LDA version on this node
-    2021-01-23 15:27:07,682 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
-    2021-01-23 15:27:07,682 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-24 11:07:23,511 : INFO : using symmetric alpha at 0.1111111111111111
+    2021-01-24 11:07:23,512 : INFO : using symmetric eta at 0.1111111111111111
+    2021-01-24 11:07:23,515 : INFO : using serial LDA version on this node
+    2021-01-24 11:07:23,526 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-24 11:07:23,526 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-24 11:07:23,572 : INFO : training LDA model using 11 processes
+    2021-01-24 11:07:23,721 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-24 11:07:28,538 : INFO : topic #1 (0.111): 0.001*"minister" + 0.001*"soviet" + 0.001*"election" + 0.001*"israel" + 0.001*"jewish" + 0.001*"jew" + 0.001*"chinese" + 0.001*"emperor" + 0.001*"actor" + 0.001*"km"
+    2021-01-24 11:07:28,538 : INFO : topic #4 (0.111): 0.001*"economy" + 0.001*"female" + 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"est" + 0.001*"jewish" + 0.001*"japanese" + 0.001*"animal" + 0.001*"band"
+    2021-01-24 11:07:28,539 : INFO : topic #7 (0.111): 0.001*"election" + 0.001*"minister" + 0.001*"km" + 0.001*"cell" + 0.001*"software" + 0.001*"energy" + 0.001*"actor" + 0.001*"band" + 0.001*"bc" + 0.001*"japanese"
+    2021-01-24 11:07:28,539 : INFO : topic #0 (0.111): 0.002*"soviet" + 0.001*"league" + 0.001*"km" + 0.001*"actor" + 0.001*"territory" + 0.001*"album" + 0.001*"emperor" + 0.001*"season" + 0.001*"jewish" + 0.001*"female"
+    2021-01-24 11:07:28,539 : INFO : topic #6 (0.111): 0.001*"actor" + 0.001*"india" + 0.001*"software" + 0.001*"band" + 0.001*"bc" + 0.001*"novel" + 0.001*"km" + 0.001*"album" + 0.001*"emperor" + 0.001*"y"
+    2021-01-24 11:07:28,539 : INFO : topic diff=1.009234, rho=1.000000
+    2021-01-24 11:07:38,853 : INFO : -9.257 per-word bound, 612.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-24 11:07:38,853 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-24 11:07:43,986 : INFO : topic #5 (0.111): 0.001*"chinese" + 0.001*"y" + 0.001*"blue" + 0.001*"spanish" + 0.001*"user" + 0.001*"china" + 0.001*"actor" + 0.001*"canadian" + 0.001*"color" + 0.001*"emperor"
+    2021-01-24 11:07:43,986 : INFO : topic #4 (0.111): 0.002*"est" + 0.002*"economy" + 0.002*"km" + 0.002*"election" + 0.001*"minister" + 0.001*"female" + 0.001*"male" + 0.001*"prime" + 0.001*"israel" + 0.001*"bank"
+    2021-01-24 11:07:43,986 : INFO : topic #7 (0.111): 0.001*"cell" + 0.001*"energy" + 0.001*"color" + 0.001*"map" + 0.001*"horse" + 0.001*"ship" + 0.001*"bc" + 0.001*"election" + 0.001*"card" + 0.001*"specie"
+    2021-01-24 11:07:43,987 : INFO : topic #1 (0.111): 0.001*"jewish" + 0.001*"jew" + 0.001*"israel" + 0.001*"aircraft" + 0.001*"minister" + 0.001*"comic" + 0.001*"soviet" + 0.001*"actor" + 0.001*"irish" + 0.001*"election"
+    2021-01-24 11:07:43,987 : INFO : topic #6 (0.111): 0.001*"actor" + 0.001*"software" + 0.001*"india" + 0.001*"band" + 0.001*"bc" + 0.001*"novel" + 0.001*"y" + 0.001*"jewish" + 0.001*"album" + 0.001*"emperor"
+    2021-01-24 11:07:43,987 : INFO : topic diff=0.143780, rho=0.592297
+    2021-01-24 11:07:54,251 : INFO : -9.190 per-word bound, 583.9 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-24 11:07:54,252 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-24 11:07:58,980 : INFO : topic #6 (0.111): 0.002*"actor" + 0.001*"software" + 0.001*"india" + 0.001*"novel" + 0.001*"bc" + 0.001*"band" + 0.001*"hebrew" + 0.001*"jewish" + 0.001*"y" + 0.001*"japanese"
+    2021-01-24 11:07:58,980 : INFO : topic #5 (0.111): 0.002*"y" + 0.001*"chinese" + 0.001*"blue" + 0.001*"user" + 0.001*"china" + 0.001*"z" + 0.001*"color" + 0.001*"server" + 0.001*"canadian" + 0.001*"spanish"
+    2021-01-24 11:07:58,980 : INFO : topic #4 (0.111): 0.003*"est" + 0.003*"km" + 0.003*"economy" + 0.002*"election" + 0.002*"minister" + 0.002*"male" + 0.002*"territory" + 0.002*"israel" + 0.002*"female" + 0.002*"prime"
+    2021-01-24 11:07:58,980 : INFO : topic #2 (0.111): 0.002*"cell" + 0.001*"import" + 0.001*"machine" + 0.001*"software" + 0.001*"user" + 0.001*"league" + 0.001*"memory" + 0.001*"bit" + 0.001*"apple" + 0.001*"aircraft"
+    2021-01-24 11:07:58,981 : INFO : topic #8 (0.111): 0.002*"emperor" + 0.001*"chinese" + 0.001*"copyright" + 0.001*"minister" + 0.001*"mary" + 0.001*"software" + 0.001*"y" + 0.001*"animal" + 0.001*"actor" + 0.001*"japanese"
+    2021-01-24 11:07:58,981 : INFO : topic diff=0.178240, rho=0.509614
+    2021-01-24 11:08:09,249 : INFO : -9.125 per-word bound, 558.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-24 11:08:09,249 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1
+    2021-01-24 11:08:13,965 : INFO : topic #6 (0.111): 0.002*"actor" + 0.001*"hebrew" + 0.001*"novel" + 0.001*"jewish" + 0.001*"software" + 0.001*"bc" + 0.001*"singer" + 0.001*"india" + 0.001*"italian" + 0.001*"la"
+    2021-01-24 11:08:13,965 : INFO : topic #7 (0.111): 0.002*"energy" + 0.002*"cell" + 0.001*"horse" + 0.001*"atom" + 0.001*"color" + 0.001*"specie" + 0.001*"chemical" + 0.001*"acid" + 0.001*"animal" + 0.001*"map"
+    2021-01-24 11:08:13,965 : INFO : topic #8 (0.111): 0.003*"emperor" + 0.002*"chinese" + 0.002*"copyright" + 0.001*"mary" + 0.001*"software" + 0.001*"japanese" + 0.001*"minister" + 0.001*"y" + 0.001*"bc" + 0.001*"animal"
+    2021-01-24 11:08:13,965 : INFO : topic #4 (0.111): 0.003*"est" + 0.003*"km" + 0.003*"election" + 0.003*"economy" + 0.003*"minister" + 0.002*"territory" + 0.002*"male" + 0.002*"israel" + 0.002*"prime" + 0.002*"female"
+    2021-01-24 11:08:13,966 : INFO : topic #3 (0.111): 0.003*"album" + 0.003*"band" + 0.001*"bass" + 0.001*"alexander" + 0.001*"love" + 0.001*"ball" + 0.001*"fan" + 0.001*"philosophy" + 0.001*"artist" + 0.001*"me"
+    2021-01-24 11:08:13,966 : INFO : topic diff=0.153747, rho=0.454053
+    2021-01-24 11:08:24,231 : INFO : -9.087 per-word bound, 543.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-24 11:08:24,310 : INFO : using symmetric alpha at 0.09090909090909091
+    2021-01-24 11:08:24,310 : INFO : using symmetric eta at 0.09090909090909091
+    2021-01-24 11:08:24,312 : INFO : using serial LDA version on this node
+    2021-01-24 11:08:24,325 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-24 11:08:24,325 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
+    2021-01-24 11:08:36,120 : INFO : -10.499 per-word bound, 1447.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-24 11:08:36,120 : INFO : PROGRESS: pass 0, at document #1701/1701
+    2021-01-24 11:08:40,292 : INFO : topic #7 (0.091): 0.001*"band" + 0.001*"minister" + 0.001*"album" + 0.001*"km" + 0.001*"cell" + 0.001*"female" + 0.001*"lake" + 0.001*"election" + 0.001*"africa" + 0.001*"chinese"
+    2021-01-24 11:08:40,292 : INFO : topic #5 (0.091): 0.001*"election" + 0.001*"japanese" + 0.001*"cell" + 0.001*"energy" + 0.001*"km" + 0.001*"economy" + 0.001*"actor" + 0.001*"ship" + 0.001*"county" + 0.001*"emperor"
+    2021-01-24 11:08:40,293 : INFO : topic #6 (0.091): 0.001*"jewish" + 0.001*"actor" + 0.001*"energy" + 0.001*"cell" + 0.001*"la" + 0.001*"y" + 0.001*"russian" + 0.001*"china" + 0.001*"league" + 0.001*"soviet"
+    2021-01-24 11:08:40,293 : INFO : topic #4 (0.091): 0.001*"chinese" + 0.001*"season" + 0.001*"emperor" + 0.001*"ball" + 0.001*"novel" + 0.001*"color" + 0.001*"league" + 0.001*"spanish" + 0.001*"lake" + 0.001*"energy"
+    2021-01-24 11:08:40,293 : INFO : topic #8 (0.091): 0.001*"emperor" + 0.001*"km" + 0.001*"band" + 0.001*"software" + 0.001*"actor" + 0.001*"album" + 0.001*"russian" + 0.001*"economy" + 0.001*"male" + 0.001*"minister"
+    2021-01-24 11:08:40,294 : INFO : topic diff=1.023860, rho=1.000000
+    2021-01-24 11:08:53,644 : INFO : -9.277 per-word bound, 620.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words
+    2021-01-24 11:08:53,644 : INFO : PROGRESS: pass 1, at document #1701/1701
+    2021-01-24 11:08:57,030 : INFO : topic #3 (0.091): 0.002*"actor" + 0.001*"album" + 0.001*"energy" + 0.001*"india" + 0.001*"software" + 0.001*"love" + 0.001*"singer" + 0.001*"actress" + 0.001*"y" + 0.001*"canadian"
+    2021-01-24 11:08:57,030 : INFO : topic #5 (0.091): 0.001*"aircraft" + 0.001*"emperor" + 0.001*"plant" + 0.001*"ireland" + 0.001*"irish" + 0.001*"county" + 0.001*"actor" + 0.001*"henry" + 0.001*"cell" + 0.001*"japanese"
+    2021-01-24 11:08:57,030 : INFO : topic #4 (0.091): 0.002*"ball" + 0.002*"season" + 0.001*"chinese" + 0.001*"league" + 0.001*"color" + 0.001*"emperor" + 0.001*"novel" + 0.001*"nfl" + 0.001*"lake" + 0.001*"mary"
+    2021-01-24 11:08:57,031 : INFO : topic #0 (0.091): 0.002*"soviet" + 0.002*"election" + 0.001*"japanese" + 0.001*"chinese" + 0.001*"league" + 0.001*"cell" + 0.001*"km" + 0.001*"japan" + 0.001*"card" + 0.001*"minister"
+    2021-01-24 11:08:57,031 : INFO : topic #7 (0.091): 0.002*"band" + 0.001*"album" + 0.001*"lake" + 0.001*"bass" + 0.001*"kong" + 0.001*"minister" + 0.001*"cell" + 0.001*"chinese" + 0.001*"africa" + 0.001*"liberal"
+    2021-01-24 11:08:57,031 : INFO : topic diff=0.167986, rho=0.577350
+    2021-01-24 11:08:57,032 : INFO : ensemble contains 9 models and 160 topics now
+    2021-01-24 11:08:57,038 : INFO : ensemble contains 10 models and 169 topics now
+    2021-01-24 11:08:57,045 : INFO : asymmetric distance matrix is outdated due to add_model
+    2021-01-24 11:08:57,045 : INFO : generating a 180 x 180 asymmetric distance matrix...
+    2021-01-24 11:08:58,136 : INFO : fitting the clustering model, using 5 for min_samples
+    2021-01-24 11:08:58,170 : INFO : generating stable topics, using 3 for min_cores
+    2021-01-24 11:08:58,171 : INFO : found 26 clusters
+    2021-01-24 11:08:58,175 : INFO : found 1 stable topics
+    2021-01-24 11:08:58,175 : INFO : generating classic gensim model representation based on results from the ensemble
+    2021-01-24 11:08:58,176 : INFO : using symmetric alpha at 1.0
+    2021-01-24 11:08:58,176 : INFO : using symmetric eta at 1.0
+    2021-01-24 11:08:58,180 : INFO : using serial LDA version on this node
+    2021-01-24 11:08:58,183 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000
+    2021-01-24 11:08:58,183 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
     180
     1
 
@@ -424,9 +424,9 @@ Afterwards the number and quality of stable topics might be different depending
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 11 minutes  53.863 seconds)
+   **Total running time of the script:** ( 6 minutes  48.976 seconds)
 
-**Estimated memory usage:**  1625 MB
+**Estimated memory usage:**  1614 MB
 
 
 .. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py:
diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst
index 7003c2957e..268c3d8887 100644
--- a/docs/src/auto_examples/tutorials/sg_execution_times.rst
+++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**11:26.674** total execution time for **auto_examples_tutorials** files:
+**06:48.976** total execution time for **auto_examples_tutorials** files:
 
 +-------------------------------------------------------------------------------------+-----------+-----------+
-| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``)       | 11:26.674 | 7177.5 MB |
+| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 06:48.976 | 1614.4 MB |
 +-------------------------------------------------------------------------------------+-----------+-----------+
 | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``)             | 00:00.000 | 0.0 MB    |
 +-------------------------------------------------------------------------------------+-----------+-----------+
@@ -20,3 +20,5 @@ Computation times
 +-------------------------------------------------------------------------------------+-----------+-----------+
 | :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``)                 | 00:00.000 | 0.0 MB    |
 +-------------------------------------------------------------------------------------+-----------+-----------+
+| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``)       | 00:00.000 | 0.0 MB    |
++-------------------------------------------------------------------------------------+-----------+-----------+

From f7bc8b20d8bafd51065c3090fe6f68912c542ef1 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 24 Jan 2021 11:29:56 +0100
Subject: [PATCH 127/166] idk

---
 .../tutorials/run_ensemblelda_codeobj.pickle     | Bin 8641 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle

diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle b/docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle
deleted file mode 100644
index 91585af924ab6004df91e11df745eacc51ffe39e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8641
zcmd5>ONbm*6dlccI@2@B%p{Xfi;B2OR7x@%6*n<4N+2IdBH}{YH&t)Cs#0Cm)W^vf
zln9C%3IbZ<T5#jWWk3)F!G${!1VIo2f*>e@Ac!09t9t*c?y73WPgaw9_ndp*eeZtW
z?HhCN-+o{w|Ce^n!1J&fJAof%Y4>7i;SgI-J7%1{mmR;99ZMHIJRG9fc?E~ri`lVP
z;nS>-hB!-CpASOo0*=oKfpmTtScyj#eJ{RhL@^#3+aTuSI7^p`#eQT5A^dlih-7KU
ziTb98q9}VO0d{+#irC(p?&XY`-KAiAc`~L@VR!9=L=vYfmJ`LuH*p_~x{AXHwp8fc
z<M?p~S~{DF<HZhm15if^3Z-+AjYgP@&c(?Hwx|o=R|qc>=Ycy>xI_H~JCZ*F?&tvf
zkux;RARGlDGAsdst?g4+Gkf`M>KKitB-jEwUnn8Fd{`*xTt+YBa{;o*?v|(>N$hyU
z9r_1umV%OqHw5v*c{Cb1{($iAm+qW_HcdjU+{0>^D$=!|5Mqm``7Lg#Ktt*}P6{1!
zmmvglzqGJlfOO_(1#{<Yj6<|pT&Wi13Qp(0Ao%y54*VE7J_K+(@2fyv!*fEzH20E(
z<C?hmh836_o58k-nKcned~u4{VF_XkU85MP;gGKUmdHpeES6;J^_a4q8i2&@3DNyD
zoMh^Xn_~x)A^MSqLVQhxRGpHf7NzUCCJ@VcSk<YxJ_(?+zb1rNSZLKLc)bRr>v%)R
zuk!G!Q}R|FNY`?c0C2kG1%m-3Pvh)`XD*5WsVrHygpS!<971M+;z_c9x1d~s0)&T<
zbD9;x4+sjS9flG{bk9(ta~2}$g_Sla_1KFH2JpygH(jPBrC5c^#v)6n(FI#TrHJ{c
z*zBU84Et2&#Crd6vCTDRwSg?F0E!K-OB<Hfr{DT`!WK>#Pa?VFw($2FPflWsu!T~3
z`m`1z@qbphTRF?3WrP+;`P$xFtOe3(8u(}&RI>alQOlHOM)!yApxPGbYe77P<s|q?
zM~%G=aRECJY3Io|5^jf-fHEM=a`B0=rVupnAB#~1Cpqs<aqLUPt^|5#=xRx`Or8pG
z?&@bSxmPxsqpDH_JH8T81JlQ1CG&q3N2Khqjcb`6g`kCf3Qn@^Phs2Yl@R&SRuGcM
zSl~Z(>NGk3Rq9N5z2I;QnK*CMtG@E9T1_OBx-)mp%m}^vPN4{22}X{|FWq7F1UEr|
z-@5T@+<*$klB?W>IwK^57ZurE4PbXw;hyFWk|2IiRFr6EIK*-2n0@{hg%b^(YY;L;
z#4Pw03PZF*&UX$<*9iXVy1m>GQiYE<ATydgS>5|N=K;<+z;Qn&$4%bGG=>k!HBS-%
z1E;L_6BtW(VSLvxfknC=kI-Ef-aWu8pSlq})w_>2f=PNdC;+D&G;$1&Jfpb_J6nmL
z&jrY-%0mYCF-6a84g(n$!&b2ElW*kp)~)=S_lWakmXL^f#;gaEj*_mER2NDQZac9Z
zBr)7VG=OP^XAB*mU%?)og7lfy{UGeawY7u%1+3VAhI5K6Cbm6ovHFKHs(UH7ma$jo
zLgxD2OsX#^dMi@k_{7;w$FH?ZsxQ?UA>C$Ek=@;->Z;uG9cdK(I9(Kz0}*4|7=opy
zrc*Z7=V&yoCIlK*^5^w@&!ia4!l<JzP;XFal&+Tp6{G1Q^-WHizleC2FA`(bbJD%Z
zA@d^`(L#H{s#K-;EiS&y3rZZr+mE5DYg@))MYnrGoN1}zwrwBTsg{Tu-L%3rC9HI=
zA5z<K)!3Tn!HQWq$?<%o&4=WW<LS_vhjul#dK}IZTX{AQ`q((!{UkSfvhZZ%$VC?K
z<Pe))0>ggx69M%7%8OL2gB2_;^PaK!h+sTni<2BrA|2`H^hk@t7f)qn7o~pTizX;Z
z%a{D(^h7c<w&2$X9QN_GIK;y|+-MKhlg6RCkmS_1$*zXBRr0IhlJ@?M;;SRPC9Bw2
z$<N<5$4VN%Q&5N5{Io)0#T?cBzA;)G;kiF3$euJ+ZK(QA8v<SXj|zCbn6)-k{8K}$
zuJdOFcZ5%18>;=KIb7HLn*u#5=CTb{|K0+x>;8jcaaCuH#N)0_XZ;uDMCpp&1k@DN
znY+moo;(&0u-P%Z(j60AvgHg!bshgInWGFv4(!Tk$TreM+3iqC*tz(r*x-K(W%rb7
zk|YJBbp@v|rH-Y#F0wA54$Uj%I^Wg2KQi!+G|?LGw4UO$Jp={&5^wl>Iq7QNd=;~0
z`s+9E6~r?N!2g3Ol>1O_Ht!bxwL#PgMel#~W_rBN2&u?CsmSi`o9Ptg#)<JaW&)Cy


From 270f4a8ed57e689951c6e0def1c3765596943b14 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Wed, 5 May 2021 21:15:44 +0200
Subject: [PATCH 128/166] review

---
 docs/src/gallery/tutorials/run_ensemblelda.py |  2 +-
 gensim/corpora/opinosiscorpus.py              |  4 +-
 gensim/downloader.py                          |  1 -
 gensim/models/ensemblelda.py                  | 39 +++++++------------
 gensim/test/test_ensemblelda.py               |  2 +-
 5 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py
index 086f65fc44..3519cf7b9f 100644
--- a/docs/src/gallery/tutorials/run_ensemblelda.py
+++ b/docs/src/gallery/tutorials/run_ensemblelda.py
@@ -109,7 +109,7 @@
 
 import numpy as np
 shape = ensemble.asymmetric_distance_matrix.shape
-without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)
+without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0], dtype=bool)].reshape(shape[0], -1)
 print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max())
 
 ensemble.recluster(eps=0.09, min_samples=2, min_cores=2)
diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py
index 1a68ec670c..eaee6f035a 100644
--- a/gensim/corpora/opinosiscorpus.py
+++ b/gensim/corpora/opinosiscorpus.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Author: Tobias B <github.com/sezanzeb>
+# Author: Tobias B <proxima@sezanzeb.de>
+# Copyright (C) 2021 Radim Rehurek <me@radimrehurek.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
 """Creates a corpus and dictionary from the Opinosis dataset.
diff --git a/gensim/downloader.py b/gensim/downloader.py
index 47061ec793..aac4d0edcf 100644
--- a/gensim/downloader.py
+++ b/gensim/downloader.py
@@ -357,7 +357,6 @@ def _download(name):
         If md5sum on client and in repo are different.
 
     """
-    print('_download')
     url_load_file = "{base}/{fname}/__init__.py".format(base=DOWNLOAD_BASE_URL, fname=name)
     data_folder_dir = os.path.join(BASE_DIR, name)
     data_folder_dir_tmp = data_folder_dir + '_tmp'
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 417b43e441..6682db3bbd 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -3,20 +3,22 @@
 #
 # Authors: Tobias Brigl <github.com/sezanzeb>, Alex Salles <alex.salles@gmail.com>,
 # Alex Loosley <aloosley@alumni.brown.edu>, Data Reply Munich
-#
+# Copyright (C) 2021 Radim Rehurek <me@radimrehurek.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
 """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
-Extracts reliable topics that are consistently learned across multiple LDA models. eLDA has the added benefit that
-the user does not need to know the exact number of topics the topic model should extract ahead of time
+Extracts reliable topics that are consistently learned across multiple LDA models, for higher reproducibility
+and less noise. eLDA has the added benefit that the user does not need to know the exact number of topics the topic
+model should extract ahead of time.
 
-For more details read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/).
+For more details read `our thesis <https://www.sezanzeb.de/machine_learning/ensemble_LDA/>`_.
 
 Usage examples
 --------------
 
-Train an ensemble of LdaModels using a Gensim corpus
+Train an ensemble of LdaModels using a Gensim corpus:
 
 .. sourcecode:: pycon
 
@@ -32,7 +34,7 @@
     >>> # keyword argument, as they are passed through to the children.
     >>> elda = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=10, num_models=4)
 
-Save a model to disk, or reload a pre-trained model
+Save a model to disk, or reload a pre-trained model:
 
 .. sourcecode:: pycon
 
@@ -45,7 +47,7 @@
     >>> # Load a potentially pretrained model from disk.
     >>> elda = EnsembleLda.load(temp_file)
 
-Query, the model using new, unseen documents
+Query, the model using new, unseen documents:
 
 .. sourcecode:: pycon
 
@@ -60,7 +62,7 @@
     >>> unseen_doc = other_corpus[0]
     >>> vector = elda[unseen_doc]  # get topic probability distribution for a document
 
-Increase the ensemble size by adding a new model. Make sure it uses the same dictionary
+Increase the ensemble size by adding a new model. Make sure it uses the same dictionary:
 
 .. sourcecode:: pycon
 
@@ -70,28 +72,17 @@
     >>> vector = elda[unseen_doc]
 
 To optimize the ensemble for your specific case, the children can be clustered again using
-different hyperparameters
+different hyperparameters:
 
 .. sourcecode:: pycon
 
     >>> elda.recluster(eps=0.2)
 
-References
-----------
-.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic modelling with large corpora. In : THE LREC
-       2010 WORKSHOP ON NEW CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. 2010. p. 45-50.
-       Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595
-
-.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis].
-       Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from:
-       https://www.hip70890b.de/machine_learning/ensemble_LDA/
-
 Citation
 --------
-At the moment, there is no paper associated to ensemble LDA but we are working on publicly releasing Tobi Brigl's
-bachelor thesis on the topic.  In the meantime, please include a mention of us (Brigl, T.; Salles, A.; Loosley, A) and
-a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if this work
-is presented, further developed, or used as the basis for a published result.
+BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis].
+Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from:
+https://www.sezanzeb.de/machine_learning/ensemble_LDA/
 
 Other Notes
 -----------
@@ -118,7 +109,7 @@ class EnsembleLda(SaveLoad):
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
     Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
-    the user does not need to know the exact number of topics the topic model should extract ahead of time [2].
+    the user does not need to know the exact number of topics the topic model should extract ahead of time.
 
     """
 
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 01c1ffa085..8d021d3658 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Author: Tobias B <github.com/sezanzeb>
+# Author: Tobias B <proxima@sezanzeb.de>
 
 """
 Automated tests for checking the EnsembleLda Class

From c39848caf23a6dc479b4d34207a7c474d629ea61 Mon Sep 17 00:00:00 2001
From: aloosley <a.loosley@rubinstein-schmiedel.com>
Date: Wed, 5 May 2021 21:18:03 +0200
Subject: [PATCH 129/166] stream documents instead of loading all into memory

---
 docs/src/gallery/tutorials/run_ensemblelda.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py
index 086f65fc44..d593f88860 100644
--- a/docs/src/gallery/tutorials/run_ensemblelda.py
+++ b/docs/src/gallery/tutorials/run_ensemblelda.py
@@ -31,9 +31,12 @@
 
 lemmatizer = WordNetLemmatizer()
 docs = api.load('text8')
-docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
-dictionary = Dictionary(docs)
+
+dictionary = Dictionary()
+for doc in docs:
+    dictionary.add_documents([[lemmatizer.lemmatize(token) for token in doc]])
 dictionary.filter_extremes(no_below=20, no_above=0.5)
+
 corpus = [dictionary.doc2bow(doc) for doc in docs]
 
 ###############################################################################

From 05bd52e1b2a5adc214d99025606d1234fe22e64e Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Wed, 5 May 2021 21:33:58 +0200
Subject: [PATCH 130/166] streaming docs instead of loading all into memory

---
 docs/src/gallery/tutorials/run_ensemblelda.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py
index 086f65fc44..d593f88860 100644
--- a/docs/src/gallery/tutorials/run_ensemblelda.py
+++ b/docs/src/gallery/tutorials/run_ensemblelda.py
@@ -31,9 +31,12 @@
 
 lemmatizer = WordNetLemmatizer()
 docs = api.load('text8')
-docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
-dictionary = Dictionary(docs)
+
+dictionary = Dictionary()
+for doc in docs:
+    dictionary.add_documents([[lemmatizer.lemmatize(token) for token in doc]])
 dictionary.filter_extremes(no_below=20, no_above=0.5)
+
 corpus = [dictionary.doc2bow(doc) for doc in docs]
 
 ###############################################################################

From 115cd1568f5762cd52a6e04b1b63556853fe27b6 Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Wed, 5 May 2021 22:40:57 +0200
Subject: [PATCH 131/166] .format replaced with f-string

---
 gensim/models/ensemblelda.py | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 6682db3bbd..dece967915 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -252,7 +252,7 @@ def __init__(
         if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0:
             return
 
-        logger.info("generating {} topic models...".format(num_models))
+        logger.info(f"generating {num_models} topic models...")
 
         if ensemble_workers > 1:
             self._generate_topic_models_multiproc(num_models, ensemble_workers)
@@ -281,15 +281,14 @@ def get_topic_model_class(self):
                 del self.topic_model_class_string
             except ModuleNotFoundError:
                 logger.error(
-                    'Could not import the "{}" module in order to provide the "{}" class as '
-                    '"topic_model_class" attribute. {}'
-                    .format(self.topic_model_class_string, self.topic_model_class_string, instruction)
+                    f'Could not import the "{self.topic_model_class_string}" module in order to provide the '
+                    f'"{self.topic_model_class_string}" class as "topic_model_class" attribute. {instruction}'
                 )
             except AttributeError:
                 logger.error(
-                    'Could not import the "{}" class from the "{}" module in order to set the '
-                    '"topic_model_class" attribute. {}'
-                    .format(self.topic_model_class_string, self.topic_model_module_string, instruction)
+                    f'Could not import the "{self.topic_model_class_string}" class from the '
+                    f'"{self.topic_model_module_string}" module in order to set the "topic_model_class" attribute. '
+                    f'{instruction}'
                 )
         return self.topic_model_class
 
@@ -476,7 +475,7 @@ def add_model(self, target, num_new_models=None):
 
             # unknown
             else:
-                raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0])))
+                raise ValueError(f"target is of unknown type or a list of unknown types: {type(target[0])}")
 
             # new models were added, increase num_models
             # if the user didn't provide a custon numer to use
@@ -508,7 +507,7 @@ def add_model(self, target, num_new_models=None):
 
             # unknown
             else:
-                raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0])))
+                raise ValueError(f"target is of unknown type or a list of unknown types: {type(target[0])}")
 
             # in this case, len(self.tms) should
             # always match self.num_models
@@ -519,13 +518,12 @@ def add_model(self, target, num_new_models=None):
                 )
             self.num_models = len(self.tms)
 
-        logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
+        logger.info(f"ensemble contains {self.num_models} models and {len(self.ttda)} topics now")
 
         if self.ttda.shape[1] != ttda.shape[1]:
             raise ValueError(
-                "target ttda dimensions do not match. Topics must be {} but was {} elements large".format(
-                    self.ttda.shape[-1], ttda.shape[-1],
-                )
+                f"target ttda dimensions do not match. Topics must be {self.ttda.shape[-1]} but was {ttda.shape[-1]} "
+                f"elements large"
             )
         self.ttda = np.append(self.ttda, ttda, axis=0)
 
@@ -545,7 +543,7 @@ def _teardown(self, pipes, processes, i):
                 index of the process that could not be started
 
         """
-        logger.error("could not start process {}".format(i))
+        logger.error(f"could not start process {i}")
 
         for pipe in pipes:
             pipe[1].close()
@@ -654,7 +652,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
 
         """
         if pipe is not None:
-            logger.info("spawned worker to generate {} topic models".format(num_models))
+            logger.info(f"spawned worker to generate {num_models} topic models")
 
         if random_states is None:
             random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)]
@@ -697,7 +695,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
 
     def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method):
         """Worker that computes the distance to all other nodes from a chunk of nodes."""
-        logger.info("spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas))
+        logger.info(f"spawned worker to generate {n_ttdas} rows of the asymmetric distance matrix")
         # the chunk of ttda that's going to be calculated:
         ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
         distance_chunk = self._calculate_asymmetric_distance_matrix_chunk(
@@ -724,7 +722,7 @@ def _generate_asymmetric_distance_matrix(self):
         if threshold is None:
             threshold = {"mass": 0.95, "rank": 0.11}[method]
 
-        logger.info("generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda)))
+        logger.info(f"generating a {len(self.ttda)} x {len(self.ttda)} asymmetric distance matrix...")
 
         # singlecore
         if workers is not None and workers <= 1:
@@ -819,7 +817,7 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s
             # the worker might not have received a ttda because it was chunked up too much
 
             if method not in ["mass", "rank"]:
-                raise ValueError("method {} unknown".format(method))
+                raise ValueError(f"method {method} unknown")
 
             # select masking method:
             def mass_masking(a):
@@ -866,7 +864,7 @@ def rank_masking(a):
                     distances[ttd1_idx][ttd2_idx] = distance
 
             percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
-            logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
+            logger.info(f'the given threshold of {threshold} covered on average {percent}% of tokens')
 
         return distances
 

From f56b60de45dd63b256275578186fd5fd47672f46 Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Wed, 5 May 2021 22:45:45 +0200
Subject: [PATCH 132/166] docstring

---
 gensim/models/ensemblelda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index dece967915..79a6fc32fc 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -531,7 +531,7 @@ def add_model(self, target, num_new_models=None):
         self.asymmetric_distance_matrix_outdated = True
 
     def _teardown(self, pipes, processes, i):
-        """close pipes and terminate processes
+        """Close pipes and terminate processes.
 
         Parameters
         ----------
@@ -806,7 +806,7 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s
 
         Returns
         -------
-        2D Numpy.numpy.ndarray of floats
+        2D numpy.ndarray of floats
             Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.
 
         """

From 28739ce58d752bd22dd6f7b0dfe401fff86da920 Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Wed, 5 May 2021 22:56:50 +0200
Subject: [PATCH 133/166] tuple variable expansion

---
 gensim/models/ensemblelda.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 79a6fc32fc..af52fc7ce5 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -545,9 +545,9 @@ def _teardown(self, pipes, processes, i):
         """
         logger.error(f"could not start process {i}")
 
-        for pipe in pipes:
-            pipe[1].close()
-            pipe[0].close()
+        for parent_conn, child_conn in pipes:
+            child_conn.close()
+            parent_conn.close()
 
         for process in processes:
             if process.is_alive():
@@ -621,9 +621,9 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
 
         # aggregate results
         # will also block until workers are finished
-        for pipe in pipes:
-            answer = pipe[0].recv()  # [0], because that is the parentConn
-            pipe[0].close()
+        for parent_conn, _ in pipes:
+            answer = parent_conn.recv()  # [0], because that is the parentConn
+            parent_conn.close()
             # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
             if not self.memory_friendly_ttda:
                 self.tms += answer
@@ -771,9 +771,9 @@ def _generate_asymmetric_distance_matrix(self):
         distances = []
         # note, that the following loop maintains order in how the ttda will be concatenated
         # which is very important. Ordering in ttda has to be the same as when using only one process
-        for pipe in pipes:
-            answer = pipe[0].recv()  # [0], because that is the parentConn
-            pipe[0].close()  # child conn will be closed from inside the worker
+        for parent_conn, _ in pipes:
+            answer = parent_conn.recv()  # [0], because that is the parentConn
+            parent_conn.close()  # child conn will be closed from inside the worker
             # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
             distances.append(answer[1])
 

From 336997576e15398a75f4fab28b329817a88c0ae2 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Wed, 5 May 2021 23:44:21 +0200
Subject: [PATCH 134/166] removed duplicate string in module docstring

---
 gensim/models/ensemblelda.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index af52fc7ce5..a6492ccd5b 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -9,12 +9,10 @@
 
 """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
-Extracts reliable topics that are consistently learned across multiple LDA models, for higher reproducibility
+Extract reliable topics that are consistently learned across multiple LDA models, for higher reproducibility
 and less noise. eLDA has the added benefit that the user does not need to know the exact number of topics the topic
 model should extract ahead of time.
 
-For more details read `our thesis <https://www.sezanzeb.de/machine_learning/ensemble_LDA/>`_.
-
 Usage examples
 --------------
 

From ed91a3fb434529ccba8ce0cc815d57d381eb7a13 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@sezanzeb.de>
Date: Wed, 5 May 2021 23:45:47 +0200
Subject: [PATCH 135/166] removed obsolete parent connection comment

---
 gensim/models/ensemblelda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index a6492ccd5b..04be03a165 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -620,7 +620,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
         # aggregate results
         # will also block until workers are finished
         for parent_conn, _ in pipes:
-            answer = parent_conn.recv()  # [0], because that is the parentConn
+            answer = parent_conn.recv()
             parent_conn.close()
             # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
             if not self.memory_friendly_ttda:
@@ -770,7 +770,7 @@ def _generate_asymmetric_distance_matrix(self):
         # note, that the following loop maintains order in how the ttda will be concatenated
         # which is very important. Ordering in ttda has to be the same as when using only one process
         for parent_conn, _ in pipes:
-            answer = parent_conn.recv()  # [0], because that is the parentConn
+            answer = parent_conn.recv()
             parent_conn.close()  # child conn will be closed from inside the worker
             # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
             distances.append(answer[1])

From 42ac44936e4f2ca4e00335f3b4a03dfe8c510d03 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@sezanzeb.de>
Date: Wed, 5 May 2021 23:54:06 +0200
Subject: [PATCH 136/166] reliable -> stable

---
 gensim/models/ensemblelda.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 04be03a165..dc79d696a0 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -9,7 +9,7 @@
 
 """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
-Extract reliable topics that are consistently learned across multiple LDA models, for higher reproducibility
+Extract stable topics that are consistently learned across multiple LDA models, for higher reproducibility
 and less noise. eLDA has the added benefit that the user does not need to know the exact number of topics the topic
 model should extract ahead of time.
 
@@ -82,11 +82,6 @@
 Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from:
 https://www.sezanzeb.de/machine_learning/ensemble_LDA/
 
-Other Notes
------------
-The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and
-comments.
-
 """
 import logging
 import os
@@ -106,7 +101,7 @@
 class EnsembleLda(SaveLoad):
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
-    Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
+    Extracts stable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
     the user does not need to know the exact number of topics the topic model should extract ahead of time.
 
     """
@@ -1102,10 +1097,10 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None):
         # Run CBDBSCAN to get topic clusters:
         self._generate_topic_clusters(eps, min_samples)
 
-        # Interpret the results of CBDBSCAN to identify reliable topics:
+        # Interpret the results of CBDBSCAN to identify stable topics:
         self._generate_stable_topics(min_cores)
 
-        # Create gensim LdaModel representation of topic model with reliable topics (can be used for inference):
+        # Create gensim LdaModel representation of topic model with stable topics (can be used for inference):
         self.generate_gensim_representation()
 
     # GENSIM API
@@ -1188,8 +1183,7 @@ class CBDBSCAN:
        6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5)
 
     The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for
-    unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is
-    one cause of unreliable topics).
+    unstable topics made of a composition of multiple stable topics.
 
     """
 

From bbd6c2ff022712d01f8953d69014b36664a26108 Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Mon, 31 May 2021 20:24:54 +0200
Subject: [PATCH 137/166] topic model intro v1

---
 gensim/models/ensemblelda.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index dc79d696a0..1b78621bc1 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -7,11 +7,16 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
+"""Ensemble Latent Dirichlet Allocation (eLDA), an algorithm for extracting reliable topics.
 
-Extract stable topics that are consistently learned across multiple LDA models, for higher reproducibility
-and less noise. eLDA has the added benefit that the user does not need to know the exact number of topics the topic
-model should extract ahead of time.
+In topic modeling, the aim is to find a set of topics that represents global structure of a corpus of documents. One
+issue that occur with topics extracted from an NMF or LDA model is reproducibility. That is, if the topic model is
+trained repeatedly allowing only the random seed to change, would the same (or similar) topic representation be reliably
+learned.
+
+Ensemble LDA addresses the issue by training an ensemble of topic models and throwing out topics that do not reoccur
+across the ensemble. In this regard, the topics extracted are reliable and there is the added benefit over many topic
+models that the user does not need to know the exact number of topics ahead of time.
 
 Usage examples
 --------------

From c032b20b72a2648a3bf8cf9922266047188e76de Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@sezanzeb.de>
Date: Mon, 31 May 2021 21:11:00 +0200
Subject: [PATCH 138/166] topic model intro v2

---
 gensim/models/ensemblelda.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 1b78621bc1..5f170747e9 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -9,14 +9,14 @@
 
 """Ensemble Latent Dirichlet Allocation (eLDA), an algorithm for extracting reliable topics.
 
-In topic modeling, the aim is to find a set of topics that represents global structure of a corpus of documents. One
-issue that occur with topics extracted from an NMF or LDA model is reproducibility. That is, if the topic model is
+The aim of topic modelling is to find a set of topics that represent the global structure of a corpus of documents. One
+issue that occurs with topics extracted from an NMF or LDA model is reproducibility. That is, if the topic model is
 trained repeatedly allowing only the random seed to change, would the same (or similar) topic representation be reliably
-learned.
+learned. Unreliable topics are undesireable because they are not a good representation of the corpus.
 
 Ensemble LDA addresses the issue by training an ensemble of topic models and throwing out topics that do not reoccur
-across the ensemble. In this regard, the topics extracted are reliable and there is the added benefit over many topic
-models that the user does not need to know the exact number of topics ahead of time.
+across the ensemble. In this regard, the topics extracted are more reliable and there is the added benefit over many
+topic models that the user does not need to know the exact number of topics ahead of time.
 
 Usage examples
 --------------

From 8c99b51cc1a1eb14b589bc37664b84db3041f07b Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@sezanzeb.de>
Date: Mon, 31 May 2021 21:31:35 +0200
Subject: [PATCH 139/166] topic model intro v3

---
 gensim/models/ensemblelda.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 5f170747e9..14e67eeeeb 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -18,6 +18,12 @@
 across the ensemble. In this regard, the topics extracted are more reliable and there is the added benefit over many
 topic models that the user does not need to know the exact number of topics ahead of time.
 
+For more information see the link in the :ref:`Citation` below, watch our Machine Learning Prague 2019 talk `Solving
+the Text Labeling challenge with EnsembleLDA and Active Learning
+<https://slideslive.com/38913528/solving-the-text-labeling-challenge-with-ensemblelda-and-active-learning?locale=cs>`_,
+or view our `Machine Learning Summer School poster
+<https://github.com/aloosley/ensembleLDA/blob/master/mlss/mlss_poster_v2.pdf>`_.
+
 Usage examples
 --------------
 

From 41e3591c050a56731a6561c36190eacc8a0c0702 Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Mon, 31 May 2021 22:22:04 +0200
Subject: [PATCH 140/166] elda introduction with references

---
 docs/src/auto_examples/index.rst | 34 ++++++++++++++++----------------
 gensim/models/ensemblelda.py     |  5 +++--
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst
index d3dd2291be..05643de00c 100644
--- a/docs/src/auto_examples/index.rst
+++ b/docs/src/auto_examples/index.rst
@@ -71,7 +71,7 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces transformations and demonstrates their use on a toy corpus. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces transformations and demonstrates their use on a toy corpus.">
 
 .. only:: html
 
@@ -92,7 +92,7 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates querying a corpus for similar documents. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates querying a corpus for similar documents.">
 
 .. only:: html
 
@@ -169,14 +169,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s EnsembleLda model">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s fastText model and demonstrates its use on the Lee Corpus.">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png
-     :alt: Ensemble LDA
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png
+     :alt: FastText Model
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py`
 
 .. raw:: html
 
@@ -186,18 +186,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_ensemblelda
+   /auto_examples/tutorials/run_fasttext
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s fastText model and demonstrates its use on the Lee Corpus. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces Gensim&#x27;s EnsembleLda model">
 
 .. only:: html
 
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png
-     :alt: FastText Model
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png
+     :alt: Ensemble LDA
 
-     :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py`
 
 .. raw:: html
 
@@ -207,11 +207,11 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 .. toctree::
    :hidden:
 
-   /auto_examples/tutorials/run_fasttext
+   /auto_examples/tutorials/run_ensemblelda
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Introduces the Annoy library for similarity queries on top of vectors learned by Word2Vec. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Introduces the Annoy library for similarity queries on top of vectors learned by Word2Vec.">
 
 .. only:: html
 
@@ -309,7 +309,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates simple and quick access to common corpora and pretrained models. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="Demonstrates simple and quick access to common corpora and pretrained models.">
 
 .. only:: html
 
@@ -330,7 +330,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="How to author documentation for Gensim. ">
+    <div class="sphx-glr-thumbcontainer" tooltip="How to author documentation for Gensim.">
 
 .. only:: html
 
@@ -447,13 +447,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from
 
   .. container:: sphx-glr-download sphx-glr-download-python
 
-    :download:`Download all examples in Python source code: auto_examples_python.zip <//Volumes/work/workspace/gensim/trunk/docs/src/auto_examples/auto_examples_python.zip>`
+    :download:`Download all examples in Python source code: auto_examples_python.zip </auto_examples/auto_examples_python.zip>`
 
 
   .. container:: sphx-glr-download sphx-glr-download-jupyter
 
-    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip <//Volumes/work/workspace/gensim/trunk/docs/src/auto_examples/auto_examples_jupyter.zip>`
+    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip </auto_examples/auto_examples_jupyter.zip>`
 
 
 .. only:: html
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 14e67eeeeb..318cccf05a 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -18,8 +18,7 @@
 across the ensemble. In this regard, the topics extracted are more reliable and there is the added benefit over many
 topic models that the user does not need to know the exact number of topics ahead of time.
 
-For more information see the link in the :ref:`Citation` below, watch our Machine Learning Prague 2019 talk `Solving
-the Text Labeling challenge with EnsembleLDA and Active Learning
+For more information, see the :ref:`citation section <Citation>` below, watch our `Machine Learning Prague 2019 talk
 <https://slideslive.com/38913528/solving-the-text-labeling-challenge-with-ensemblelda-and-active-learning?locale=cs>`_,
 or view our `Machine Learning Summer School poster
 <https://github.com/aloosley/ensembleLDA/blob/master/mlss/mlss_poster_v2.pdf>`_.
@@ -87,6 +86,8 @@
 
     >>> elda.recluster(eps=0.2)
 
+.. _Citation:
+
 Citation
 --------
 BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis].

From 3ed4522972d81783ccfc736a7550743d14c57453 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@sezanzeb.de>
Date: Wed, 23 Jun 2021 23:05:14 +0200
Subject: [PATCH 141/166] Update gensim/corpora/opinosiscorpus.py

Co-authored-by: Michael Penkov <m@penkov.dev>
---
 gensim/corpora/opinosiscorpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py
index eaee6f035a..b4e25731ce 100644
--- a/gensim/corpora/opinosiscorpus.py
+++ b/gensim/corpora/opinosiscorpus.py
@@ -22,7 +22,7 @@
 from gensim.parsing.preprocessing import STOPWORDS
 
 
-class OpinosisCorpus():
+class OpinosisCorpus:
     """Creates a corpus and dictionary from the Opinosis dataset.
 
     http://kavita-ganesan.com/opinosis-opinion-dataset/

From c1d2d5762b782670d9574fddf00ec72f8d9855f8 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@sezanzeb.de>
Date: Wed, 23 Jun 2021 23:20:13 +0200
Subject: [PATCH 142/166] Update gensim/models/ensemblelda.py

Co-authored-by: Michael Penkov <m@penkov.dev>
---
 gensim/models/ensemblelda.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 318cccf05a..80b64a0193 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -135,6 +135,8 @@ def __init__(
             Examples:
                 * 'ldamulticore' (default, recommended)
                 * 'lda'
+                * ldamodel.LdaModel
+                * ldamulticore.LdaMulticore
         ensemble_workers : int, optional
             Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
             num_models should be a multiple of ensemble_workers.

From 128c9dc25b864dd3eadbb0e7f5cff150db74200d Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Wed, 23 Jun 2021 23:20:40 +0200
Subject: [PATCH 143/166] update docstrings

---
 gensim/downloader.py         | 4 ----
 gensim/models/ensemblelda.py | 7 +++++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/gensim/downloader.py b/gensim/downloader.py
index aac4d0edcf..8a4395440e 100644
--- a/gensim/downloader.py
+++ b/gensim/downloader.py
@@ -372,8 +372,6 @@ def _download(name):
 
             fname = "{f}.gz_0{p}".format(f=name, p=part)
             dst_path = os.path.join(tmp_dir, fname)
-            print('downloading', url_data)
-            print()
             urllib.urlretrieve(
                 url_data, dst_path,
                 reporthook=partial(_progress, part=part, total_parts=total_parts)
@@ -395,8 +393,6 @@ def _download(name):
         url_data = "{base}/{fname}/{fname}.gz".format(base=DOWNLOAD_BASE_URL, fname=name)
         fname = "{fname}.gz".format(fname=name)
         dst_path = os.path.join(tmp_dir, fname)
-        print('downloading', url_data, 'to', dst_path, data_folder_dir)
-        print()
         urllib.urlretrieve(url_data, dst_path, reporthook=_progress)
         if _calculate_md5_checksum(dst_path) == _get_checksum(name):
             sys.stdout.write("\n")
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 318cccf05a..ade756bcf1 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -298,7 +298,6 @@ def get_topic_model_class(self):
         return self.topic_model_class
 
     def save(self, *args, **kwargs):
-        """See :meth:`gensim.utils.SaveLoad.save`."""
         if self.get_topic_model_class() is not None:
             self.topic_model_module_string = self.topic_model_class.__module__
             self.topic_model_class_string = self.topic_model_class.__name__
@@ -308,7 +307,11 @@ def save(self, *args, **kwargs):
     save.__doc__ = SaveLoad.save.__doc__
 
     def convert_to_memory_friendly(self):
-        """Remove the stored gensim models and only keep their ttdas."""
+        """Remove the stored gensim models and only keep their ttdas.
+
+        This frees up memory, but you won't have access to the individual  models anymore if you intended to use them
+        outside of the ensemble.
+        """
         self.tms = []
         self.memory_friendly_ttda = True
 

From e07906a5523c0c35e60eeb52ca1d3cafef6fc6f4 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Wed, 23 Jun 2021 23:25:10 +0200
Subject: [PATCH 144/166] static functions

---
 gensim/models/ensemblelda.py | 260 ++++++++++++++++++-----------------
 1 file changed, 133 insertions(+), 127 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index f8ea578fb2..8f70de146a 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -110,6 +110,131 @@
 logger = logging.getLogger(__name__)
 
 
+def _is_valid_core(topic):
+    """Check if the topic is a valid core.
+
+    Parameters
+    ----------
+    topic : {'is_core', 'valid_parents', 'label'}
+        topic to validate
+
+    """
+    return topic["is_core"] and (topic["valid_parents"] == {topic["label"]})
+
+
+def _remove_from_all_sets(label, clusters):
+    """Remove a label from every set in "neighboring_labels" for each core in ``clusters``."""
+    for cluster in clusters:
+        for neighboring_labels_set in cluster["neighboring_labels"]:
+            if label in neighboring_labels_set:
+                neighboring_labels_set.remove(label)
+
+
+def _contains_isolated_cores(label, cluster, min_cores):
+    """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster."""
+    return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores
+
+
+def _aggregate_topics(grouped_by_labels):
+    """Aggregate the labeled topics to a list of clusters.
+
+    Parameters
+    ----------
+    grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
+        The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the
+        label.
+
+    Returns
+    -------
+        list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'}
+        max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label
+        refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the
+        neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending
+        order. There is one single element for each cluster.
+
+    """
+    clusters = []
+
+    for label, group in grouped_by_labels.items():
+        max_num_neighboring_labels = 0
+        neighboring_labels = []  # will be a list of sets
+
+        for topic in group:
+            max_num_neighboring_labels = max(topic["num_neighboring_labels"], max_num_neighboring_labels)
+            neighboring_labels.append(topic["neighboring_labels"])
+
+        neighboring_labels = [x for x in neighboring_labels if len(x) > 0]
+
+        clusters.append({
+            "max_num_neighboring_labels": max_num_neighboring_labels,
+            "neighboring_labels": neighboring_labels,
+            "label": label,
+            "num_cores": len([topic for topic in group if topic["is_core"]]),
+        })
+
+    logger.info("found %s clusters", len(clusters))
+
+    return clusters
+
+
+def _group_by_labels(results):
+    """Group all the learned cores by their label, which was assigned in the cluster_model.
+
+    Parameters
+    ----------
+    results : {list of {'is_core', 'neighboring_labels', 'label'}}
+        After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results
+        member, which can be used as the argument to this function. It's a list of infos gathered during
+        the clustering step and each element in the list corresponds to a single topic.
+
+    Returns
+    -------
+        dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
+        A mapping of the label to a list of topics that belong to that particular label. Also adds
+        a new member to each topic called num_neighboring_labels, which is the number of
+        neighboring_labels of that topic.
+
+    """
+    grouped_by_labels = {}
+    for topic in results:
+        if topic["is_core"]:
+            topic = topic.copy()
+
+            # counts how many different labels a core has as parents
+            topic["num_neighboring_labels"] = len(topic["neighboring_labels"])
+
+            label = topic["label"]
+            if label not in grouped_by_labels:
+                grouped_by_labels[label] = []
+            grouped_by_labels[label].append(topic)
+    return grouped_by_labels
+
+
+def _teardown(pipes, processes, i):
+    """Close pipes and terminate processes.
+
+    Parameters
+    ----------
+        pipes : {list of :class:`multiprocessing.Pipe`}
+            list of pipes that the processes use to communicate with the parent
+        processes : {list of :class:`multiprocessing.Process`}
+            list of worker processes
+        i : int
+            index of the process that could not be started
+
+    """
+    logger.error(f"could not start process {i}")
+
+    for parent_conn, child_conn in pipes:
+        child_conn.close()
+        parent_conn.close()
+
+    for process in processes:
+        if process.is_alive():
+            process.terminate()
+        del process
+
+
 class EnsembleLda(SaveLoad):
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
@@ -540,30 +665,6 @@ def add_model(self, target, num_new_models=None):
         # tell recluster that the distance matrix needs to be regenerated
         self.asymmetric_distance_matrix_outdated = True
 
-    def _teardown(self, pipes, processes, i):
-        """Close pipes and terminate processes.
-
-        Parameters
-        ----------
-            pipes : {list of :class:`multiprocessing.Pipe`}
-                list of pipes that the processes use to communicate with the parent
-            processes : {list of :class:`multiprocessing.Process`}
-                list of worker processes
-            i : int
-                index of the process that could not be started
-
-        """
-        logger.error(f"could not start process {i}")
-
-        for parent_conn, child_conn in pipes:
-            child_conn.close()
-            parent_conn.close()
-
-        for process in processes:
-            if process.is_alive():
-                process.terminate()
-            del process
-
     def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
         """Generate the topic models to form the ensemble in a multiprocessed way.
 
@@ -626,7 +727,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
                 num_models_unhandled -= num_subprocess_models
 
             except ProcessError:
-                self._teardown(pipes, processes, i)
+                _teardown(pipes, processes, i)
                 raise ProcessError
 
         # aggregate results
@@ -775,7 +876,7 @@ def _generate_asymmetric_distance_matrix(self):
                 process.start()
 
             except ProcessError:
-                self._teardown(pipes, processes, i)
+                _teardown(pipes, processes, i)
                 raise ProcessError
 
         distances = []
@@ -901,101 +1002,6 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None):
         self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples)
         self.cluster_model.fit(self.asymmetric_distance_matrix)
 
-    def _is_valid_core(self, topic):
-        """Check if the topic is a valid core.
-
-        Parameters
-        ----------
-        topic : {'is_core', 'valid_parents', 'label'}
-            topic to validate
-
-        """
-        return topic["is_core"] and (topic["valid_parents"] == {topic["label"]})
-
-    def _group_by_labels(self, results):
-        """Group all the learned cores by their label, which was assigned in the cluster_model.
-
-        Parameters
-        ----------
-        results : {list of {'is_core', 'neighboring_labels', 'label'}}
-            After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results
-            member, which can be used as the argument to this function. It's a list of infos gathered during
-            the clustering step and each element in the list corresponds to a single topic.
-
-        Returns
-        -------
-            dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
-            A mapping of the label to a list of topics that belong to that particular label. Also adds
-            a new member to each topic called num_neighboring_labels, which is the number of
-            neighboring_labels of that topic.
-
-        """
-        grouped_by_labels = {}
-        for topic in results:
-            if topic["is_core"]:
-                topic = topic.copy()
-
-                # counts how many different labels a core has as parents
-                topic["num_neighboring_labels"] = len(topic["neighboring_labels"])
-
-                label = topic["label"]
-                if label not in grouped_by_labels:
-                    grouped_by_labels[label] = []
-                grouped_by_labels[label].append(topic)
-        return grouped_by_labels
-
-    def _aggregate_topics(self, grouped_by_labels):
-        """Aggregate the labeled topics to a list of clusters.
-
-        Parameters
-        ----------
-        grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
-            The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the
-            label.
-
-        Returns
-        -------
-            list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'}
-            max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label
-            refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the
-            neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending
-            order. There is one single element for each cluster.
-
-        """
-        clusters = []
-
-        for label, group in grouped_by_labels.items():
-            max_num_neighboring_labels = 0
-            neighboring_labels = []  # will be a list of sets
-
-            for topic in group:
-                max_num_neighboring_labels = max(topic["num_neighboring_labels"], max_num_neighboring_labels)
-                neighboring_labels.append(topic["neighboring_labels"])
-
-            neighboring_labels = [x for x in neighboring_labels if len(x) > 0]
-
-            clusters.append({
-                "max_num_neighboring_labels": max_num_neighboring_labels,
-                "neighboring_labels": neighboring_labels,
-                "label": label,
-                "num_cores": len([topic for topic in group if topic["is_core"]]),
-            })
-
-        logger.info("found %s clusters", len(clusters))
-
-        return clusters
-
-    def _remove_from_all_sets(self, label, clusters):
-        """Remove a label from every set in "neighboring_labels" for each core in ``clusters``."""
-        for cluster in clusters:
-            for neighboring_labels_set in cluster["neighboring_labels"]:
-                if label in neighboring_labels_set:
-                    neighboring_labels_set.remove(label)
-
-    def _contains_isolated_cores(self, label, cluster, min_cores):
-        """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster."""
-        return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores
-
     def _generate_stable_topics(self, min_cores=None):
         """Generate stable topics out of the clusters.
 
@@ -1028,9 +1034,9 @@ def _generate_stable_topics(self, min_cores=None):
 
         results = self.cluster_model.results
 
-        grouped_by_labels = self._group_by_labels(results)
+        grouped_by_labels = _group_by_labels(results)
 
-        clusters = self._aggregate_topics(grouped_by_labels)
+        clusters = _aggregate_topics(grouped_by_labels)
 
         # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the
         # easy ones and potentially already remove some noise by also sorting smaller clusters to the front.
@@ -1049,17 +1055,17 @@ def _generate_stable_topics(self, min_cores=None):
             cluster["is_valid"] = None
             if cluster["num_cores"] < min_cores:
                 cluster["is_valid"] = False
-                self._remove_from_all_sets(cluster["label"], sorted_clusters)
+                _remove_from_all_sets(cluster["label"], sorted_clusters)
 
         # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any
         # other cluster.
         for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]:
             label = cluster["label"]
-            if self._contains_isolated_cores(label, cluster, min_cores):
+            if _contains_isolated_cores(label, cluster, min_cores):
                 cluster["is_valid"] = True
             else:
                 cluster["is_valid"] = False
-                self._remove_from_all_sets(label, sorted_clusters)
+                _remove_from_all_sets(label, sorted_clusters)
 
         # list of all the label numbers that are valid
         valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]])
@@ -1068,7 +1074,7 @@ def _generate_stable_topics(self, min_cores=None):
             topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels}
 
         # keeping only VALID cores
-        valid_core_mask = np.vectorize(self._is_valid_core)(results)
+        valid_core_mask = np.vectorize(_is_valid_core)(results)
         valid_topics = self.ttda[valid_core_mask]
         topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask]
         unique_labels = np.unique(topic_labels)

From 8647aea9a30b1089dbd9c687846d8584684b37a0 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Wed, 23 Jun 2021 23:26:39 +0200
Subject: [PATCH 145/166] module-level constant

---
 gensim/models/ensemblelda.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 8f70de146a..7e0fe76f59 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -109,6 +109,10 @@
 
 logger = logging.getLogger(__name__)
 
+# _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping
+# distance calculations for highly masked topic-term distributions
+_COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05
+
 
 def _is_valid_core(topic):
     """Check if the topic is a valid core.
@@ -319,10 +323,6 @@ def __init__(
         # nps max random state of 2**32 - 1 is too large for windows:
         self._MAX_RANDOM_STATE = np.iinfo(np.int32).max
 
-        # _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping
-        # distance calculations for highly masked topic-term distributions
-        self._COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05
-
         if "id2word" not in gensim_kw_args:
             gensim_kw_args["id2word"] = None
         if "corpus" not in gensim_kw_args:
@@ -967,7 +967,7 @@ def rank_masking(a):
 
                     # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
                     # distributions that will have distance values near 1.
-                    if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD:
+                    if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD:
                         distance = 1
                     else:
                         distance = cosine(ttd1_masked, ttd2_masked)

From 5109799d1432c99767bc0beda51709394042ffe5 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Wed, 23 Jun 2021 23:28:35 +0200
Subject: [PATCH 146/166] assert and no return

---
 gensim/models/ensemblelda.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 7e0fe76f59..aa610412eb 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -840,7 +840,6 @@ def _generate_asymmetric_distance_matrix(self):
             self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk(
                 ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method,
             )
-            return self.asymmetric_distance_matrix
 
         # else, if workers > 1 use multiprocessing
         # best performance on 2-core machine: 2 workers
@@ -894,8 +893,6 @@ def _generate_asymmetric_distance_matrix(self):
 
         self.asymmetric_distance_matrix = np.concatenate(distances)
 
-        return self.asymmetric_distance_matrix
-
     def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method):
         """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
 
@@ -1140,7 +1137,7 @@ def get_topics(self):
         """
         return self.stable_topics
 
-    def _has_gensim_representation(self):
+    def _ensure_gensim_representation(self):
         """Check if stable topics and the internal gensim representation exist. Raise an error if not."""
         if self.classic_model_representation is None:
             if len(self.stable_topics) == 0:
@@ -1150,22 +1147,22 @@ def _has_gensim_representation(self):
 
     def __getitem__(self, i):
         """See :meth:`gensim.models.LdaModel.__getitem__`."""
-        self._has_gensim_representation()
+        self._ensure_gensim_representation()
         return self.classic_model_representation[i]
 
     def inference(self, *posargs, **kwargs):
         """See :meth:`gensim.models.LdaModel.inference`."""
-        self._has_gensim_representation()
+        self._ensure_gensim_representation()
         return self.classic_model_representation.inference(*posargs, **kwargs)
 
     def log_perplexity(self, *posargs, **kwargs):
         """See :meth:`gensim.models.LdaModel.log_perplexity`."""
-        self._has_gensim_representation()
+        self._ensure_gensim_representation()
         return self.classic_model_representation.log_perplexity(*posargs, **kwargs)
 
     def print_topics(self, *posargs, **kwargs):
         """See :meth:`gensim.models.LdaModel.print_topics`."""
-        self._has_gensim_representation()
+        self._ensure_gensim_representation()
         return self.classic_model_representation.print_topics(*posargs, **kwargs)
 
     @property

From 0dd9f29c76cbd16b7a1f78631b9873d39fa6d185 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Wed, 23 Jun 2021 23:30:54 +0200
Subject: [PATCH 147/166] static _calculate_asymmetric_distance_matrix_chunk

---
 gensim/models/ensemblelda.py | 171 ++++++++++++++++++-----------------
 1 file changed, 86 insertions(+), 85 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index aa610412eb..126db88622 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -239,6 +239,90 @@ def _teardown(pipes, processes, i):
         del process
 
 
+def _calculate_asymmetric_distance_matrix_chunk(ttda1, ttda2, threshold, start_index, method):
+    """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
+
+    Parameters
+    ----------
+    ttda1 and ttda2: 2D arrays of floats
+        Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
+        topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
+    threshold : float, optional
+        threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method
+    start_index : int
+        this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
+        complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into
+        two pieces, each 100 ttdas long, then start_index should be be 100. default is 0
+    method : {'mass', 'rank}, optional
+        method can be "mass" for the original masking method or "rank" for a faster masking method that selects
+        by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the
+        topic.
+
+    Returns
+    -------
+    2D numpy.ndarray of floats
+        Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.
+
+    """
+    # initialize the distance matrix. ndarray is faster than zeros
+    distances = np.ndarray((len(ttda1), len(ttda2)))
+
+    if ttda1.shape[0] > 0 and ttda2.shape[0] > 0:
+        # the worker might not have received a ttda because it was chunked up too much
+
+        if method not in ["mass", "rank"]:
+            raise ValueError(f"method {method} unknown")
+
+        # select masking method:
+        def mass_masking(a):
+            """Original masking method. Returns a new binary mask."""
+            sorted_a = np.sort(a)[::-1]
+            largest_mass = sorted_a.cumsum() < threshold
+            smallest_valid = sorted_a[largest_mass][-1]
+            return a >= smallest_valid
+
+        def rank_masking(a):
+            """Faster masking method. Returns a new binary mask."""
+            return a > np.sort(a)[::-1][int(len(a) * threshold)]
+
+        create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
+
+        # some help to find a better threshold by useful log messages
+        avg_mask_size = 0
+
+        # now iterate over each topic
+        for ttd1_idx, ttd1 in enumerate(ttda1):
+            # create mask from ttd1 that removes noise from a and keeps the largest terms
+            mask = create_mask(ttd1)
+            ttd1_masked = ttd1[mask]
+
+            avg_mask_size += mask.sum()
+
+            # now look at every possible pair for topic a:
+            for ttd2_idx, ttd2 in enumerate(ttda2):
+                # distance to itself is 0
+                if ttd1_idx + start_index == ttd2_idx:
+                    distances[ttd1_idx][ttd2_idx] = 0
+                    continue
+
+                # now mask b based on a, which will force the shape of a onto b
+                ttd2_masked = ttd2[mask]
+
+                # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
+                # distributions that will have distance values near 1.
+                if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD:
+                    distance = 1
+                else:
+                    distance = cosine(ttd1_masked, ttd2_masked)
+
+                distances[ttd1_idx][ttd2_idx] = distance
+
+        percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
+        logger.info(f'the given threshold of {threshold} covered on average {percent}% of tokens')
+
+    return distances
+
+
 class EnsembleLda(SaveLoad):
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
@@ -809,7 +893,7 @@ def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pip
         logger.info(f"spawned worker to generate {n_ttdas} rows of the asymmetric distance matrix")
         # the chunk of ttda that's going to be calculated:
         ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
-        distance_chunk = self._calculate_asymmetric_distance_matrix_chunk(
+        distance_chunk = _calculate_asymmetric_distance_matrix_chunk(
             ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method,
         )
         pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
@@ -837,7 +921,7 @@ def _generate_asymmetric_distance_matrix(self):
 
         # singlecore
         if workers is not None and workers <= 1:
-            self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk(
+            self.asymmetric_distance_matrix = _calculate_asymmetric_distance_matrix_chunk(
                 ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method,
             )
 
@@ -893,89 +977,6 @@ def _generate_asymmetric_distance_matrix(self):
 
         self.asymmetric_distance_matrix = np.concatenate(distances)
 
-    def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method):
-        """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
-
-        Parameters
-        ----------
-        ttda1 and ttda2: 2D arrays of floats
-            Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
-            topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
-        threshold : float, optional
-            threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method
-        start_index : int
-            this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
-            complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into
-            two pieces, each 100 ttdas long, then start_index should be be 100. default is 0
-        method : {'mass', 'rank}, optional
-            method can be "mass" for the original masking method or "rank" for a faster masking method that selects
-            by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the
-            topic.
-
-        Returns
-        -------
-        2D numpy.ndarray of floats
-            Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.
-
-        """
-        # initialize the distance matrix. ndarray is faster than zeros
-        distances = np.ndarray((len(ttda1), len(ttda2)))
-
-        if ttda1.shape[0] > 0 and ttda2.shape[0] > 0:
-            # the worker might not have received a ttda because it was chunked up too much
-
-            if method not in ["mass", "rank"]:
-                raise ValueError(f"method {method} unknown")
-
-            # select masking method:
-            def mass_masking(a):
-                """Original masking method. Returns a new binary mask."""
-                sorted_a = np.sort(a)[::-1]
-                largest_mass = sorted_a.cumsum() < threshold
-                smallest_valid = sorted_a[largest_mass][-1]
-                return a >= smallest_valid
-
-            def rank_masking(a):
-                """Faster masking method. Returns a new binary mask."""
-                return a > np.sort(a)[::-1][int(len(a) * threshold)]
-
-            create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
-
-            # some help to find a better threshold by useful log messages
-            avg_mask_size = 0
-
-            # now iterate over each topic
-            for ttd1_idx, ttd1 in enumerate(ttda1):
-                # create mask from ttd1 that removes noise from a and keeps the largest terms
-                mask = create_mask(ttd1)
-                ttd1_masked = ttd1[mask]
-
-                avg_mask_size += mask.sum()
-
-                # now look at every possible pair for topic a:
-                for ttd2_idx, ttd2 in enumerate(ttda2):
-                    # distance to itself is 0
-                    if ttd1_idx + start_index == ttd2_idx:
-                        distances[ttd1_idx][ttd2_idx] = 0
-                        continue
-
-                    # now mask b based on a, which will force the shape of a onto b
-                    ttd2_masked = ttd2[mask]
-
-                    # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
-                    # distributions that will have distance values near 1.
-                    if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD:
-                        distance = 1
-                    else:
-                        distance = cosine(ttd1_masked, ttd2_masked)
-
-                    distances[ttd1_idx][ttd2_idx] = distance
-
-            percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
-            logger.info(f'the given threshold of {threshold} covered on average {percent}% of tokens')
-
-        return distances
-
     def _generate_topic_clusters(self, eps=0.1, min_samples=None):
         """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices.
 

From 78a16b4855d46d520639ecf3056ee800b60008cc Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Wed, 30 Jun 2021 21:09:04 +0200
Subject: [PATCH 148/166] better variable names and data types

---
 gensim/models/ensemblelda.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 126db88622..17654fc676 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -181,12 +181,13 @@ def _aggregate_topics(grouped_by_labels):
     return clusters
 
 
-def _group_by_labels(results):
+def _group_by_labels(cbdbscan_topics):
     """Group all the learned cores by their label, which was assigned in the cluster_model.
 
     Parameters
     ----------
-    results : {list of {'is_core', 'neighboring_labels', 'label'}}
+    cbdbscan_topics : {list of {'is_core', 'neighboring_labels', 'label'}}
+        A list of topic data resulting from fitting a :class:`~CBDBSCAN` object.
         After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results
         member, which can be used as the argument to this function. It's a list of infos gathered during
         the clustering step and each element in the list corresponds to a single topic.
@@ -200,7 +201,7 @@ def _group_by_labels(results):
 
     """
     grouped_by_labels = {}
-    for topic in results:
+    for topic in cbdbscan_topics:
         if topic["is_core"]:
             topic = topic.copy()
 
@@ -1030,9 +1031,9 @@ def _generate_stable_topics(self, min_cores=None):
         else:
             logger.info("generating stable topics")
 
-        results = self.cluster_model.results
+        cbdbscan_topics = self.cluster_model.results
 
-        grouped_by_labels = _group_by_labels(results)
+        grouped_by_labels = _group_by_labels(cbdbscan_topics)
 
         clusters = _aggregate_topics(grouped_by_labels)
 
@@ -1066,15 +1067,15 @@ def _generate_stable_topics(self, min_cores=None):
                 _remove_from_all_sets(label, sorted_clusters)
 
         # list of all the label numbers that are valid
-        valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]])
+        valid_labels = {cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]}
 
-        for topic in results:
+        for topic in cbdbscan_topics:
             topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels}
 
         # keeping only VALID cores
-        valid_core_mask = np.vectorize(_is_valid_core)(results)
+        valid_core_mask = np.vectorize(_is_valid_core)(cbdbscan_topics)
         valid_topics = self.ttda[valid_core_mask]
-        topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask]
+        topic_labels = np.array([topic["label"] for topic in cbdbscan_topics])[valid_core_mask]
         unique_labels = np.unique(topic_labels)
 
         num_stable_topics = len(unique_labels)

From 8c10975814ff6bfe905f5038592d018c8ffc5bf4 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Wed, 30 Jun 2021 21:45:00 +0200
Subject: [PATCH 149/166] refactoring

---
 gensim/models/ensemblelda.py    | 110 +++++++++++++++++---------------
 gensim/test/test_ensemblelda.py |   4 +-
 2 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 17654fc676..71c9874c2e 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -115,15 +115,15 @@
 
 
 def _is_valid_core(topic):
-    """Check if the topic is a valid core.
+    """Check if the topic is a valid core, i.e. no neighboring valid cluster is overlapping with it.
 
     Parameters
     ----------
-    topic : {'is_core', 'valid_parents', 'label'}
+    topic : {'is_core', 'valid_neighboring_labels', 'label'}
         topic to validate
 
     """
-    return topic["is_core"] and (topic["valid_parents"] == {topic["label"]})
+    return topic["is_core"] and (topic["valid_neighboring_labels"] == {topic["label"]})
 
 
 def _remove_from_all_sets(label, clusters):
@@ -201,6 +201,7 @@ def _group_by_labels(cbdbscan_topics):
 
     """
     grouped_by_labels = {}
+
     for topic in cbdbscan_topics:
         if topic["is_core"]:
             topic = topic.copy()
@@ -212,6 +213,7 @@ def _group_by_labels(cbdbscan_topics):
             if label not in grouped_by_labels:
                 grouped_by_labels[label] = []
             grouped_by_labels[label].append(topic)
+
     return grouped_by_labels
 
 
@@ -240,6 +242,19 @@ def _teardown(pipes, processes, i):
         del process
 
 
+def mass_masking(a, threshold):
+    """Original masking method. Returns a new binary mask."""
+    sorted_a = np.sort(a)[::-1]
+    largest_mass = sorted_a.cumsum() < threshold
+    smallest_valid = sorted_a[largest_mass][-1]
+    return a >= smallest_valid
+
+
+def rank_masking(a, threshold):
+    """Faster masking method. Returns a new binary mask."""
+    return a > np.sort(a)[::-1][int(len(a) * threshold)]
+
+
 def _calculate_asymmetric_distance_matrix_chunk(ttda1, ttda2, threshold, start_index, method):
     """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
 
@@ -275,17 +290,6 @@ def _calculate_asymmetric_distance_matrix_chunk(ttda1, ttda2, threshold, start_i
             raise ValueError(f"method {method} unknown")
 
         # select masking method:
-        def mass_masking(a):
-            """Original masking method. Returns a new binary mask."""
-            sorted_a = np.sort(a)[::-1]
-            largest_mass = sorted_a.cumsum() < threshold
-            smallest_valid = sorted_a[largest_mass][-1]
-            return a >= smallest_valid
-
-        def rank_masking(a):
-            """Faster masking method. Returns a new binary mask."""
-            return a > np.sort(a)[::-1][int(len(a) * threshold)]
-
         create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
 
         # some help to find a better threshold by useful log messages
@@ -294,7 +298,7 @@ def rank_masking(a):
         # now iterate over each topic
         for ttd1_idx, ttd1 in enumerate(ttda1):
             # create mask from ttd1 that removes noise from a and keeps the largest terms
-            mask = create_mask(ttd1)
+            mask = create_mask(ttd1, threshold)
             ttd1_masked = ttd1[mask]
 
             avg_mask_size += mask.sum()
@@ -324,6 +328,40 @@ def rank_masking(a):
     return distances
 
 
+def _validate_clusters(clusters, min_cores):
+    """Check which clusters from the cbdbscan step are significant enough. is_valid is set accordingly."""
+    # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the
+    # easy ones and potentially already remove some noise by also sorting smaller clusters to the front.
+    # This clears up the clusters a bit before checking the ones with many neighbors.
+    sorted_clusters = sorted(
+        clusters,
+        key=lambda cluster: (
+            cluster["max_num_neighboring_labels"],
+            cluster["num_cores"],
+            cluster["label"],
+        ),
+        reverse=False,
+    )
+
+    for cluster in sorted_clusters:
+        cluster["is_valid"] = None
+        if cluster["num_cores"] < min_cores:
+            cluster["is_valid"] = False
+            _remove_from_all_sets(cluster["label"], sorted_clusters)
+
+    # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any
+    # other cluster.
+    for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]:
+        label = cluster["label"]
+        if _contains_isolated_cores(label, cluster, min_cores):
+            cluster["is_valid"] = True
+        else:
+            cluster["is_valid"] = False
+            _remove_from_all_sets(label, sorted_clusters)
+
+    return [cluster for cluster in sorted_clusters if cluster["is_valid"]]
+
+
 class EnsembleLda(SaveLoad):
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
@@ -1034,43 +1072,15 @@ def _generate_stable_topics(self, min_cores=None):
         cbdbscan_topics = self.cluster_model.results
 
         grouped_by_labels = _group_by_labels(cbdbscan_topics)
-
         clusters = _aggregate_topics(grouped_by_labels)
-
-        # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the
-        # easy ones and potentially already remove some noise by also sorting smaller clusters to the front.
-        # This clears up the clusters a bit before checking the ones with many neighbors.
-        sorted_clusters = sorted(
-            clusters,
-            key=lambda cluster: (
-                cluster["max_num_neighboring_labels"],
-                cluster["num_cores"],
-                cluster["label"],
-            ),
-            reverse=False,
-        )
-
-        for cluster in sorted_clusters:
-            cluster["is_valid"] = None
-            if cluster["num_cores"] < min_cores:
-                cluster["is_valid"] = False
-                _remove_from_all_sets(cluster["label"], sorted_clusters)
-
-        # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any
-        # other cluster.
-        for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]:
-            label = cluster["label"]
-            if _contains_isolated_cores(label, cluster, min_cores):
-                cluster["is_valid"] = True
-            else:
-                cluster["is_valid"] = False
-                _remove_from_all_sets(label, sorted_clusters)
-
-        # list of all the label numbers that are valid
-        valid_labels = {cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]}
+        valid_clusters = _validate_clusters(clusters, min_cores)
+        valid_cluster_labels = {cluster["label"] for cluster in valid_clusters}
 
         for topic in cbdbscan_topics:
-            topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels}
+            topic["valid_neighboring_labels"] = {
+                label for label in topic["neighboring_labels"]
+                if label in valid_cluster_labels
+            }
 
         # keeping only VALID cores
         valid_core_mask = np.vectorize(_is_valid_core)(cbdbscan_topics)
@@ -1087,7 +1097,7 @@ def _generate_stable_topics(self, min_cores=None):
             topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label])
             stable_topics[l] = topics_of_cluster.mean(axis=0)
 
-        self.sorted_clusters = sorted_clusters
+        self.valid_clusters = valid_clusters
         self.stable_topics = stable_topics
 
         logger.info("found %s stable topics", len(stable_topics))
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 8d021d3658..4288f9ef98 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -84,7 +84,7 @@ def test_clustering(self):
 
         reference = EnsembleLda.load(datapath('ensemblelda'))
         cluster_model_results = deepcopy(reference.cluster_model.results)
-        sorted_clusters = deepcopy(reference.sorted_clusters)
+        valid_clusters = deepcopy(reference.valid_clusters)
         stable_topics = deepcopy(reference.get_topics())
 
         # continue training with the distance matrix of the pretrained reference and see if
@@ -93,7 +93,7 @@ def test_clustering(self):
         reference.recluster()
 
         self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
-        self.assertEqual(reference.sorted_clusters, sorted_clusters)
+        self.assertEqual(reference.valid_clusters, valid_clusters)
         np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=rtol)
 
     def test_not_trained(self):

From dbb758121d8177d67bb362c6669782d4d52c4ebe Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Wed, 30 Jun 2021 21:26:23 +0200
Subject: [PATCH 150/166] pythonic varnames + pytest style asserts

---
 gensim/models/ensemblelda.py    |   4 +-
 gensim/test/test_ensemblelda.py | 347 ++++++++++++++++----------------
 2 files changed, 177 insertions(+), 174 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 17654fc676..e480ba07a2 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -327,7 +327,7 @@ def rank_masking(a):
 class EnsembleLda(SaveLoad):
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
-    Extracts stable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
+    Extracts stable topics that are consistently learned across multiple LDA models. eLDA has the added benefit that
     the user does not need to know the exact number of topics the topic model should extract ahead of time.
 
     """
@@ -454,7 +454,7 @@ def __init__(
         self.sstats_sum = 0
         self.eta = None
         self.tms = []
-        # initialize empty topic term distribution array
+        # initialize empty 2D topic term distribution array (ttda) (number of topics x number of terms)
         self.ttda = np.empty((0, len(gensim_kw_args["id2word"])))
         self.asymmetric_distance_matrix_outdated = True
 
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 8d021d3658..44afb0e4a6 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -14,15 +14,17 @@
 import numpy as np
 from copy import deepcopy
 
+import pytest
+
 from gensim.models import EnsembleLda, LdaMulticore, LdaModel
 from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
 
-num_topics = 2
-num_models = 4
-passes = 50
+NUM_TOPICS = 2
+NUM_MODELS = 4
+PASSES = 50
 
 # windows tests fail due to the required assertion precision being too high
-rtol = 1e-04 if os.name == 'nt' else 1e-05
+RTOL = 1e-04 if os.name == 'nt' else 1e-05
 
 
 class TestModel(unittest.TestCase):
@@ -31,49 +33,49 @@ def setUp(self):
         # the topics are equal
         random_state = 0
 
-        self.eLDA = EnsembleLda(
-            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-            passes=passes, num_models=num_models, random_state=random_state,
+        self.elda = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
+            passes=PASSES, num_models=NUM_MODELS, random_state=random_state,
             topic_model_class=LdaModel,
         )
 
-        self.eLDA_mu = EnsembleLda(
-            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-            passes=passes, num_models=num_models, random_state=random_state,
+        self.elda_mem_unfriendly = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
+            passes=PASSES, num_models=NUM_MODELS, random_state=random_state,
             memory_friendly_ttda=False, topic_model_class=LdaModel,
         )
 
-    def check_ttda(self, ensemble):
-        """tests the integrity of the ttda of any ensemble"""
-        self.assertGreater(len(ensemble.ttda), 0)
-        a = ensemble.ttda.sum(axis=1)
-        b = np.ones(len(ensemble.ttda)).astype(np.float32)
-        np.testing.assert_allclose(a, b, rtol=1e-04)
+    def assert_ttda_is_valid(self, ensemble):
+        """Check that ttda has one or more topic and that term probabilities add to one."""
+        assert len(ensemble.ttda) > 0
+        sum_over_terms = ensemble.ttda.sum(axis=1)
+        expected_sum_over_terms = np.ones(len(ensemble.ttda)).astype(np.float32)
+        np.testing.assert_allclose(sum_over_terms, expected_sum_over_terms, rtol=1e-04)
 
-    def test_eLDA(self):
+    def test_elda(self):
         # given that the random_state doesn't change, it should
         # always be 2 detected topics in this setup.
-        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
-        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
-        self.check_ttda(self.eLDA)
+        assert self.elda.stable_topics.shape[1] == len(common_dictionary)
+        assert len(self.elda.ttda) == NUM_MODELS * NUM_TOPICS
+        self.assert_ttda_is_valid(self.elda)
 
         # reclustering shouldn't change anything without
         # added models or different parameters
-        self.eLDA.recluster()
+        self.elda.recluster()
 
-        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
-        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
-        self.check_ttda(self.eLDA)
+        assert self.elda.stable_topics.shape[1] == len(common_dictionary)
+        assert len(self.elda.ttda) == NUM_MODELS * NUM_TOPICS
+        self.assert_ttda_is_valid(self.elda)
 
         # compare with a pre-trained reference model
         reference = EnsembleLda.load(datapath('ensemblelda'))
-        np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=rtol)
+        np.testing.assert_allclose(self.elda.ttda, reference.ttda, rtol=RTOL)
         # small values in the distance matrix tend to vary quite a bit around 2%,
         # so use some absolute tolerance measurement to check if the matrix is at least
         # close to the target.
         atol = reference.asymmetric_distance_matrix.max() * 1e-05
         np.testing.assert_allclose(
-            self.eLDA.asymmetric_distance_matrix,
+            self.elda.asymmetric_distance_matrix,
             reference.asymmetric_distance_matrix, atol=atol,
         )
 
@@ -93,54 +95,54 @@ def test_clustering(self):
         reference.recluster()
 
         self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
-        self.assertEqual(reference.sorted_clusters, sorted_clusters)
-        np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=rtol)
+        assert reference.sorted_clusters == sorted_clusters
+        np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=RTOL)
 
     def test_not_trained(self):
         # should not throw errors and no training should happen
 
         # 0 passes
-        eLDA = EnsembleLda(
-            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-            passes=0, num_models=num_models, random_state=0,
+        elda = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
+            passes=0, num_models=NUM_MODELS, random_state=0,
         )
-        self.assertEqual(len(eLDA.ttda), 0)
+        assert len(elda.ttda) == 0
 
         # no corpus
-        eLDA = EnsembleLda(
-            id2word=common_dictionary, num_topics=num_topics,
-            passes=passes, num_models=num_models, random_state=0,
+        elda = EnsembleLda(
+            id2word=common_dictionary, num_topics=NUM_TOPICS,
+            passes=PASSES, num_models=NUM_MODELS, random_state=0,
         )
-        self.assertEqual(len(eLDA.ttda), 0)
+        assert len(elda.ttda) == 0
 
         # 0 iterations
-        eLDA = EnsembleLda(
-            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-            iterations=0, num_models=num_models, random_state=0,
+        elda = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
+            iterations=0, num_models=NUM_MODELS, random_state=0,
         )
-        self.assertEqual(len(eLDA.ttda), 0)
+        assert len(elda.ttda) == 0
 
         # 0 models
-        eLDA = EnsembleLda(
-            corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
-            passes=passes, num_models=0, random_state=0
+        elda = EnsembleLda(
+            corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
+            passes=PASSES, num_models=0, random_state=0
         )
-        self.assertEqual(len(eLDA.ttda), 0)
+        assert len(elda.ttda) == 0
 
-    def test_memory_unfriendly(self):
-        # at this point, self.eLDA_mu and self.eLDA are already trained
+    def test_mem_unfriendly(self):
+        # at this point, self.elda_mem_unfriendly and self.eLDA are already trained
         # in the setUpClass function
         # both should be 100% similar (but floats cannot
         # be compared that easily, so check for threshold)
-        self.assertEqual(len(self.eLDA_mu.tms), num_models)
-        np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=rtol)
-        np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=rtol)
-        self.check_ttda(self.eLDA_mu)
+        assert len(self.elda_mem_unfriendly.tms) == NUM_MODELS
+        np.testing.assert_allclose(self.elda.ttda, self.elda_mem_unfriendly.ttda, rtol=RTOL)
+        np.testing.assert_allclose(self.elda.get_topics(), self.elda_mem_unfriendly.get_topics(), rtol=RTOL)
+        self.assert_ttda_is_valid(self.elda_mem_unfriendly)
 
     def test_generate_gensim_rep(self):
-        gensimModel = self.eLDA.generate_gensim_representation()
-        topics = gensimModel.get_topics()
-        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol)
+        gensim_model = self.elda.generate_gensim_representation()
+        topics = gensim_model.get_topics()
+        np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL)
 
     def assert_cluster_results_equal(self, a, b):
         """compares important attributes of the cluster results"""
@@ -155,38 +157,38 @@ def assert_cluster_results_equal(self, a, b):
 
     def test_persisting(self):
         fname = get_tmpfile('gensim_models_ensemblelda')
-        self.eLDA.save(fname)
-        loaded_eLDA = EnsembleLda.load(fname)
+        self.elda.save(fname)
+        loaded_elda = EnsembleLda.load(fname)
         # storing the ensemble without memory_friendy_ttda
-        self.eLDA_mu.save(fname)
-        loaded_eLDA_mu = EnsembleLda.load(fname)
+        self.elda_mem_unfriendly.save(fname)
+        loaded_elda_mu = EnsembleLda.load(fname)
 
         # topic_model_class will be lazy loaded and should be None first
-        assert loaded_eLDA.topic_model_class is None
+        assert loaded_elda.topic_model_class is None
 
         # was it stored and loaded correctly?
         # memory friendly.
-        loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation()
+        loaded_elda_representation = loaded_elda.generate_gensim_representation()
 
         # generating the representation also lazily loads the topic_model_class
-        assert loaded_eLDA.topic_model_class == LdaModel
+        assert loaded_elda.topic_model_class == LdaModel
 
-        topics = loaded_eLDA_representation.get_topics()
-        ttda = loaded_eLDA.ttda
-        amatrix = loaded_eLDA.asymmetric_distance_matrix
-        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol)
-        np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=rtol)
-        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=rtol)
+        topics = loaded_elda_representation.get_topics()
+        ttda = loaded_elda.ttda
+        amatrix = loaded_elda.asymmetric_distance_matrix
+        np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL)
+        np.testing.assert_allclose(self.elda.ttda, ttda, rtol=RTOL)
+        np.testing.assert_allclose(self.elda.asymmetric_distance_matrix, amatrix, rtol=RTOL)
 
-        a = self.eLDA.cluster_model.results
-        b = loaded_eLDA.cluster_model.results
+        a = self.elda.cluster_model.results
+        b = loaded_elda.cluster_model.results
 
         self.assert_cluster_results_equal(a, b)
 
         # memory unfriendly
-        loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation()
-        topics = loaded_eLDA_mu_representation.get_topics()
-        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol)
+        loaded_elda_mem_unfriendly_representation = loaded_elda_mu.generate_gensim_representation()
+        topics = loaded_elda_mem_unfriendly_representation.get_topics()
+        np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL)
 
     def test_multiprocessing(self):
         # same configuration
@@ -198,40 +200,40 @@ def test_multiprocessing(self):
         workers = 3
 
         # memory friendly. contains List of topic word distributions
-        eLDA_multi = EnsembleLda(
+        elda_multiprocessing = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
-            num_topics=num_topics, passes=passes, num_models=num_models,
+            num_topics=NUM_TOPICS, passes=PASSES, num_models=NUM_MODELS,
             random_state=random_state, ensemble_workers=workers, distance_workers=workers,
         )
 
         # memory unfriendly. contains List of models
-        eLDA_multi_mu = EnsembleLda(
+        elda_multiprocessing_mem_unfriendly = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
-            num_topics=num_topics, passes=passes, num_models=num_models,
+            num_topics=NUM_TOPICS, passes=PASSES, num_models=NUM_MODELS,
             random_state=random_state, ensemble_workers=workers, distance_workers=workers,
             memory_friendly_ttda=False,
         )
 
-        np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=rtol)
-        np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=rtol)
+        np.testing.assert_allclose(self.elda.get_topics(), elda_multiprocessing.get_topics(), rtol=RTOL)
+        np.testing.assert_allclose(self.elda_mem_unfriendly.get_topics(), elda_multiprocessing_mem_unfriendly.get_topics(), rtol=RTOL)
 
     def test_add_models_to_empty(self):
         ensemble = EnsembleLda(id2word=common_dictionary, num_models=0)
-        ensemble.add_model(self.eLDA.ttda[0:1])
-        ensemble.add_model(self.eLDA.ttda[1:])
+        ensemble.add_model(self.elda.ttda[0:1])
+        ensemble.add_model(self.elda.ttda[1:])
         ensemble.recluster()
-        np.testing.assert_allclose(ensemble.get_topics(), self.eLDA.get_topics(), rtol=rtol)
+        np.testing.assert_allclose(ensemble.get_topics(), self.elda.get_topics(), rtol=RTOL)
 
         # persisting an ensemble that is entirely built from existing ttdas
         fname = get_tmpfile('gensim_models_ensemblelda')
         ensemble.save(fname)
         loaded_ensemble = EnsembleLda.load(fname)
-        np.testing.assert_allclose(loaded_ensemble.get_topics(), self.eLDA.get_topics(), rtol=rtol)
+        np.testing.assert_allclose(loaded_ensemble.get_topics(), self.elda.get_topics(), rtol=RTOL)
         self.test_inference(loaded_ensemble)
 
     def test_add_models(self):
         # same configuration
-        num_models = self.eLDA.num_models
+        num_models = self.elda.num_models
 
         # make sure countings and sizes after adding are correct
         # create new models and add other models to them.
@@ -244,7 +246,7 @@ def test_add_models(self):
         num_new_topics = 3
 
         # 1. memory friendly
-        eLDA_base = EnsembleLda(
+        elda_base = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=1, num_models=num_new_models,
             iterations=1, random_state=0, topic_model_class=LdaMulticore,
@@ -252,47 +254,47 @@ def test_add_models(self):
         )
 
         # 1.1 ttda
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
-        eLDA_base.add_model(self.eLDA.ttda)
-        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
-        self.assertEqual(eLDA_base.num_models, b + 1)  # defaults to 1 for one ttda matrix
+        a = len(elda_base.ttda)
+        b = elda_base.num_models
+        elda_base.add_model(self.elda.ttda)
+        assert len(elda_base.ttda) == a + len(self.elda.ttda)
+        assert elda_base.num_models == b + 1  # defaults to 1 for one ttda matrix
 
         # 1.2 an ensemble
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
-        eLDA_base.add_model(self.eLDA, 5)
-        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
-        self.assertEqual(eLDA_base.num_models, b + 5)
+        a = len(elda_base.ttda)
+        b = elda_base.num_models
+        elda_base.add_model(self.elda, 5)
+        assert len(elda_base.ttda) == a + len(self.elda.ttda)
+        assert elda_base.num_models == b + 5
 
         # 1.3 a list of ensembles
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
+        a = len(elda_base.ttda)
+        b = elda_base.num_models
         # it should be totally legit to add a memory unfriendly object to a memory friendly one
-        eLDA_base.add_model([self.eLDA, self.eLDA_mu])
-        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda))
-        self.assertEqual(eLDA_base.num_models, b + 2 * num_models)
+        elda_base.add_model([self.elda, self.elda_mem_unfriendly])
+        assert len(elda_base.ttda) == a + 2 * len(self.elda.ttda)
+        assert elda_base.num_models == b + 2 * num_models
 
         # 1.4 a single gensim model
-        model = self.eLDA.classic_model_representation
+        model = self.elda.classic_model_representation
 
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
-        eLDA_base.add_model(model)
-        self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics()))
-        self.assertEqual(eLDA_base.num_models, b + 1)
+        a = len(elda_base.ttda)
+        b = elda_base.num_models
+        elda_base.add_model(model)
+        assert len(elda_base.ttda) == a + len(model.get_topics())
+        assert elda_base.num_models == b + 1
 
         # 1.5 a list gensim models
-        a = len(eLDA_base.ttda)
-        b = eLDA_base.num_models
-        eLDA_base.add_model([model, model])
-        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics()))
-        self.assertEqual(eLDA_base.num_models, b + 2)
+        a = len(elda_base.ttda)
+        b = elda_base.num_models
+        elda_base.add_model([model, model])
+        assert len(elda_base.ttda) == a + 2 * len(model.get_topics())
+        assert elda_base.num_models == b + 2
 
-        self.check_ttda(eLDA_base)
+        self.assert_ttda_is_valid(elda_base)
 
         # 2. memory unfriendly
-        eLDA_base_mu = EnsembleLda(
+        elda_base_mem_unfriendly = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=1, num_models=num_new_models,
             iterations=1, random_state=0, topic_model_class=LdaMulticore,
@@ -300,44 +302,45 @@ def test_add_models(self):
         )
 
         # 2.1 a single ensemble
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        eLDA_base_mu.add_model(self.eLDA_mu)
-        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
-        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+        a = len(elda_base_mem_unfriendly.tms)
+        b = elda_base_mem_unfriendly.num_models
+        elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly)
+        assert len(elda_base_mem_unfriendly.tms) == a + num_models
+        assert elda_base_mem_unfriendly.num_models == b + num_models
 
         # 2.2 a list of ensembles
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu])
-        self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models)
-        self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models)
+        a = len(elda_base_mem_unfriendly.tms)
+        b = elda_base_mem_unfriendly.num_models
+        elda_base_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly])
+        assert len(elda_base_mem_unfriendly.tms) == a + 2 * num_models
+        assert elda_base_mem_unfriendly.num_models == b + 2 * num_models
 
         # 2.3 a single gensim model
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        eLDA_base_mu.add_model(self.eLDA_mu.tms[0])
-        self.assertEqual(len(eLDA_base_mu.tms), a + 1)
-        self.assertEqual(eLDA_base_mu.num_models, b + 1)
+        a = len(elda_base_mem_unfriendly.tms)
+        b = elda_base_mem_unfriendly.num_models
+        elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0])
+        assert len(elda_base_mem_unfriendly.tms) == a + 1
+        assert elda_base_mem_unfriendly.num_models == b + 1
 
         # 2.4 a list of gensim models
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        eLDA_base_mu.add_model(self.eLDA_mu.tms)
-        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
-        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+        a = len(elda_base_mem_unfriendly.tms)
+        b = elda_base_mem_unfriendly.num_models
+        elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms)
+        assert len(elda_base_mem_unfriendly.tms) == a + num_models
+        assert elda_base_mem_unfriendly.num_models == b + num_models
 
         # 2.5 topic term distributions should throw errors, because the
         # actual models are needed for the memory unfriendly ensemble
-        a = len(eLDA_base_mu.tms)
-        b = eLDA_base_mu.num_models
-        self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics()))
+        a = len(elda_base_mem_unfriendly.tms)
+        b = elda_base_mem_unfriendly.num_models
+        with pytest.raises(ValueError):
+            elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics())
         # remains unchanged
-        self.assertEqual(len(eLDA_base_mu.tms), a)
-        self.assertEqual(eLDA_base_mu.num_models, b)
+        assert len(elda_base_mem_unfriendly.tms) == a
+        assert elda_base_mem_unfriendly.num_models == b
 
-        self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
-        self.check_ttda(eLDA_base_mu)
+        assert elda_base_mem_unfriendly.num_models == len(elda_base_mem_unfriendly.tms)
+        self.assert_ttda_is_valid(elda_base_mem_unfriendly)
 
     def test_add_and_recluster(self):
         # See if after adding a model, the model still makes sense
@@ -345,73 +348,73 @@ def test_add_and_recluster(self):
         num_new_topics = 3
 
         # train
-        new_eLDA = EnsembleLda(
+        elda = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=10, num_models=num_new_models,
             iterations=30, random_state=1, topic_model_class='lda',
             distance_workers=4,
         )
-        new_eLDA_mu = EnsembleLda(
+        elda_mem_unfriendly = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=10, num_models=num_new_models,
             iterations=30, random_state=1, topic_model_class=LdaModel,
             distance_workers=4, memory_friendly_ttda=False,
         )
         # both should be similar
-        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol)
-        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol)
+        np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL)
+        np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL)
         # and every next step applied to both should result in similar results
 
         # 1. adding to ttda and tms
-        new_eLDA.add_model(self.eLDA)
-        new_eLDA_mu.add_model(self.eLDA_mu)
-        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol)
-        self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics)
-        self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics)
-        self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models)
-        self.check_ttda(new_eLDA)
-        self.check_ttda(new_eLDA_mu)
+        elda.add_model(self.elda)
+        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly)
+        np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL)
+        assert len(elda.ttda) == len(self.elda.ttda) + num_new_models * num_new_topics
+        assert len(elda_mem_unfriendly.ttda) == len(self.elda_mem_unfriendly.ttda) + num_new_models * num_new_topics
+        assert len(elda_mem_unfriendly.tms) == NUM_MODELS + num_new_models
+        self.assert_ttda_is_valid(elda)
+        self.assert_ttda_is_valid(elda_mem_unfriendly)
 
         # 2. distance matrix
-        new_eLDA._generate_asymmetric_distance_matrix()
-        new_eLDA_mu._generate_asymmetric_distance_matrix()
+        elda._generate_asymmetric_distance_matrix()
+        elda_mem_unfriendly._generate_asymmetric_distance_matrix()
         np.testing.assert_allclose(
-            new_eLDA.asymmetric_distance_matrix,
-            new_eLDA_mu.asymmetric_distance_matrix,
+            elda.asymmetric_distance_matrix,
+            elda_mem_unfriendly.asymmetric_distance_matrix,
         )
 
         # 3. CBDBSCAN results
-        new_eLDA._generate_topic_clusters()
-        new_eLDA_mu._generate_topic_clusters()
-        a = new_eLDA.cluster_model.results
-        b = new_eLDA_mu.cluster_model.results
+        elda._generate_topic_clusters()
+        elda_mem_unfriendly._generate_topic_clusters()
+        a = elda.cluster_model.results
+        b = elda_mem_unfriendly.cluster_model.results
         self.assert_cluster_results_equal(a, b)
 
         # 4. finally, the stable topics
-        new_eLDA._generate_stable_topics()
-        new_eLDA_mu._generate_stable_topics()
+        elda._generate_stable_topics()
+        elda_mem_unfriendly._generate_stable_topics()
         np.testing.assert_allclose(
-            new_eLDA.get_topics(),
-            new_eLDA_mu.get_topics(),
+            elda.get_topics(),
+            elda_mem_unfriendly.get_topics(),
         )
 
-        new_eLDA.generate_gensim_representation()
-        new_eLDA_mu.generate_gensim_representation()
+        elda.generate_gensim_representation()
+        elda_mem_unfriendly.generate_gensim_representation()
 
         # same random state, hence topics should be still similar
-        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol)
+        np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL)
 
-    def test_inference(self, eLDA=None):
-        if eLDA is None:
-            eLDA = self.eLDA
+    def test_inference(self, elda=None):
+        if elda is None:
+            elda = self.elda
 
         # get the most likely token id from topic 0
-        max_id = np.argmax(eLDA.get_topics()[0, :])
-        self.assertGreater(eLDA.classic_model_representation.iterations, 0)
+        max_id = np.argmax(elda.get_topics()[0, :])
+        assert elda.classic_model_representation.iterations > 0
         # topic 0 should be dominant in the inference.
         # the difference between the probabilities should be significant and larger than 0.3
-        infered = eLDA[[(max_id, 1)]]
-        self.assertGreater(infered[0][1] - 0.3, infered[1][1])
+        inferred = elda[[(max_id, 1)]]
+        assert inferred[0][1] - 0.3 > inferred[1][1]
 
 
 if __name__ == '__main__':

From 06cc33a49882506087383b16f1c438c277915431 Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Wed, 30 Jun 2021 22:24:22 +0200
Subject: [PATCH 151/166] better var names

---
 gensim/test/test_ensemblelda.py | 113 ++++++++++++++++----------------
 1 file changed, 55 insertions(+), 58 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 38ada1caaa..98720ea7d1 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -232,9 +232,6 @@ def test_add_models_to_empty(self):
         self.test_inference(loaded_ensemble)
 
     def test_add_models(self):
-        # same configuration
-        num_models = self.elda.num_models
-
         # make sure countings and sizes after adding are correct
         # create new models and add other models to them.
 
@@ -246,7 +243,7 @@ def test_add_models(self):
         num_new_topics = 3
 
         # 1. memory friendly
-        elda_base = EnsembleLda(
+        elda = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=1, num_models=num_new_models,
             iterations=1, random_state=0, topic_model_class=LdaMulticore,
@@ -254,47 +251,47 @@ def test_add_models(self):
         )
 
         # 1.1 ttda
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
-        elda_base.add_model(self.elda.ttda)
-        assert len(elda_base.ttda) == a + len(self.elda.ttda)
-        assert elda_base.num_models == b + 1  # defaults to 1 for one ttda matrix
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
+        elda.add_model(self.elda.ttda)
+        assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda)
+        assert elda.num_models == num_models_before_add_model + 1  # defaults to 1 for one ttda matrix
 
         # 1.2 an ensemble
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
-        elda_base.add_model(self.elda, 5)
-        assert len(elda_base.ttda) == a + len(self.elda.ttda)
-        assert elda_base.num_models == b + 5
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
+        elda.add_model(self.elda, 5)
+        assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda)
+        assert elda.num_models == num_models_before_add_model + 5
 
         # 1.3 a list of ensembles
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
         # it should be totally legit to add a memory unfriendly object to a memory friendly one
-        elda_base.add_model([self.elda, self.elda_mem_unfriendly])
-        assert len(elda_base.ttda) == a + 2 * len(self.elda.ttda)
-        assert elda_base.num_models == b + 2 * num_models
+        elda.add_model([self.elda, self.elda_mem_unfriendly])
+        assert len(elda.ttda) == num_topics_before_add_model + 2 * len(self.elda.ttda)
+        assert elda.num_models == num_models_before_add_model + 2 * NUM_MODELS
 
         # 1.4 a single gensim model
         model = self.elda.classic_model_representation
 
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
-        elda_base.add_model(model)
-        assert len(elda_base.ttda) == a + len(model.get_topics())
-        assert elda_base.num_models == b + 1
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
+        elda.add_model(model)
+        assert len(elda.ttda) == num_topics_before_add_model + len(model.get_topics())
+        assert elda.num_models == num_models_before_add_model + 1
 
         # 1.5 a list gensim models
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
-        elda_base.add_model([model, model])
-        assert len(elda_base.ttda) == a + 2 * len(model.get_topics())
-        assert elda_base.num_models == b + 2
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
+        elda.add_model([model, model])
+        assert len(elda.ttda) == num_topics_before_add_model + 2 * len(model.get_topics())
+        assert elda.num_models == num_models_before_add_model + 2
 
-        self.assert_ttda_is_valid(elda_base)
+        self.assert_ttda_is_valid(elda)
 
         # 2. memory unfriendly
-        elda_base_mem_unfriendly = EnsembleLda(
+        elda_mem_unfriendly = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=1, num_models=num_new_models,
             iterations=1, random_state=0, topic_model_class=LdaMulticore,
@@ -302,45 +299,45 @@ def test_add_models(self):
         )
 
         # 2.1 a single ensemble
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
-        elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly)
-        assert len(elda_base_mem_unfriendly.tms) == a + num_models
-        assert elda_base_mem_unfriendly.num_models == b + num_models
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
+        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly)
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS
 
         # 2.2 a list of ensembles
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
-        elda_base_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly])
-        assert len(elda_base_mem_unfriendly.tms) == a + 2 * num_models
-        assert elda_base_mem_unfriendly.num_models == b + 2 * num_models
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
+        elda_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly])
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 2 * NUM_MODELS
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model + 2 * NUM_MODELS
 
         # 2.3 a single gensim model
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
-        elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0])
-        assert len(elda_base_mem_unfriendly.tms) == a + 1
-        assert elda_base_mem_unfriendly.num_models == b + 1
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
+        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0])
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 1
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model + 1
 
         # 2.4 a list of gensim models
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
-        elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms)
-        assert len(elda_base_mem_unfriendly.tms) == a + num_models
-        assert elda_base_mem_unfriendly.num_models == b + num_models
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
+        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms)
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS
 
         # 2.5 topic term distributions should throw errors, because the
         # actual models are needed for the memory unfriendly ensemble
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
         with pytest.raises(ValueError):
-            elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics())
+            elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics())
         # remains unchanged
-        assert len(elda_base_mem_unfriendly.tms) == a
-        assert elda_base_mem_unfriendly.num_models == b
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model
 
-        assert elda_base_mem_unfriendly.num_models == len(elda_base_mem_unfriendly.tms)
-        self.assert_ttda_is_valid(elda_base_mem_unfriendly)
+        assert elda_mem_unfriendly.num_models == len(elda_mem_unfriendly.tms)
+        self.assert_ttda_is_valid(elda_mem_unfriendly)
 
     def test_add_and_recluster(self):
         # See if after adding a model, the model still makes sense

From ec4b487a2fcc44661e7ea2cfbfe3286279853fa1 Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Wed, 30 Jun 2021 22:24:22 +0200
Subject: [PATCH 152/166] better var names

---
 gensim/test/test_ensemblelda.py | 134 ++++++++++++++++----------------
 1 file changed, 65 insertions(+), 69 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 38ada1caaa..1c0004f51b 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -27,7 +27,7 @@
 RTOL = 1e-04 if os.name == 'nt' else 1e-05
 
 
-class TestModel(unittest.TestCase):
+class TestEnsembleLda(unittest.TestCase):
     def setUp(self):
         # same configuration for each model to make sure
         # the topics are equal
@@ -130,10 +130,9 @@ def test_not_trained(self):
         assert len(elda.ttda) == 0
 
     def test_mem_unfriendly(self):
-        # at this point, self.elda_mem_unfriendly and self.eLDA are already trained
-        # in the setUpClass function
-        # both should be 100% similar (but floats cannot
-        # be compared that easily, so check for threshold)
+        # at this point, self.elda_mem_unfriendly and self.elda are already trained
+        # in the setUp function and the topics extracted should be equal up to floating
+        # point variations between the two implementations.
         assert len(self.elda_mem_unfriendly.tms) == NUM_MODELS
         np.testing.assert_allclose(self.elda.ttda, self.elda_mem_unfriendly.ttda, rtol=RTOL)
         np.testing.assert_allclose(self.elda.get_topics(), self.elda_mem_unfriendly.get_topics(), rtol=RTOL)
@@ -144,15 +143,15 @@ def test_generate_gensim_rep(self):
         topics = gensim_model.get_topics()
         np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL)
 
-    def assert_cluster_results_equal(self, a, b):
-        """compares important attributes of the cluster results"""
+    def assert_cluster_results_equal(self, cluster_model_results_1, cluster_model_results_2):
+        """Assert important attributes of the cluster results"""
         np.testing.assert_array_equal(
-            [row["label"] for row in a],
-            [row["label"] for row in b],
+            [row["label"] for row in cluster_model_results_1],
+            [row["label"] for row in cluster_model_results_2],
         )
         np.testing.assert_array_equal(
-            [row["is_core"] for row in a],
-            [row["is_core"] for row in b],
+            [row["is_core"] for row in cluster_model_results_1],
+            [row["is_core"] for row in cluster_model_results_2],
         )
 
     def test_persisting(self):
@@ -232,9 +231,6 @@ def test_add_models_to_empty(self):
         self.test_inference(loaded_ensemble)
 
     def test_add_models(self):
-        # same configuration
-        num_models = self.elda.num_models
-
         # make sure countings and sizes after adding are correct
         # create new models and add other models to them.
 
@@ -246,7 +242,7 @@ def test_add_models(self):
         num_new_topics = 3
 
         # 1. memory friendly
-        elda_base = EnsembleLda(
+        elda = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=1, num_models=num_new_models,
             iterations=1, random_state=0, topic_model_class=LdaMulticore,
@@ -254,47 +250,47 @@ def test_add_models(self):
         )
 
         # 1.1 ttda
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
-        elda_base.add_model(self.elda.ttda)
-        assert len(elda_base.ttda) == a + len(self.elda.ttda)
-        assert elda_base.num_models == b + 1  # defaults to 1 for one ttda matrix
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
+        elda.add_model(self.elda.ttda)
+        assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda)
+        assert elda.num_models == num_models_before_add_model + 1  # defaults to 1 for one ttda matrix
 
         # 1.2 an ensemble
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
-        elda_base.add_model(self.elda, 5)
-        assert len(elda_base.ttda) == a + len(self.elda.ttda)
-        assert elda_base.num_models == b + 5
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
+        elda.add_model(self.elda, 5)
+        assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda)
+        assert elda.num_models == num_models_before_add_model + 5
 
         # 1.3 a list of ensembles
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
         # it should be totally legit to add a memory unfriendly object to a memory friendly one
-        elda_base.add_model([self.elda, self.elda_mem_unfriendly])
-        assert len(elda_base.ttda) == a + 2 * len(self.elda.ttda)
-        assert elda_base.num_models == b + 2 * num_models
+        elda.add_model([self.elda, self.elda_mem_unfriendly])
+        assert len(elda.ttda) == num_topics_before_add_model + 2 * len(self.elda.ttda)
+        assert elda.num_models == num_models_before_add_model + 2 * NUM_MODELS
 
         # 1.4 a single gensim model
         model = self.elda.classic_model_representation
 
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
-        elda_base.add_model(model)
-        assert len(elda_base.ttda) == a + len(model.get_topics())
-        assert elda_base.num_models == b + 1
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
+        elda.add_model(model)
+        assert len(elda.ttda) == num_topics_before_add_model + len(model.get_topics())
+        assert elda.num_models == num_models_before_add_model + 1
 
         # 1.5 a list gensim models
-        a = len(elda_base.ttda)
-        b = elda_base.num_models
-        elda_base.add_model([model, model])
-        assert len(elda_base.ttda) == a + 2 * len(model.get_topics())
-        assert elda_base.num_models == b + 2
+        num_topics_before_add_model = len(elda.ttda)
+        num_models_before_add_model = elda.num_models
+        elda.add_model([model, model])
+        assert len(elda.ttda) == num_topics_before_add_model + 2 * len(model.get_topics())
+        assert elda.num_models == num_models_before_add_model + 2
 
-        self.assert_ttda_is_valid(elda_base)
+        self.assert_ttda_is_valid(elda)
 
         # 2. memory unfriendly
-        elda_base_mem_unfriendly = EnsembleLda(
+        elda_mem_unfriendly = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=1, num_models=num_new_models,
             iterations=1, random_state=0, topic_model_class=LdaMulticore,
@@ -302,45 +298,45 @@ def test_add_models(self):
         )
 
         # 2.1 a single ensemble
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
-        elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly)
-        assert len(elda_base_mem_unfriendly.tms) == a + num_models
-        assert elda_base_mem_unfriendly.num_models == b + num_models
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
+        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly)
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS
 
         # 2.2 a list of ensembles
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
-        elda_base_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly])
-        assert len(elda_base_mem_unfriendly.tms) == a + 2 * num_models
-        assert elda_base_mem_unfriendly.num_models == b + 2 * num_models
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
+        elda_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly])
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 2 * NUM_MODELS
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model + 2 * NUM_MODELS
 
         # 2.3 a single gensim model
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
-        elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0])
-        assert len(elda_base_mem_unfriendly.tms) == a + 1
-        assert elda_base_mem_unfriendly.num_models == b + 1
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
+        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0])
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 1
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model + 1
 
         # 2.4 a list of gensim models
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
-        elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms)
-        assert len(elda_base_mem_unfriendly.tms) == a + num_models
-        assert elda_base_mem_unfriendly.num_models == b + num_models
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
+        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms)
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS
 
         # 2.5 topic term distributions should throw errors, because the
         # actual models are needed for the memory unfriendly ensemble
-        a = len(elda_base_mem_unfriendly.tms)
-        b = elda_base_mem_unfriendly.num_models
+        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
+        num_models_before_add_model = elda_mem_unfriendly.num_models
         with pytest.raises(ValueError):
-            elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics())
+            elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics())
         # remains unchanged
-        assert len(elda_base_mem_unfriendly.tms) == a
-        assert elda_base_mem_unfriendly.num_models == b
+        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model
+        assert elda_mem_unfriendly.num_models == num_models_before_add_model
 
-        assert elda_base_mem_unfriendly.num_models == len(elda_base_mem_unfriendly.tms)
-        self.assert_ttda_is_valid(elda_base_mem_unfriendly)
+        assert elda_mem_unfriendly.num_models == len(elda_mem_unfriendly.tms)
+        self.assert_ttda_is_valid(elda_mem_unfriendly)
 
     def test_add_and_recluster(self):
         # See if after adding a model, the model still makes sense

From 531ac6aa77a4e63f1f960dd655c7dc26d094ea29 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Wed, 30 Jun 2021 23:15:31 +0200
Subject: [PATCH 153/166] simplified some function calls to use attributes
 instead of parameters

---
 gensim/models/ensemblelda.py | 172 +++++++++++++++++------------------
 1 file changed, 81 insertions(+), 91 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 71c9874c2e..53dcbb99d4 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -242,90 +242,23 @@ def _teardown(pipes, processes, i):
         del process
 
 
-def mass_masking(a, threshold):
+def mass_masking(a, threshold=None):
     """Original masking method. Returns a new binary mask."""
+    if threshold is None:
+        threshold = 0.95
+
     sorted_a = np.sort(a)[::-1]
     largest_mass = sorted_a.cumsum() < threshold
     smallest_valid = sorted_a[largest_mass][-1]
     return a >= smallest_valid
 
 
-def rank_masking(a, threshold):
+def rank_masking(a, threshold=None):
     """Faster masking method. Returns a new binary mask."""
-    return a > np.sort(a)[::-1][int(len(a) * threshold)]
-
-
-def _calculate_asymmetric_distance_matrix_chunk(ttda1, ttda2, threshold, start_index, method):
-    """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
-
-    Parameters
-    ----------
-    ttda1 and ttda2: 2D arrays of floats
-        Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
-        topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
-    threshold : float, optional
-        threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method
-    start_index : int
-        this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
-        complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into
-        two pieces, each 100 ttdas long, then start_index should be be 100. default is 0
-    method : {'mass', 'rank}, optional
-        method can be "mass" for the original masking method or "rank" for a faster masking method that selects
-        by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the
-        topic.
-
-    Returns
-    -------
-    2D numpy.ndarray of floats
-        Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.
-
-    """
-    # initialize the distance matrix. ndarray is faster than zeros
-    distances = np.ndarray((len(ttda1), len(ttda2)))
-
-    if ttda1.shape[0] > 0 and ttda2.shape[0] > 0:
-        # the worker might not have received a ttda because it was chunked up too much
-
-        if method not in ["mass", "rank"]:
-            raise ValueError(f"method {method} unknown")
-
-        # select masking method:
-        create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
-
-        # some help to find a better threshold by useful log messages
-        avg_mask_size = 0
-
-        # now iterate over each topic
-        for ttd1_idx, ttd1 in enumerate(ttda1):
-            # create mask from ttd1 that removes noise from a and keeps the largest terms
-            mask = create_mask(ttd1, threshold)
-            ttd1_masked = ttd1[mask]
-
-            avg_mask_size += mask.sum()
-
-            # now look at every possible pair for topic a:
-            for ttd2_idx, ttd2 in enumerate(ttda2):
-                # distance to itself is 0
-                if ttd1_idx + start_index == ttd2_idx:
-                    distances[ttd1_idx][ttd2_idx] = 0
-                    continue
-
-                # now mask b based on a, which will force the shape of a onto b
-                ttd2_masked = ttd2[mask]
-
-                # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
-                # distributions that will have distance values near 1.
-                if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD:
-                    distance = 1
-                else:
-                    distance = cosine(ttd1_masked, ttd2_masked)
-
-                distances[ttd1_idx][ttd2_idx] = distance
-
-        percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
-        logger.info(f'the given threshold of {threshold} covered on average {percent}% of tokens')
+    if threshold is None:
+        threshold = 0.11
 
-    return distances
+    return a > np.sort(a)[::-1][int(len(a) * threshold)]
 
 
 def _validate_clusters(clusters, min_cores):
@@ -374,7 +307,7 @@ def __init__(
             self, topic_model_class="ldamulticore", num_models=3,
             min_cores=None,  # default value from _generate_stable_topics()
             epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
-            min_samples=None, masking_method="mass", masking_threshold=None,
+            min_samples=None, masking_method=mass_masking, masking_threshold=None,
             distance_workers=1, random_state=None, **gensim_kw_args,
     ):
         """Create and train a new EnsembleLda model.
@@ -416,8 +349,9 @@ def __init__(
             If False, any topic term matrix can be suplied to add_model.
         min_samples : int, optional
             Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering.
-        masking_method : str, optional
-            Choose one of "mass" (default) or "rank" (percentile, faster).
+        masking_method : function, optional
+            Choose one of :meth:`~gensim.models.ensemblelda.mass_masking` (default) or
+            :meth:`~gensim.models.ensemblelda.rank_masking` (percentile, faster).
 
             For clustering, distances between topic-term distributions are asymmetric.  In particular, the distance
             (technically a divergence) from distribution A to B is more of a measure of if A is contained in B.  At a
@@ -429,7 +363,6 @@ def __init__(
 
             2. rank: forms mask by taking the top ranked terms (by mass) until the 'masking_threshold' is reached.
             For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask.
-
         masking_threshold : float, optional
             Default: None, which uses ``0.95`` for "mass", and ``0.11`` for masking_method "rank".  In general, too
             small a mask threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy
@@ -750,7 +683,8 @@ def add_model(self, target, num_new_models=None):
                 raise ValueError(
                     'ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, '
                     'you can call convert_to_memory_friendly, but it will discard the stored gensim '
-                    'models and only keep the relevant topic term distributions from them.')
+                    'models and only keep the relevant topic term distributions from them.'
+                )
 
             # 2. list of ensembles
             elif isinstance(target[0], type(self)):
@@ -783,6 +717,7 @@ def add_model(self, target, num_new_models=None):
                 f"target ttda dimensions do not match. Topics must be {self.ttda.shape[-1]} but was {ttda.shape[-1]} "
                 f"elements large"
             )
+
         self.ttda = np.append(self.ttda, ttda, axis=0)
 
         # tell recluster that the distance matrix needs to be regenerated
@@ -927,13 +862,73 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None):
 
             pipe.close()
 
-    def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method):
+    def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, start_index):
+        """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
+
+        Parameters
+        ----------
+        ttda1 and ttda2: 2D arrays of floats
+            Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
+            topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
+        start_index : int
+            this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
+            complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into
+            two pieces, each 100 ttdas long, then start_index should be be 100. default is 0
+
+        Returns
+        -------
+        2D numpy.ndarray of floats
+            Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.
+
+        """
+        # initialize the distance matrix. ndarray is faster than zeros
+        distances = np.ndarray((len(ttda1), len(ttda2)))
+
+        if ttda1.shape[0] > 0 and ttda2.shape[0] > 0:
+            # the worker might not have received a ttda because it was chunked up too much
+
+            # some help to find a better threshold by useful log messages
+            avg_mask_size = 0
+
+            # now iterate over each topic
+            for ttd1_idx, ttd1 in enumerate(ttda1):
+                # create mask from ttd1 that removes noise from a and keeps the largest terms
+                mask = self.masking_method(ttd1, self.masking_threshold)
+                ttd1_masked = ttd1[mask]
+
+                avg_mask_size += mask.sum()
+
+                # now look at every possible pair for topic a:
+                for ttd2_idx, ttd2 in enumerate(ttda2):
+                    # distance to itself is 0
+                    if ttd1_idx + start_index == ttd2_idx:
+                        distances[ttd1_idx][ttd2_idx] = 0
+                        continue
+
+                    # now mask b based on a, which will force the shape of a onto b
+                    ttd2_masked = ttd2[mask]
+
+                    # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
+                    # distributions that will have distance values near 1.
+                    if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD:
+                        distance = 1
+                    else:
+                        distance = cosine(ttd1_masked, ttd2_masked)
+
+                    distances[ttd1_idx][ttd2_idx] = distance
+
+            percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
+            logger.info(f'the given threshold of {self.masking_threshold} covered on average {percent}% of tokens')
+
+        return distances
+
+    def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe):
         """Worker that computes the distance to all other nodes from a chunk of nodes."""
         logger.info(f"spawned worker to generate {n_ttdas} rows of the asymmetric distance matrix")
         # the chunk of ttda that's going to be calculated:
         ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
-        distance_chunk = _calculate_asymmetric_distance_matrix_chunk(
-            ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method,
+        distance_chunk = self._calculate_asymmetric_distance_matrix_chunk(
+            ttda1=ttda1, ttda2=self.ttda, start_index=ttdas_sent
         )
         pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
         pipe.close()
@@ -947,21 +942,16 @@ def _generate_asymmetric_distance_matrix(self):
 
         """
         workers = self.distance_workers
-        threshold = self.masking_threshold
-        method = self.masking_method
 
         # matrix is up to date afterwards
         self.asymmetric_distance_matrix_outdated = False
 
-        if threshold is None:
-            threshold = {"mass": 0.95, "rank": 0.11}[method]
-
         logger.info(f"generating a {len(self.ttda)} x {len(self.ttda)} asymmetric distance matrix...")
 
         # singlecore
         if workers is not None and workers <= 1:
-            self.asymmetric_distance_matrix = _calculate_asymmetric_distance_matrix_chunk(
-                ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method,
+            self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk(
+                ttda1=self.ttda, ttda2=self.ttda, start_index=0,
             )
 
         # else, if workers > 1 use multiprocessing
@@ -989,7 +979,7 @@ def _generate_asymmetric_distance_matrix(self):
 
                 process = Process(
                     target=self._asymmetric_distance_matrix_worker,
-                    args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method),
+                    args=(i, ttdas_sent, n_ttdas, child_conn),
                 )
                 ttdas_sent += n_ttdas
 

From 0311d94cd24206980fb88411068a4234881e9dbb Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Wed, 30 Jun 2021 23:17:02 +0200
Subject: [PATCH 154/166] sort key function

---
 gensim/models/ensemblelda.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 1b01faccf7..007443edfe 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -136,7 +136,7 @@ def _remove_from_all_sets(label, clusters):
 
 def _contains_isolated_cores(label, cluster, min_cores):
     """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster."""
-    return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores
+    return sum(map(lambda neighboring_labels: neighboring_labels == {label}, cluster["neighboring_labels"])) >= min_cores
 
 
 def _aggregate_topics(grouped_by_labels):
@@ -333,15 +333,10 @@ def _validate_clusters(clusters, min_cores):
     # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the
     # easy ones and potentially already remove some noise by also sorting smaller clusters to the front.
     # This clears up the clusters a bit before checking the ones with many neighbors.
-    sorted_clusters = sorted(
-        clusters,
-        key=lambda cluster: (
-            cluster["max_num_neighboring_labels"],
-            cluster["num_cores"],
-            cluster["label"],
-        ),
-        reverse=False,
-    )
+    def _cluster_sort_key(cluster):
+        return (cluster["max_num_neighboring_labels"], cluster["num_cores"], cluster["label"])
+
+    sorted_clusters = sorted(clusters, key=_cluster_sort_key, reverse=False)
 
     for cluster in sorted_clusters:
         cluster["is_valid"] = None

From 9617e8f2e746eb5e6dfef9fd6fd6e630181f1d72 Mon Sep 17 00:00:00 2001
From: aloosley <aloosley@alumni.brown.edu>
Date: Thu, 1 Jul 2021 00:18:15 +0200
Subject: [PATCH 155/166] more efficient tests with better case names

---
 gensim/test/test_ensemblelda.py | 326 +++++++++++++++++---------------
 1 file changed, 174 insertions(+), 152 deletions(-)

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 1c0004f51b..6cd18ab550 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -22,145 +22,154 @@
 NUM_TOPICS = 2
 NUM_MODELS = 4
 PASSES = 50
+RANDOM_STATE = 0
 
 # windows tests fail due to the required assertion precision being too high
 RTOL = 1e-04 if os.name == 'nt' else 1e-05
 
 
 class TestEnsembleLda(unittest.TestCase):
-    def setUp(self):
-        # same configuration for each model to make sure
-        # the topics are equal
-        random_state = 0
-
-        self.elda = EnsembleLda(
+    def get_elda(self):
+        return EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
-            passes=PASSES, num_models=NUM_MODELS, random_state=random_state,
+            passes=PASSES, num_models=NUM_MODELS, random_state=RANDOM_STATE,
             topic_model_class=LdaModel,
         )
 
-        self.elda_mem_unfriendly = EnsembleLda(
+    def get_elda_mem_unfriendly(self):
+        return EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
-            passes=PASSES, num_models=NUM_MODELS, random_state=random_state,
+            passes=PASSES, num_models=NUM_MODELS, random_state=RANDOM_STATE,
             memory_friendly_ttda=False, topic_model_class=LdaModel,
         )
 
-    def assert_ttda_is_valid(self, ensemble):
+    def assert_ttda_is_valid(self, elda):
         """Check that ttda has one or more topic and that term probabilities add to one."""
-        assert len(ensemble.ttda) > 0
-        sum_over_terms = ensemble.ttda.sum(axis=1)
-        expected_sum_over_terms = np.ones(len(ensemble.ttda)).astype(np.float32)
+        assert len(elda.ttda) > 0
+        sum_over_terms = elda.ttda.sum(axis=1)
+        expected_sum_over_terms = np.ones(len(elda.ttda)).astype(np.float32)
         np.testing.assert_allclose(sum_over_terms, expected_sum_over_terms, rtol=1e-04)
 
     def test_elda(self):
+        elda = self.get_elda()
+
         # given that the random_state doesn't change, it should
         # always be 2 detected topics in this setup.
-        assert self.elda.stable_topics.shape[1] == len(common_dictionary)
-        assert len(self.elda.ttda) == NUM_MODELS * NUM_TOPICS
-        self.assert_ttda_is_valid(self.elda)
-
-        # reclustering shouldn't change anything without
-        # added models or different parameters
-        self.elda.recluster()
+        assert elda.stable_topics.shape[1] == len(common_dictionary)
+        assert len(elda.ttda) == NUM_MODELS * NUM_TOPICS
+        self.assert_ttda_is_valid(elda)
 
-        assert self.elda.stable_topics.shape[1] == len(common_dictionary)
-        assert len(self.elda.ttda) == NUM_MODELS * NUM_TOPICS
-        self.assert_ttda_is_valid(self.elda)
+    def test_backwards_compatibility_with_persisted_model(self):
+        elda = self.get_elda()
 
         # compare with a pre-trained reference model
-        reference = EnsembleLda.load(datapath('ensemblelda'))
-        np.testing.assert_allclose(self.elda.ttda, reference.ttda, rtol=RTOL)
-        # small values in the distance matrix tend to vary quite a bit around 2%,
-        # so use some absolute tolerance measurement to check if the matrix is at least
-        # close to the target.
-        atol = reference.asymmetric_distance_matrix.max() * 1e-05
+        loaded_elda = EnsembleLda.load(datapath('ensemblelda'))
+        np.testing.assert_allclose(elda.ttda, loaded_elda.ttda, rtol=RTOL)
+        atol = loaded_elda.asymmetric_distance_matrix.max() * 1e-05
         np.testing.assert_allclose(
-            self.elda.asymmetric_distance_matrix,
-            reference.asymmetric_distance_matrix, atol=atol,
+            elda.asymmetric_distance_matrix,
+            loaded_elda.asymmetric_distance_matrix, atol=atol,
         )
 
-    def test_clustering(self):
+    def test_recluster(self):
         # the following test is quite specific to the current implementation and not part of any api,
         # but it makes improving those sections of the code easier as long as sorted_clusters and the
         # cluster_model results are supposed to stay the same. Potentially this test will deprecate.
 
-        reference = EnsembleLda.load(datapath('ensemblelda'))
-        cluster_model_results = deepcopy(reference.cluster_model.results)
-        valid_clusters = deepcopy(reference.valid_clusters)
-        stable_topics = deepcopy(reference.get_topics())
+        elda = EnsembleLda.load(datapath('ensemblelda'))
+        loaded_cluster_model_results = deepcopy(elda.cluster_model.results)
+        loaded_valid_clusters = deepcopy(elda.valid_clusters)
+        loaded_stable_topics = deepcopy(elda.get_topics())
 
         # continue training with the distance matrix of the pretrained reference and see if
         # the generated clusters match.
-        reference.asymmetric_distance_matrix_outdated = True
-        reference.recluster()
+        elda.asymmetric_distance_matrix_outdated = True
+        elda.recluster()
 
-        self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
-        assert reference.valid_clusters == valid_clusters
-        np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=RTOL)
+        self.assert_clustering_results_equal(elda.cluster_model.results, loaded_cluster_model_results)
+        assert elda.valid_clusters == loaded_valid_clusters
+        np.testing.assert_allclose(elda.get_topics(), loaded_stable_topics, rtol=RTOL)
 
-    def test_not_trained(self):
-        # should not throw errors and no training should happen
+    def test_recluster_does_nothing_when_stable_topics_already_found(self):
+        elda = self.get_elda()
 
-        # 0 passes
+        # reclustering shouldn't change anything without
+        # added models or different parameters
+        elda.recluster()
+
+        assert elda.stable_topics.shape[1] == len(common_dictionary)
+        assert len(elda.ttda) == NUM_MODELS * NUM_TOPICS
+        self.assert_ttda_is_valid(elda)
+
+    def test_not_trained_given_zero_passes(self):
         elda = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
-            passes=0, num_models=NUM_MODELS, random_state=0,
+            passes=0, num_models=NUM_MODELS, random_state=RANDOM_STATE,
         )
         assert len(elda.ttda) == 0
 
-        # no corpus
+    def test_not_trained_given_no_corpus(self):
         elda = EnsembleLda(
             id2word=common_dictionary, num_topics=NUM_TOPICS,
-            passes=PASSES, num_models=NUM_MODELS, random_state=0,
+            passes=PASSES, num_models=NUM_MODELS, random_state=RANDOM_STATE,
         )
         assert len(elda.ttda) == 0
 
-        # 0 iterations
+    def test_not_trained_given_zero_iterations(self):
         elda = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
-            iterations=0, num_models=NUM_MODELS, random_state=0,
+            iterations=0, num_models=NUM_MODELS, random_state=RANDOM_STATE,
         )
         assert len(elda.ttda) == 0
 
-        # 0 models
+    def test_not_trained_given_zero_models(self):
         elda = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS,
-            passes=PASSES, num_models=0, random_state=0
+            passes=PASSES, num_models=0, random_state=RANDOM_STATE
         )
         assert len(elda.ttda) == 0
 
     def test_mem_unfriendly(self):
-        # at this point, self.elda_mem_unfriendly and self.elda are already trained
-        # in the setUp function and the topics extracted should be equal up to floating
-        # point variations between the two implementations.
-        assert len(self.elda_mem_unfriendly.tms) == NUM_MODELS
-        np.testing.assert_allclose(self.elda.ttda, self.elda_mem_unfriendly.ttda, rtol=RTOL)
-        np.testing.assert_allclose(self.elda.get_topics(), self.elda_mem_unfriendly.get_topics(), rtol=RTOL)
-        self.assert_ttda_is_valid(self.elda_mem_unfriendly)
-
-    def test_generate_gensim_rep(self):
-        gensim_model = self.elda.generate_gensim_representation()
+        # elda_mem_unfriendly and self.elda should have topics that are
+        # the same up to floating point variations caused by the two different
+        # implementations
+
+        elda = self.get_elda()
+        elda_mem_unfriendly = self.get_elda_mem_unfriendly()
+
+        assert len(elda_mem_unfriendly.tms) == NUM_MODELS
+        np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL)
+        np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL)
+        self.assert_ttda_is_valid(elda_mem_unfriendly)
+
+    def test_generate_gensim_representation(self):
+        elda = self.get_elda()
+
+        gensim_model = elda.generate_gensim_representation()
         topics = gensim_model.get_topics()
-        np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL)
+        np.testing.assert_allclose(elda.get_topics(), topics, rtol=RTOL)
 
-    def assert_cluster_results_equal(self, cluster_model_results_1, cluster_model_results_2):
+    def assert_clustering_results_equal(self, clustering_results_1, clustering_results_2):
         """Assert important attributes of the cluster results"""
         np.testing.assert_array_equal(
-            [row["label"] for row in cluster_model_results_1],
-            [row["label"] for row in cluster_model_results_2],
+            [element["label"] for element in clustering_results_1],
+            [element["label"] for element in clustering_results_2],
         )
         np.testing.assert_array_equal(
-            [row["is_core"] for row in cluster_model_results_1],
-            [row["is_core"] for row in cluster_model_results_2],
+            [element["is_core"] for element in clustering_results_1],
+            [element["is_core"] for element in clustering_results_2],
         )
 
     def test_persisting(self):
+        elda = self.get_elda()
+        elda_mem_unfriendly = self.get_elda_mem_unfriendly()
+
         fname = get_tmpfile('gensim_models_ensemblelda')
-        self.elda.save(fname)
+        elda.save(fname)
         loaded_elda = EnsembleLda.load(fname)
         # storing the ensemble without memory_friendy_ttda
-        self.elda_mem_unfriendly.save(fname)
-        loaded_elda_mu = EnsembleLda.load(fname)
+        elda_mem_unfriendly.save(fname)
+        loaded_elda_mem_unfriendly = EnsembleLda.load(fname)
 
         # topic_model_class will be lazy loaded and should be None first
         assert loaded_elda.topic_model_class is None
@@ -175,23 +184,23 @@ def test_persisting(self):
         topics = loaded_elda_representation.get_topics()
         ttda = loaded_elda.ttda
         amatrix = loaded_elda.asymmetric_distance_matrix
-        np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL)
-        np.testing.assert_allclose(self.elda.ttda, ttda, rtol=RTOL)
-        np.testing.assert_allclose(self.elda.asymmetric_distance_matrix, amatrix, rtol=RTOL)
+        np.testing.assert_allclose(elda.get_topics(), topics, rtol=RTOL)
+        np.testing.assert_allclose(elda.ttda, ttda, rtol=RTOL)
+        np.testing.assert_allclose(elda.asymmetric_distance_matrix, amatrix, rtol=RTOL)
 
-        a = self.elda.cluster_model.results
-        b = loaded_elda.cluster_model.results
+        expected_clustering_results = elda.cluster_model.results
+        loaded_clustering_results = loaded_elda.cluster_model.results
 
-        self.assert_cluster_results_equal(a, b)
+        self.assert_clustering_results_equal(expected_clustering_results, loaded_clustering_results)
 
         # memory unfriendly
-        loaded_elda_mem_unfriendly_representation = loaded_elda_mu.generate_gensim_representation()
+        loaded_elda_mem_unfriendly_representation = loaded_elda_mem_unfriendly.generate_gensim_representation()
         topics = loaded_elda_mem_unfriendly_representation.get_topics()
-        np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL)
+        np.testing.assert_allclose(elda.get_topics(), topics, rtol=RTOL)
 
     def test_multiprocessing(self):
         # same configuration
-        random_state = 0
+        random_state = RANDOM_STATE
 
         # use 3 processes for the ensemble and the distance,
         # so that the 4 models and 8 topics cannot be distributed
@@ -199,6 +208,7 @@ def test_multiprocessing(self):
         workers = 3
 
         # memory friendly. contains List of topic word distributions
+        elda = self.get_elda()
         elda_multiprocessing = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
             num_topics=NUM_TOPICS, passes=PASSES, num_models=NUM_MODELS,
@@ -206,6 +216,7 @@ def test_multiprocessing(self):
         )
 
         # memory unfriendly. contains List of models
+        elda_mem_unfriendly = self.get_elda_mem_unfriendly()
         elda_multiprocessing_mem_unfriendly = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
             num_topics=NUM_TOPICS, passes=PASSES, num_models=NUM_MODELS,
@@ -213,21 +224,23 @@ def test_multiprocessing(self):
             memory_friendly_ttda=False,
         )
 
-        np.testing.assert_allclose(self.elda.get_topics(), elda_multiprocessing.get_topics(), rtol=RTOL)
-        np.testing.assert_allclose(self.elda_mem_unfriendly.get_topics(), elda_multiprocessing_mem_unfriendly.get_topics(), rtol=RTOL)
+        np.testing.assert_allclose(elda.get_topics(), elda_multiprocessing.get_topics(), rtol=RTOL)
+        np.testing.assert_allclose(elda_mem_unfriendly.get_topics(), elda_multiprocessing_mem_unfriendly.get_topics(), rtol=RTOL)
 
     def test_add_models_to_empty(self):
+        elda = self.get_elda()
+
         ensemble = EnsembleLda(id2word=common_dictionary, num_models=0)
-        ensemble.add_model(self.elda.ttda[0:1])
-        ensemble.add_model(self.elda.ttda[1:])
+        ensemble.add_model(elda.ttda[0:1])
+        ensemble.add_model(elda.ttda[1:])
         ensemble.recluster()
-        np.testing.assert_allclose(ensemble.get_topics(), self.elda.get_topics(), rtol=RTOL)
+        np.testing.assert_allclose(ensemble.get_topics(), elda.get_topics(), rtol=RTOL)
 
         # persisting an ensemble that is entirely built from existing ttdas
         fname = get_tmpfile('gensim_models_ensemblelda')
         ensemble.save(fname)
         loaded_ensemble = EnsembleLda.load(fname)
-        np.testing.assert_allclose(loaded_ensemble.get_topics(), self.elda.get_topics(), rtol=RTOL)
+        np.testing.assert_allclose(loaded_ensemble.get_topics(), elda.get_topics(), rtol=RTOL)
         self.test_inference(loaded_ensemble)
 
     def test_add_models(self):
@@ -242,86 +255,88 @@ def test_add_models(self):
         num_new_topics = 3
 
         # 1. memory friendly
-        elda = EnsembleLda(
+        base_elda = self.get_elda()
+        cumulative_elda = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=1, num_models=num_new_models,
-            iterations=1, random_state=0, topic_model_class=LdaMulticore,
+            iterations=1, random_state=RANDOM_STATE, topic_model_class=LdaMulticore,
             workers=3, ensemble_workers=2,
         )
 
         # 1.1 ttda
-        num_topics_before_add_model = len(elda.ttda)
-        num_models_before_add_model = elda.num_models
-        elda.add_model(self.elda.ttda)
-        assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda)
-        assert elda.num_models == num_models_before_add_model + 1  # defaults to 1 for one ttda matrix
+        num_topics_before_add_model = len(cumulative_elda.ttda)
+        num_models_before_add_model = cumulative_elda.num_models
+        cumulative_elda.add_model(base_elda.ttda)
+        assert len(cumulative_elda.ttda) == num_topics_before_add_model + len(base_elda.ttda)
+        assert cumulative_elda.num_models == num_models_before_add_model + 1  # defaults to 1 for one ttda matrix
 
         # 1.2 an ensemble
-        num_topics_before_add_model = len(elda.ttda)
-        num_models_before_add_model = elda.num_models
-        elda.add_model(self.elda, 5)
-        assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda)
-        assert elda.num_models == num_models_before_add_model + 5
+        num_topics_before_add_model = len(cumulative_elda.ttda)
+        num_models_before_add_model = cumulative_elda.num_models
+        cumulative_elda.add_model(base_elda, 5)
+        assert len(cumulative_elda.ttda) == num_topics_before_add_model + len(base_elda.ttda)
+        assert cumulative_elda.num_models == num_models_before_add_model + 5
 
         # 1.3 a list of ensembles
-        num_topics_before_add_model = len(elda.ttda)
-        num_models_before_add_model = elda.num_models
+        num_topics_before_add_model = len(cumulative_elda.ttda)
+        num_models_before_add_model = cumulative_elda.num_models
         # it should be totally legit to add a memory unfriendly object to a memory friendly one
-        elda.add_model([self.elda, self.elda_mem_unfriendly])
-        assert len(elda.ttda) == num_topics_before_add_model + 2 * len(self.elda.ttda)
-        assert elda.num_models == num_models_before_add_model + 2 * NUM_MODELS
+        base_elda_mem_unfriendly = self.get_elda_mem_unfriendly()
+        cumulative_elda.add_model([base_elda, base_elda_mem_unfriendly])
+        assert len(cumulative_elda.ttda) == num_topics_before_add_model + 2 * len(base_elda.ttda)
+        assert cumulative_elda.num_models == num_models_before_add_model + 2 * NUM_MODELS
 
         # 1.4 a single gensim model
-        model = self.elda.classic_model_representation
+        model = base_elda.classic_model_representation
 
-        num_topics_before_add_model = len(elda.ttda)
-        num_models_before_add_model = elda.num_models
-        elda.add_model(model)
-        assert len(elda.ttda) == num_topics_before_add_model + len(model.get_topics())
-        assert elda.num_models == num_models_before_add_model + 1
+        num_topics_before_add_model = len(cumulative_elda.ttda)
+        num_models_before_add_model = cumulative_elda.num_models
+        cumulative_elda.add_model(model)
+        assert len(cumulative_elda.ttda) == num_topics_before_add_model + len(model.get_topics())
+        assert cumulative_elda.num_models == num_models_before_add_model + 1
 
         # 1.5 a list gensim models
-        num_topics_before_add_model = len(elda.ttda)
-        num_models_before_add_model = elda.num_models
-        elda.add_model([model, model])
-        assert len(elda.ttda) == num_topics_before_add_model + 2 * len(model.get_topics())
-        assert elda.num_models == num_models_before_add_model + 2
+        num_topics_before_add_model = len(cumulative_elda.ttda)
+        num_models_before_add_model = cumulative_elda.num_models
+        cumulative_elda.add_model([model, model])
+        assert len(cumulative_elda.ttda) == num_topics_before_add_model + 2 * len(model.get_topics())
+        assert cumulative_elda.num_models == num_models_before_add_model + 2
 
-        self.assert_ttda_is_valid(elda)
+        self.assert_ttda_is_valid(cumulative_elda)
 
         # 2. memory unfriendly
         elda_mem_unfriendly = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=1, num_models=num_new_models,
-            iterations=1, random_state=0, topic_model_class=LdaMulticore,
+            iterations=1, random_state=RANDOM_STATE, topic_model_class=LdaMulticore,
             workers=3, ensemble_workers=2, memory_friendly_ttda=False,
         )
 
         # 2.1 a single ensemble
         num_topics_before_add_model = len(elda_mem_unfriendly.tms)
         num_models_before_add_model = elda_mem_unfriendly.num_models
-        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly)
+        elda_mem_unfriendly.add_model(base_elda_mem_unfriendly)
         assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS
         assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS
 
         # 2.2 a list of ensembles
         num_topics_before_add_model = len(elda_mem_unfriendly.tms)
         num_models_before_add_model = elda_mem_unfriendly.num_models
-        elda_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly])
+        elda_mem_unfriendly.add_model([base_elda_mem_unfriendly, base_elda_mem_unfriendly])
         assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 2 * NUM_MODELS
         assert elda_mem_unfriendly.num_models == num_models_before_add_model + 2 * NUM_MODELS
 
         # 2.3 a single gensim model
         num_topics_before_add_model = len(elda_mem_unfriendly.tms)
         num_models_before_add_model = elda_mem_unfriendly.num_models
-        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0])
+        elda_mem_unfriendly.add_model(base_elda_mem_unfriendly.tms[0])
         assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 1
         assert elda_mem_unfriendly.num_models == num_models_before_add_model + 1
 
         # 2.4 a list of gensim models
         num_topics_before_add_model = len(elda_mem_unfriendly.tms)
         num_models_before_add_model = elda_mem_unfriendly.num_models
-        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms)
+        elda_mem_unfriendly.add_model(base_elda_mem_unfriendly.tms)
         assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS
         assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS
 
@@ -330,7 +345,7 @@ def test_add_models(self):
         num_topics_before_add_model = len(elda_mem_unfriendly.tms)
         num_models_before_add_model = elda_mem_unfriendly.num_models
         with pytest.raises(ValueError):
-            elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics())
+            elda_mem_unfriendly.add_model(base_elda_mem_unfriendly.tms[0].get_topics())
         # remains unchanged
         assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model
         assert elda_mem_unfriendly.num_models == num_models_before_add_model
@@ -342,67 +357,74 @@ def test_add_and_recluster(self):
         # See if after adding a model, the model still makes sense
         num_new_models = 3
         num_new_topics = 3
+        random_state = 1
 
-        # train
-        elda = EnsembleLda(
+        # train models two sets of models (mem friendly and unfriendly)
+        elda_1 = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=10, num_models=num_new_models,
-            iterations=30, random_state=1, topic_model_class='lda',
+            iterations=30, random_state=random_state, topic_model_class='lda',
             distance_workers=4,
         )
-        elda_mem_unfriendly = EnsembleLda(
+        elda_mem_unfriendly_1 = EnsembleLda(
             corpus=common_corpus, id2word=common_dictionary,
             num_topics=num_new_topics, passes=10, num_models=num_new_models,
-            iterations=30, random_state=1, topic_model_class=LdaModel,
+            iterations=30, random_state=random_state, topic_model_class=LdaModel,
             distance_workers=4, memory_friendly_ttda=False,
         )
+        elda_2 = self.get_elda()
+        elda_mem_unfriendly_2 = self.get_elda_mem_unfriendly()
+        assert elda_1.random_state != elda_2.random_state
+        assert elda_mem_unfriendly_1.random_state != elda_mem_unfriendly_2.random_state
+
         # both should be similar
-        np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL)
-        np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL)
+        np.testing.assert_allclose(elda_1.ttda, elda_mem_unfriendly_1.ttda, rtol=RTOL)
+        np.testing.assert_allclose(elda_1.get_topics(), elda_mem_unfriendly_1.get_topics(), rtol=RTOL)
         # and every next step applied to both should result in similar results
 
         # 1. adding to ttda and tms
-        elda.add_model(self.elda)
-        elda_mem_unfriendly.add_model(self.elda_mem_unfriendly)
-        np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL)
-        assert len(elda.ttda) == len(self.elda.ttda) + num_new_models * num_new_topics
-        assert len(elda_mem_unfriendly.ttda) == len(self.elda_mem_unfriendly.ttda) + num_new_models * num_new_topics
-        assert len(elda_mem_unfriendly.tms) == NUM_MODELS + num_new_models
-        self.assert_ttda_is_valid(elda)
-        self.assert_ttda_is_valid(elda_mem_unfriendly)
+        elda_1.add_model(elda_2)
+        elda_mem_unfriendly_1.add_model(elda_mem_unfriendly_2)
+
+        np.testing.assert_allclose(elda_1.ttda, elda_mem_unfriendly_1.ttda, rtol=RTOL)
+        assert len(elda_1.ttda) == len(elda_2.ttda) + num_new_models * num_new_topics
+        assert len(elda_mem_unfriendly_1.ttda) == len(elda_mem_unfriendly_2.ttda) + num_new_models * num_new_topics
+        assert len(elda_mem_unfriendly_1.tms) == NUM_MODELS + num_new_models
+        self.assert_ttda_is_valid(elda_1)
+        self.assert_ttda_is_valid(elda_mem_unfriendly_1)
 
         # 2. distance matrix
-        elda._generate_asymmetric_distance_matrix()
-        elda_mem_unfriendly._generate_asymmetric_distance_matrix()
+        elda_1._generate_asymmetric_distance_matrix()
+        elda_mem_unfriendly_1._generate_asymmetric_distance_matrix()
         np.testing.assert_allclose(
-            elda.asymmetric_distance_matrix,
-            elda_mem_unfriendly.asymmetric_distance_matrix,
+            elda_1.asymmetric_distance_matrix,
+            elda_mem_unfriendly_1.asymmetric_distance_matrix,
         )
 
         # 3. CBDBSCAN results
-        elda._generate_topic_clusters()
-        elda_mem_unfriendly._generate_topic_clusters()
-        a = elda.cluster_model.results
-        b = elda_mem_unfriendly.cluster_model.results
-        self.assert_cluster_results_equal(a, b)
+        elda_1._generate_topic_clusters()
+        elda_mem_unfriendly_1._generate_topic_clusters()
+        clustering_results = elda_1.cluster_model.results
+        mem_unfriendly_clustering_results = elda_mem_unfriendly_1.cluster_model.results
+        self.assert_clustering_results_equal(clustering_results, mem_unfriendly_clustering_results)
 
         # 4. finally, the stable topics
-        elda._generate_stable_topics()
-        elda_mem_unfriendly._generate_stable_topics()
+        elda_1._generate_stable_topics()
+        elda_mem_unfriendly_1._generate_stable_topics()
         np.testing.assert_allclose(
-            elda.get_topics(),
-            elda_mem_unfriendly.get_topics(),
+            elda_1.get_topics(),
+            elda_mem_unfriendly_1.get_topics(),
         )
 
-        elda.generate_gensim_representation()
-        elda_mem_unfriendly.generate_gensim_representation()
+        elda_1.generate_gensim_representation()
+        elda_mem_unfriendly_1.generate_gensim_representation()
 
         # same random state, hence topics should be still similar
-        np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL)
+        np.testing.assert_allclose(elda_1.get_topics(), elda_mem_unfriendly_1.get_topics(), rtol=RTOL)
 
     def test_inference(self, elda=None):
         if elda is None:
-            elda = self.elda
+            elda = self.get_elda()
 
         # get the most likely token id from topic 0
         max_id = np.argmax(elda.get_topics()[0, :])

From 07f514827dff0d99d6434f66bfed0aa91c5377e8 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 4 Jul 2021 22:18:20 +0200
Subject: [PATCH 156/166] new reference model

---
 gensim/test/test_data/ensemblelda | Bin 10082 -> 10643 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 127d00888c3c5184ca18d49e7117e0d31352d7c9..9c746b200f18efbf64f9c5b3e514a0abd7456a45 100644
GIT binary patch
delta 7306
zcmbVR2V4_L+YhAB0xA|jL?aeJA@nMQE=T}DEMOsqWXT57Ora_Zr$}*CkQEObARrdh
z6A=rj2ztJXB8mmYf@d#><v_*8H@m@dcfQ~6`+b3(oq2ZVnWxR{JpcV8yeLXPld?Y7
zibVWSlfq>t2!`qfN)(VdP6UPWxmXcqm4z+CEYV?{FyCm-EZ^{enPHr$*}k&_v5<j*
z0R+{RsKgwxln;p%7@L9`2q1|<DCQ(4bGUMWf*4EVN#!z?0$YSxf`2xNt+5hgYY?R-
zUP9lg2$>*Lh>}W_G@{hPOQIt&gspx?P}$nwo~fBq*_vR2?^HF<zw=@1;DfmM%tVc?
zOE5wHL162B<E@s7=r%}&P%R<fAz3QtW2lK*Zzis)RL*7cg*>HDD&fjgF;q9;d*hoy
zwUyFDNa7&m6DHE;NyRdi5<=t{n}kwfm6$65HbQlT5+x*$=kg$ot%+*OA%#o|+=H>n
zC{3Y~Cqb#W(P@fQg%T2DY${4o3dP_t4W+0Q5MZW-k|7!Jj5bPD${`5UbWo~5&XvI!
zTNkB?g%YV85cO23Hqb?h;6mXNu}QESTLX4yYr-CEGCYk<fj!w&*o#erS!^xXo2?D|
zuytTxwl3_))`R_3D47>exEpv@llUWnk^(L*HW|2!0!mFb4P4sb(g7`9aOtr%!ASnM
zk;u=6lK=ZqZ9Mw;QXZj~7T%CaLEQ``>JmvB#vX$X6$#@ZUMf!naiAnfqQoN+sECrq
z3IT?qaVnum$x!<U!98Ya@4!$=giBNqgK(0<gR#J3&_QwwYla2k9=!6+J9y)(bMU&_
zHx`SSOr{khUM>|g+!;Vjl^CC%VX2fVxgv&xJ43xP7(i$t9xe*R3PZK|TqQ)f5gqMd
z=U{JR=W1i;G~2<!!`{`y(S>Pm=jiU{hM_cd$btBBV%jt9n0zS74p3#ON?0nvP$x&G
zJJXqA$%o>(Dv{Ef5ynkrI5;tImeve=2M=dE4@YM!hVFtOe}9I(J<ekhSpV8G5my;6
zm5VXdCR8X<rPw%wDVa_VHug4QzTqLhvu4_)xVdm#oNSVXO4vpq62|cyn2wGZN+ryT
zp;|mS1a#nIDlV$c;Q+VDQgNf>4Fyj)*wE#0gaQzL5TAH9lEaZhMGt7h5YST<r`}kU
zQkf8Ut|rint5D$a>!3nMh#|35p2~@r3n2+#l*&;m!CsY%4&@6KO0I+laloQXgz&)D
zK!=IBibSDAz)`}$L$FlD$HGyAuS78fti{J*5=sXo1qT$`cyF-tp+z5QW||mU7-@u?
z1#E|5_%`=_Po(mdsWJ$tO(=-Laxg|S-U-Xg&B11YB}7IwJhAZbaD1a2APP1~iN(Qo
zuoJ3-$0}IbVjv?O)x!ms3**2Jz{m1nKG0W(2dgBB3Sk<Ag|#DED3uTKxT#o>clY3S
zO_RpQD<CDHyc(no2dh~q#@B#c8pn+jiiAphz>2hcL3>A}qk8z1Dj6PKU{Vqcft`k~
zZsY3$>{P&7x=<2V^tGuAj7PBa20URQ2Ru?iKX4o(gXFlW`J6#N<pHs|B2gTdm#6?%
zrgB9xn2Vt!i+(d732aCQ9_E24SISj9C9or`0PB>`mCJ!*ML?u%%wHo8QyUNLBH0+6
zoae6%wDZO<4A>4ekjlW)09N6sdFO%sP07XJJa`p89$x)*G+jN~?(1k$(UIX62*HE?
zjR!t752eew624T-0kNY5?i@k5lc^>$IUoQNMSyGw;*;n{bmQ=pQaK(ZT4A&8-Q68s
zv25UTd{@zr6Dm1iX+m-?-UTj0scOdHA%c<<!Iu3mX~Sf=vPx7_^+WW)DgTb0JijI#
z5^2I&<Gc)~xGOtp4+@4Vm>(Rn%Eh*pbTp|daaw<BcOvs@RlnI#``y`kh|Y_b>v~7l
zWUct!#+k7MF%6AyE}!=6k-CHN(+)(iV~-aZW$oXw??K!3D@zx`<{6<*$J(5lZb!cP
zv-Iwzjv?Wz@2?fTrOsOEJgGf+;rU<x5{4I)jLQ6^fuyM}FsIug_i8QsR=3ja2I?+t
zWHXQdcHL&QV#u+Dzr9P}{1;McFlw=gCTSVk^k(+!dtsw9su7xpNGECDKJRzgB1guf
zg|xC~9uI$&$A{&1eypU@BiAfHyR{kYlC!l?>ucwp&?2TxYS%63XD)qx@w88^OOo<=
zz%_b{=gs3A%B@E(hNhNpd+fI3-WTn?)tbEgCRJVjT}AVr{aMJakiy~_6X$UJ`!`>C
zmr+L=cyRtztd4RIa`6`BUTJl~&JvVQ(hs^{GxwtJ)xzlYOB(LZ7n$jt2>J7AuChS8
zmln3ad9`E4<TsCZns(S3zt}!3beDAfik!^X!&6e9+?#JJOv}o8cIu+BdHtU?Vday9
zS^C>`rn;i!=-H8D=lH>>USGrUv^e*e1jij)&nS-G^r_+6-tfGIMCT<v95bD6a#aI*
zol*Kr{?4Ag{rC6zKThzqeR8Wa?PThkRTF(_*{W5Q)t~69h6#+}6^u15GsKMj%(r3B
zIl<@aiWl#8+maEL-O=-8x!=~<J^Ob55tgW$bL+;KEg63<Fgf&WGh-?<!<D}5(0Kil
zfrEht>wC}4y82N|ru$&M4}u;rytCNpjb0VCac<@6g}E=BM$7t-F^+HX*%h?0qS(E|
z=E)Mj7gI*VBg$is{js)_dc|0L&M1<83F_lWKV5zEFe$Ei#g0mS)rh-Y#mNq7zfZPK
zAjyUnC-10tw%l(*@%Syaao2dG2^&0CLofF#hHl!OIsYa@G)L=G*o}+&nuyb<_H<p_
zLxK8<S1K&e^LrP*f0u-A81-bS*&IE=^AQml+omk3Y&g=r{pyb1=fjNdK3z0v+sTX<
zmQFSKdHO2%js-)4yVe-<g=Xd*I!1lz*C#eg6D>w1G-W>%eYjYE>r;J8b2WYIu%S2p
zH2OVcitnwzGB_DR!=fhXy5C>@vd0%eEJBw@s15`h(_1v3O)}4TSP)xtp})|nvvd+S
z?!>mPL;L>oLQl5PW6R_6r-!>;Z{0P`@rhpba`K|i#CVs=j0pEcZ7IiR-siCsWzTq<
zdB2^DIv72r<}-Jq*h5Jcq^{QOl2mu5_uXm>w@9&ZAa|ZKJ~~F_x?iAgH`icC+f?Mh
z&Xcp!5`BiBsT)X*Nx9=selJ?=MxW{!dV?9fL3`-iqh=Sfra!tL+tFJ+V#(fSNBHSc
zK`gxDg8%Zv%O*#L-ms@Fz5S|y6Z-23M)&JDZ_c*W@<o@-*57oP^68OWd1nmoim+Mp
z?j1Vw)ivndLyHlkSIS$s`2){vuTM(}>=z(zqsJaIogNe5>wl~4S)l<MLi>Hwq_u}C
zUt>OOgWA^m51ThuZRoju`q8$;#@H$6*jg^Nwd+o54?i;7CBiRe82#<!C#RRhCWQ>u
zcmL>;FwP)iICc6gxo>u+>SKd{pHW%lmegOiX3TkHRs8YXx!u_FY1dAFf>Sa(&2uHQ
zlBQWB+o@eW2Y1IbCeEM6_li2dBx+6!#YVzpD2BBdZhTv@=yGj}+rE(s(}5uuv!^~=
zw#>Pr^mR*Xb&ev6x8jxD-P8>V>?{kZxg$;4p%*GX_oB6E?x(qWs#W7^r5|%$BfTU~
zDUWvF@v+NUn;f-S@7aF-jx~F~Oc#&3cKFiLV266d%Fn~QmQuZAT>#e~bu9lSI<3oM
zQ)c^!_j4ZBety8*LU!mie%Nri`hbDaaRqCpW%c{M>_DG62b?Dq@Js)ket@$fBB7<<
z_Ky><J|Ch~3JwO^M_VTZFP(a$`Kg0B!_J)_v}O<V@?cRC>5(j;_jYqZbxN4kxh#{R
zMw^1>xp9z7^N?LvpKe<H&Kw&uyc4T=*s=Mt@4K8JN?x-|)r%OW>%_m>*2sn1{x}{o
zBW-x9PYQaoUFLFUst4!z^VGh&m?^r8oZfYkd!ob0{8L(5%QdbCT)07IY@9v!^|h?h
z4%_^wjg1zI^)eP)j{fpYzNljN!KHq_uutcTd5sGjqWuxSXSehEdne6!UCY|^V)MzR
z78@I8t}xU(XkDJ1-*oike)HhW9reE$-z!?CC%$=`Ip4?h!`S;g=GD6GGXov223%fC
zw`5W&Dsyc~)DUK=!Q-}at*N2+-o(<6*)MNZbRTvezv0unQDYK<i=UWgG*Th&j{DfH
zo$a~B^mz#rJP<_mfhstlGI`fb>ThwKBxv2Nb#&5jW?O(^_ORMILC5EuE63deyn2@G
zI9xH&>2n9uloUn}Yk4d<D5jmw5%`=%F4TYia(^~?$l5FSFLvI2Q?hE@M<I7Y$Fjh@
zRM!0xMa=PeN!}fE%t?*U?^b@SOv-+J&hwFXfZv8kEStb9Vx+t6^K`Acmv3(w7W)r#
zYel;HbLs;Qr8_5B80Q^6Qu{W9^V#+MefRpX7w-DB3uinUBmK{oP`Y1xLusN7>Aep!
z{R+%(7B6ZUcYXcjq#^lmTEE};#DVrH2N<?TJa?)Jnex^Pj0+~eR^IPZ@h)iVpB$S<
zJ98B+<J8Pu=j2<Ce2FjnI3OQdlRb`9TUbB)C1qMykH6*71&2wHkzUSe&5&D@(Y6JR
zV&S?g7gUk9wZ;i=(|Zi}Y+75~wehsM$CcV}w7<h9;C*4c>x0Pnu{R~jA<EOIzIZRS
zys21zyQ$kPZOb~9_gMWgd;1P5`?3#TcALG|G^^^o*2NF@6rFs;!u$*K()KE{Me!FU
zB0sEiuMMSrn8Ao&<#>HV`jPxq9Yra!D&w5xRvLV>u%%&3lk`@G8D8zzp?pdYi#fH0
zW%sswtb4I$&C&k5)M?6m`~PSxn$tbsLDS@VqZP?+Q^dX}8CE{N`)s4q0`pduhFO<3
z^jIa>r-i=gaUZzp+kiOj)w_8^wz9X;%qpLDvdP>3T0o>n%eGA&CoaXFky>>IxNn}2
z!<IfxS(7%mx8kbc)Vj(<iAz$(?`t0{t53CDGmy)8JZp^pF|otpJrh4SALsP#Ykiq{
zXf-2wTdzh|=b;;|<sOogk(bxc@L`Pacz3Mp4E;-}2iZ0!>y7J!4;zsdh53R=oi%f*
zQ@;qwZ`-QlRyCT26pheJ>OWVbdk8YA{f|w=#}g1~LR17Tg<|^9Y;qs%O1od_zRS;F
z6*(>SVx|4{FnD)Tug`c-rS;CfwQFo|2Wwt$>AazAC~vQzc+{cEb^g<%nBs)oC)-CB
zE}zm+k-lsUpE8yllvLl|jr1I8gs%waZ@El`-%k1PR#)(u-*+E5V<US{oJ`3?_9{@J
zIxttk5pY!s{G<*NcA^%fKIfB2cy=TN$pe?j89_>f4iRw0V(t$qPLOv*T-fE;J3pbi
zAh(cll{^^00Z2HJR(;Qo;3`;5q&88++4n0t0b}q6fGXhUR>i+hs~~@;uO!aqi(}J*
zS=%aRu(lM=U<ob-vM6)Tdb2G9Sn;5~Gcl0Gl?AY{#vC8L>p4D1ryJr^O=<N0+|%G4
z_Nu`<(xuV+{L(4jp6UKRc-`vEP#=0kqqm~_?6G?d7mgY2ZaYSp1yAM&lYCeXC&LRD
zVQ?(C7K1B}NEY>=6j`81Dv0}eGVoPk#DDSC9*Jk<L8)iNPN`>_rP#B2YlWAizKE#r
zd?504^AdUPs*Yhjy%fU=E*`-$G%xX@4lD68Ff8$!Jfp;`?6jem!oZA$*E5t)y#$L&
zyn+jnX*Z0QPK)$QpN0zzhY<h+9DznC<y@fz;xp9QBLaymza@TN_72tzqD3URUZ6$f
zcwTl)XKTi{^v}z|&d$z?A(Sxen09u+9kYH83phL3ySX}I@N9SvJQtn^N5e7je0YJ1
z`-8cH2zyzA473VT#lf55(tqVIa3$OcSAkpwuKq7+%HBM`1HWmV(jk&5xRy;(!3RJ(
zS@*Z_RL2fSp%P<o4hSWX7IN`?2~PH>JRf-Mi>(Cd5)o!(CCGsRHb5bg_<zj_;37QO
zKyDQ8_C2%4b0hPACf0d=Br>r?x^I*m##6?{MNi1{BD<&0D#u%y2j&y^CRQ|YXKaon
z?%mTL9l@V7)`t*npRH<orqN9G^Zjai_nIvi2tGvxlyMr2qTeVdhIKC&s{7nk_hl4i
zQfF$AirT4CP124cUm8=RtVl)^(TF&nQ1oizXgWu#0&oS9>7zw2Cz>KZSz+32?Vl_C
z6P+~)or$86XZg-+Z>#tF$byZd*R=v^o1C8dl(C|(ciF~Q;cXISLA#RWwC=XM`~cC8
zFh`zFWYu5PxwG~1eZ0fqZ*k0c>pE8R@UeO!mk7FPVoDAxcDGmASarWwht`c}oxPhc
zuLtc;Ar`)?74N4W_m#_NmA@kT96?`J^>Qog(a|W!sYJh~Rm%(yveuQ_w;?Ch1VU1`
z<`ZYCYFfQdkyEx*6c7>;yk4wzaqg-hG&&moYYvNky~pC_e!T5deABUv^~`mY*}%o`
zf&xK^2{R1)M7!GvI>F~{+4*pU=+{(#A~2uzXq*PJYg9YYfiMkyTN~^2o>uQ$ndw9J
z6@H&k_GwSko67G39oA5l_16;Z5v*h-%P`es;h`Ta158h7=+|YI6h~N4Jz;heY~=cb
zr5m%5q&ID9LWDVRg=MOa_LzOG-qxp>E8i17RBNd1=^7HS{riN6pJpUS)O{C-h%kMg
zNClu^xf+qjVJ&TczW~1|f8TJh_|#ZF>=WdDqy+zMMD+l^6Uk9<#WE2N3TcAZ1C*lV
zh`4b8=mJeG0GR>s5MQ8{IJluDRB!-h2?4OpPyz`BaGX?*V{60!1-2HTUK#~xVb5rU
zsN=w|Rsl*K<fnle0|-gT=ls{9Dlka`U4ul*P0$A=&W}`(Fl4Zb<H@vwgkhk`CM6ht
z>+q9N$PdFb{_n#yzE4IXWLHn7@oh4Vf}c$Gx48hw4Mar#4>JC3GBPmK|7SAF&+UQy
z(~SR^jQX?5z&imoP<v=Fj=nj5w3wTM1GztQ9b1EN76_4qkwEPP6Hp)t<MFB$D4;=y
zsWDGHdO)O6!1o3P4k&^TAi0D|9YNP0I2qu>&G2FpUf6(Qkda1$Gib0W2`=E4tAbE{
za83c=tG<Ge#4E)AZ(yMM{r@J<zX@ytO;wADJ-hRvfVfAi)c0<k2%;Tv?GNI8kX!U>
zypesGIEbKX%Qg|ae|t%p`rc)68s9FXBZ`T4jhiDz+(|`l=Amkw(S$%n)k_gCRX|1@
z<^||7ICUBS_>?EY!RN0JC4&bDbV$5}D~32yIR^&NBaTA4+KMGgTk!he+WG@D+E)Dy
z;Oh+_yGmP7AW9C=1$>JmKIMr3JdWxrAQ=~&86gq)CPfLCN&q_@K;wu2psK_|NTf0n
zN>fAf_!qx_$%hsGFNY5YJAi?HJ$%@xPaojD&rQPB{o5e~O9%mu<KibbSj1JuG5-fG
Csek(b

delta 6704
zcmaJl2UHVT*Mv?eDu@bL=tf0Euz*4W0s;;lMq|l_l!1w)y-cD4hM)+7N;3*7iVF6E
z1+0i7ippYHtcZ%(7Ynvs*R}D#$>6%4^X;E=;LUxv-*;cRSDWxK+8|3`P-1B}h#o2>
zFbQ9bMGARzKpU|L=&OMN1Jz0x_G5q{Iv@p;5+aGGL?*<<gr``@(|rLW)KVZ;5lT$K
zr6!CSq&zk~cuL5Wgy5iPIxvv|Q#CMSpmZs=K*<&J_?Q^b5q;ELB9d|mo<uIj2w)B;
z3@|wXEW%jF){?(<sEGn2RAMCoEIGi6p$2`RJzrey-9TRfm&;d)#7ZcDOD8ZT9rWV>
zYX)L~7$GKDPCT$dwGUivG@w5Y5w-}82j>Q$mh*UGk&r9rDKIG%JCH;UqL6l|iHP6|
zWC{$}=b=WFl@16^r3Ql;cwdro2m{ea3=v~e{!m;ec^INcN_NC`qMRtm8Mn{F9S}X0
zS_52qd0iO@?K`g<#p_N{52tt;6g1)=UNEwkbreK>cgurf9ZlKnNwJQhATNrQw0LYU
ztHzsDt52EiL-CHI6pg2NCs5GDA8LJjdHqPW22?aBQLK|GMgA0P00l9BsAcuC22xrL
zu@o4aAc}Vir6`!<4WXdWA8M!e@`h1rjmRK}Q>+n`qG=Q>LP3$=)nY0>s$I^o(|{<{
z6lQM}nV}#WHBd?j5W_(Yl}aHG#0sP;i9F4dTwqTLOezsiq2Q&_xe81mlM+gWN&u56
zj-sMVg%lRVBYHU~jlVP=B#=ItA{M#5Btm7H9HWCoR6p4l28y=SBOahI0a2QNbaeEz
z9{AJ!L-dsV{6swiJ@VIid+kvc9J9244xJIF7Z$&ngf`X8(v6)=wr+G>AEz7j!k!)@
z@qw$S>EOT$YCdaV423^P&36`VxJ1%5Xg3TSMoZGx4?ESbAzP%Q>Ac@dt1WPhp*d>r
zIm+}qw`-%FJZTMDwUbz*<fFD+G72gMA)1H5{WuR3%XmTpOy__Z=wN}EM-U<bwYwpm
z9PUZDd3Z{ZObXaMw4a1Wq>7{|TqO=MxJ)brGow)p1y3rJNw@?g#K5fH;8T-yE;n9>
zO@K%g;|i2A1s%*D5fA1tP?M;{v3`C&69EU0!+%8o7a090xqc1~<E?@j4C5USxc?dM
zn&SRv&7f;=$LWQ^$mo91Q+IA~OhQC-C^sY`Au%XAIFuV46d8<01|>$sL~|3v<3kg|
zV<JO9QW$Gz?-@jgc|$o*03Kq9vjzYAB&2^&Jv!jS(^H@Z8X*JiBZmj$e3?SXRm$Wd
z0Rgal)JT^)Aer<f`MWPE|KSTfG*uEPAQe*(07qyjT?EfsUGxbcLiEr+DLjdU_pj*B
z4P(XfqprT)sriB$%i-d|!!;EWnv;&G<U+V01Q5dsBOxZ>r2$D8tCI>4YAzJPogm_?
zl$a1mp{3OF$^IXg$u)%AKn`67YM@}Crd+OWrx8FyaPVY21)t7A;gX9|q;TuOv{G_V
zV}cY+0M$=xJdZjxEOPObd^Cxl9?sla6~U}5iD1rc4P_dl&8!$k2os0jyHi7%m^_5J
z`oz*ei(5+rv)X(EcNm^vee14cMZc+I#ZNfFy0kEGBr6~zIFJNJobMZG8GC{SI-Bb|
z>MqsW?!H_P)4>2!^1%F`j95S#VUa_c;H71mxN2ieEXX}#0-X}d-~y&0W~7gVKqOBi
zKspDWK2|chQX~<jV_cqCj`Ki9K57H6HjxAdT&|Gud3=#rq)Y<~!&pnwAB0u+&ww!j
zY6dQYcaG4TOb?KWp>z!1PF!prrbq)CgpQgCaFsNb5T$D{kQMct3EXPmh2kPEq*Y;H
z(LW*~IZrI+^8~2`SR9R-k*f_e1)hP~hy`jaOHPK@Bv=y0deghuMwGERKe?Z)TLYc@
zrjU+2Rai<5AGP?$MKWYd^HDlX6**5Sz%d2Li^kJAI02Vgr<6+ceDZ?F7RW=zvJ^ha
zwCpEE%YRzz8{Ja>8HZ&4aXV%H=?sbgj*6-PA4{?Bd;jA(V*j83vHz~^Gnp@1XEMV}
z`!lVb*9MsMTN_|yy*6NY#M*#OP1XT~)et86ovG{!Na3sv2rto0ZXdL8a@>@R$x1#B
z^Ui`w|K1p{KxlZTK+B=oY3Ly$0ePVD6lru&0CTxe4T>0~OQxhtx`i($7hWk##iZUM
zAt+IUl?=8qj1OEC8Qhy%#GGve!4w5gj)PSQU56yn;L}I~RwG6_q*8&w%~y(;vB~>@
zeC?DWrfgFU1OzgP9PT>>ScB+MtSZGkEDfwh4B3VduOc9B9b%%BgR3MwDJVy1IuuMk
z46H|t*=CSVB!xnfc>)Y<K=j$<BEn-?4zD;2R3HW_n0aI*41~!9*r*1Z$lF$5qkAOi
z!-s-_?uDZdpBn~<G5j`xqbVHC;7Esm@Rg?0?FFt>nhrK2gHcnM!Mdn`N;Rm0;_x*v
zuH}e6z5y}7E8vN_7BRwCBgXi8!~|c5nBrxK8D5Ie@l}X9uGLA?z<nYkz!qc(Y9N4y
zZB>J9I>Y{lUGV+&A9lecRPpio_yT+uPTnTl8Tika$PSoux=VzcpV$Cnl@vHX`t-ML
z+Fvz)<y>$<xU(JBuOHT)`uxP=)mEP4&36h`Oj}&He$}xB{v*O)pSqt>_e%Es(_)wA
zhiUAfj=ej#7&nAm3qNwn?~&i6ad(=W>fH{^YTA4z%IRZmIjt%>__X@y9vgOJ?HKDl
zJ*9K+3Q}21@1B<z=fCa<C;YlsosSyqG+V9Sy?J+wCc?wH`)kGip$z4uYSGq`rwf8J
zD!<H&KkYE~UBx8#Gvlg;Jh(3;P8P*(!TwzQYT<hO%+a4+=i0eBJ|hT5>YITpHS>g@
zQnFrOEpZo40k`vQ%*9iN)vq<%Ug5E~?D^;Hg8lCT=vgQ7l+FY#aE05V+T764<r8Zo
z#CNwvxooCs%C5OMyV)J>RKKxR4KW8JD<*h*;~+iqs>_6BZpYUZ$E=`_%iH&gbwp$n
ze)|uXM`4+R|ID6VKn%DU_WLi_vVZsHr1VAq%1byIER>&4^zSO^95}rCkJ`!GK0mY#
zqB|t^S&*4@)t%L#aPnm9IgEaMve3I@eR)h)XJzN}%~n!PVFJcqZE3L50jZkWiB6c_
z_2&4*B!!d1DbbZjm;bnbpxS1a*YFQDo88M97lS`tGU70~{qETm$hEGYIfr=axBQn0
z7xG%&7KkS3*>aBVxW#^+wXpOR+Wp%8R#(%uj=QBHjmLL7yE&@Exp#_;W1r4H)6@N7
z(W=JWxKf9K?Aw|)$DGF7d*=jyam#oyYEGi7>UqKK8^Z*L5-L|Le=dK`m^@J(_1NT|
z&GVcGw659}`}gU`zLXUEKeHZU+qG|NY+%K41HX>dv;1W*E=+Z9@vrVIJnN8KaLc}@
zGQn_bJ-&uDbf|GwtJ-x(`MD`c(&a9dk1Ma3_&$~%d;UJ6&U%o>r(t~L%9cf2UprxQ
zPTl_e+3NU&%i&w*j(A$~Mb+@`VSSCs%is+slG!0L&qb@+*{w7W*^Ja9r&_pOMvBJP
z@=1B&Qd#OlFFmi;_@pnQyF;%ka+(f{heQVUJoTQL-hlU&yfD1o<oSk?kl8)VhwXD*
zd8BikJ->Xr+Xmm5!e-4e58;^9q@%90SZ!wmXCkHeqw8t5Z}iOq1g9>h9Q4gA^W5c|
z5b}-q+iTOwr3>4#o7$$>Z+tdAlF{Sr5$xw`_Wu2jG1;5_8;|e>Q!`hU8*gG<GrABa
zJ$z9+ZEnz@KYQLC>bX>x@iN1vvORF(X+?HHy+O`{$1m39#oL)P=Z9)IW=}E#I-jH!
z#c3`cxPNeU!D+#?F51c9?y|GeBad#BeJy*vP<UbbYV_d4q;UOG+i_>=`=q@*cm5fF
zSo%kMMcJgA1(M>&(|7bUXJ1d0aT_hKd>s;ctf^|^1i#W~?YB_}W^%4&olq87yXe1X
zBxQSdUQSN_yR-9QTKBPa*@L%f4ja?t7jLLDFPz)PZe796B(mQ|s*`>l^r|`TkJIBG
zolEjvzM)a?qg?XvgLRB?c=z1*M#!rN??0K$;f%XhwLgAr*9YsYx|~&oM;>Up`#1BG
z``HhDn5<r2G%QyXw?xWVXJP-zzWVct7R_Wm`P{%Bml3n87Bsl!i;lcIJL=dNjd(=q
z=cKRS#=lApt8<P{o1K~|S<aq;A~v5~jqMQa61RC1Rwt|;a{5Gny>o`H*DW^gto`C1
z@=bQ&(bbV2E1t-@%!9Ld7F)u+IUDAr$zSiCsqz~&_T3uG3v(CE8hpQaZCIye&SS?d
zW%6@9H`!UiqYR^FSL7{QhKJlKt8~by(hLwR6gFQsF0eUU=Dv8<fm<i7R&I~XF-x^d
z?RPZIr#9%3Ptl+&>vkV+kMnr4@?Ii$Y+wCQ>Fknh@3B=vZMS25hkzz@oVw)g$a_9E
z3qK(fhhN={v|XRd3>_7mXg0I@c4jqm_WJ#L%Z`o5EbfmVvc_Wb@pAOA7pH69;m#=~
z12yyWD;zJsKD&A3+JX&odz)+CTXL-@Y@AVdX3481^TUhW#+w}7;qWwJtlP4XTf6HL
zu_3oQ{DqfXpQ#VFj+oE2Q~kBA%zK=ou{gTs5KXIGc;?==%qwXV!z=U3MYFeP*X574
ze3~&mxJ=!3*Ex${;NG#Jx~tB$jIrd!nVA<Y*L>E@$e)tCZKappft-zVPZxEtZl2q1
zEnBs8+Jn|pvpMVc%)E2|qj}L6^XAlm^!ADk$Lnes8S$*NAoYW*E)~xEh91TL3ZKlp
zl#V@r<@08N<)@~QK1)2>D{cxaLq=?kj&$;|oV3P;zP+DQtcO9{th`-qO&-%K-EI1$
z-<eNKKErvhxng9yNj&@Xez~vJ<7fAsnE4}Qhbz&?`B{eYU0$B*Wp~&kultIJR(5?z
zVMrBw+F4$Us_Xh1ornmVT#T(QzWwpuoUIcJZ@jQsSvV?{h~O_f<hr?bti{a>D`Riy
zUmbLL+W;#|bFbs0Rxf_~2XQH3js7Ru(2bX?W(9n>Q!_f$QIo#@BycVn*;X?Pi(l{c
z%ig-y$&Cq5wfncTONL*Xbm1=(q5Js9eSh`Ysn_)?MOptMv%e~ecGxWQ7Z<x&qW$5v
zQt6U}fi`cE=PM5GkWFeD#C|)Z!FfC9e$$_hJ(rHxJlnUvB3V4EYJb6wN3j=P<fJ)!
z2&!i3&rEE=Blb0gd23u+*6dA?lzsYopFX}}mt^|keSKpplfSwLC^G_Q(@yHAUy~T-
z+R=iREA(ev++_9Qqr_Lm?{MGV^)%|G*^rN|IkvkavSYJtYepxbWfmPL``<d?<x#yd
z%j?+0)|mL0MR}WwIdK=GMq3w$kBwbw{@Pn4$N6V>|6Q9g;&N_&ipFDJejanq22ioj
z%y{tB>kB^)u&&Lg&+z?C|Csy4ne&ntK5(Y*!k2B0vO1GDy6rw)o-po7_O?;8?vCZw
zPw2j@VMa33Y~MTV^B9wBUbxyfg%uxAHDt?WkXvUo!nV!NXVT8;v5n^s_D`2DTEL$A
zG*-&UH?25w!YA{2eR$b$L{oFp8PmIw`uvL0Lr|0w*QDND`iAfu_yMgx(_HaY?(yE)
z>sG%3D-S1KefjB&CBDqWP*Z%dAas#HanGfwsQrcg>_z;pcUcR|+I(vWldkRhd+%!T
z5gXnsj-0ypx8sWH@~<9^cZV*IZ}OcfZo6(+n(wn~g-gBjgBu^6f1RC@rzx7wDtvr?
z)14iCPye}aqg|I7?UiS8U*103l~4cj4wwG2er==wwZjf&7T?OzP&8qR-`&yWDOCx~
zDPeJSgX8}AW#YXv+NN)7+}&A6BVOR+7vIocE^7Da|LD_HYgzM!lmq#H6sM`~uI#4s
z%QNwf?Pblv&L?F-11s}BRiBimjnu5LiFELO!TK`CanG8fkylD5m42DYxwYiZ-mXcZ
zp`VHU&mWE2Tbt6-<7eIQ_MFfAw|<rD+MDfMz2{d=t3Du3yzH@YK}|pnf8&&7So6c$
z_FpwWtj{cAwFx_$-%IeFKNTXo;G<9jUzD)yMDx)G_+GxSHHm7SJW0`wxQZG(&0=+M
z_|1p9woLKvP#xUXJHMtdN(b+dgy}|8dAY7F^ZVLOy79L=!<)#0iXB_3YXSxOtkR99
ziCWlMgheKNn1kJDKV7Gwx8I`czJNVElunh*TtX!Qd;g=CMZgM>2U{uCcnWIxvgcV)
zZamnBT2cK9Zt8q4Pmw}^{c3Q4foI}{cq-zJ8pDQ|Tt$F`8V;yoa6m2j?xy8}L-6Sh
zhr=*3a5#c-z|mgJF<1t`spGJ>0*5+SD!}0ce3iqY9u^F6I0?y5k<C*0-9QnXrZA1L
znt+pMAi-I(0)*d9kU&2dXwFfn^AL4`Y~(^zGbQy`inj$~F2YI#A};Yb;4*{yz5)vq
zIQ1LEw)Re4rKYaIas^_pL(B~ZW$sN1bBn^XVHghUR2ABXkZBaK>A~&oWfR~w2eiYE
z^Bo3NNt10u0^EhwOn(lz$G{!&;rMtw7=z^xL6+jMBJ9XR?IpYgWcPzB#Y8DMU#1}Y
zV!BQ;0q(=9NY@hvoeaEJ&;tfO0QbcG@o5+y){%!(G&B_kOBu8uH!6tDjSq?riHYLE
zZdGC^ct~~>HDp<8^}XvR!IZd62p*w^aE1VnQEj)As}XpTkJ@WrJK1Z%)1R{a85wj_
z#FR|-L_K|&jlDm%X!FN_>K2xv%%{+PWeb_7je0?wn0W`bn<Q@~;jGli-;~TfH5bek
zH9CCX{=d62>1K5u6;~gT^u52VJ?<X0%sn3r%%`<J(m+Ju)Nu=zGAAv1y=iDKp+Sig
zQyMB$9ENc7(s#(N7E%Dk_+8ATscka$ujKruogJ?$m|Iu}y+-Ni!mkusA7bwB)AZ|U
z9ZVVeh4r*~>z)>t@Zyt_s$!jiP?SbbPi(x`@=ld*V#cyHOPR6K#h-2;AmPB$cArhm
zS*}+cd#;d*C_}$1>@j;N+kN?mXQhgqc#V!O>+PP<Wz4w`UhNt1o6c}uWR`rG7TC16
zg|({3*<fEuuRaY_c+qP9ns-~iTcoFVbWxJx?}H?qb?x;$C3Co)KkvxDBBQ5AdH7MO
zOy{@i6zCMvE0zzQ_^(jYjV^w;Ou@RPh&<~?AH1)H<ycsd{!zDtfu%vDhvCEaIfrcf
vq42KKVp?qN4})NPt1sF4<4FY=*{sy{8h?lR3TtUpd&SV!7<A>S_@4g<LQaop


From 6dc4bcb88bccd6fb40af08851d3f01150a4d040d Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 4 Jul 2021 22:18:53 +0200
Subject: [PATCH 157/166] updated opinosis example

---
 docs/notebooks/ensemble_lda_with_opinosis.ipynb | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
index cf96d3d74a..354a2eeff8 100644
--- a/docs/notebooks/ensemble_lda_with_opinosis.ipynb
+++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
@@ -10,6 +10,7 @@
    "source": [
     "import logging\n",
     "from gensim.models import EnsembleLda, LdaMulticore\n",
+    "from gensim.models.ensemblelda import rank_masking\n",
     "from gensim.corpora import OpinosisCorpus\n",
     "import os"
    ]
@@ -130,7 +131,7 @@
     "elda = EnsembleLda(\n",
     "    corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n",
     "    passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n",
-    "    topic_model_class='ldamulticore', masking_method='rank',\n",
+    "    topic_model_class='ldamulticore', masking_method=rank_masking,\n",
     ")\n",
     "pretty_print_topics()"
    ]
@@ -169,7 +170,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.3"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,

From 1e5108ccb6f301d8622d763ab8eaab01ecdaf887 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Sun, 4 Jul 2021 22:34:37 +0200
Subject: [PATCH 158/166] tox

---
 gensim/models/ensemblelda.py    |  2 +-
 gensim/test/test_ensemblelda.py | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 48cac7ab6d..856fa2cc26 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -136,7 +136,7 @@ def _remove_from_all_sets(label, clusters):
 
 def _contains_isolated_cores(label, cluster, min_cores):
     """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster."""
-    return sum(map(lambda neighboring_labels: neighboring_labels == {label}, cluster["neighboring_labels"])) >= min_cores
+    return sum([neighboring_labels == {label} for neighboring_labels in cluster["neighboring_labels"]]) >= min_cores
 
 
 def _aggregate_topics(grouped_by_labels):
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 6cd18ab550..5cdccbfc73 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -224,8 +224,16 @@ def test_multiprocessing(self):
             memory_friendly_ttda=False,
         )
 
-        np.testing.assert_allclose(elda.get_topics(), elda_multiprocessing.get_topics(), rtol=RTOL)
-        np.testing.assert_allclose(elda_mem_unfriendly.get_topics(), elda_multiprocessing_mem_unfriendly.get_topics(), rtol=RTOL)
+        np.testing.assert_allclose(
+            elda.get_topics(),
+            elda_multiprocessing.get_topics(),
+            rtol=RTOL
+        )
+        np.testing.assert_allclose(
+            elda_mem_unfriendly.get_topics(),
+            elda_multiprocessing_mem_unfriendly.get_topics(),
+            rtol=RTOL
+        )
 
     def test_add_models_to_empty(self):
         elda = self.get_elda()

From 9ac34390e5597baa00c92819d94c91434ed96926 Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Mon, 5 Jul 2021 23:37:22 +0200
Subject: [PATCH 159/166] using dataclasses

---
 gensim/models/ensemblelda.py      | 133 +++++++++++++++++-------------
 gensim/test/test_data/ensemblelda | Bin 10643 -> 10825 bytes
 gensim/test/test_ensemblelda.py   |   8 +-
 3 files changed, 80 insertions(+), 61 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 856fa2cc26..28684c247c 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -99,14 +99,17 @@
 import os
 from multiprocessing import Process, Pipe, ProcessError
 import importlib
+from typing import Set, Optional, List
 
 import numpy as np
 from scipy.spatial.distance import cosine
+from dataclasses import dataclass, replace
 
 from gensim import utils
 from gensim.models import ldamodel, ldamulticore, basemodel
 from gensim.utils import SaveLoad
 
+
 logger = logging.getLogger(__name__)
 
 # _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping
@@ -114,29 +117,47 @@
 _COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05
 
 
+@dataclass
+class Topic:
+    is_core: bool  # if the topic has enough neighbors
+    neighboring_labels: Set[int]  # which other clusters are close by
+    neighboring_topic_indices: Set[int]  # which other topics are close by
+    label: Optional[int]  # to which cluster this topic belongs
+    num_neighboring_labels: int  # how many different labels a core has as parents
+    valid_neighboring_labels: Set[int]  # A set of labels of close by clusters that are large enough
+
+
+@dataclass
+class Cluster:
+    max_num_neighboring_labels: int  # the max number of parent labels among each topic of a given cluster
+    neighboring_labels: List[Set[int]]  # a concatenated list of the neighboring_labels sets of each topic
+    label: int  # the unique identifier of the cluster
+    num_cores: int  # how many topics in the cluster are cores
+
+
 def _is_valid_core(topic):
     """Check if the topic is a valid core, i.e. no neighboring valid cluster is overlapping with it.
 
     Parameters
     ----------
-    topic : {'is_core', 'valid_neighboring_labels', 'label'}
+    topic : Topic
         topic to validate
 
     """
-    return topic["is_core"] and (topic["valid_neighboring_labels"] == {topic["label"]})
+    return topic.is_core and (topic.valid_neighboring_labels == {topic.label})
 
 
 def _remove_from_all_sets(label, clusters):
     """Remove a label from every set in "neighboring_labels" for each core in ``clusters``."""
     for cluster in clusters:
-        for neighboring_labels_set in cluster["neighboring_labels"]:
+        for neighboring_labels_set in cluster.neighboring_labels:
             if label in neighboring_labels_set:
                 neighboring_labels_set.remove(label)
 
 
 def _contains_isolated_cores(label, cluster, min_cores):
     """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster."""
-    return sum([neighboring_labels == {label} for neighboring_labels in cluster["neighboring_labels"]]) >= min_cores
+    return sum([neighboring_labels == {label} for neighboring_labels in cluster.neighboring_labels]) >= min_cores
 
 
 def _aggregate_topics(grouped_by_labels):
@@ -144,37 +165,34 @@ def _aggregate_topics(grouped_by_labels):
 
     Parameters
     ----------
-    grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
+    grouped_by_labels : dict of (int, list of :class:`Topic`)
         The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the
         label.
 
     Returns
     -------
-        list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'}
-        max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label
-        refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the
-        neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending
-        order. There is one single element for each cluster.
+    list of :class:`Cluster`
+        It is sorted by max_num_neighboring_labels in descending order. There is one single element for each cluster.
 
     """
     clusters = []
 
-    for label, group in grouped_by_labels.items():
+    for label, topics in grouped_by_labels.items():
         max_num_neighboring_labels = 0
         neighboring_labels = []  # will be a list of sets
 
-        for topic in group:
-            max_num_neighboring_labels = max(topic["num_neighboring_labels"], max_num_neighboring_labels)
-            neighboring_labels.append(topic["neighboring_labels"])
+        for topic in topics:
+            max_num_neighboring_labels = max(topic.num_neighboring_labels, max_num_neighboring_labels)
+            neighboring_labels.append(topic.neighboring_labels)
 
         neighboring_labels = [x for x in neighboring_labels if len(x) > 0]
 
-        clusters.append({
-            "max_num_neighboring_labels": max_num_neighboring_labels,
-            "neighboring_labels": neighboring_labels,
-            "label": label,
-            "num_cores": len([topic for topic in group if topic["is_core"]]),
-        })
+        clusters.append(Cluster(
+            max_num_neighboring_labels=max_num_neighboring_labels,
+            neighboring_labels=neighboring_labels,
+            label=label,
+            num_cores=len([topic for topic in topics if topic.is_core]),
+        ))
 
     logger.info("found %s clusters", len(clusters))
 
@@ -186,15 +204,15 @@ def _group_by_labels(cbdbscan_topics):
 
     Parameters
     ----------
-    cbdbscan_topics : {list of {'is_core', 'neighboring_labels', 'label'}}
+    cbdbscan_topics : list of :class:`Topic`
         A list of topic data resulting from fitting a :class:`~CBDBSCAN` object.
         After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results
-        member, which can be used as the argument to this function. It's a list of infos gathered during
+        member, which can be used as the argument to this function. It is a list of infos gathered during
         the clustering step and each element in the list corresponds to a single topic.
 
     Returns
     -------
-        dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
+    dict of (int, list of :class:`Topic`)
         A mapping of the label to a list of topics that belong to that particular label. Also adds
         a new member to each topic called num_neighboring_labels, which is the number of
         neighboring_labels of that topic.
@@ -203,13 +221,10 @@ def _group_by_labels(cbdbscan_topics):
     grouped_by_labels = {}
 
     for topic in cbdbscan_topics:
-        if topic["is_core"]:
-            topic = topic.copy()
-
-            # counts how many different labels a core has as parents
-            topic["num_neighboring_labels"] = len(topic["neighboring_labels"])
+        if topic.is_core:
+            topic.num_neighboring_labels = len(topic.neighboring_labels)
 
-            label = topic["label"]
+            label = topic.label
             if label not in grouped_by_labels:
                 grouped_by_labels[label] = []
             grouped_by_labels[label].append(topic)
@@ -267,27 +282,27 @@ def _validate_clusters(clusters, min_cores):
     # easy ones and potentially already remove some noise by also sorting smaller clusters to the front.
     # This clears up the clusters a bit before checking the ones with many neighbors.
     def _cluster_sort_key(cluster):
-        return (cluster["max_num_neighboring_labels"], cluster["num_cores"], cluster["label"])
+        return cluster.max_num_neighboring_labels, cluster.num_cores, cluster.label
 
     sorted_clusters = sorted(clusters, key=_cluster_sort_key, reverse=False)
 
     for cluster in sorted_clusters:
-        cluster["is_valid"] = None
-        if cluster["num_cores"] < min_cores:
-            cluster["is_valid"] = False
-            _remove_from_all_sets(cluster["label"], sorted_clusters)
+        cluster.is_valid = None
+        if cluster.num_cores < min_cores:
+            cluster.is_valid = False
+            _remove_from_all_sets(cluster.label, sorted_clusters)
 
     # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any
     # other cluster.
-    for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]:
-        label = cluster["label"]
+    for cluster in [cluster for cluster in sorted_clusters if cluster.is_valid is None]:
+        label = cluster.label
         if _contains_isolated_cores(label, cluster, min_cores):
-            cluster["is_valid"] = True
+            cluster.is_valid = True
         else:
-            cluster["is_valid"] = False
+            cluster.is_valid = False
             _remove_from_all_sets(label, sorted_clusters)
 
-    return [cluster for cluster in sorted_clusters if cluster["is_valid"]]
+    return [cluster for cluster in sorted_clusters if cluster.is_valid]
 
 
 class EnsembleLda(SaveLoad):
@@ -631,7 +646,7 @@ def add_model(self, target, num_new_models=None):
         # If it has, append.
 
         # Be flexible. Can be a single element or a list of elements
-        # make sure it's a numpy array
+        # make sure it is a numpy array
         if not isinstance(target, (np.ndarray, list)):
             target = np.array([target])
         else:
@@ -1059,18 +1074,18 @@ def _generate_stable_topics(self, min_cores=None):
         grouped_by_labels = _group_by_labels(cbdbscan_topics)
         clusters = _aggregate_topics(grouped_by_labels)
         valid_clusters = _validate_clusters(clusters, min_cores)
-        valid_cluster_labels = {cluster["label"] for cluster in valid_clusters}
+        valid_cluster_labels = {cluster.label for cluster in valid_clusters}
 
         for topic in cbdbscan_topics:
-            topic["valid_neighboring_labels"] = {
-                label for label in topic["neighboring_labels"]
+            topic.valid_neighboring_labels = {
+                label for label in topic.neighboring_labels
                 if label in valid_cluster_labels
             }
 
         # keeping only VALID cores
         valid_core_mask = np.vectorize(_is_valid_core)(cbdbscan_topics)
         valid_topics = self.ttda[valid_core_mask]
-        topic_labels = np.array([topic["label"] for topic in cbdbscan_topics])[valid_core_mask]
+        topic_labels = np.array([topic.label for topic in cbdbscan_topics])[valid_core_mask]
         unique_labels = np.unique(topic_labels)
 
         num_stable_topics = len(unique_labels)
@@ -1222,12 +1237,16 @@ def fit(self, amatrix):
         """Apply the algorithm to an asymmetric distance matrix."""
         self.next_label = 0
 
-        topic_clustering_results = [{
-            "is_core": False,
-            "neighboring_labels": set(),
-            "neighboring_topic_indices": set(),
-            "label": None,
-        } for _ in range(len(amatrix))]
+        topic_clustering_results = [
+            Topic(
+                is_core=False,
+                neighboring_labels=set(),
+                neighboring_topic_indices=set(),
+                label=None,
+                num_neighboring_labels=0,
+                valid_neighboring_labels=set()
+            ) for i in range(len(amatrix))
+        ]
 
         amatrix_copy = amatrix.copy()
 
@@ -1266,7 +1285,7 @@ def scan_topic(topic_index, current_label=None, parent_neighbors=None):
             # This also takes neighbor indices that already are identified as core in count.
             if num_neighboring_topics >= self.min_samples:
                 # This topic is a core!
-                topic_clustering_results[topic_index]["is_core"] = True
+                topic_clustering_results[topic_index].is_core = True
 
                 # if current_label is none, then this is the first core
                 # of a new cluster (hence next_label is used)
@@ -1287,23 +1306,23 @@ def scan_topic(topic_index, current_label=None, parent_neighbors=None):
                         current_label = self.next_label
                         self.next_label += 1
 
-                topic_clustering_results[topic_index]["label"] = current_label
+                topic_clustering_results[topic_index].label = current_label
 
                 for neighboring_topic_index in neighboring_topic_indices:
-                    if topic_clustering_results[neighboring_topic_index]["label"] is None:
+                    if topic_clustering_results[neighboring_topic_index].label is None:
                         ordered_min_similarity.remove(neighboring_topic_index)
                         # try to extend the cluster into the direction of the neighbor
                         scan_topic(neighboring_topic_index, current_label, neighboring_topic_indices + [topic_index])
 
-                    topic_clustering_results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index)
-                    topic_clustering_results[neighboring_topic_index]["neighboring_labels"].add(current_label)
+                    topic_clustering_results[neighboring_topic_index].neighboring_topic_indices.add(topic_index)
+                    topic_clustering_results[neighboring_topic_index].neighboring_labels.add(current_label)
 
             else:
                 # this topic is not a core!
                 if current_label is None:
-                    topic_clustering_results[topic_index]["label"] = -1
+                    topic_clustering_results[topic_index].label = -1
                 else:
-                    topic_clustering_results[topic_index]["label"] = current_label
+                    topic_clustering_results[topic_index].label = current_label
 
         # elements are going to be removed from that array in scan_topic, do until it is empty
         while len(ordered_min_similarity) != 0:
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
index 9c746b200f18efbf64f9c5b3e514a0abd7456a45..cf396af29cb6122e2e198c9a783ba118d40ebd1b 100644
GIT binary patch
delta 3330
zcmbtWcTm&W7AAy%^d1Q4(xiz}rAU`hjerO&J&*trLg-5o!5@*{q=nu^HmEF1Z%Q+u
z6bm3oFA5??iXaitATPY}y`8sj=FQH&f4+0hz27<W-80`GH`{v91`^GXqkE<v(g5aG
zwNX~lR8iMdRFYRyQPt2;ZX5u+F#wh8N(@zcDg3O>AAEf3D%~d7O!ev7E?5Cz<*ij}
zDKe3J9`C$!wOgW;G4$qQ1@YIRNS}u>6t(4FWw|8I7CEvc?rS@6gehBf8@9dJvx?ct
z7ZfSgl+Gc&JFs}flg+_s7L2dy6c5&Hs1{#R?Oy$`7Q18j=*6ZbtbrNxG6C?Sj$X$P
zSvRuPHnQDs@wz!8Y505E1;=Q;Eitq{o-EHG_#%ziAomeeeMX3k6K-75-(x3hEl@gN
zE3xWd3X?P{*p8%VG~{+L7>rivPE{1$a{-$~quN8Im68pA{#x6`6)q_@>pj7)Byj$O
zLGY;}offXa^y|3?SE2q3-~wR1?F&*R)P2<aeQPs)1T=NwW=(26j3ZDT<4^}z_YfGi
zVBVX15t<nuJvLeXZV3C>Tk=D-;NxpfKT&O4=F)zDTi9H{RU!jr6e3-~D_rke?J$g$
zpGo3;@106j)1Yv$zv(%p8u^(LHR~o71poZ$#)(xURwQ7i)FR519ODaU+lnx$(`P*1
zz-g@)!+^NeLLPFkx)5f2G$T8}j13+5e6;KBgSrMj?Hh&80PNBUgY#B72$1YO#qN-|
zM<$IKtdQ*aFN{QE=B!*FlBqrN>0$?GhE@7<OPi@nLq#QX>R9H5&jo#xL;eIAZo4~m
zhvYlhDYpVlbhO2-)3*TyxgMOK9n~~n88zLMi_aj>P{RBkoEEU!$4KIZg%RbtQyM}!
zmUDAYA#l@HUp{9=n+XdcxKpwA*tFr~?SN+4nfo)7NAvHRPP&}_yox*XD|!!JcM~is
zXfdXT8_U^jhSsb#lz{s?5T6S0tYjVa=56=y9M|(Cro%Nwo&rJ7a(0)BlVQFG;?ec}
zyI%1{!PZN^bjniYBC;NAd7|Ab$Jj=mJdM4rDei8w?L?@mY<&<izFaWrJdt&+!>cX;
zxwx9K$v1Z`vJH}>&zb{EtnT?Pv4I3H$FNgeAMMm2?lrdKr`;3h`(IJwNj9k=nP)=Y
zmI-2rZQhVAGyqY<JV=mo*~~u}z^w|sO|yd(b;nLI=?Y!*)rcv7lkjc!x7Z2HZA9u<
zU@Y(f>N~I9PXf?<DUnF}jrvMzQ;+O99;2vn+~6R3yXb_}!Fk+e8LQTBc^A%Do=bzn
zYxJN?2`bg23pH{$0sVs8I$sp#;&j+6l9%feS8J*pUjY%<;TZ2L(7Cwjs`k#F`<fks
z-_W9yl0yg{)e=)jzf)y{lrR!Y+nKIIRPgM#$(Q$Eb(lQ4gSIR&A-Rc6zD(Xdu&ste
z?3<rMd=~@We{kdi-Sl?Qw#o92$}1DxH#tc@^4HX^%RupfEP1VQtWP~^h{Ze8;%AB0
z&HU2*^FaH>&g{G>_U+jT=U$D9)^6TROeso}^ym$1ttZKhSDwdgv_mufip-X2ctr2(
z1=4JD0)u|ArxBgWKn&_>uQ=yfpGFCKSg}sa^V#Tfh0bx|eCsqbLA4hz-@wSTLWL52
zb~_OpUe0dVp2f^p2ip3%kJ#!vn~07#-la>nC<9&Lu08?6_E51ZF4eHix}<@~yIi+a
zk{F5m17VIW=G*CWDT(b%hNsr7!3^e>N+z!k15p<hJW3w>Qk0=TgHS_~&Y)MZA~(g6
zIKPKA1AGw%!~)3~>8Z6?jgJdqLndq6lUW?vhHq+FjC4jnT!J$D`CjEr!CTx<E(<Y=
zDFDC?w%v>uhc-vf8N)AvwbxZh(>KI<CrRRxXXBSgSezW0oKghSiTJ{{gohcwAr8k)
z?PaXl{4m2lqpk(O`1&rhw(!(aK9VA0IkVPqS=wFU%6@TW<|S)LllHTxHNA5(*=k;M
z>*IlTE06jhs!Q{Br7EgyUsmtJv-Tszl!2olH2bqst>I>4T`)7RBiS~{w398KnD%Vz
zV`A8qE6R-G1Dg*vk}~01<87q2=*8!DcbWX3DV+3_?{uAs7E;7sF?<4`!*jN`{1g@K
z)cB3R+6A1K^L!195l|os6crjCI&P4sL4k^AsR39nY9Ah#67>~&YeT&{`LMz^4cKMe
zMdZMYW1NDatpmRK=K9-$h^p@kS(!{Lm00ACl&;x@d~-(!S1JCYYk{JxAKusQ&-Lx)
zcXzQ|4p{g6EZ2i?6>1!(Gh#DP!;6hEenhW6S3LOEY~R&Ijp^^wQ6}E9)8(FO=(_9S
z$4@sNW@3L*@m5=(A(ON$#j+DW0nEMtamey*GL22pPf4LD+j1(!)U%muXMPQWm<T?;
zS{R|LRSPZ~I%x__+d_7&D>cX1W-c1WPfe!3C%@~-HeH=#)!vx`s18x<3Hh1Hq5Kk3
zMIl5WQa;+mmYnxZ;ak#r`*kQH1QfLscG-Ts%~<CSzdw=zeNQGrGfF}oU?qk5ic)(y
zvvzpG7q^BRg^6*oE=Ob7@8ptm7@67Ie3N`8*-3V%yNUy2N);u!FI%oz)_PtZGpN5W
zB*aWNXR+eJ+qw7gjw<I>(XYs7q5YH|5cJ%v<^=EUa~3u35Jy;S*4Jwj=Rb-ZmR8jl
zny>6hou&i-tZZ*NU7yr{6%ZR{>cEqjpUmXzh@8qJry9heeOgOAM5X;A^yJIRFmX@9
zS9RMq7??Xu4wCTP<3>6O_T|XQ{OsKIjO5mpYkk0!p44{bk#gv;_*O~#*3h+$xyq8)
z@eHoTJ`NvGiuPNa(4s;Hh&KzQjd)pd8FLjlEj)ndPwGs~-TFR>0n&u7%*Q|tY4+Y#
zU$%0?<fc|Ei_iP{8+y)xAg`4Xm)TNEuPY>XNiBL&N68X7%;wZ^N-yL<@3j_Z@QFnY
z*Uw5=CYD0$AVJU*H)iyg&+4`xR8kh^&9=DSP+6Oc*OqdohpD0yaq|JoI-e*<*C{*B
zg0*K)IDyVhA##(9fiiJPTw>19!<38_7}i2DGnQ|6XjaWRuZ;Z#A>8a$*Wpo>72VkU
z?D}C6p_de@HK4&eI!wff?Fz)fVn&3^3;XeNl5IU-P~9qO<b<3yD}2ggP{ylxz#5U9
z7v>jM<!vz{umnw7kqPa!&1^x|M>JW-xy6h=wt30E_*@N`pWpgWq3RbOO>eo)-Mi>2
zP}Z93ExiXozq1Zz4N)_P#RX(GV=C{G&F|zF!-5wPBkjQ>Ffk^tx_fTK5NrKqN9~57
z?+;3X>I&Hn1uY#Ee_QiW3p~hBNH8K6pM%%g#26UX^fqKaJHO=1{E^NqeFOvtat<SB
ztNpE71U|V|mEXL&aPpy-$>3lXqp&{6G>Pw$DtxrM8+p*<>i}*i!uB+q;fKXSkIBaV
z`;*FqOX+WQ=1jf5p|TVCtg>pikiXZDR8K{CG@aMBN54f&EY}f!A)r&i0sg@Rgiqrx
zw-o&!e?Mh;MRjFmC8b6=J|Y(<hAtIi;~(JdPRJ(6;tAPfryT{Lej&cDexBZ57_@(o
zx1X0Q4u$r_1!v1hN&;vs8;FjM#*8A!U^!`NB9@DW=>!=BT?UrtM@o-iKrm)t`Di7A
z4$FTWKv?L%c_Dvo3h2M50DpB1=npY;2nd4dhZxYG#eg#YvKV^WlYqbVrvDHF{(r;(
zKXwR0&{q8S7W^kMkpHzA&k%GfM<@#C?cwT<3km+kQy?gq;6jk0xrPb;&x4>JzFClx
z`03aPv`efIEn-EELG~C_XyA$E%NCIqHAZ7~Y1I$=T%@HmU;L|$`<Q(C803#Zg9fxV
u8qzrM<G_G5q=m!~bmI-_QP7_pt)ilq?IJ79=|H2PFlZM728Rkk%l`x4;;dT$

delta 3158
zcmb7`XH=707KVX@AQCzXh+w4G&|6RmRRrP%DFV`qK<Firnh!*zh2E4R(gXxSLX{>(
zML}vnx_}gEq9Dak2WIZvDQniuoj>p1d#!h`=j^l3kMqGW*?=+xoEj-qNl5@P${0#X
zAtdDy(sFP~328+Ig{t=;YcRm`fh`IrV>?s=nNMKwf$vF$=DLXsLh;x_7bTMaj0=3Y
zki^L#SsG3QP_KPY+B#Pj`rw_IEX<2TS5G!q>Cvm|n!8F>dODUZ$*iH3#TE0tBSS&v
zPTV+MndV-Z)+znXZ#mNgMEaYtbMdY_l(!zr^7U((e|+@Q`DVi7^C7?iIS_#9fon<t
zvjoBI*lwuAarHok4!q^%h}Z=m`eyT&yMb9hfgHN?)~*ou4u;lE!;M+g1zZsX;Yy9Y
zU7@@i?kWviG>1H0K`cD-zKe<+-7lbo>c@qBc+zf%aWkaCi+<`_g6IkJ^`A{3!GkvX
z-XaNdSl?CkPtcAlz<5h~uIM>y2a(*ouL{MpM`y~5DD0wJG1bx2KJ70mLxJboiD@@@
zjjYs2Swp+HYO>?`k3TG_eP8rVf@gDz5=tK9?a64b&y^YXsSGEYq<S?@o4ImQzt;Y?
z9O;{IW($I<Y>$=3@o#=D;UG$|tvzGZeeRL^AR=Ug+0TDz7BCZc4hRihdDF+nUHh#J
zmCLVnjrJL}h&&!-VyMq$r0RsH*=}qJuvIigOBX-s^l2DJmf47pU70kA!Y*9Ygt8Bl
zJ8a-`?njrbmy_ly)V`us#Fr*V1KRvIV|Y~{;h30$q5~+Vkq5?{4~vt#;RdUO@1Rz#
zv_4j+S(hpR**Jr6;`&mU>J!Ts6{TNLE*PW9v5VQbZ+F=1R<d9saGX3exb8CT<Kr3)
zy40=CTf_TQo@eJ%k@za6&(<=VG=-E+#s#tFk!vy+JW0*4mTctn>lyiJibSy`FV!{S
z3r;M#md#({M=6Kc+`3rxp#u)vRvrh#n+sUm_6Nm40WF4QdLr$fRKPoaQ8bw+L)yLK
zT3NwLb}-_lWz+M^tUT$6ScmWBJ`9<qA!g$+S0kze)L0)ah0H;J;92oH4O*9>e8G?Q
zTjsxZvH0}!OF^7QG>)q*dbm7cuY$%`GtY*Lw^kWhr<X1G^4f4~f-+^%QM4FE;vIUe
zi8waE-kFn|NX@z(IKtcH;UaJz-5S2)y4P1bc~INYUIcx@$T0Se^_{k`%H&U+70#I{
zxz!`--H-b(Q~&|pFg;9_78|sKVug<zw~%02)<a5^8O`Cdv3;F4QCIPPjmFa)k^MDy
zxqe1|a+)IoX%h3D-(u9|u3Q05PtoO!hZPdIeaOY@xnh@P_p-~N#zeU6(1zCy;R|x~
zk3Bo#UNjoGJ=uFxz)wsHG}^`1a1kS4=}0SKOjq11QdQD+E5HTG+*y6>Z|e714fM;^
zS^+8|tvd$SN<YJ})4<sis<}94N!%)8@hWeZc3N(5v~+sVqZU>l%&)IICJ70e`jG%w
z={|Z5o7u2cw#tk3wiw_{9hVY5SoHS&e9>;mxt(JAGZg;glf&+U0Lz7k-W@j4$1CC^
zN`4w7M}-Sq%^aGh>MClJPgfG@@Y;}fnSAke1sf?y9XdjH?Os+!Vfy;i+r>PWCQIQi
z9V$5jA*t;Q^D9F+Jylai=nntVTQ5rhtIa^GxG#skM5o70sd>v=B|HSP->A0D`cyyL
z|Nax4(V|e=epgp%%Bs?*w+BvsA*0-p?zgZ?qR%KN)@w>ln_SG4?5`Sp^fE9q1&g`E
zVLpr<Oc^`*7J~#og^F`N9LMy1;UX)+!Ls=|8y($65k3ZX4}N$naws@xj6T%{%07Ge
z71QXtb**(L8y^_cU}7#2JoY2y^{=`wgBMFbBPAl@?-^v#tW??;$CV#xx}E=2KM<rP
zRV$>5P$qzjij&lB)bP@|_e}yO1TsVVS$-KU5Dw?z*&wMcwuQ#QqAEJp7N2Xs2^Rg@
z4%a{$Rmt)s*ynuHtg=eiLwAq>@h`7`9M*vg9BVWrO+?XJK_X-A%TnAh2}S$saW5Rc
z*Cb=f7CqIsrrHyV{7^z&q3jH-nb&VCSPk3;o)0f)#_n>b&@+#wlr0dm232+=u7ji6
z<qFqK;qtscd*i&F^S-ob-w0s#NBZH%`#t48iy*98R{giDO@+@|$n1^+pmh@z$o>r#
zRT%k*de0aLmSJeT@hLQiC?0K)(Ij9^gR>UAaJ1rWkzZO9q^jbC9DQ)R$-L1-O?71|
zinPUdV}o!lb1kbaNFbx}w+BpAHKMuqqFWo<D!H{nifiw)%_axaxQ$Q2&5#^>Tyu8t
z;p%6<X-EyL55_|U;gnzi!+plxfF7Ph_q8{dN<?>d(-PV&8P@GHQ!jft{b2X}MYLAh
z5(ln{(m|Oxmoiz>ADITdjpjiBu6vY@RT%c&b}~|jJfCBC7F7cuGrU)wDV&i|?MOU~
z7-~^azq;;KT%XS?b4Y}9kfETcj<1e2ZjcWVj>r!{PwnB+9AF5dj~|-r8=c;K9CK;k
z*@lN0tP$mZZSJv;Y0GV_GSP^etZ8++V7~wxzR`7MQCVFzees%@#*o`g@1Z7D_4l1g
zrZhE1g>GPi6j7^Q7buGsV2i4MMcC1{I+Xu7r&x<xQ>2CTbRwGc)jm81&ul0=1h}RL
zZq0|NCUB0Y0TvyXMpF5)^wCZMB-P6<$Nj=pF!5JcN-&9V@9rL05BsBnxowPH&l%b_
zt|&<7F#f4knQ@YgO779!r~Aj=3}xY$$OwtGhTp+T6YFY%4R`9v99U^0-co2!^5c8&
zG`Tq^4fSC3#R+WaoxpXb7n$*C6B%#05krKVcoI<zQ2&+KFF&t;muuYpp0@AXH%H1r
zg5y46Q>`-!0oh3yWiHx_<=ZvB+u`+t3Wntzp@ko*`t~Hj)X@UmNAQ7Xg&={nBVWL~
zo_e+y581l`yBi}tk{<XfI)<3+=UK=W5hg@#&xr~`1z~9(qL_wB)Zl|al<Ax7YZ5y%
zT#9J`MOg!Bno`L(TJk-)h0%<e6b1W8lMtCire4JoP6(+|A#M<$5%n+!C7RQ?E`*j0
z&|O<sJRVnRlqshfAM<>;)x;?j4QXptR{Nx`kLbwDB)%T7?DP;CRaea7iO}&__KOQJ
z-pU_#e3Mk*;x329y^Eg@uJsp;JC1~Xy>$_wZFZBYf5Cg$-eR>~(fvK7E*5q#Z;L#1
zv~H|B7vbKfKbU$03A;?(ZJy|a9_b=L;t`>n^7DHc9X{N5^r_>FDMgN)K|8%gwlPf{
z+Q}?5SW;Kn**XVy!h12j{nrj;JO+9YKQPAvC;v8Ns9!awVsQ0GvP{s`YXLtOv`Ve8
zTgb~-d__yP<Kx7qv?xY8M#p>`bNlnbh-&=3YBfy*xEnfh>Djr&FyY4hz~GDaU@p*g
zY;FI{`l}|VA!oDfK}x3`;k})+j)(T!b3mt<=dTps6k9yV)59l4$FXXQk)Pt9|9-ME
zk_z(DRU#}!49T;+W>jog8&_w0tK<z{b^sHcB8Y~PlaWQDX-_mhMNo$<0ZspVN}&VR
zfh3?APf8s!G}9l3|K3IZ=eq#%f56CeD0QfQhmrpQBTx8WFp58W#D8Id{>lPQC;R_k
z;6HaC{|2M{PlGvN&e`JW{<aM}SB#IZgSStLMT%f@v;Ys+={inw^tqFWW<Qn7ry_JB
z4rs>YZUNYp)7?p*)F%`C-A#Au;XD<-QxQ9n6ZeUpcw|ol7%g`aA~98xf&=9L`bwlE
aq~(*Dg$3xfPaHN*wiYQ)t~MB3_+J38$arZ0

diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
index 5cdccbfc73..ad574108f9 100644
--- a/gensim/test/test_ensemblelda.py
+++ b/gensim/test/test_ensemblelda.py
@@ -152,12 +152,12 @@ def test_generate_gensim_representation(self):
     def assert_clustering_results_equal(self, clustering_results_1, clustering_results_2):
         """Assert important attributes of the cluster results"""
         np.testing.assert_array_equal(
-            [element["label"] for element in clustering_results_1],
-            [element["label"] for element in clustering_results_2],
+            [element.label for element in clustering_results_1],
+            [element.label for element in clustering_results_2],
         )
         np.testing.assert_array_equal(
-            [element["is_core"] for element in clustering_results_1],
-            [element["is_core"] for element in clustering_results_2],
+            [element.is_core for element in clustering_results_1],
+            [element.is_core for element in clustering_results_2],
         )
 
     def test_persisting(self):

From 773ce173d1bad31b98c0047f579689b5c623fb7e Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Mon, 5 Jul 2021 23:43:51 +0200
Subject: [PATCH 160/166] updated type syntax for docstring

---
 gensim/models/ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 28684c247c..c415c7bdfd 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -140,7 +140,7 @@ def _is_valid_core(topic):
 
     Parameters
     ----------
-    topic : Topic
+    topic : :class:`Topic`
         topic to validate
 
     """

From 4d674f98f65ff958f1afde0daedec2b8962e4e0d Mon Sep 17 00:00:00 2001
From: sezanzeb <proxima@hip70890b.de>
Date: Mon, 5 Jul 2021 23:47:35 +0200
Subject: [PATCH 161/166] unused import

---
 gensim/models/ensemblelda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index c415c7bdfd..88c37845ef 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -103,7 +103,7 @@
 
 import numpy as np
 from scipy.spatial.distance import cosine
-from dataclasses import dataclass, replace
+from dataclasses import dataclass
 
 from gensim import utils
 from gensim.models import ldamodel, ldamulticore, basemodel

From c35fb01bf5406754b569c74c09874ee10ee91eee Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 18 Jul 2021 20:49:23 +0900
Subject: [PATCH 162/166] update sbt install step

---
 .github/workflows/tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8e3ad48871..41a608ef90 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -39,7 +39,8 @@ jobs:
       #
       - name: Update sbt
         run: |
-          echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
+          echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list
+          echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list
           curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add
           sudo apt-get update -y
           sudo apt-get install -y sbt

From 71b33ddfb9ad4e510fe39894fc82efc5ffef5788 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 18 Jul 2021 22:28:37 +0900
Subject: [PATCH 163/166] minor refactoring

Decoupled multiprocessing code from EnsembleLda class.  This reduces the
length of the class by several hundred lines, making it slightly easier
to understand.

Added _generate_topic_models_worker function to clarify distinction
between single-process and multi-process code.

Fixed flake8 problem (l is an ambiguous variable name)

Adjusted _teardown function (removed i parameter, it's only for logs)

Moved _MAX_RANDOM_STATE to module level
---
 gensim/models/ensemblelda.py | 591 ++++++++++++++++++-----------------
 1 file changed, 311 insertions(+), 280 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 88c37845ef..9fcb486144 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -116,6 +116,9 @@
 # distance calculations for highly masked topic-term distributions
 _COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05
 
+# nps max random state of 2**32 - 1 is too large for windows
+_MAX_RANDOM_STATE = np.iinfo(np.int32).max
+
 
 @dataclass
 class Topic:
@@ -241,12 +244,7 @@ def _teardown(pipes, processes, i):
             list of pipes that the processes use to communicate with the parent
         processes : {list of :class:`multiprocessing.Process`}
             list of worker processes
-        i : int
-            index of the process that could not be started
-
     """
-    logger.error(f"could not start process {i}")
-
     for parent_conn, child_conn in pipes:
         child_conn.close()
         parent_conn.close()
@@ -305,6 +303,292 @@ def _cluster_sort_key(cluster):
     return [cluster for cluster in sorted_clusters if cluster.is_valid]
 
 
+def _generate_topic_models_multiproc(ensemble, num_models, ensemble_workers):
+    """Generate the topic models to form the ensemble in a multiprocessed way.
+
+    Depending on the used topic model this can result in a speedup.
+
+    Parameters
+    ----------
+    ensemble: EnsembleLda
+        the ensemble
+    num_models : int
+        how many models to train in the ensemble
+    ensemble_workers : int
+        into how many processes to split the models will be set to max(workers, num_models), to avoid workers that
+        are supposed to train 0 models.
+
+        to get maximum performance, set to the number of your cores, if non-parallelized models are being used in
+        the ensemble (LdaModel).
+
+        For LdaMulticore, the performance gain is small and gets larger for a significantly smaller corpus.
+        In that case, ensemble_workers=2 can be used.
+
+    """
+    # the way random_states is handled needs to prevent getting different results when multiprocessing is on,
+    # or getting the same results in every lda children. so it is solved by generating a list of state seeds before
+    # multiprocessing is started.
+    random_states = [ensemble.random_state.randint(_MAX_RANDOM_STATE) for _ in range(num_models)]
+
+    # each worker has to work on at least one model.
+    # Don't spawn idle workers:
+    workers = min(ensemble_workers, num_models)
+
+    # create worker processes:
+    # from what I know this is basically forking with a jump to a target function in each child
+    # so modifying the ensemble object will not modify the one in the parent because of no shared memory
+    processes = []
+    pipes = []
+    num_models_unhandled = num_models  # how many more models need to be trained by workers?
+
+    for i in range(workers):
+        parent_conn, child_conn = Pipe()
+        num_subprocess_models = 0
+        if i == workers - 1:  # i is a index, hence -1
+            # is this the last worker that needs to be created?
+            # then task that worker with all the remaining models
+            num_subprocess_models = num_models_unhandled
+        else:
+            num_subprocess_models = int(num_models_unhandled / (workers - i))
+
+        # get the chunk from the random states that is meant to be for those models
+        random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
+
+        args = (ensemble, num_subprocess_models, random_states_for_worker, child_conn)
+        try:
+            process = Process(target=_generate_topic_models_worker, args=args)
+            processes.append(process)
+            pipes.append((parent_conn, child_conn))
+            process.start()
+            num_models_unhandled -= num_subprocess_models
+        except ProcessError:
+            logger.error(f"could not start process {i}")
+            _teardown(pipes, processes)
+            raise ProcessError
+
+    # aggregate results
+    # will also block until workers are finished
+    for parent_conn, _ in pipes:
+        answer = parent_conn.recv()
+        parent_conn.close()
+        # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
+        if not ensemble.memory_friendly_ttda:
+            ensemble.tms += answer
+            ttda = np.concatenate([m.get_topics() for m in answer])
+        else:
+            ttda = answer
+        ensemble.ttda = np.concatenate([ensemble.ttda, ttda])
+
+    for process in processes:
+        process.terminate()
+
+
+def _generate_topic_models(ensemble, num_models, random_states=None):
+    """Train the topic models that form the ensemble.
+
+    Parameters
+    ----------
+    ensemble: EnsembleLda
+        the ensemble
+    num_models : int
+        number of models to be generated
+    random_states : list
+        list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles
+        RandomState if None (default).
+    """
+    if random_states is None:
+        random_states = [ensemble.random_state.randint(_MAX_RANDOM_STATE) for _ in range(num_models)]
+
+    assert len(random_states) == num_models
+
+    kwargs = ensemble.gensim_kw_args.copy()
+
+    tm = None  # remember one of the topic models from the following
+    # loop, in order to collect some properties from it afterwards.
+
+    for i in range(num_models):
+        kwargs["random_state"] = random_states[i]
+
+        tm = ensemble.get_topic_model_class()(**kwargs)
+
+        # adds the lambda (that is the unnormalized get_topics) to ttda, which is
+        # a list of all those lambdas
+        ensemble.ttda = np.concatenate([ensemble.ttda, tm.get_topics()])
+
+        # only saves the model if it is not "memory friendly"
+        if not ensemble.memory_friendly_ttda:
+            ensemble.tms += [tm]
+
+    # use one of the tms to get some info that will be needed later
+    ensemble.sstats_sum = tm.state.sstats.sum()
+    ensemble.eta = tm.eta
+
+
+def _generate_topic_models_worker(ensemble, num_models, random_states, pipe):
+    #
+    # Same as _generate_topic_models, but runs in a separate subprocess, and
+    # sends the updated ensemble state to the parent subprocess via a pipe.
+    #
+    logger.info(f"spawned worker to generate {num_models} topic models")
+
+    _generate_topic_models(ensemble=ensemble, num_models=num_models, random_states=random_states)
+
+    # send the ttda that is in the child/workers version of the memory into the pipe
+    # available, after _generate_topic_models has been called in the worker
+    if ensemble.memory_friendly_ttda:
+        # remember that this code is inside the worker processes memory,
+        # so self.ttda is the ttda of only a chunk of models
+        pipe.send(ensemble.ttda)
+    else:
+        pipe.send(ensemble.tms)
+
+    pipe.close()
+
+
+def _calculate_asymmetric_distance_matrix_chunk(
+    ttda1,
+    ttda2,
+    start_index,
+    masking_method,
+    masking_threshold,
+):
+    """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
+
+    Parameters
+    ----------
+    ttda1 and ttda2: 2D arrays of floats
+        Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
+        topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
+    start_index : int
+        this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
+        complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into
+        two pieces, each 100 ttdas long, then start_index should be be 100. default is 0
+    masking_method: function
+
+    masking_threshold: float
+
+    Returns
+    -------
+    2D numpy.ndarray of floats
+        Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.
+
+    """
+    # initialize the distance matrix. ndarray is faster than zeros
+    distances = np.ndarray((len(ttda1), len(ttda2)))
+
+    if ttda1.shape[0] > 0 and ttda2.shape[0] > 0:
+        # the worker might not have received a ttda because it was chunked up too much
+
+        # some help to find a better threshold by useful log messages
+        avg_mask_size = 0
+
+        # now iterate over each topic
+        for ttd1_idx, ttd1 in enumerate(ttda1):
+            # create mask from ttd1 that removes noise from a and keeps the largest terms
+            mask = masking_method(ttd1, masking_threshold)
+            ttd1_masked = ttd1[mask]
+
+            avg_mask_size += mask.sum()
+
+            # now look at every possible pair for topic a:
+            for ttd2_idx, ttd2 in enumerate(ttda2):
+                # distance to itself is 0
+                if ttd1_idx + start_index == ttd2_idx:
+                    distances[ttd1_idx][ttd2_idx] = 0
+                    continue
+
+                # now mask b based on a, which will force the shape of a onto b
+                ttd2_masked = ttd2[mask]
+
+                # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
+                # distributions that will have distance values near 1.
+                if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD:
+                    distance = 1
+                else:
+                    distance = cosine(ttd1_masked, ttd2_masked)
+
+                distances[ttd1_idx][ttd2_idx] = distance
+
+        percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
+        logger.info(f'the given threshold of {masking_threshold} covered on average {percent}% of tokens')
+
+    return distances
+
+
+def _asymmetric_distance_matrix_worker(
+    worker_id,
+    entire_ttda,
+    ttdas_sent,
+    n_ttdas,
+    masking_method,
+    masking_threshold,
+    pipe,
+):
+    """Worker that computes the distance to all other nodes from a chunk of nodes."""
+    logger.info(f"spawned worker {worker_id} to generate {n_ttdas} rows of the asymmetric distance matrix")
+    # the chunk of ttda that's going to be calculated:
+    ttda1 = entire_ttda[ttdas_sent:ttdas_sent + n_ttdas]
+    distance_chunk = _calculate_asymmetric_distance_matrix_chunk(
+        ttda1=ttda1,
+        ttda2=entire_ttda,
+        start_index=ttdas_sent,
+        masking_method=masking_method,
+        masking_threshold=masking_threshold,
+    )
+    pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
+    pipe.close()
+
+
+def _calculate_assymetric_distance_matrix_multiproc(
+    workers,
+    entire_ttda,
+    masking_method,
+    masking_threshold,
+):
+    processes = []
+    pipes = []
+    ttdas_sent = 0
+
+    for i in range(workers):
+        try:
+            parent_conn, child_conn = Pipe()
+
+            # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3.
+            n_ttdas = 0
+            if i == workers - 1:  # i is a index, hence -1
+                # is this the last worker that needs to be created?
+                # then task that worker with all the remaining models
+                n_ttdas = len(entire_ttda) - ttdas_sent
+            else:
+                n_ttdas = int((len(entire_ttda) - ttdas_sent) / (workers - i))
+
+            args = (i, entire_ttda, ttdas_sent, n_ttdas, masking_method, masking_threshold, child_conn)
+            process = Process(target=_asymmetric_distance_matrix_worker, args=args)
+            ttdas_sent += n_ttdas
+
+            processes.append(process)
+            pipes.append((parent_conn, child_conn))
+            process.start()
+        except ProcessError:
+            logger.error(f"could not start process {i}")
+            _teardown(pipes, processes)
+            raise ProcessError
+
+    distances = []
+    # note, that the following loop maintains order in how the ttda will be concatenated
+    # which is very important. Ordering in ttda has to be the same as when using only one process
+    for parent_conn, _ in pipes:
+        worker_id, distance_chunk = parent_conn.recv()
+        parent_conn.close()  # child conn will be closed from inside the worker
+        # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
+        distances.append(distance_chunk)
+
+    for process in processes:
+        process.terminate()
+
+    return np.concatenate(distances)
+
+
 class EnsembleLda(SaveLoad):
     """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
 
@@ -384,10 +668,6 @@ def __init__(
             Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble.
 
         """
-        # INTERNAL PARAMETERS
-        # Set random state
-        # nps max random state of 2**32 - 1 is too large for windows:
-        self._MAX_RANDOM_STATE = np.iinfo(np.int32).max
 
         if "id2word" not in gensim_kw_args:
             gensim_kw_args["id2word"] = None
@@ -450,13 +730,12 @@ def __init__(
         if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0:
             return
 
-        logger.info(f"generating {num_models} topic models...")
+        logger.info(f"generating {num_models} topic models using {ensemble_workers} workers")
 
         if ensemble_workers > 1:
-            self._generate_topic_models_multiproc(num_models, ensemble_workers)
+            _generate_topic_models_multiproc(self, num_models, ensemble_workers)
         else:
-            # singlecore
-            self._generate_topic_models(num_models)
+            _generate_topic_models(self, num_models)
 
         self._generate_asymmetric_distance_matrix()
         self._generate_topic_clusters(epsilon, min_samples)
@@ -733,216 +1012,6 @@ def add_model(self, target, num_new_models=None):
         # tell recluster that the distance matrix needs to be regenerated
         self.asymmetric_distance_matrix_outdated = True
 
-    def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
-        """Generate the topic models to form the ensemble in a multiprocessed way.
-
-        Depending on the used topic model this can result in a speedup.
-
-        Parameters
-        ----------
-        num_models : int
-            how many models to train in the ensemble
-        ensemble_workers : int
-            into how many processes to split the models will be set to max(workers, num_models), to avoid workers that
-            are supposed to train 0 models.
-
-            to get maximum performance, set to the number of your cores, if non-parallelized models are being used in
-            the ensemble (LdaModel).
-
-            For LdaMulticore, the performance gain is small and gets larger for a significantly smaller corpus.
-            In that case, ensemble_workers=2 can be used.
-
-        """
-        # the way random_states is handled needs to prevent getting different results when multiprocessing is on,
-        # or getting the same results in every lda children. so it is solved by generating a list of state seeds before
-        # multiprocessing is started.
-        random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)]
-
-        # each worker has to work on at least one model.
-        # Don't spawn idle workers:
-        workers = min(ensemble_workers, num_models)
-
-        # create worker processes:
-        # from what I know this is basically forking with a jump to a target function in each child
-        # so modifying the ensemble object will not modify the one in the parent because of no shared memory
-        processes = []
-        pipes = []
-        num_models_unhandled = num_models  # how many more models need to be trained by workers?
-
-        for i in range(workers):
-            parent_conn, child_conn = Pipe()
-            num_subprocess_models = 0
-            if i == workers - 1:  # i is a index, hence -1
-                # is this the last worker that needs to be created?
-                # then task that worker with all the remaining models
-                num_subprocess_models = num_models_unhandled
-            else:
-                num_subprocess_models = int(num_models_unhandled / (workers - i))
-
-            # get the chunk from the random states that is meant to be for those models
-            random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
-
-            try:
-                process = Process(
-                    target=self._generate_topic_models,
-                    args=(num_subprocess_models, random_states_for_worker, child_conn),
-                )
-
-                processes.append(process)
-                pipes.append((parent_conn, child_conn))
-                process.start()
-
-                num_models_unhandled -= num_subprocess_models
-
-            except ProcessError:
-                _teardown(pipes, processes, i)
-                raise ProcessError
-
-        # aggregate results
-        # will also block until workers are finished
-        for parent_conn, _ in pipes:
-            answer = parent_conn.recv()
-            parent_conn.close()
-            # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
-            if not self.memory_friendly_ttda:
-                self.tms += answer
-                ttda = np.concatenate([model.get_topics() for model in answer])
-            else:
-                ttda = answer
-            self.ttda = np.concatenate([self.ttda, ttda])
-
-        # end all processes
-        for process in processes:
-            process.terminate()
-
-    def _generate_topic_models(self, num_models, random_states=None, pipe=None):
-        """Train the topic models that form the ensemble.
-
-        Parameters
-        ----------
-        num_models : int
-            number of models to be generated
-        random_states : list
-            list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles
-            RandomState if None (default).
-        pipe : multiprocessing.pipe
-            Default None. If provided, will send the trained models over this pipe. If memory friendly, it will only
-            send the ttda.
-
-        """
-        if pipe is not None:
-            logger.info(f"spawned worker to generate {num_models} topic models")
-
-        if random_states is None:
-            random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)]
-
-        assert len(random_states) == num_models
-
-        kwargs = self.gensim_kw_args.copy()
-
-        tm = None  # remember one of the topic models from the following
-        # loop, in order to collect some properties from it afterwards.
-
-        for i in range(num_models):
-            kwargs["random_state"] = random_states[i]
-
-            tm = self.get_topic_model_class()(**kwargs)
-
-            # adds the lambda (that is the unnormalized get_topics) to ttda, which is
-            # a list of all those lambdas
-            self.ttda = np.concatenate([self.ttda, tm.get_topics()])
-
-            # only saves the model if it is not "memory friendly"
-            if not self.memory_friendly_ttda:
-                self.tms += [tm]
-
-        # use one of the tms to get some info that will be needed later
-        self.sstats_sum = tm.state.sstats.sum()
-        self.eta = tm.eta
-
-        if pipe is not None:
-            # send the ttda that is in the child/workers version of the memory into the pipe
-            # available, after _generate_topic_models has been called in the worker
-            if self.memory_friendly_ttda:
-                # remember that this code is inside the worker processes memory,
-                # so self.ttda is the ttda of only a chunk of models
-                pipe.send(self.ttda)
-            else:
-                pipe.send(self.tms)
-
-            pipe.close()
-
-    def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, start_index):
-        """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
-
-        Parameters
-        ----------
-        ttda1 and ttda2: 2D arrays of floats
-            Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
-            topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
-        start_index : int
-            this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
-            complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into
-            two pieces, each 100 ttdas long, then start_index should be be 100. default is 0
-
-        Returns
-        -------
-        2D numpy.ndarray of floats
-            Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.
-
-        """
-        # initialize the distance matrix. ndarray is faster than zeros
-        distances = np.ndarray((len(ttda1), len(ttda2)))
-
-        if ttda1.shape[0] > 0 and ttda2.shape[0] > 0:
-            # the worker might not have received a ttda because it was chunked up too much
-
-            # some help to find a better threshold by useful log messages
-            avg_mask_size = 0
-
-            # now iterate over each topic
-            for ttd1_idx, ttd1 in enumerate(ttda1):
-                # create mask from ttd1 that removes noise from a and keeps the largest terms
-                mask = self.masking_method(ttd1, self.masking_threshold)
-                ttd1_masked = ttd1[mask]
-
-                avg_mask_size += mask.sum()
-
-                # now look at every possible pair for topic a:
-                for ttd2_idx, ttd2 in enumerate(ttda2):
-                    # distance to itself is 0
-                    if ttd1_idx + start_index == ttd2_idx:
-                        distances[ttd1_idx][ttd2_idx] = 0
-                        continue
-
-                    # now mask b based on a, which will force the shape of a onto b
-                    ttd2_masked = ttd2[mask]
-
-                    # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
-                    # distributions that will have distance values near 1.
-                    if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD:
-                        distance = 1
-                    else:
-                        distance = cosine(ttd1_masked, ttd2_masked)
-
-                    distances[ttd1_idx][ttd2_idx] = distance
-
-            percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
-            logger.info(f'the given threshold of {self.masking_threshold} covered on average {percent}% of tokens')
-
-        return distances
-
-    def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe):
-        """Worker that computes the distance to all other nodes from a chunk of nodes."""
-        logger.info(f"spawned worker to generate {n_ttdas} rows of the asymmetric distance matrix")
-        # the chunk of ttda that's going to be calculated:
-        ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
-        distance_chunk = self._calculate_asymmetric_distance_matrix_chunk(
-            ttda1=ttda1, ttda2=self.ttda, start_index=ttdas_sent
-        )
-        pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
-        pipe.close()
-
     def _generate_asymmetric_distance_matrix(self):
         """Calculate the pairwise distance matrix for all the ttdas from the ensemble.
 
@@ -958,63 +1027,25 @@ def _generate_asymmetric_distance_matrix(self):
 
         logger.info(f"generating a {len(self.ttda)} x {len(self.ttda)} asymmetric distance matrix...")
 
-        # singlecore
         if workers is not None and workers <= 1:
-            self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk(
-                ttda1=self.ttda, ttda2=self.ttda, start_index=0,
+            self.asymmetric_distance_matrix = _calculate_asymmetric_distance_matrix_chunk(
+                ttda1=self.ttda,
+                ttda2=self.ttda,
+                start_index=0,
+                masking_method=self.masking_method,
+                masking_threshold=self.masking_threshold,
+            )
+        else:
+            # best performance on 2-core machine: 2 workers
+            if workers is None:
+                workers = os.cpu_count()
+
+            self.asymmetric_distance_matrix = _calculate_assymetric_distance_matrix_multiproc(
+                workers=workers,
+                entire_ttda=self.ttda,
+                masking_method=self.masking_method,
+                masking_threshold=self.masking_threshold,
             )
-
-        # else, if workers > 1 use multiprocessing
-        # best performance on 2-core machine: 2 workers
-        if workers is None:
-            workers = os.cpu_count()
-
-        # create worker processes:
-        processes = []
-        pipes = []
-        ttdas_sent = 0
-
-        for i in range(workers):
-            try:
-                parent_conn, child_conn = Pipe()
-
-                # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3.
-                n_ttdas = 0
-                if i == workers - 1:  # i is a index, hence -1
-                    # is this the last worker that needs to be created?
-                    # then task that worker with all the remaining models
-                    n_ttdas = len(self.ttda) - ttdas_sent
-                else:
-                    n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i))
-
-                process = Process(
-                    target=self._asymmetric_distance_matrix_worker,
-                    args=(i, ttdas_sent, n_ttdas, child_conn),
-                )
-                ttdas_sent += n_ttdas
-
-                processes.append(process)
-                pipes.append((parent_conn, child_conn))
-                process.start()
-
-            except ProcessError:
-                _teardown(pipes, processes, i)
-                raise ProcessError
-
-        distances = []
-        # note, that the following loop maintains order in how the ttda will be concatenated
-        # which is very important. Ordering in ttda has to be the same as when using only one process
-        for parent_conn, _ in pipes:
-            answer = parent_conn.recv()
-            parent_conn.close()  # child conn will be closed from inside the worker
-            # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
-            distances.append(answer[1])
-
-        # end all processes
-        for process in processes:
-            process.terminate()
-
-        self.asymmetric_distance_matrix = np.concatenate(distances)
 
     def _generate_topic_clusters(self, eps=0.1, min_samples=None):
         """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices.
@@ -1092,10 +1123,10 @@ def _generate_stable_topics(self, min_cores=None):
         stable_topics = np.empty((num_stable_topics, len(self.id2word)))
 
         # for each cluster
-        for l, label in enumerate(unique_labels):
+        for label_index, label in enumerate(unique_labels):
             # mean of all the topics that are of that cluster
             topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label])
-            stable_topics[l] = topics_of_cluster.mean(axis=0)
+            stable_topics[label_index] = topics_of_cluster.mean(axis=0)
 
         self.valid_clusters = valid_clusters
         self.stable_topics = stable_topics

From 444c1909dddcda5bce77ca205a12f9e66f48d102 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 18 Jul 2021 22:39:22 +0900
Subject: [PATCH 164/166] roll back change to docs/src/Makefile

---
 docs/src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/Makefile b/docs/src/Makefile
index d4ae6caa5f..30cbe5a869 100644
--- a/docs/src/Makefile
+++ b/docs/src/Makefile
@@ -41,7 +41,7 @@ html:
 	@echo "Build finished. The HTML pages are in ../"
 
 upload:
-	scp -r _build/html/* rr:public_html/gensim_4.0.0/
+	scp -r _build/html/* rr:public_html/gensim/
 
 dirhtml:
 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml

From f00aca8645fe6f4b3bcd8ad0fdcfd0b92a4fcf37 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 18 Jul 2021 23:39:21 +0900
Subject: [PATCH 165/166] re-raise caught exception instead of raising a new
 one

we don't want to hide the details of the problem
---
 gensim/models/ensemblelda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index 9fcb486144..a4b6eccb9c 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -364,7 +364,7 @@ def _generate_topic_models_multiproc(ensemble, num_models, ensemble_workers):
         except ProcessError:
             logger.error(f"could not start process {i}")
             _teardown(pipes, processes)
-            raise ProcessError
+            raise
 
     # aggregate results
     # will also block until workers are finished
@@ -572,7 +572,7 @@ def _calculate_assymetric_distance_matrix_multiproc(
         except ProcessError:
             logger.error(f"could not start process {i}")
             _teardown(pipes, processes)
-            raise ProcessError
+            raise
 
     distances = []
     # note, that the following loop maintains order in how the ttda will be concatenated

From cac68195bd0a4a462025dcad3784e3d26bf03614 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Thu, 22 Jul 2021 21:31:47 +0900
Subject: [PATCH 166/166] add docstring

---
 gensim/models/ensemblelda.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
index a4b6eccb9c..39d7e06620 100644
--- a/gensim/models/ensemblelda.py
+++ b/gensim/models/ensemblelda.py
@@ -425,6 +425,9 @@ def _generate_topic_models(ensemble, num_models, random_states=None):
 
 
 def _generate_topic_models_worker(ensemble, num_models, random_states, pipe):
+    """Wrapper for _generate_topic_models to write the results into a pipe.
+
+    This is intended to be used inside a subprocess."""
     #
     # Same as _generate_topic_models, but runs in a separate subprocess, and
     # sends the updated ensemble state to the parent subprocess via a pipe.