From 33b96581c786013993233a5fea80d106ac8509eb Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 1 Jun 2017 18:02:13 -0700 Subject: [PATCH 01/10] added sklearn interface for base_topic_model class --- .../sklearn_wrapper_gensim_basetopicmodel.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 gensim/sklearn_integration/sklearn_wrapper_gensim_basetopicmodel.py diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_basetopicmodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_basetopicmodel.py new file mode 100644 index 0000000000..dbd47f0efb --- /dev/null +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_basetopicmodel.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +# +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +follows on scikit learn API conventions +""" + + +class SklearnWrapperBaseTopicModel(object): + """ + BaseTopicModel module + """ + + def set_params(self, **parameters): + """ + Set all parameters. + """ + for parameter, value in parameters.items(): + setattr(self, parameter, value) + return self From edcb7cf4d7f8a1fa3ad0686fb1d32184d87d5755 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 1 Jun 2017 18:02:59 -0700 Subject: [PATCH 02/10] refactored code for LDA and LSI wrappers --- .../sklearn_wrapper_gensim_ldamodel.py | 12 ++---------- .../sklearn_wrapper_gensim_lsimodel.py | 11 ++--------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index 3eae9d0265..d387033f6c 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -9,14 +9,14 @@ follows on scikit learn API conventions """ import numpy as np +import sklearn_wrapper_gensim_basetopicmodel from gensim import models from gensim import matutils from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator - -class SklearnWrapperLdaModel(models.LdaModel, TransformerMixin, BaseEstimator): +class SklearnWrapperLdaModel(models.LdaModel, sklearn_wrapper_gensim_basetopicmodel.SklearnWrapperBaseTopicModel, TransformerMixin, BaseEstimator): """ Base LDA module """ @@ -66,14 +66,6 @@ def get_params(self, deep=True): "gamma_threshold": self.gamma_threshold, "minimum_probability": self.minimum_probability, "random_state": self.random_state} - def set_params(self, **parameters): - """ - Set all parameters. - """ - for parameter, value in parameters.items(): - self.parameter = value - return self - def fit(self, X, y=None): """ For fitting corpus into the class object. diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index 753cbaf899..f444ad2b54 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -9,13 +9,14 @@ Follows scikit-learn API conventions """ import numpy as np +import sklearn_wrapper_gensim_basetopicmodel from gensim import models from gensim import matutils from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator -class SklearnWrapperLsiModel(models.LsiModel, TransformerMixin, BaseEstimator): +class SklearnWrapperLsiModel(models.LsiModel, sklearn_wrapper_gensim_basetopicmodel.SklearnWrapperBaseTopicModel, TransformerMixin, BaseEstimator): """ Base LSI module """ @@ -47,14 +48,6 @@ def get_params(self, deep=True): "chunksize": self.chunksize, "decay": self.decay, "onepass": self.onepass, "extra_samples": self.extra_samples, "power_iters": self.power_iters} - def set_params(self, **parameters): - """ - Set all parameters. - """ - for parameter, value in parameters.items(): - self.parameter = value - return self - def fit(self, X, y=None): """ For fitting corpus into the class object. From addb4baef8fed697a9dc6ad547d263c4d4c5e3ae Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 1 Jun 2017 18:03:23 -0700 Subject: [PATCH 03/10] added tests for set_params and get_params functions --- gensim/test/test_sklearn_integration.py | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index cb6c514612..d482ab5210 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -114,6 +114,19 @@ def testPipeline(self): score = text_lda.score(corpus, data.target) self.assertGreater(score, 0.40) + def testSetGetParams(self): + # updating only one param + self.model.set_params(num_topics=3) + model_params = self.model.get_params() + self.assertEqual(model_params["num_topics"], 3) + + # updating multiple params + param_dict = {"eval_every" : 20 , "decay" : 0.7} + self.model.set_params(**param_dict) + model_params = self.model.get_params() + for key in param_dict.keys(): + self.assertEqual(model_params[key], param_dict[key]) + class TestSklearnLSIWrapper(unittest.TestCase): def setUp(self): @@ -165,5 +178,19 @@ def testPipeline(self): score = text_lda.score(corpus, data.target) self.assertGreater(score, 0.50) + def testSetGetParams(self): + # updating only one param + self.model.set_params(num_topics=3) + model_params = self.model.get_params() + self.assertEqual(model_params["num_topics"], 3) + + # updating multiple params + param_dict = {"chunksize" : 10000 , "decay" : 0.9} + self.model.set_params(**param_dict) + model_params = self.model.get_params() + for key in param_dict.keys(): + self.assertEqual(model_params[key], param_dict[key]) + + if __name__ == '__main__': unittest.main() From 570f3eeef9688b9266a0767fe673343611cbc1bb Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 1 Jun 2017 19:02:23 -0700 Subject: [PATCH 04/10] PEP8 changes --- gensim/test/test_sklearn_integration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index d482ab5210..08e2ae9fe7 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -121,7 +121,7 @@ def testSetGetParams(self): self.assertEqual(model_params["num_topics"], 3) # updating multiple params - param_dict = {"eval_every" : 20 , "decay" : 0.7} + param_dict = {"eval_every": 20, "decay": 0.7} self.model.set_params(**param_dict) model_params = self.model.get_params() for key in param_dict.keys(): @@ -185,7 +185,7 @@ def testSetGetParams(self): self.assertEqual(model_params["num_topics"], 3) # updating multiple params - param_dict = {"chunksize" : 10000 , "decay" : 0.9} + param_dict = {"chunksize": 10000, "decay": 0.9} self.model.set_params(**param_dict) model_params = self.model.get_params() for key in param_dict.keys(): From 803db57e26277ea340aee1f60de7413f8de99629 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 1 Jun 2017 19:21:06 -0700 Subject: [PATCH 05/10] more PEP8 changes --- gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py | 1 + gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py | 1 + 2 files changed, 2 insertions(+) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index d387033f6c..5b57eb6e08 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -21,6 +21,7 @@ class SklearnWrapperLdaModel(models.LdaModel, sklearn_wrapper_gensim_basetopicmo Base LDA module """ + def __init__( self, corpus=None, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index f444ad2b54..55b716eb3c 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -21,6 +21,7 @@ class SklearnWrapperLsiModel(models.LsiModel, sklearn_wrapper_gensim_basetopicmo Base LSI module """ + def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100): """ From 7a4794ef3c8755de03480b5cfbce97206662428b Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 1 Jun 2017 19:42:03 -0700 Subject: [PATCH 06/10] added new line before class definition --- gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py | 1 + gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py | 1 + 2 files changed, 2 insertions(+) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index 5b57eb6e08..53052b8fd9 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -16,6 +16,7 @@ from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator + class SklearnWrapperLdaModel(models.LdaModel, sklearn_wrapper_gensim_basetopicmodel.SklearnWrapperBaseTopicModel, TransformerMixin, BaseEstimator): """ Base LDA module diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index 55b716eb3c..b0aeef56d6 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -16,6 +16,7 @@ from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator + class SklearnWrapperLsiModel(models.LsiModel, sklearn_wrapper_gensim_basetopicmodel.SklearnWrapperBaseTopicModel, TransformerMixin, BaseEstimator): """ Base LSI module From f852fda7db343821bca22c8363f9e386c5565106 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 1 Jun 2017 20:23:25 -0700 Subject: [PATCH 07/10] fixed import statements for Python3 --- gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py | 2 +- gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index 53052b8fd9..e56c119706 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -9,10 +9,10 @@ follows on scikit learn API conventions """ import numpy as np -import sklearn_wrapper_gensim_basetopicmodel from gensim import models from gensim import matutils +from gensim.sklearn_integration import sklearn_wrapper_gensim_basetopicmodel from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index b0aeef56d6..f160c1a076 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -9,10 +9,10 @@ Follows scikit-learn API conventions """ import numpy as np -import sklearn_wrapper_gensim_basetopicmodel from gensim import models from gensim import matutils +from gensim.sklearn_integration import sklearn_wrapper_gensim_basetopicmodel from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator From 09eb1827ec6f359d06311c6515801b7dc2f15e10 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 5 Jun 2017 04:09:41 -0700 Subject: [PATCH 08/10] added abstract base class for sklearn wrappers --- ...nsim_basetopicmodel.py => base_sklearn_wrapper.py} | 7 +++++-- .../sklearn_wrapper_gensim_ldamodel.py | 11 ++++++++--- .../sklearn_wrapper_gensim_lsimodel.py | 11 ++++++++--- 3 files changed, 21 insertions(+), 8 deletions(-) rename gensim/sklearn_integration/{sklearn_wrapper_gensim_basetopicmodel.py => base_sklearn_wrapper.py} (77%) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_basetopicmodel.py b/gensim/sklearn_integration/base_sklearn_wrapper.py similarity index 77% rename from gensim/sklearn_integration/sklearn_wrapper_gensim_basetopicmodel.py rename to gensim/sklearn_integration/base_sklearn_wrapper.py index dbd47f0efb..998d6f778b 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_basetopicmodel.py +++ b/gensim/sklearn_integration/base_sklearn_wrapper.py @@ -8,13 +8,16 @@ Scikit learn interface for gensim for easy use of gensim with scikit-learn follows on scikit learn API conventions """ +from abc import ABCMeta, abstractmethod -class SklearnWrapperBaseTopicModel(object): +class BaseSklearnWrapper(object): """ - BaseTopicModel module + Base sklearn wrapper module """ + __metaclass__ = ABCMeta + @abstractmethod def set_params(self, **parameters): """ Set all parameters. diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index e56c119706..609af42889 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -12,17 +12,16 @@ from gensim import models from gensim import matutils -from gensim.sklearn_integration import sklearn_wrapper_gensim_basetopicmodel +from gensim.sklearn_integration import base_sklearn_wrapper from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator -class SklearnWrapperLdaModel(models.LdaModel, sklearn_wrapper_gensim_basetopicmodel.SklearnWrapperBaseTopicModel, TransformerMixin, BaseEstimator): +class SklearnWrapperLdaModel(models.LdaModel, base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base LDA module """ - def __init__( self, corpus=None, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, @@ -68,6 +67,12 @@ def get_params(self, deep=True): "gamma_threshold": self.gamma_threshold, "minimum_probability": self.minimum_probability, "random_state": self.random_state} + def set_params(self, **parameters): + """ + Set all parameters. + """ + super(SklearnWrapperLdaModel, self).set_params(**parameters) + def fit(self, X, y=None): """ For fitting corpus into the class object. diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index f160c1a076..cb7e2b48ef 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -12,17 +12,16 @@ from gensim import models from gensim import matutils -from gensim.sklearn_integration import sklearn_wrapper_gensim_basetopicmodel +from gensim.sklearn_integration import base_sklearn_wrapper from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator -class SklearnWrapperLsiModel(models.LsiModel, sklearn_wrapper_gensim_basetopicmodel.SklearnWrapperBaseTopicModel, TransformerMixin, BaseEstimator): +class SklearnWrapperLsiModel(models.LsiModel, base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base LSI module """ - def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100): """ @@ -50,6 +49,12 @@ def get_params(self, deep=True): "chunksize": self.chunksize, "decay": self.decay, "onepass": self.onepass, "extra_samples": self.extra_samples, "power_iters": self.power_iters} + def set_params(self, **parameters): + """ + Set all parameters. + """ + super(SklearnWrapperLsiModel, self).set_params(**parameters) + def fit(self, X, y=None): """ For fitting corpus into the class object. From 27ead735803a6ccd79770dbf6624a1516622f5f2 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 5 Jun 2017 04:17:48 -0700 Subject: [PATCH 09/10] added other methods in abstract base class --- .../sklearn_integration/base_sklearn_wrapper.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/gensim/sklearn_integration/base_sklearn_wrapper.py b/gensim/sklearn_integration/base_sklearn_wrapper.py index 998d6f778b..e80898679c 100644 --- a/gensim/sklearn_integration/base_sklearn_wrapper.py +++ b/gensim/sklearn_integration/base_sklearn_wrapper.py @@ -17,6 +17,11 @@ class BaseSklearnWrapper(object): """ __metaclass__ = ABCMeta + + @abstractmethod + def get_params(self, deep=True): + pass + @abstractmethod def set_params(self, **parameters): """ @@ -25,3 +30,15 @@ def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self, parameter, value) return self + + @abstractmethod + def fit(self, X, y=None): + pass + + @abstractmethod + def transform(self, docs, minimum_probability=None): + pass + + @abstractmethod + def partial_fit(self, X): + pass From 464d787f84b970b680cd9498e78af70b2129c566 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 5 Jun 2017 04:54:39 -0700 Subject: [PATCH 10/10] PEP8 changes --- gensim/sklearn_integration/base_sklearn_wrapper.py | 3 +-- gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py | 2 +- gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gensim/sklearn_integration/base_sklearn_wrapper.py b/gensim/sklearn_integration/base_sklearn_wrapper.py index e80898679c..e58d705f1c 100644 --- a/gensim/sklearn_integration/base_sklearn_wrapper.py +++ b/gensim/sklearn_integration/base_sklearn_wrapper.py @@ -17,7 +17,6 @@ class BaseSklearnWrapper(object): """ __metaclass__ = ABCMeta - @abstractmethod def get_params(self, deep=True): pass @@ -32,7 +31,7 @@ def set_params(self, **parameters): return self @abstractmethod - def fit(self, X, y=None): + def fit(self, X, y=None): pass @abstractmethod diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index 609af42889..bd2ee1292d 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -73,7 +73,7 @@ def set_params(self, **parameters): """ super(SklearnWrapperLdaModel, self).set_params(**parameters) - def fit(self, X, y=None): + def fit(self, X, y=None): """ For fitting corpus into the class object. Calls gensim.model.LdaModel: diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index cb7e2b48ef..9d0564cfa4 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -55,7 +55,7 @@ def set_params(self, **parameters): """ super(SklearnWrapperLsiModel, self).set_params(**parameters) - def fit(self, X, y=None): + def fit(self, X, y=None): """ For fitting corpus into the class object. Calls gensim.model.LsiModel: