From f70c5830cc9a231867ff3710fb0bf8e10514f4c6 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 5 Jul 2017 02:40:38 -0700 Subject: [PATCH 01/34] created sklearn wrapper for Doc2Vec --- gensim/sklearn_integration/__init__.py | 1 + .../sklearn_wrapper_gensim_d2vmodel.py | 78 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index 673130ff3a..ae9349c548 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -16,4 +16,5 @@ from .sklearn_wrapper_gensim_rpmodel import SklRpModel # noqa: F401 from .sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel # noqa: F401 from .sklearn_wrapper_gensim_w2vmodel import SklW2VModel # noqa: F401 +from .sklearn_wrapper_gensim_d2vmodel import SklD2VModel # noqa: F401 from .sklearn_wrapper_gensim_atmodel import SklATModel # noqa: F401 diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py new file mode 100644 index 0000000000..f30d43437d --- /dev/null +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +from gensim import models +from gensim.sklearn_integration import BaseSklearnWrapper + + +class SklD2VModel(BaseSklearnWrapper, TransformerMixin, BaseEstimator): + """ + Base Doc2Vec module + """ + + def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, + dm_tag_count=1, docvecs=None, docvecs_mapfile=None, + comment=None, trim_rule=None, **other_params): + """ + Sklearn wrapper for Doc2Vec model. Class derived from gensim.models.Doc2Vec + """ + self.gensim_model = None + self.dm_mean = dm_mean + self.dm = dm + self.dbow_words = dbow_words + self.dm_concat = dm_concat + self.dm_tag_count = dm_tag_count + self.docvecs = docvecs + self.docvecs_mapfile = docvecs_mapfile + self.comment = comment + self.trim_rule = trim_rule + self.other_params = other_params + + def get_params(self, deep=True): + """ + Returns all parameters as dictionary. + """ + model_params = {"dm_mean": self.dm_mean, "dm": self.dm, "dbow_words": self.dbow_words, + "dm_concat": self.dm_concat, "dm_tag_count": self.dm_tag_count, "docvecs": self.docvecs, + "docvecs_mapfile": self.docvecs_mapfile, "comment": self.comment, "trim_rule": self.trim_rule} + + model_params.update(self.other_params) + return model_params + + def set_params(self, **parameters): + """ + Set all parameters. + """ + super(SklD2VModel, self).set_params(**parameters) + return self + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + Calls gensim.models.Doc2Vec + """ + self.gensim_model = models.Doc2Vec(documents=X, dm_mean=self.dm_mean, dm=self.dm, + dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, + docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, + trim_rule=self.trim_rule, **self.other_params) + return self + + def transform(self, words): + """ + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + def partial_fit(self, X): + raise NotImplementedError("'partial_fit' has not been implemented for SklD2VModel") From 0c675fa355d044899a4635ff3b0fd1fad8bc697d Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 5 Jul 2017 03:14:26 -0700 Subject: [PATCH 02/34] PEP8 fix --- .../sklearn_wrapper_gensim_d2vmodel.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py index f30d43437d..8f17c9d9f8 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py @@ -21,8 +21,8 @@ class SklD2VModel(BaseSklearnWrapper, TransformerMixin, BaseEstimator): Base Doc2Vec module """ - def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, - dm_tag_count=1, docvecs=None, docvecs_mapfile=None, + def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, + dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **other_params): """ Sklearn wrapper for Doc2Vec model. Class derived from gensim.models.Doc2Vec @@ -62,9 +62,9 @@ def fit(self, X, y=None): Fit the model according to the given training data. Calls gensim.models.Doc2Vec """ - self.gensim_model = models.Doc2Vec(documents=X, dm_mean=self.dm_mean, dm=self.dm, - dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, - docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, + self.gensim_model = models.Doc2Vec(documents=X, dm_mean=self.dm_mean, dm=self.dm, + dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, + docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, trim_rule=self.trim_rule, **self.other_params) return self From 7210c69221f7b32f8027800368c87886a815fe89 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 10 Jul 2017 16:19:14 +0530 Subject: [PATCH 03/34] added 'transform' function and refactored code --- gensim/sklearn_integration/__init__.py | 2 +- ...wrapper_gensim_d2vmodel.py => d2vmodel.py} | 24 +++++++++++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) rename gensim/sklearn_integration/{sklearn_wrapper_gensim_d2vmodel.py => d2vmodel.py} (76%) diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index ae9349c548..f08ac90fe9 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -16,5 +16,5 @@ from .sklearn_wrapper_gensim_rpmodel import SklRpModel # noqa: F401 from .sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel # noqa: F401 from .sklearn_wrapper_gensim_w2vmodel import SklW2VModel # noqa: F401 -from .sklearn_wrapper_gensim_d2vmodel import SklD2VModel # noqa: F401 +from .d2vmodel import SklD2VModel # noqa: F401 from .sklearn_wrapper_gensim_atmodel import SklATModel # noqa: F401 diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py b/gensim/sklearn_integration/d2vmodel.py similarity index 76% rename from gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py rename to gensim/sklearn_integration/d2vmodel.py index 8f17c9d9f8..39413ae79d 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_d2vmodel.py +++ b/gensim/sklearn_integration/d2vmodel.py @@ -9,14 +9,16 @@ Follows scikit-learn API conventions """ +from numpy import integer +from six import string_types, integer_types from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError from gensim import models -from gensim.sklearn_integration import BaseSklearnWrapper +from gensim.sklearn_api import BaseTransformer -class SklD2VModel(BaseSklearnWrapper, TransformerMixin, BaseEstimator): +class D2VTransformer(BaseTransformer, TransformerMixin, BaseEstimator): """ Base Doc2Vec module """ @@ -54,7 +56,7 @@ def set_params(self, **parameters): """ Set all parameters. """ - super(SklD2VModel, self).set_params(**parameters) + super(D2VTransformer, self).set_params(**parameters) return self def fit(self, X, y=None): @@ -68,11 +70,23 @@ def fit(self, X, y=None): trim_rule=self.trim_rule, **self.other_params) return self - def transform(self, words): + def transform(self, docs): """ + Return the vector representations for the input list of documents. """ if self.gensim_model is None: raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + # The input as array of array + check = lambda x: [x] if isinstance(x, string_types + integer_types + (integer,)) else x + docs = check(docs) + X = [[] for _ in range(0, len(docs))] + + for k, v in enumerate(docs): + doc_vec = self.gensim_model[v] + X[k] = doc_vec + + return np.reshape(np.array(X), (len(docs), self.gensim_model.vector_size)) + def partial_fit(self, X): - raise NotImplementedError("'partial_fit' has not been implemented for SklD2VModel") + raise NotImplementedError("'partial_fit' has not been implemented for D2VTransformer") From b733e25253bb385d09910c6c74a303267eaf1eec Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 10 Jul 2017 23:45:55 -0700 Subject: [PATCH 04/34] updated d2v skl api code --- gensim/sklearn_integration/__init__.py | 2 +- gensim/sklearn_integration/d2vmodel.py | 63 +++++++++++++++++++------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index f08ac90fe9..950541fa4d 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -16,5 +16,5 @@ from .sklearn_wrapper_gensim_rpmodel import SklRpModel # noqa: F401 from .sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel # noqa: F401 from .sklearn_wrapper_gensim_w2vmodel import SklW2VModel # noqa: F401 -from .d2vmodel import SklD2VModel # noqa: F401 +from .d2vmodel import D2VTransformer # noqa: F401 from .sklearn_wrapper_gensim_atmodel import SklATModel # noqa: F401 diff --git a/gensim/sklearn_integration/d2vmodel.py b/gensim/sklearn_integration/d2vmodel.py index 39413ae79d..e7203a7cf4 100644 --- a/gensim/sklearn_integration/d2vmodel.py +++ b/gensim/sklearn_integration/d2vmodel.py @@ -9,25 +9,29 @@ Follows scikit-learn API conventions """ +import numpy as np from numpy import integer from six import string_types, integer_types from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError from gensim import models -from gensim.sklearn_api import BaseTransformer +from gensim.sklearn_integration import BaseSklearnWrapper - -class D2VTransformer(BaseTransformer, TransformerMixin, BaseEstimator): +class D2VTransformer(BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base Doc2Vec module """ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, - comment=None, trim_rule=None, **other_params): + comment=None, trim_rule=None, size=100, alpha=0.025, + window=5, min_count=5, max_vocab_size=None, sample=1e-3, + seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, + cbow_mean=1, hashfxn=hash, iter=5, sorted_vocab=1, + batch_words=10000): """ - Sklearn wrapper for Doc2Vec model. Class derived from gensim.models.Doc2Vec + Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec for parameter details. """ self.gensim_model = None self.dm_mean = dm_mean @@ -39,22 +43,41 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, self.docvecs_mapfile = docvecs_mapfile self.comment = comment self.trim_rule = trim_rule - self.other_params = other_params + + # attributes of gensim.models.Word2Vec + self.size = size + self.alpha = alpha + self.window = window + self.min_count = min_count + self.max_vocab_size = max_vocab_size + self.sample = sample + self.seed = seed + self.workers = workers + self.min_alpha = min_alpha + self.hs = hs + self.negative = negative + self.cbow_mean = int(cbow_mean) + self.hashfxn = hashfxn + self.iter = iter + self.sorted_vocab = sorted_vocab + self.batch_words = batch_words def get_params(self, deep=True): """ - Returns all parameters as dictionary. + Return all parameters as dictionary. """ - model_params = {"dm_mean": self.dm_mean, "dm": self.dm, "dbow_words": self.dbow_words, + return {"dm_mean": self.dm_mean, "dm": self.dm, "dbow_words": self.dbow_words, "dm_concat": self.dm_concat, "dm_tag_count": self.dm_tag_count, "docvecs": self.docvecs, - "docvecs_mapfile": self.docvecs_mapfile, "comment": self.comment, "trim_rule": self.trim_rule} - - model_params.update(self.other_params) - return model_params + "docvecs_mapfile": self.docvecs_mapfile, "comment": self.comment, "trim_rule": self.trim_rule, + "size": self.size, "alpha": self.alpha, "window": self.window, "min_count": self.min_count, + "max_vocab_size": self.max_vocab_size, "sample": self.sample, "seed": self.seed, + "workers": self.workers, "min_alpha": self.min_alpha, "hs": self.hs, + "negative": self.negative, "cbow_mean": self.cbow_mean, "hashfxn": self.hashfxn, + "iter": self.iter, "sorted_vocab": self.sorted_vocab, "batch_words": self.batch_words} def set_params(self, **parameters): """ - Set all parameters. + Set parameters """ super(D2VTransformer, self).set_params(**parameters) return self @@ -67,23 +90,29 @@ def fit(self, X, y=None): self.gensim_model = models.Doc2Vec(documents=X, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, - trim_rule=self.trim_rule, **self.other_params) + trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window, + min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, + seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, + negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, + iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words) return self def transform(self, docs): """ - Return the vector representations for the input list of documents. + Return the vector representations for the input documents. + The input `docs` should be a list of lists like : [ ['calculus', 'mathematical'], ['geometry', 'operations', 'curves'] ] + or a single document like : ['calculus', 'mathematical'] """ if self.gensim_model is None: raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") # The input as array of array - check = lambda x: [x] if isinstance(x, string_types + integer_types + (integer,)) else x + check = lambda x: [x] if isinstance(x[0], string_types) else x docs = check(docs) X = [[] for _ in range(0, len(docs))] for k, v in enumerate(docs): - doc_vec = self.gensim_model[v] + doc_vec = self.gensim_model.infer_vector(v) X[k] = doc_vec return np.reshape(np.array(X), (len(docs), self.gensim_model.vector_size)) From 7f198a186b6261f4188294147f115f1b8fde3fa9 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 10 Jul 2017 23:47:51 -0700 Subject: [PATCH 05/34] added unittests for sklearn api for d2v model --- gensim/test/test_sklearn_integration.py | 73 +++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 6cbfe902a3..e78ae53d28 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -21,7 +21,9 @@ from gensim.sklearn_integration.sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel from gensim.sklearn_integration.sklearn_wrapper_gensim_w2vmodel import SklW2VModel from gensim.sklearn_integration.sklearn_wrapper_gensim_atmodel import SklATModel +from gensim.sklearn_integration.d2vmodel import D2VTransformer from gensim.corpora import mmcorpus, Dictionary +from gensim.models import doc2vec from gensim import matutils module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -97,6 +99,9 @@ ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', 'transformed', 'modern', 'day', 'society'] ] +d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)] + + class TestSklLdaModelWrapper(unittest.TestCase): def setUp(self): numpy.random.seed(0) # set fixed seed to get similar values everytime @@ -537,5 +542,73 @@ def testPipeline(self): self.assertEqual(len(ret_val), len(author_list)) +class TestD2VTransformerWrapper(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = D2VTransformer(min_count=1) + self.model.fit(d2v_sentences) + + def testTransform(self): + # tranform multiple documents + docs = [] + docs.append(w2v_texts[0]) + docs.append(w2v_texts[1]) + docs.append(w2v_texts[2]) + matrix = self.model.transform(docs) + self.assertEqual(matrix.shape[0], 3) + self.assertEqual(matrix.shape[1], self.model.size) + + # tranform one document + doc = w2v_texts[0] + matrix = self.model.transform(doc) + self.assertEqual(matrix.shape[0], 1) + self.assertEqual(matrix.shape[1], self.model.size) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(negative=20) + model_params = self.model.get_params() + self.assertEqual(model_params["negative"], 20) + + def testPipeline(self): + numpy.random.seed(0) # set fixed seed to get similar values everytime + model = D2VTransformer(min_count=1) + model.fit(d2v_sentences) + + class_dict = {'mathematics': 1, 'physics': 0} + train_data = [ + (['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'), + (['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics') + ] + train_input = list(map(lambda x: x[0], train_data)) + train_target = list(map(lambda x: class_dict[x[1]], train_data)) + + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) + clf.fit(model.transform(train_input), train_target) + text_w2v = Pipeline((('features', model,), ('classifier', clf))) + score = text_w2v.score(train_input, train_target) + self.assertGreater(score, 0.40) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = w2v_texts[0] + loaded_transformed_vecs = model_load.transform(doc) + + # sanity check for transformation operation + self.assertEqual(loaded_transformed_vecs.shape[0], 1) + self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size) + + # comparing the original and loaded models + original_transformed_vecs = self.model.transform(doc) + passed = numpy.allclose(sorted(loaded_transformed_vecs), sorted(original_transformed_vecs), atol=1e-1) + self.assertTrue(passed) + + def testModelNotFitted(self): + d2vmodel_wrapper = D2VTransformer(min_count=1) + self.assertRaises(NotFittedError, d2vmodel_wrapper.transform, 1) + + if __name__ == '__main__': unittest.main() From 8a12ef53996c62c516a64b058d90231336127ca8 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 11 Jul 2017 00:05:33 -0700 Subject: [PATCH 06/34] fixed flake8 errors --- gensim/sklearn_integration/d2vmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_integration/d2vmodel.py b/gensim/sklearn_integration/d2vmodel.py index e7203a7cf4..9850353f1a 100644 --- a/gensim/sklearn_integration/d2vmodel.py +++ b/gensim/sklearn_integration/d2vmodel.py @@ -10,14 +10,14 @@ """ import numpy as np -from numpy import integer -from six import string_types, integer_types +from six import string_types from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError from gensim import models from gensim.sklearn_integration import BaseSklearnWrapper + class D2VTransformer(BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base Doc2Vec module From 2c18b87f94e0bbc76689dbe0454da2a692b60302 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 12 Jul 2017 16:44:47 +0530 Subject: [PATCH 07/34] added skl api class for Text2Bow model --- .../sklearn_wrapper_gensim_text2bow.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 gensim/sklearn_integration/sklearn_wrapper_gensim_text2bow.py diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_text2bow.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_text2bow.py new file mode 100644 index 0000000000..59e8c73d57 --- /dev/null +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_text2bow.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +from gensim.corpora import Dictionary +from gensim.sklearn_integration import BaseSklearnWrapper + + +class SklText2BowModel(BaseSklearnWrapper, TransformerMixin, BaseEstimator): + """ + Base Text2Bow module + """ + + def __init__(self, prune_at=2000000): + """ + Sklearn wrapper for Text2Bow model. + """ + self.gensim_model = None + self.prune_at = prune_at + + def get_params(self, deep=True): + """ + Returns all parameters as dictionary. + """ + return {"prune_at": self.prune_at} + + def set_params(self, **parameters): + """ + Set all parameters. + """ + super(SklText2BowModel, self).set_params(**parameters) + return self + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + """ + self.gensim_model = gensim.corpora.Dictionary(documents=X, prune_at=self.prune_at) + return self + + def transform(self, docs): + """ + Return the BOW format for the input documents. + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + # The input as array of array + check = lambda x: [x] if isinstance(x[0], six.string_types) else x + docs = check(docs) + X = [[] for _ in range(0, len(docs))] + + for k, v in enumerate(docs): + bow_val = self.gensim_model.doc2bow(v) + X[k] = bow_val + + return X + + def partial_fit(self, X): + if self.gensim_model is None: + self.gensim_model = gensim.corpora.Dictionary(prune_at=self.prune_at) + + self.gensim_model.add_documents(X) + return self From 710d2cedcfa78aaecfd9b55549a622a7bfbd91ce Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 12 Jul 2017 16:47:46 +0530 Subject: [PATCH 08/34] updated docstring for d2vmodel api --- gensim/sklearn_integration/d2vmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_integration/d2vmodel.py b/gensim/sklearn_integration/d2vmodel.py index 9850353f1a..9bbf917717 100644 --- a/gensim/sklearn_integration/d2vmodel.py +++ b/gensim/sklearn_integration/d2vmodel.py @@ -31,7 +31,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, cbow_mean=1, hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): """ - Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec for parameter details. + Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. """ self.gensim_model = None self.dm_mean = dm_mean @@ -44,7 +44,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, self.comment = comment self.trim_rule = trim_rule - # attributes of gensim.models.Word2Vec + # attributes associated with gensim.models.Word2Vec self.size = size self.alpha = alpha self.window = window From fe76d283a6df3f2e3a624a5f3c0a2c9afa0525d3 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 13 Jul 2017 01:59:27 -0700 Subject: [PATCH 09/34] updated text2bow skl api code --- gensim/sklearn_integration/__init__.py | 1 + ...learn_wrapper_gensim_text2bow.py => text2bow.py} | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) rename gensim/sklearn_integration/{sklearn_wrapper_gensim_text2bow.py => text2bow.py} (80%) diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index 950541fa4d..cbd1c56db2 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -18,3 +18,4 @@ from .sklearn_wrapper_gensim_w2vmodel import SklW2VModel # noqa: F401 from .d2vmodel import D2VTransformer # noqa: F401 from .sklearn_wrapper_gensim_atmodel import SklATModel # noqa: F401 +from .text2bow import Text2BowTransformer # noqa: F401 diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_text2bow.py b/gensim/sklearn_integration/text2bow.py similarity index 80% rename from gensim/sklearn_integration/sklearn_wrapper_gensim_text2bow.py rename to gensim/sklearn_integration/text2bow.py index 59e8c73d57..d6493d80d1 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_text2bow.py +++ b/gensim/sklearn_integration/text2bow.py @@ -9,6 +9,7 @@ Follows scikit-learn API conventions """ +from six import string_types from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -16,7 +17,7 @@ from gensim.sklearn_integration import BaseSklearnWrapper -class SklText2BowModel(BaseSklearnWrapper, TransformerMixin, BaseEstimator): +class Text2BowTransformer(BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base Text2Bow module """ @@ -38,14 +39,14 @@ def set_params(self, **parameters): """ Set all parameters. """ - super(SklText2BowModel, self).set_params(**parameters) + super(Text2BowTransformer, self).set_params(**parameters) return self def fit(self, X, y=None): """ Fit the model according to the given training data. """ - self.gensim_model = gensim.corpora.Dictionary(documents=X, prune_at=self.prune_at) + self.gensim_model = Dictionary(documents=X, prune_at=self.prune_at) return self def transform(self, docs): @@ -55,8 +56,8 @@ def transform(self, docs): if self.gensim_model is None: raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") - # The input as array of array - check = lambda x: [x] if isinstance(x[0], six.string_types) else x + # input as python lists + check = lambda x: [x] if isinstance(x[0], string_types) else x docs = check(docs) X = [[] for _ in range(0, len(docs))] @@ -68,7 +69,7 @@ def transform(self, docs): def partial_fit(self, X): if self.gensim_model is None: - self.gensim_model = gensim.corpora.Dictionary(prune_at=self.prune_at) + self.gensim_model = Dictionary(prune_at=self.prune_at) self.gensim_model.add_documents(X) return self From 9acaba5f31fa8d7cab1152f980a8b4e0ce68d792 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 13 Jul 2017 01:59:53 -0700 Subject: [PATCH 10/34] added unittests for text2bow skl api class --- gensim/test/test_sklearn_integration.py | 73 +++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index e78ae53d28..269b98eb7a 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -22,6 +22,7 @@ from gensim.sklearn_integration.sklearn_wrapper_gensim_w2vmodel import SklW2VModel from gensim.sklearn_integration.sklearn_wrapper_gensim_atmodel import SklATModel from gensim.sklearn_integration.d2vmodel import D2VTransformer +from gensim.sklearn_integration.text2bow import Text2BowTransformer from gensim.corpora import mmcorpus, Dictionary from gensim.models import doc2vec from gensim import matutils @@ -101,6 +102,18 @@ d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)] +dict_texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] + class TestSklLdaModelWrapper(unittest.TestCase): def setUp(self): @@ -610,5 +623,65 @@ def testModelNotFitted(self): self.assertRaises(NotFittedError, d2vmodel_wrapper.transform, 1) +class TestText2BowTransformerWrapper(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = Text2BowTransformer() + self.model.fit(dict_texts) + + def testTransform(self): + # tranform multiple documents + docs = [] + docs.append(dict_texts[0]) + docs.append(dict_texts[1]) + docs.append(dict_texts[2]) + matrix = self.model.transform(docs) + expected_matrix = [[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(0, 1), (6, 1), (7, 1), (8, 1)]] + self.assertEqual(matrix, expected_matrix) + + # tranform one document + doc = dict_texts[0] + matrix = self.model.transform(doc) + expected_matrix = [[(0, 1), (1, 1), (2, 1)]] + self.assertEqual(matrix, expected_matrix) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(prune_at=1000000) + model_params = self.model.get_params() + self.assertEqual(model_params["prune_at"], 1000000) + + def testPipeline(self): + with open(datapath('mini_newsgroup'), 'rb') as f: + compressed_content = f.read() + uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') + cache = pickle.loads(uncompressed_content) + data = cache + input_data = [(i.split()) for i in data.data] + text2bow_model = Text2BowTransformer(map(lambda x: x.split(), data.data)) + lda_model = SklLdaModel(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) + numpy.random.mtrand.RandomState(1) # set seed for getting same result + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) + text_lda = Pipeline((('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf))) + text_lda.fit(input_data, data.target) + score = text_lda.score(input_data, data.target) + self.assertGreater(score, 0.40) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = dict_texts[0] + loaded_transformed_vecs = model_load.transform(doc) + + # comparing the original and loaded models + original_transformed_vecs = self.model.transform(doc) + self.assertEqual(original_transformed_vecs, loaded_transformed_vecs) + + def testModelNotFitted(self): + text2bow_wrapper = Text2BowTransformer() + self.assertRaises(NotFittedError, text2bow_wrapper.transform, dict_texts[0]) + + if __name__ == '__main__': unittest.main() From 8c5d04e5df84f3137b4fe6f736ad0557d619acaa Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 13 Jul 2017 03:49:16 -0700 Subject: [PATCH 11/34] updated 'testPipeline' and 'testTransform' for text2bow --- gensim/test/test_sklearn_integration.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 269b98eb7a..aa0d4e0c68 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -630,20 +630,12 @@ def setUp(self): self.model.fit(dict_texts) def testTransform(self): - # tranform multiple documents - docs = [] - docs.append(dict_texts[0]) - docs.append(dict_texts[1]) - docs.append(dict_texts[2]) - matrix = self.model.transform(docs) - expected_matrix = [[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(0, 1), (6, 1), (7, 1), (8, 1)]] - self.assertEqual(matrix, expected_matrix) - # tranform one document - doc = dict_texts[0] - matrix = self.model.transform(doc) - expected_matrix = [[(0, 1), (1, 1), (2, 1)]] - self.assertEqual(matrix, expected_matrix) + doc = ['computer', 'system', 'interface', 'time', 'computer', 'system'] + bow_vec = self.model.transform(doc)[0] + expected_values = [1, 1, 2, 2] # comparing only the word-counts + values = list(map(lambda x: x[1], bow_vec)) + self.assertEqual(sorted(expected_values), sorted(values)) def testSetGetParams(self): # updating only one param @@ -658,7 +650,8 @@ def testPipeline(self): cache = pickle.loads(uncompressed_content) data = cache input_data = [(i.split()) for i in data.data] - text2bow_model = Text2BowTransformer(map(lambda x: x.split(), data.data)) + text2bow_model = Text2BowTransformer() + text2bow_model.fit(list(map(lambda x: x.split(), data.data))) lda_model = SklLdaModel(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) From 4101e30ec8eb32e67afdbe882b127505c3083bf9 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 14 Jul 2017 03:44:26 -0700 Subject: [PATCH 12/34] added 'tokenizer' param to text2bow skl api --- gensim/sklearn_integration/text2bow.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/gensim/sklearn_integration/text2bow.py b/gensim/sklearn_integration/text2bow.py index d6493d80d1..12574fa8c5 100644 --- a/gensim/sklearn_integration/text2bow.py +++ b/gensim/sklearn_integration/text2bow.py @@ -15,6 +15,7 @@ from gensim.corpora import Dictionary from gensim.sklearn_integration import BaseSklearnWrapper +from gensim.utils import tokenize class Text2BowTransformer(BaseSklearnWrapper, TransformerMixin, BaseEstimator): @@ -22,12 +23,13 @@ class Text2BowTransformer(BaseSklearnWrapper, TransformerMixin, BaseEstimator): Base Text2Bow module """ - def __init__(self, prune_at=2000000): + def __init__(self, prune_at=2000000, tokenizer=tokenize): """ Sklearn wrapper for Text2Bow model. """ self.gensim_model = None self.prune_at = prune_at + self.tokenizer = tokenizer def get_params(self, deep=True): """ @@ -46,7 +48,8 @@ def fit(self, X, y=None): """ Fit the model according to the given training data. """ - self.gensim_model = Dictionary(documents=X, prune_at=self.prune_at) + tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X)) + self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at) return self def transform(self, docs): @@ -57,11 +60,12 @@ def transform(self, docs): raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") # input as python lists - check = lambda x: [x] if isinstance(x[0], string_types) else x + check = lambda x: [x] if isinstance(x, string_types) else x docs = check(docs) - X = [[] for _ in range(0, len(docs))] + tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), docs)) + X = [[] for _ in range(0, len(tokenized_docs))] - for k, v in enumerate(docs): + for k, v in enumerate(tokenized_docs): bow_val = self.gensim_model.doc2bow(v) X[k] = bow_val @@ -71,5 +75,6 @@ def partial_fit(self, X): if self.gensim_model is None: self.gensim_model = Dictionary(prune_at=self.prune_at) - self.gensim_model.add_documents(X) + tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X)) + self.gensim_model.add_documents(tokenized_docs) return self From ed7a57116f022e35aaccd36176a9146e6e33c721 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 14 Jul 2017 03:44:54 -0700 Subject: [PATCH 13/34] updated unittests for text2bow --- gensim/test/test_sklearn_integration.py | 27 ++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index aa0d4e0c68..0fb3405687 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -103,15 +103,15 @@ d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)] dict_texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] + 'human interface computer', + 'survey user computer system response time', + 'eps user interface system', + 'system human system eps', + 'user response time', + 'trees', + 'graph trees', + 'graph minors trees', + 'graph minors survey' ] @@ -631,7 +631,7 @@ def setUp(self): def testTransform(self): # tranform one document - doc = ['computer', 'system', 'interface', 'time', 'computer', 'system'] + doc = ['computer system interface time computer system'] bow_vec = self.model.transform(doc)[0] expected_values = [1, 1, 2, 2] # comparing only the word-counts values = list(map(lambda x: x[1], bow_vec)) @@ -649,15 +649,14 @@ def testPipeline(self): uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache - input_data = [(i.split()) for i in data.data] text2bow_model = Text2BowTransformer() - text2bow_model.fit(list(map(lambda x: x.split(), data.data))) + text2bow_model.fit(data.data) lda_model = SklLdaModel(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_lda = Pipeline((('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf))) - text_lda.fit(input_data, data.target) - score = text_lda.score(input_data, data.target) + text_lda.fit(data.data, data.target) + score = text_lda.score(data.data, data.target) self.assertGreater(score, 0.40) def testPersistence(self): From 66a8302f972c5f1a318a81ce9f85e2a7cbed1f37 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 18 Jul 2017 01:59:52 -0700 Subject: [PATCH 14/34] removed get_params and set_params functions from existing classes --- .../base_sklearn_wrapper.py | 13 ------------ gensim/sklearn_integration/d2vmodel.py | 20 ------------------- gensim/sklearn_integration/text2bow.py | 13 ------------ 3 files changed, 46 deletions(-) diff --git a/gensim/sklearn_integration/base_sklearn_wrapper.py b/gensim/sklearn_integration/base_sklearn_wrapper.py index e58d705f1c..4dc352c50a 100644 --- a/gensim/sklearn_integration/base_sklearn_wrapper.py +++ b/gensim/sklearn_integration/base_sklearn_wrapper.py @@ -17,19 +17,6 @@ class BaseSklearnWrapper(object): """ __metaclass__ = ABCMeta - @abstractmethod - def get_params(self, deep=True): - pass - - @abstractmethod - def set_params(self, **parameters): - """ - Set all parameters. - """ - for parameter, value in parameters.items(): - setattr(self, parameter, value) - return self - @abstractmethod def fit(self, X, y=None): pass diff --git a/gensim/sklearn_integration/d2vmodel.py b/gensim/sklearn_integration/d2vmodel.py index 9bbf917717..a1dbc30ebd 100644 --- a/gensim/sklearn_integration/d2vmodel.py +++ b/gensim/sklearn_integration/d2vmodel.py @@ -62,26 +62,6 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, self.sorted_vocab = sorted_vocab self.batch_words = batch_words - def get_params(self, deep=True): - """ - Return all parameters as dictionary. - """ - return {"dm_mean": self.dm_mean, "dm": self.dm, "dbow_words": self.dbow_words, - "dm_concat": self.dm_concat, "dm_tag_count": self.dm_tag_count, "docvecs": self.docvecs, - "docvecs_mapfile": self.docvecs_mapfile, "comment": self.comment, "trim_rule": self.trim_rule, - "size": self.size, "alpha": self.alpha, "window": self.window, "min_count": self.min_count, - "max_vocab_size": self.max_vocab_size, "sample": self.sample, "seed": self.seed, - "workers": self.workers, "min_alpha": self.min_alpha, "hs": self.hs, - "negative": self.negative, "cbow_mean": self.cbow_mean, "hashfxn": self.hashfxn, - "iter": self.iter, "sorted_vocab": self.sorted_vocab, "batch_words": self.batch_words} - - def set_params(self, **parameters): - """ - Set parameters - """ - super(D2VTransformer, self).set_params(**parameters) - return self - def fit(self, X, y=None): """ Fit the model according to the given training data. diff --git a/gensim/sklearn_integration/text2bow.py b/gensim/sklearn_integration/text2bow.py index 12574fa8c5..4bc5e9ef8b 100644 --- a/gensim/sklearn_integration/text2bow.py +++ b/gensim/sklearn_integration/text2bow.py @@ -31,19 +31,6 @@ def __init__(self, prune_at=2000000, tokenizer=tokenize): self.prune_at = prune_at self.tokenizer = tokenizer - def get_params(self, deep=True): - """ - Returns all parameters as dictionary. - """ - return {"prune_at": self.prune_at} - - def set_params(self, **parameters): - """ - Set all parameters. - """ - super(Text2BowTransformer, self).set_params(**parameters) - return self - def fit(self, X, y=None): """ Fit the model according to the given training data. From 75faa7a2018fe680e80fe229654b47c12e2d5b09 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 18 Jul 2017 02:54:24 -0700 Subject: [PATCH 15/34] added tfidf api class --- gensim/sklearn_integration/__init__.py | 1 + gensim/sklearn_integration/tfidf.py | 63 ++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 gensim/sklearn_integration/tfidf.py diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index cbd1c56db2..d7838f66c3 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -19,3 +19,4 @@ from .d2vmodel import D2VTransformer # noqa: F401 from .sklearn_wrapper_gensim_atmodel import SklATModel # noqa: F401 from .text2bow import Text2BowTransformer # noqa: F401 +from .tfidf import TfIdfTransformer # noqa: F401 diff --git a/gensim/sklearn_integration/tfidf.py b/gensim/sklearn_integration/tfidf.py new file mode 100644 index 0000000000..93b7b8cf3f --- /dev/null +++ b/gensim/sklearn_integration/tfidf.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +import gensim +from gensim.models import TfidfModel +from gensim.sklearn_integration import BaseSklearnWrapper + +class TfIdfTransformer(BaseSklearnWrapper, TransformerMixin, BaseEstimator): + """ + Base Tf-Idf module + """ + + def __init__(self, corpus=None, id2word=None, dictionary=None, + wlocal=gensim.utils.identity, wglobal=gensim.models.tfidfmodel.df2idf, normalize=True): + """ + Sklearn wrapper for Tf-Idf model. + """ + self.gensim_model = None + self.id2word = id2word + self.dictionary = dictionary + self.wlocal = wlocal + self.wglobal = wglobal + self.normalize = normalize + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + """ + self.gensim_model = TfidfModel(corpus=X, id2word=self.id2word, dictionary=self.dictionary, + wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize) + return self + + def transform(self, docs): + """ + Return the transformed documents after multiplication with the tf-idf matrix. + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + # input as python lists + check = lambda x: [x] if isinstance(x[0], tuple) else x + docs = check(docs) + X = [[] for _ in range(0, len(docs))] + + for k, v in enumerate(docs): + transformed_doc = self.gensim_model[v] + X[k] = transformed_doc + + return X + + def partial_fit(self, X): + raise NotImplementedError("'partial_fit' has not been implemented for TfIdfTransformer") From 8cbd2ba508f9e25bb01048ced092f47ab9548e21 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 18 Jul 2017 02:54:54 -0700 Subject: [PATCH 16/34] added unittests for tfidf api class --- gensim/test/test_sklearn_integration.py | 62 +++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 0fb3405687..30e6cc4be2 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -23,6 +23,7 @@ from gensim.sklearn_integration.sklearn_wrapper_gensim_atmodel import SklATModel from gensim.sklearn_integration.d2vmodel import D2VTransformer from gensim.sklearn_integration.text2bow import Text2BowTransformer +from gensim.sklearn_integration.tfidf import TfIdfTransformer from gensim.corpora import mmcorpus, Dictionary from gensim.models import doc2vec from gensim import matutils @@ -675,5 +676,66 @@ def testModelNotFitted(self): self.assertRaises(NotFittedError, text2bow_wrapper.transform, dict_texts[0]) +class TestTfIdfTransformer(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = TfIdfTransformer(normalize=True) + self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.model.fit(self.corpus) + + def testTransform(self): + # tranform one document + doc = corpus[0] + transformed_doc = self.model.transform(doc) + expected_doc = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]] + self.assertTrue(numpy.allclose(transformed_doc, expected_doc)) + + # tranform multiple documents + docs = [corpus[0], corpus[1]] + transformed_docs = self.model.transform(docs) + expected_docs = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)], + [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(normalize=False) + model_params = self.model.get_params() + self.assertEqual(model_params["normalize"], False) + + def testPipeline(self): + with open(datapath('mini_newsgroup'), 'rb') as f: + compressed_content = f.read() + uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') + cache = pickle.loads(uncompressed_content) + data = cache + id2word = Dictionary(map(lambda x: x.split(), data.data)) + corpus = [id2word.doc2bow(i.split()) for i in data.data] + tfidf_model = TfIdfTransformer() + tfidf_model.fit(corpus) + lda_model = SklLdaModel(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) + numpy.random.mtrand.RandomState(1) # set seed for getting same result + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) + text_tfidf = Pipeline((('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf))) + text_tfidf.fit(corpus, data.target) + score = text_tfidf.score(corpus, data.target) + self.assertGreater(score, 0.40) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = corpus[0] + loaded_transformed_doc = model_load.transform(doc) + + # comparing the original and loaded models + original_transformed_doc = self.model.transform(doc) + self.assertEqual(original_transformed_doc, loaded_transformed_doc) + + def testModelNotFitted(self): + tfidf_wrapper = TfIdfTransformer() + self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0]) + if __name__ == '__main__': unittest.main() From 48958c094dc0ac2ef6316d35a7219e28ec6b1ca3 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 18 Jul 2017 03:25:51 -0700 Subject: [PATCH 17/34] flake8 fixes --- gensim/sklearn_integration/tfidf.py | 3 ++- gensim/test/test_sklearn_integration.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_integration/tfidf.py b/gensim/sklearn_integration/tfidf.py index 93b7b8cf3f..74a06f7f7f 100644 --- a/gensim/sklearn_integration/tfidf.py +++ b/gensim/sklearn_integration/tfidf.py @@ -16,6 +16,7 @@ from gensim.models import TfidfModel from gensim.sklearn_integration import BaseSklearnWrapper + class TfIdfTransformer(BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base Tf-Idf module @@ -37,7 +38,7 @@ def fit(self, X, y=None): """ Fit the model according to the given training data. """ - self.gensim_model = TfidfModel(corpus=X, id2word=self.id2word, dictionary=self.dictionary, + self.gensim_model = TfidfModel(corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize) return self diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 30e6cc4be2..7c24a015b2 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -693,7 +693,7 @@ def testTransform(self): # tranform multiple documents docs = [corpus[0], corpus[1]] transformed_docs = self.model.transform(docs) - expected_docs = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)], + expected_docs = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)], [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]] self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) From a852980cda63c0e0fe769d1db23df649d4e45419 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 19 Jul 2017 03:53:04 -0700 Subject: [PATCH 18/34] added skl api for hdpmodel --- gensim/sklearn_integration/__init__.py | 1 + gensim/sklearn_integration/hdp.py | 76 ++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 gensim/sklearn_integration/hdp.py diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index d7838f66c3..d75132a366 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -20,3 +20,4 @@ from .sklearn_wrapper_gensim_atmodel import SklATModel # noqa: F401 from .text2bow import Text2BowTransformer # noqa: F401 from .tfidf import TfIdfTransformer # noqa: F401 +from .hdp import HdpTransformer # noqa: F401 diff --git a/gensim/sklearn_integration/hdp.py b/gensim/sklearn_integration/hdp.py new file mode 100644 index 0000000000..4ac26eda3a --- /dev/null +++ b/gensim/sklearn_integration/hdp.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +from gensim import models +from gensim.sklearn_integration import BaseSklearnWrapper + + +class HdpTransformer(BaseSklearnWrapper, TransformerMixin, BaseEstimator): + """ + Base HDP module + """ + + def __init__(self, id2word, max_chunks=None, max_time=None, + chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, + gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, + outputdir=None, random_state=None): + """ + Sklearn api for HDP model. See gensim.models.HdpModel for parameter details. + """ + self.gensim_model = None + self.id2word = id2word + self.max_chunks = max_chunks + self.max_time = max_time + self.chunksize = chunksize + self.kappa = kappa + self.tau = tau + self.K = K + self.T = T + self.alpha = alpha + self.gamma = gamma + self.eta = eta + self.scale = scale + self.var_converge = var_converge + self.outputdir = outputdir + self.random_state = random_state + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + Calls gensim.models.HdpModel + """ + self.gensim_model = models.HdpModel(corpus=X, id2word=self.id2word, max_chunks=self.max_chunks, + max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, + K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, + var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) + return self + + def transform(self, docs): + """ + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + # The input as array of array + check = lambda x: [x] if isinstance(x[0], tuple) else x + docs = check(docs) + X = [[] for _ in range(0, len(docs))] + + for k, v in enumerate(docs): + doc_topics = self.gensim_model[v] + X[k] = doc_topics + return X + + def partial_fit(self, X): + raise NotImplementedError("'partial_fit' has not been implemented for HdpTransformer") From 5a16e77cd1628831710a621b67fceb40e861861e Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 19 Jul 2017 03:53:23 -0700 Subject: [PATCH 19/34] added unittests for hdp model api class --- gensim/test/test_sklearn_integration.py | 45 +++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 7c24a015b2..e209e9b900 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -24,6 +24,7 @@ from gensim.sklearn_integration.d2vmodel import D2VTransformer from gensim.sklearn_integration.text2bow import Text2BowTransformer from gensim.sklearn_integration.tfidf import TfIdfTransformer +from gensim.sklearn_integration.hdp import HdpTransformer from gensim.corpora import mmcorpus, Dictionary from gensim.models import doc2vec from gensim import matutils @@ -737,5 +738,49 @@ def testModelNotFitted(self): tfidf_wrapper = TfIdfTransformer() self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0]) + +class TestHdpTransformer(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = HdpTransformer(id2word=dictionary) + self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.model.fit(self.corpus) + + def testTransform(self): + # tranform one document + doc = corpus[0] + transformed_doc = self.model.transform(doc) + expected_doc = [[(0, 0.81043386270128193), (1, 0.049357139518070477), (2, 0.035840906753517532), (3, 0.026542006926698079), (4, 0.019925705902962578), (5, 0.014776690981729117), (6, 0.011068909979528148)]] + self.assertTrue(numpy.allclose(transformed_doc, expected_doc)) + + # tranform multiple documents + docs = [corpus[0], corpus[1]] + transformed_docs = self.model.transform(docs) + expected_docs = [[(0, 0.81043386270128193), (1, 0.049357139518070477), (2, 0.035840906753517532), (3, 0.026542006926698079), (4, 0.019925705902962578), (5, 0.014776690981729117), (6, 0.011068909979528148)], + [(0, 0.037756672994608199), (1, 0.64897189582154546), (2, 0.25362737170407318), (3, 0.015171827968228756), (4, 0.011386305552649889)]] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(var_converge=0.05) + model_params = self.model.get_params() + self.assertEqual(model_params["var_converge"], 0.05) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = corpus[0] + loaded_transformed_doc = model_load.transform(doc) + + # comparing the original and loaded models + original_transformed_doc = self.model.transform(doc) + self.assertEqual(original_transformed_doc, loaded_transformed_doc) + + def testModelNotFitted(self): + hdp_wrapper = HdpTransformer(id2word=dictionary) + self.assertRaises(NotFittedError, hdp_wrapper.transform, corpus[0]) + if __name__ == '__main__': unittest.main() From 919105342c9f30f987af48046621b95d150a4685 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 19 Jul 2017 04:13:07 -0700 Subject: [PATCH 20/34] flake8 fixes --- gensim/sklearn_integration/hdp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_integration/hdp.py b/gensim/sklearn_integration/hdp.py index 4ac26eda3a..9597c56ccc 100644 --- a/gensim/sklearn_integration/hdp.py +++ b/gensim/sklearn_integration/hdp.py @@ -50,9 +50,9 @@ def fit(self, X, y=None): Fit the model according to the given training data. Calls gensim.models.HdpModel """ - self.gensim_model = models.HdpModel(corpus=X, id2word=self.id2word, max_chunks=self.max_chunks, - max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, - K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, + self.gensim_model = models.HdpModel(corpus=X, id2word=self.id2word, max_chunks=self.max_chunks, + max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, + K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) return self From 57257bea2e5897a7e39bfd051fb015e06d0039ff Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 20 Jul 2017 01:11:39 -0700 Subject: [PATCH 21/34] updated hdp api class --- gensim/sklearn_integration/hdp.py | 41 ++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_integration/hdp.py b/gensim/sklearn_integration/hdp.py index 9597c56ccc..279b70c0b0 100644 --- a/gensim/sklearn_integration/hdp.py +++ b/gensim/sklearn_integration/hdp.py @@ -9,10 +9,13 @@ Follows scikit-learn API conventions """ +import numpy as np +from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError from gensim import models +from gensim import matutils from gensim.sklearn_integration import BaseSklearnWrapper @@ -50,6 +53,11 @@ def fit(self, X, y=None): Fit the model according to the given training data. Calls gensim.models.HdpModel """ + if sparse.issparse(X): + corpus = matutils.Sparse2Corpus(X) + else: + corpus = X + self.gensim_model = models.HdpModel(corpus=X, id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, @@ -58,6 +66,11 @@ def fit(self, X, y=None): def transform(self, docs): """ + Takes a list of documents as input ('docs'). + Returns a matrix of topic distribution for the given document bow, where a_ij + indicates (topic_i, topic_probability_j). + The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ] + or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") @@ -67,10 +80,32 @@ def transform(self, docs): docs = check(docs) X = [[] for _ in range(0, len(docs))] + max_num_topics = 0 for k, v in enumerate(docs): doc_topics = self.gensim_model[v] - X[k] = doc_topics - return X + probs_docs = list(map(lambda x: x[1], doc_topics)) + X[k] = probs_docs + max_num_topics = max(max_num_topics, len(probs_docs)) + + for k, v in enumerate(X): + if len(v) != max_num_topics: + v.extend([1e-12]*(max_num_topics - len(v))) + X[k] = v + + return np.reshape(np.array(X), (len(docs), max_num_topics)) def partial_fit(self, X): - raise NotImplementedError("'partial_fit' has not been implemented for HdpTransformer") + """ + Train model over X. + """ + if sparse.issparse(X): + X = matutils.Sparse2Corpus(X) + + if self.gensim_model is None: + self.gensim_model = models.HdpModel(id2word=self.id2word, max_chunks=self.max_chunks, + max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, + K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, + var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) + + self.gensim_model.update(corpus=X) + return self From de2e11da7fed1255766e881938c98e861e37dbab Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 20 Jul 2017 01:12:31 -0700 Subject: [PATCH 22/34] added 'testPartialFit' and 'testPipeline' tests for hdp api class --- gensim/test/test_sklearn_integration.py | 36 ++++++++++++++++++++----- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index e209e9b900..8c7fb26b3b 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -748,25 +748,49 @@ def setUp(self): def testTransform(self): # tranform one document - doc = corpus[0] + doc = self.corpus[0] transformed_doc = self.model.transform(doc) - expected_doc = [[(0, 0.81043386270128193), (1, 0.049357139518070477), (2, 0.035840906753517532), (3, 0.026542006926698079), (4, 0.019925705902962578), (5, 0.014776690981729117), (6, 0.011068909979528148)]] + expected_doc = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148]] self.assertTrue(numpy.allclose(transformed_doc, expected_doc)) # tranform multiple documents - docs = [corpus[0], corpus[1]] + docs = [self.corpus[0], self.corpus[1]] transformed_docs = self.model.transform(docs) - expected_docs = [[(0, 0.81043386270128193), (1, 0.049357139518070477), (2, 0.035840906753517532), (3, 0.026542006926698079), (4, 0.019925705902962578), (5, 0.014776690981729117), (6, 0.011068909979528148)], - [(0, 0.037756672994608199), (1, 0.64897189582154546), (2, 0.25362737170407318), (3, 0.015171827968228756), (4, 0.011386305552649889)]] + expected_docs = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148], + [0.0368655605, 0.709055041, 0.194436428, 0.0151706795, 0.0113863652, 1.00000000e-12, 1.00000000e-12]] self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + def testPartialFit(self): + for i in range(10): + self.model.partial_fit(X=self.corpus) # fit against the model again + doc = list(self.corpus)[0] # transform only the first document + transformed = self.model.transform(doc) + expected = numpy.array([0.76777752, 0.01757334, 0.01600339, 0.01374061, 0.01275931, 0.01126313, 0.01058131, 0.01167185]) + passed = numpy.allclose(sorted(transformed[0]), sorted(expected), atol=1e-1) + self.assertTrue(True) + def testSetGetParams(self): # updating only one param self.model.set_params(var_converge=0.05) model_params = self.model.get_params() self.assertEqual(model_params["var_converge"], 0.05) + def testPipeline(self): + with open(datapath('mini_newsgroup'), 'rb') as f: + compressed_content = f.read() + uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') + cache = pickle.loads(uncompressed_content) + data = cache + id2word = Dictionary(map(lambda x: x.split(), data.data)) + corpus = [id2word.doc2bow(i.split()) for i in data.data] + model = HdpTransformer(id2word=id2word) + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) + text_lda = Pipeline((('features', model,), ('classifier', clf))) + text_lda.fit(corpus, data.target) + score = text_lda.score(corpus, data.target) + self.assertGreater(score, 0.40) + def testPersistence(self): model_dump = pickle.dumps(self.model) model_load = pickle.loads(model_dump) @@ -776,7 +800,7 @@ def testPersistence(self): # comparing the original and loaded models original_transformed_doc = self.model.transform(doc) - self.assertEqual(original_transformed_doc, loaded_transformed_doc) + self.assertTrue(numpy.allclose(original_transformed_doc, loaded_transformed_doc)) def testModelNotFitted(self): hdp_wrapper = HdpTransformer(id2word=dictionary) From acdb6ddc8d506cf3e65e1a7619cac9bcad20164d Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 20 Jul 2017 07:35:56 -0700 Subject: [PATCH 23/34] flake8 fixes --- gensim/sklearn_integration/hdp.py | 4 ++-- gensim/test/test_sklearn_integration.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/sklearn_integration/hdp.py b/gensim/sklearn_integration/hdp.py index 279b70c0b0..ee49840fae 100644 --- a/gensim/sklearn_integration/hdp.py +++ b/gensim/sklearn_integration/hdp.py @@ -58,7 +58,7 @@ def fit(self, X, y=None): else: corpus = X - self.gensim_model = models.HdpModel(corpus=X, id2word=self.id2word, max_chunks=self.max_chunks, + self.gensim_model = models.HdpModel(corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) @@ -89,7 +89,7 @@ def transform(self, docs): for k, v in enumerate(X): if len(v) != max_num_topics: - v.extend([1e-12]*(max_num_topics - len(v))) + v.extend([1e-12] * (max_num_topics - len(v))) X[k] = v return np.reshape(np.array(X), (len(docs), max_num_topics)) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 8c7fb26b3b..81b213669f 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -766,9 +766,9 @@ def testPartialFit(self): self.model.partial_fit(X=self.corpus) # fit against the model again doc = list(self.corpus)[0] # transform only the first document transformed = self.model.transform(doc) - expected = numpy.array([0.76777752, 0.01757334, 0.01600339, 0.01374061, 0.01275931, 0.01126313, 0.01058131, 0.01167185]) + expected = numpy.array([0.76777752, 0.01757334, 0.01600339, 0.01374061, 0.01275931, 0.01126313, 0.01058131, 0.01167185]) passed = numpy.allclose(sorted(transformed[0]), sorted(expected), atol=1e-1) - self.assertTrue(True) + self.assertTrue(passed) def testSetGetParams(self): # updating only one param From 1c7da8ed19f1f4697a1a32a96c67c021f63dd5f5 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 1 Aug 2017 16:51:22 +0530 Subject: [PATCH 24/34] added skl API class for phrases --- gensim/sklearn_integration/__init__.py | 1 + gensim/sklearn_integration/phrases.py | 69 ++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 gensim/sklearn_integration/phrases.py diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index d75132a366..eaf3390db4 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -21,3 +21,4 @@ from .text2bow import Text2BowTransformer # noqa: F401 from .tfidf import TfIdfTransformer # noqa: F401 from .hdp import HdpTransformer # noqa: F401 +from .phrases import PhrasesTransformer # noqa: F401 diff --git a/gensim/sklearn_integration/phrases.py b/gensim/sklearn_integration/phrases.py new file mode 100644 index 0000000000..e61186f686 --- /dev/null +++ b/gensim/sklearn_integration/phrases.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +from six import string_types +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +from gensim import models +from gensim.sklearn_integration import BaseSklearnWrapper + + +class PhrasesTransformer(BaseSklearnWrapper, TransformerMixin, BaseEstimator): + """ + Base Phrases module + """ + + def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, + delimiter=b'_', progress_per=10000): + """ + Sklearn wrapper for Phrases model. + """ + self.gensim_model = None + self.min_count = min_count + self.threshold = threshold + self.max_vocab_size = max_vocab_size + self.delimiter = delimiter + self.progress_per = progress_per + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + """ + self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) + return self + + def transform(self, docs): + """ + Return the input documents to return phrase tokens. + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + # input as python lists + check = lambda x: [x] if isinstance(x[0], string_types) else x + docs = check(docs) + X = [[] for _ in range(0, len(docs))] + + for k, v in enumerate(docs): + phrase_tokens = self.gensim_model[v] + X[k] = phrase_tokens + + return X + + def partial_fit(self, X): + if self.gensim_model is None: + self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) + + self.gensim_model.add_vocab(X) + return self From 47a421468ef452ab0aebf304031eabe1b5351809 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 1 Aug 2017 16:51:44 +0530 Subject: [PATCH 25/34] added unit tests for phrases API class --- gensim/test/test_sklearn_integration.py | 52 ++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 81b213669f..f2a5f05d58 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -25,6 +25,7 @@ from gensim.sklearn_integration.text2bow import Text2BowTransformer from gensim.sklearn_integration.tfidf import TfIdfTransformer from gensim.sklearn_integration.hdp import HdpTransformer +from gensim.sklearn_integration.phrases import PhrasesTransformer from gensim.corpora import mmcorpus, Dictionary from gensim.models import doc2vec from gensim import matutils @@ -116,6 +117,19 @@ 'graph minors survey' ] +phrases_sentences = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'], + ['graph', 'minors', 'survey','human','interface'] #test bigrams within same sentence +] + class TestSklLdaModelWrapper(unittest.TestCase): def setUp(self): @@ -557,7 +571,7 @@ def testPipeline(self): self.assertEqual(len(ret_val), len(author_list)) -class TestD2VTransformerWrapper(unittest.TestCase): +class TestD2VTransformer(unittest.TestCase): def setUp(self): numpy.random.seed(0) self.model = D2VTransformer(min_count=1) @@ -625,7 +639,7 @@ def testModelNotFitted(self): self.assertRaises(NotFittedError, d2vmodel_wrapper.transform, 1) -class TestText2BowTransformerWrapper(unittest.TestCase): +class TestText2BowTransformer(unittest.TestCase): def setUp(self): numpy.random.seed(0) self.model = Text2BowTransformer() @@ -806,5 +820,39 @@ def testModelNotFitted(self): hdp_wrapper = HdpTransformer(id2word=dictionary) self.assertRaises(NotFittedError, hdp_wrapper.transform, corpus[0]) +class TestPhrasesTransformer(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = PhrasesTransformer(min_count=1, threshold=1) + self.model.fit(phrases_sentences) + + def testTransform(self): + # tranform one document + doc = phrases_sentences[-1] + phrase_tokens = self.model.transform(doc)[0] + expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface'] + self.assertEqual(phrase_tokens, expected_phrase_tokens) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(progress_per=5000) + model_params = self.model.get_params() + self.assertEqual(model_params["progress_per"], 5000) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = phrases_sentences[-1] + loaded_phrase_tokens = model_load.transform(doc) + + # comparing the original and loaded models + original_phrase_tokens = self.model.transform(doc) + self.assertEqual(original_phrase_tokens, loaded_phrase_tokens) + + def testModelNotFitted(self): + phrases_transformer = PhrasesTransformer() + self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) + if __name__ == '__main__': unittest.main() From 9b32c4d68ca9410729bf844686087afc5e5abba9 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 1 Aug 2017 17:22:07 +0530 Subject: [PATCH 26/34] flake8 fixes --- gensim/test/test_sklearn_integration.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index f2a5f05d58..8b677a1975 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -127,7 +127,7 @@ ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey'], - ['graph', 'minors', 'survey','human','interface'] #test bigrams within same sentence + ['graph', 'minors', 'survey', 'human', 'interface'] # test bigrams within same sentence ] @@ -820,6 +820,7 @@ def testModelNotFitted(self): hdp_wrapper = HdpTransformer(id2word=dictionary) self.assertRaises(NotFittedError, hdp_wrapper.transform, corpus[0]) + class TestPhrasesTransformer(unittest.TestCase): def setUp(self): numpy.random.seed(0) From 3a0977a2e441ba34fff8df7754239c510069df14 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 1 Aug 2017 17:55:13 +0530 Subject: [PATCH 27/34] added 'testPartialFit' function for 'TestPhrasesTransformer' --- gensim/test/test_sklearn_integration.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 8b677a1975..bf84b39f4d 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -127,7 +127,7 @@ ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey'], - ['graph', 'minors', 'survey', 'human', 'interface'] # test bigrams within same sentence + ['graph', 'minors', 'survey', 'human', 'interface'] ] @@ -834,6 +834,19 @@ def testTransform(self): expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface'] self.assertEqual(phrase_tokens, expected_phrase_tokens) + def testPartialFit(self): + new_sentences = [ + ['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'], + ['world', 'peace', 'people'], + ['world', 'peace', 'humans'] + ] + self.model.partial_fit(X=new_sentences) # train model with new sentences + + doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace'] + phrase_tokens = self.model.transform(doc)[0] + expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface', u'world_peace'] + self.assertEqual(phrase_tokens, expected_phrase_tokens) + def testSetGetParams(self): # updating only one param self.model.set_params(progress_per=5000) From 687c3d7299f84840db3dcb1cbae753bfbc9825dd Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 1 Aug 2017 18:06:56 +0530 Subject: [PATCH 28/34] updated 'testPipeline' function for 'TestText2BowTransformer' --- gensim/test/test_sklearn_integration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index bf84b39f4d..88c52722ab 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -666,7 +666,6 @@ def testPipeline(self): cache = pickle.loads(uncompressed_content) data = cache text2bow_model = Text2BowTransformer() - text2bow_model.fit(data.data) lda_model = SklLdaModel(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) From d42c877113bd60a2c58ca32e4d7769697ac51ad5 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 15 Aug 2017 17:57:00 +0530 Subject: [PATCH 29/34] updated code for transform function for HDP transformer --- gensim/sklearn_api/hdp.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index 09b217d3ab..92265a5e8f 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -81,15 +81,13 @@ def transform(self, docs): max_num_topics = 0 for k, v in enumerate(docs): - doc_topics = self.gensim_model[v] - probs_docs = list(map(lambda x: x[1], doc_topics)) - X[k] = probs_docs - max_num_topics = max(max_num_topics, len(probs_docs)) + X[k] = self.gensim_model[v] + max_num_topics = max(max_num_topics, max(list(map(lambda x: x[0], X[k]))) + 1) for k, v in enumerate(X): - if len(v) != max_num_topics: - v.extend([1e-12] * (max_num_topics - len(v))) - X[k] = v + # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + dense_vec = matutils.sparse2full(v, max_num_topics) + X[k] = dense_vec return np.reshape(np.array(X), (len(docs), max_num_topics)) From 3037620b31bbcf22553cd5c45968c466c9a35a93 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 15 Aug 2017 17:57:37 +0530 Subject: [PATCH 30/34] updated tests as discussed in PR 1473 --- gensim/test/test_sklearn_api.py | 65 ++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 369a6f216b..17b2dacd8a 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -12,31 +12,19 @@ except ImportError: raise unittest.SkipTest("Test requires scikit-learn to be installed, which is not available") -<<<<<<< HEAD:gensim/test/test_sklearn_integration.py -from gensim.sklearn_integration.sklearn_wrapper_gensim_rpmodel import SklRpModel -from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklLdaModel -from gensim.sklearn_integration.sklearn_wrapper_gensim_lsimodel import SklLsiModel -from gensim.sklearn_integration.sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel -from gensim.sklearn_integration.sklearn_wrapper_gensim_w2vmodel import SklW2VModel -from gensim.sklearn_integration.sklearn_wrapper_gensim_atmodel import SklATModel -from gensim.sklearn_integration.d2vmodel import D2VTransformer -from gensim.sklearn_integration.text2bow import Text2BowTransformer -from gensim.sklearn_integration.tfidf import TfIdfTransformer -from gensim.sklearn_integration.hdp import HdpTransformer -from gensim.sklearn_integration.phrases import PhrasesTransformer -from gensim.corpora import mmcorpus, Dictionary -from gensim.models import doc2vec -======= from gensim.sklearn_api.rpmodel import RpTransformer from gensim.sklearn_api.ldamodel import LdaTransformer from gensim.sklearn_api.lsimodel import LsiTransformer from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer from gensim.sklearn_api.w2vmodel import W2VTransformer from gensim.sklearn_api.atmodel import AuthorTopicTransformer +from gensim.sklearn_api.d2vmodel import D2VTransformer +from gensim.sklearn_api.text2bow import Text2BowTransformer +from gensim.sklearn_api.tfidf import TfIdfTransformer +from gensim.sklearn_api.hdp import HdpTransformer +from gensim.sklearn_api.phrases import PhrasesTransformer from gensim.corpora import mmcorpus, Dictionary -from gensim import models ->>>>>>> 718b1c6bd1a8a98625993d73b83d98baf385752d:gensim/test/test_sklearn_api.py -from gensim import matutils +from gensim import matutils, models module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -110,8 +98,7 @@ ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', 'transformed', 'modern', 'day', 'society'] ] -<<<<<<< HEAD:gensim/test/test_sklearn_integration.py -d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)] +d2v_sentences = [models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)] dict_texts = [ 'human interface computer', @@ -139,11 +126,7 @@ ] -class TestSklLdaModelWrapper(unittest.TestCase): -======= - class TestLdaWrapper(unittest.TestCase): ->>>>>>> 718b1c6bd1a8a98625993d73b83d98baf385752d:gensim/test/test_sklearn_api.py def setUp(self): numpy.random.seed(0) # set fixed seed to get similar values everytime self.model = LdaTransformer(id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)) @@ -685,6 +668,10 @@ def testSetGetParams(self): model_params = self.model.get_params() self.assertEqual(model_params["negative"], 20) + # verify that the attributes values are also changed for `gensim_model` after fitting + self.model.fit(d2v_sentences) + self.assertEqual(getattr(self.model.gensim_model, 'negative'), 20) + def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime model = D2VTransformer(min_count=1) @@ -720,6 +707,20 @@ def testPersistence(self): passed = numpy.allclose(sorted(loaded_transformed_vecs), sorted(original_transformed_vecs), atol=1e-1) self.assertTrue(passed) + def testConsistencyWithGensimModel(self): + # training a D2VTransformer + self.model = D2VTransformer(min_count=1) + self.model.fit(d2v_sentences) + + # training a Gensim Doc2Vec model with the same params + gensim_d2vmodel = models.Doc2Vec(d2v_sentences, min_count=1) + + doc = w2v_texts[0] + vec_transformer_api = self.model.transform(doc) # vector returned by D2VTransformer + vec_gensim_model = gensim_d2vmodel[doc] # vector returned by Doc2Vec + passed = numpy.allclose(vec_transformer_api, vec_gensim_model, atol=1e-1) + self.assertTrue(passed) + def testModelNotFitted(self): d2vmodel_wrapper = D2VTransformer(min_count=1) self.assertRaises(NotFittedError, d2vmodel_wrapper.transform, 1) @@ -752,7 +753,7 @@ def testPipeline(self): cache = pickle.loads(uncompressed_content) data = cache text2bow_model = Text2BowTransformer() - lda_model = SklLdaModel(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) + lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_lda = Pipeline((('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf))) @@ -804,6 +805,10 @@ def testSetGetParams(self): model_params = self.model.get_params() self.assertEqual(model_params["normalize"], False) + # verify that the attributes values are also changed for `gensim_model` after fitting + self.model.fit(self.corpus) + self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False) + def testPipeline(self): with open(datapath('mini_newsgroup'), 'rb') as f: compressed_content = f.read() @@ -814,7 +819,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in data.data] tfidf_model = TfIdfTransformer() tfidf_model.fit(corpus) - lda_model = SklLdaModel(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) + lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_tfidf = Pipeline((('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf))) @@ -875,6 +880,10 @@ def testSetGetParams(self): model_params = self.model.get_params() self.assertEqual(model_params["var_converge"], 0.05) + # verify that the attributes values are also changed for `gensim_model` after fitting + self.model.fit(self.corpus) + self.assertEqual(getattr(self.model.gensim_model, 'm_var_converge'), 0.05) + def testPipeline(self): with open(datapath('mini_newsgroup'), 'rb') as f: compressed_content = f.read() @@ -938,6 +947,10 @@ def testSetGetParams(self): model_params = self.model.get_params() self.assertEqual(model_params["progress_per"], 5000) + # verify that the attributes values are also changed for `gensim_model` after fitting + self.model.fit(phrases_sentences) + self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000) + def testPersistence(self): model_dump = pickle.dumps(self.model) model_load = pickle.loads(model_dump) From c52a0e26a334c533cb44556c9011a40b4595f355 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 16 Aug 2017 17:54:10 +0530 Subject: [PATCH 31/34] added examples for new models in ipynb --- docs/notebooks/sklearn_api.ipynb | 429 +++++++++++++++++++++++++++++-- 1 file changed, 413 insertions(+), 16 deletions(-) diff --git a/docs/notebooks/sklearn_api.ipynb b/docs/notebooks/sklearn_api.ipynb index 3273b70728..3fb9c01d52 100644 --- a/docs/notebooks/sklearn_api.ipynb +++ b/docs/notebooks/sklearn_api.ipynb @@ -44,17 +44,11 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], + "execution_count": 20, + "metadata": { + "collapsed": true + }, + "outputs": [], "source": [ "from gensim.sklearn_api import LdaTransformer" ] @@ -68,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 21, "metadata": { "collapsed": true }, @@ -145,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 22, "metadata": { "collapsed": true }, @@ -160,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 28, "metadata": { "collapsed": true }, @@ -180,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 29, "metadata": { "collapsed": true }, @@ -314,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": { "collapsed": true }, @@ -367,6 +361,7 @@ "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", + "\n", "print(pipe.score(corpus, data.target))" ] }, @@ -554,9 +549,411 @@ "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", "pipe.fit(corpus, test_target)\n", "print_features_pipe(pipe, id2word.values())\n", + "\n", "print(pipe.score(corpus, test_target))" ] }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### Word2Vec Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use Word2Vec model begin with importing Word2Vec wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "from gensim.sklearn_api import W2VTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7\n" + ] + } + ], + "source": [ + "w2v_texts = [\n", + " ['calculus', 'is', 'the', 'mathematical', 'study', 'of', 'continuous', 'change'],\n", + " ['geometry', 'is', 'the', 'study', 'of', 'shape'],\n", + " ['algebra', 'is', 'the', 'study', 'of', 'generalizations', 'of', 'arithmetic', 'operations'],\n", + " ['differential', 'calculus', 'is', 'related', 'to', 'rates', 'of', 'change', 'and', 'slopes', 'of', 'curves'],\n", + " ['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and', 'the', 'areas', 'under', 'and', 'between', 'curves'],\n", + " ['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter', 'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'],\n", + " ['the', 'main', 'goal', 'of', 'physics', 'is', 'to', 'understand', 'how', 'the', 'universe', 'behaves'],\n", + " ['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new', 'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'],\n", + " ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', 'transformed', 'modern', 'day', 'society']\n", + "]\n", + "\n", + "model = W2VTransformer(size=10, min_count=1)\n", + "model.fit(w2v_texts)\n", + "\n", + "class_dict = {'mathematics': 1, 'physics': 0}\n", + "train_data = [\n", + " ('calculus', 'mathematics'), ('mathematical', 'mathematics'), ('geometry', 'mathematics'), ('operations', 'mathematics'), ('curves', 'mathematics'),\n", + " ('natural', 'physics'), ('nuclear', 'physics'), ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics')\n", + "]\n", + "\n", + "train_input = list(map(lambda x: x[0], train_data))\n", + "train_target = list(map(lambda x: class_dict[x[1]], train_data))\n", + "\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "clf.fit(model.transform(train_input), train_target)\n", + "text_w2v = Pipeline((('features', model,), ('classifier', clf)))\n", + "score = text_w2v.score(train_input, train_target)\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### AuthorTopic Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use AuthorTopic model begin with importing AuthorTopic wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import AuthorTopicTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 0 0]\n" + ] + } + ], + "source": [ + "from sklearn import cluster\n", + "\n", + "atm_texts = [\n", + " ['complier', 'system', 'computer'],\n", + " ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],\n", + " ['graph', 'flow', 'network', 'graph'],\n", + " ['loading', 'computer', 'system'],\n", + " ['user', 'server', 'system'],\n", + " ['tree', 'hamiltonian'],\n", + " ['graph', 'trees'],\n", + " ['computer', 'kernel', 'malfunction', 'computer'],\n", + " ['server', 'system', 'computer'],\n", + "]\n", + "atm_dictionary = Dictionary(atm_texts)\n", + "atm_corpus = [atm_dictionary.doc2bow(text) for text in atm_texts]\n", + "author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7]}\n", + "\n", + "model = AuthorTopicTransformer(id2word=atm_dictionary, author2doc=author2doc, num_topics=10, passes=100)\n", + "model.fit(atm_corpus)\n", + "\n", + "# create and train clustering model\n", + "clstr = cluster.MiniBatchKMeans(n_clusters=2)\n", + "authors_full = ['john', 'jane', 'jack', 'jill']\n", + "clstr.fit(model.transform(authors_full))\n", + "\n", + "# stack together the two models in a pipeline\n", + "text_atm = Pipeline((('features', model,), ('cluster', clstr)))\n", + "author_list = ['jane', 'jack', 'jill']\n", + "ret_val = text_atm.predict(author_list)\n", + "\n", + "print(ret_val)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Doc2Vec Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use Doc2Vec model begin with importing Doc2Vec wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import D2VTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0\n" + ] + } + ], + "source": [ + "from gensim.models import doc2vec\n", + "d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)]\n", + "\n", + "model = D2VTransformer(min_count=1)\n", + "model.fit(d2v_sentences)\n", + "\n", + "class_dict = {'mathematics': 1, 'physics': 0}\n", + "train_data = [\n", + " (['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'),\n", + " (['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics')\n", + "]\n", + "train_input = list(map(lambda x: x[0], train_data))\n", + "train_target = list(map(lambda x: class_dict[x[1]], train_data))\n", + "\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "clf.fit(model.transform(train_input), train_target)\n", + "text_d2v = Pipeline((('features', model,), ('classifier', clf)))\n", + "score = text_d2v.score(train_input, train_target)\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Text2Bow Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use Text2Bow model begin with importing Text2Bow wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import Text2BowTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.947147651007\n" + ] + } + ], + "source": [ + "text2bow_model = Text2BowTransformer()\n", + "lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=np.random.seed(0))\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "text_t2b = Pipeline((('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)))\n", + "text_t2b.fit(data.data, data.target)\n", + "score = text_t2b.score(data.data, data.target)\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TfIdf Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use TfIdf model begin with importing TfIdf wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import TfIdfTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.578859060403\n" + ] + } + ], + "source": [ + "tfidf_model = TfIdfTransformer()\n", + "tfidf_model.fit(corpus)\n", + "lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=np.random.seed(0))\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "text_tfidf = Pipeline((('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf)))\n", + "text_tfidf.fit(corpus, data.target)\n", + "score = text_tfidf.score(corpus, data.target)\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### HDP Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use HDP model begin with importing HDP wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import HdpTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.848154362416\n" + ] + } + ], + "source": [ + "model = HdpTransformer(id2word=id2word)\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "text_hdp = Pipeline((('features', model,), ('classifier', clf)))\n", + "text_hdp.fit(corpus, data.target)\n", + "score = text_hdp.score(corpus, data.target)\n", + "\n", + "print(score)" + ] + }, { "cell_type": "code", "execution_count": null, From 28a3aa3f05927f1b6daad9e3e2a06a1db2b9e3c7 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 18 Aug 2017 16:52:25 +0530 Subject: [PATCH 32/34] unpinned sklearn version for running unit-tests --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 912fb1075a..6097e34c5a 100644 --- a/setup.py +++ b/setup.py @@ -229,7 +229,7 @@ def finalize_options(self): test_env = [ 'testfixtures', 'Morfessor == 2.0.2a4', - 'scikit-learn == 0.18.2', + 'scikit-learn', 'pyemd', 'annoy', 'tensorflow >= 1.1.0', From 464a496a4d20ff444fab240cb2b397e355912c67 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 18 Aug 2017 16:53:08 +0530 Subject: [PATCH 33/34] updated 'Pipeline' initialization format --- gensim/test/test_sklearn_api.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 17b2dacd8a..d990cc403e 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -194,7 +194,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lda = Pipeline((('features', model,), ('classifier', clf))) + text_lda = Pipeline([('features', model,), ('classifier', clf)]) text_lda.fit(corpus, data.target) score = text_lda.score(corpus, data.target) self.assertGreater(score, 0.40) @@ -284,7 +284,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lsi = Pipeline((('features', model,), ('classifier', clf))) + text_lsi = Pipeline([('features', model,), ('classifier', clf)]) text_lsi.fit(corpus, data.target) score = text_lsi.score(corpus, data.target) self.assertGreater(score, 0.50) @@ -367,7 +367,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in test_data] model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim') clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_ldaseq = Pipeline((('features', model,), ('classifier', clf))) + text_ldaseq = Pipeline([('features', model,), ('classifier', clf)]) text_ldaseq.fit(corpus, test_target) score = text_ldaseq.score(corpus, test_target) self.assertGreater(score, 0.50) @@ -437,7 +437,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_rp = Pipeline((('features', model,), ('classifier', clf))) + text_rp = Pipeline([('features', model,), ('classifier', clf)]) text_rp.fit(corpus, data.target) score = text_rp.score(corpus, data.target) self.assertGreater(score, 0.40) @@ -522,7 +522,7 @@ def testPipeline(self): clf = linear_model.LogisticRegression(penalty='l2', C=0.1) clf.fit(model.transform(train_input), train_target) - text_w2v = Pipeline((('features', model,), ('classifier', clf))) + text_w2v = Pipeline([('features', model,), ('classifier', clf)]) score = text_w2v.score(train_input, train_target) self.assertGreater(score, 0.40) @@ -593,7 +593,7 @@ def testPipeline(self): clstr.fit(model.transform(authors_full)) # stack together the two models in a pipeline - text_atm = Pipeline((('features', model,), ('cluster', clstr))) + text_atm = Pipeline([('features', model,), ('cluster', clstr)]) author_list = ['jane', 'jack', 'jill'] ret_val = text_atm.predict(author_list) self.assertEqual(len(ret_val), len(author_list)) @@ -687,7 +687,7 @@ def testPipeline(self): clf = linear_model.LogisticRegression(penalty='l2', C=0.1) clf.fit(model.transform(train_input), train_target) - text_w2v = Pipeline((('features', model,), ('classifier', clf))) + text_w2v = Pipeline([('features', model,), ('classifier', clf)]) score = text_w2v.score(train_input, train_target) self.assertGreater(score, 0.40) @@ -756,7 +756,7 @@ def testPipeline(self): lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lda = Pipeline((('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf))) + text_lda = Pipeline([('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)]) text_lda.fit(data.data, data.target) score = text_lda.score(data.data, data.target) self.assertGreater(score, 0.40) @@ -822,7 +822,7 @@ def testPipeline(self): lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_tfidf = Pipeline((('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf))) + text_tfidf = Pipeline([('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf)]) text_tfidf.fit(corpus, data.target) score = text_tfidf.score(corpus, data.target) self.assertGreater(score, 0.40) @@ -894,7 +894,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in data.data] model = HdpTransformer(id2word=id2word) clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lda = Pipeline((('features', model,), ('classifier', clf))) + text_lda = Pipeline([('features', model,), ('classifier', clf)]) text_lda.fit(corpus, data.target) score = text_lda.score(corpus, data.target) self.assertGreater(score, 0.40) From 9d65fdfd6797b989677a3446ca8b932a2cb4488b Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 18 Aug 2017 17:18:47 +0530 Subject: [PATCH 34/34] updated 'Pipeline' initialization format in ipynb --- docs/notebooks/sklearn_api.ipynb | 152 +++++++++++++++++-------------- 1 file changed, 83 insertions(+), 69 deletions(-) diff --git a/docs/notebooks/sklearn_api.ipynb b/docs/notebooks/sklearn_api.ipynb index 3fb9c01d52..b0e036747c 100644 --- a/docs/notebooks/sklearn_api.ipynb +++ b/docs/notebooks/sklearn_api.ipynb @@ -19,13 +19,25 @@ "metadata": {}, "source": [ "The wrappers available (as of now) are :\n", - "* LdaModel (```gensim.sklearn_api.ldamodel.LdaTransformer```),which implements gensim's ```LDA Model``` in a scikit-learn interface\n", + "* LdaModel (```gensim.sklearn_api.ldamodel.LdaTransformer```), which implements gensim's ```LDA Model``` in a scikit-learn interface\n", "\n", - "* LsiModel (```gensim.sklearn_api.lsimodel.LsiTransformer```),which implements gensim's ```LSI Model``` in a scikit-learn interface\n", + "* LsiModel (```gensim.sklearn_api.lsimodel.LsiTransformer```), which implements gensim's ```LSI Model``` in a scikit-learn interface\n", "\n", - "* RpModel (```gensim.sklearn_api.rpmodel.RpTransformer```),which implements gensim's ```Random Projections Model``` in a scikit-learn interface\n", + "* RpModel (```gensim.sklearn_api.rpmodel.RpTransformer```), which implements gensim's ```Random Projections Model``` in a scikit-learn interface\n", "\n", - "* LDASeq Model (```gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer```),which implements gensim's ```LdaSeqModel``` in a scikit-learn interface" + "* LDASeq Model (```gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer```), which implements gensim's ```LdaSeqModel``` in a scikit-learn interface\n", + "\n", + "* Word2Vec Model (```gensim.sklearn_api.w2vmodel.W2VTransformer```), which implements gensim's ```Word2Vec``` in a scikit-learn interface\n", + "\n", + "* AuthorTopicModel Model (```gensim.sklearn_api.atmodel.AuthorTopicTransformer```), which implements gensim's ```AuthorTopicModel``` in a scikit-learn interface\n", + "\n", + "* Doc2Vec Model (```gensim.sklearn_api.d2vmodel.D2VTransformer```), which implements gensim's ```Doc2Vec``` in a scikit-learn interface\n", + "\n", + "* Text2Bow Model (```gensim.sklearn_api.text2bow.Text2BowTransformer```), which implements gensim's ```Dictionary``` in a scikit-learn interface\n", + "\n", + "* TfidfModel Model (```gensim.sklearn_api.tfidf.TfIdfTransformer```), which implements gensim's ```TfidfModel``` in a scikit-learn interface\n", + "\n", + "* HdpModel Model (```gensim.sklearn_api.hdp.HdpTransformer```), which implements gensim's ```HdpModel``` in a scikit-learn interface" ] }, { @@ -44,11 +56,17 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": true - }, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], "source": [ "from gensim.sklearn_api import LdaTransformer" ] @@ -62,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -139,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 4, "metadata": { "collapsed": true }, @@ -154,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 5, "metadata": { "collapsed": true }, @@ -174,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 6, "metadata": { "collapsed": true }, @@ -194,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "collapsed": true }, @@ -308,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": { "collapsed": true }, @@ -327,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": { "collapsed": true }, @@ -339,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -358,7 +376,7 @@ "source": [ "model = LdaTransformer(num_topics=15, id2word=id2word, iterations=10, random_state=37)\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", - "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", + "pipe = Pipeline([('features', model,), ('classifier', clf)])\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", "\n", @@ -381,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": { "collapsed": true }, @@ -399,18 +417,18 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[ 0.13653967 -0.00378269 0.02652037 0.08496786 -0.02401959 -0.60089273\n", - " -1.0708177 -0.03932274 -0.43813039 -0.54848409 -0.20147759 0.21781259\n", - " 1.30378972 -0.08678691 -0.17529122]\n", - "Positive features: internet...:1.30 trawling:0.22 Fame.:0.14 Keach:0.08 Fame,:0.03\n", - "Negative features: 01101001B:-1.07 comp.org.eff.talk.:-0.60 red@redpoll.neoucom.edu:-0.55 circuitry:-0.44 hanging:-0.20 >Pat:-0.18 dome.:-0.09 *best*:-0.04 comp.org.eff.talk,:-0.02 considered,:-0.00\n", + "[ 0.13655775 0.00381287 0.02643593 -0.08499907 -0.02387209 0.6004697\n", + " 1.07090198 0.03926809 0.43769831 0.54886088 -0.20186911 -0.21785685\n", + " 1.30488175 0.08663351 0.17558704]\n", + "Positive features: internet...:1.30 01101001B:1.07 comp.org.eff.talk.:0.60 red@redpoll.neoucom.edu:0.55 circuitry:0.44 >Pat:0.18 Fame.:0.14 dome.:0.09 *best*:0.04 Fame,:0.03\n", + "Negative features: trawling:-0.22 hanging:-0.20 Keach:-0.08 comp.org.eff.talk,:-0.02\n", "0.865771812081\n" ] } @@ -418,9 +436,10 @@ "source": [ "model = LsiTransformer(num_topics=15, id2word=id2word)\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", - "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", + "pipe = Pipeline([('features', model,), ('classifier', clf)])\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", + "\n", "print(pipe.score(corpus, data.target))" ] }, @@ -440,7 +459,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "metadata": { "collapsed": true }, @@ -458,17 +477,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[-0.03186506 -0.00872616]\n", - "Positive features: \n", - "Negative features: Fame.:-0.03 considered,:-0.01\n", - "0.621644295302\n" + "[-0.01217523 0.0109422 ]\n", + "Positive features: considered,:0.01\n", + "Negative features: Fame.:-0.01\n", + "0.604865771812\n" ] } ], @@ -476,9 +495,10 @@ "model = RpTransformer(num_topics=2)\n", "np.random.mtrand.RandomState(1) # set seed for getting same result\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", - "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", + "pipe = Pipeline([('features', model,), ('classifier', clf)])\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", + "\n", "print(pipe.score(corpus, data.target))" ] }, @@ -498,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "metadata": { "collapsed": true }, @@ -516,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -541,16 +561,16 @@ "source": [ "test_data = data.data[0:2]\n", "test_target = data.target[0:2]\n", - "id2word = Dictionary(map(lambda x: x.split(), test_data))\n", - "corpus = [id2word.doc2bow(i.split()) for i in test_data]\n", + "id2word_ldaseq = Dictionary(map(lambda x: x.split(), test_data))\n", + "corpus_ldaseq = [id2word_ldaseq.doc2bow(i.split()) for i in test_data]\n", "\n", - "model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')\n", + "model = LdaSeqTransformer(id2word=id2word_ldaseq, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", - "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", - "pipe.fit(corpus, test_target)\n", - "print_features_pipe(pipe, id2word.values())\n", + "pipe = Pipeline([('features', model,), ('classifier', clf)])\n", + "pipe.fit(corpus_ldaseq, test_target)\n", + "print_features_pipe(pipe, id2word_ldaseq.values())\n", "\n", - "print(pipe.score(corpus, test_target))" + "print(pipe.score(corpus_ldaseq, test_target))" ] }, { @@ -571,17 +591,11 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], "source": [ "from gensim.sklearn_api import W2VTransformer" ] @@ -595,7 +609,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -633,7 +647,7 @@ "\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", "clf.fit(model.transform(train_input), train_target)\n", - "text_w2v = Pipeline((('features', model,), ('classifier', clf)))\n", + "text_w2v = Pipeline([('features', model,), ('classifier', clf)])\n", "score = text_w2v.score(train_input, train_target)\n", "\n", "print(score)" @@ -655,7 +669,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 19, "metadata": { "collapsed": true }, @@ -673,7 +687,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -711,7 +725,7 @@ "clstr.fit(model.transform(authors_full))\n", "\n", "# stack together the two models in a pipeline\n", - "text_atm = Pipeline((('features', model,), ('cluster', clstr)))\n", + "text_atm = Pipeline([('features', model,), ('cluster', clstr)])\n", "author_list = ['jane', 'jack', 'jill']\n", "ret_val = text_atm.predict(author_list)\n", "\n", @@ -734,7 +748,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 21, "metadata": { "collapsed": true }, @@ -752,7 +766,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -780,7 +794,7 @@ "\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", "clf.fit(model.transform(train_input), train_target)\n", - "text_d2v = Pipeline((('features', model,), ('classifier', clf)))\n", + "text_d2v = Pipeline([('features', model,), ('classifier', clf)])\n", "score = text_d2v.score(train_input, train_target)\n", "\n", "print(score)" @@ -802,7 +816,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 23, "metadata": { "collapsed": true }, @@ -820,7 +834,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -835,7 +849,7 @@ "text2bow_model = Text2BowTransformer()\n", "lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=np.random.seed(0))\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", - "text_t2b = Pipeline((('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)))\n", + "text_t2b = Pipeline([('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)])\n", "text_t2b.fit(data.data, data.target)\n", "score = text_t2b.score(data.data, data.target)\n", "\n", @@ -858,7 +872,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "metadata": { "collapsed": true }, @@ -876,7 +890,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -915,7 +929,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 27, "metadata": { "collapsed": true }, @@ -933,7 +947,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -947,7 +961,7 @@ "source": [ "model = HdpTransformer(id2word=id2word)\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", - "text_hdp = Pipeline((('features', model,), ('classifier', clf)))\n", + "text_hdp = Pipeline([('features', model,), ('classifier', clf)])\n", "text_hdp.fit(corpus, data.target)\n", "score = text_hdp.score(corpus, data.target)\n", "\n",