diff --git a/docs/notebooks/sklearn_api.ipynb b/docs/notebooks/sklearn_api.ipynb index 3273b70728..b0e036747c 100644 --- a/docs/notebooks/sklearn_api.ipynb +++ b/docs/notebooks/sklearn_api.ipynb @@ -19,13 +19,25 @@ "metadata": {}, "source": [ "The wrappers available (as of now) are :\n", - "* LdaModel (```gensim.sklearn_api.ldamodel.LdaTransformer```),which implements gensim's ```LDA Model``` in a scikit-learn interface\n", + "* LdaModel (```gensim.sklearn_api.ldamodel.LdaTransformer```), which implements gensim's ```LDA Model``` in a scikit-learn interface\n", "\n", - "* LsiModel (```gensim.sklearn_api.lsimodel.LsiTransformer```),which implements gensim's ```LSI Model``` in a scikit-learn interface\n", + "* LsiModel (```gensim.sklearn_api.lsimodel.LsiTransformer```), which implements gensim's ```LSI Model``` in a scikit-learn interface\n", "\n", - "* RpModel (```gensim.sklearn_api.rpmodel.RpTransformer```),which implements gensim's ```Random Projections Model``` in a scikit-learn interface\n", + "* RpModel (```gensim.sklearn_api.rpmodel.RpTransformer```), which implements gensim's ```Random Projections Model``` in a scikit-learn interface\n", "\n", - "* LDASeq Model (```gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer```),which implements gensim's ```LdaSeqModel``` in a scikit-learn interface" + "* LDASeq Model (```gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer```), which implements gensim's ```LdaSeqModel``` in a scikit-learn interface\n", + "\n", + "* Word2Vec Model (```gensim.sklearn_api.w2vmodel.W2VTransformer```), which implements gensim's ```Word2Vec``` in a scikit-learn interface\n", + "\n", + "* AuthorTopicModel Model (```gensim.sklearn_api.atmodel.AuthorTopicTransformer```), which implements gensim's ```AuthorTopicModel``` in a scikit-learn interface\n", + "\n", + "* Doc2Vec Model (```gensim.sklearn_api.d2vmodel.D2VTransformer```), which implements gensim's ```Doc2Vec``` in a scikit-learn interface\n", + "\n", + "* Text2Bow Model (```gensim.sklearn_api.text2bow.Text2BowTransformer```), which implements gensim's ```Dictionary``` in a scikit-learn interface\n", + "\n", + "* TfidfModel Model (```gensim.sklearn_api.tfidf.TfIdfTransformer```), which implements gensim's ```TfidfModel``` in a scikit-learn interface\n", + "\n", + "* HdpModel Model (```gensim.sklearn_api.hdp.HdpTransformer```), which implements gensim's ```HdpModel``` in a scikit-learn interface" ] }, { @@ -145,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "collapsed": true }, @@ -160,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "collapsed": true }, @@ -180,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "collapsed": true }, @@ -200,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "collapsed": true }, @@ -314,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "metadata": { "collapsed": true }, @@ -333,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": { "collapsed": true }, @@ -345,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -364,9 +376,10 @@ "source": [ "model = LdaTransformer(num_topics=15, id2word=id2word, iterations=10, random_state=37)\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", - "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", + "pipe = Pipeline([('features', model,), ('classifier', clf)])\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", + "\n", "print(pipe.score(corpus, data.target))" ] }, @@ -386,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": { "collapsed": true }, @@ -404,18 +417,18 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[ 0.13653967 -0.00378269 0.02652037 0.08496786 -0.02401959 -0.60089273\n", - " -1.0708177 -0.03932274 -0.43813039 -0.54848409 -0.20147759 0.21781259\n", - " 1.30378972 -0.08678691 -0.17529122]\n", - "Positive features: internet...:1.30 trawling:0.22 Fame.:0.14 Keach:0.08 Fame,:0.03\n", - "Negative features: 01101001B:-1.07 comp.org.eff.talk.:-0.60 red@redpoll.neoucom.edu:-0.55 circuitry:-0.44 hanging:-0.20 >Pat:-0.18 dome.:-0.09 *best*:-0.04 comp.org.eff.talk,:-0.02 considered,:-0.00\n", + "[ 0.13655775 0.00381287 0.02643593 -0.08499907 -0.02387209 0.6004697\n", + " 1.07090198 0.03926809 0.43769831 0.54886088 -0.20186911 -0.21785685\n", + " 1.30488175 0.08663351 0.17558704]\n", + "Positive features: internet...:1.30 01101001B:1.07 comp.org.eff.talk.:0.60 red@redpoll.neoucom.edu:0.55 circuitry:0.44 >Pat:0.18 Fame.:0.14 dome.:0.09 *best*:0.04 Fame,:0.03\n", + "Negative features: trawling:-0.22 hanging:-0.20 Keach:-0.08 comp.org.eff.talk,:-0.02\n", "0.865771812081\n" ] } @@ -423,9 +436,10 @@ "source": [ "model = LsiTransformer(num_topics=15, id2word=id2word)\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", - "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", + "pipe = Pipeline([('features', model,), ('classifier', clf)])\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", + "\n", "print(pipe.score(corpus, data.target))" ] }, @@ -445,7 +459,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "metadata": { "collapsed": true }, @@ -463,17 +477,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[-0.03186506 -0.00872616]\n", - "Positive features: \n", - "Negative features: Fame.:-0.03 considered,:-0.01\n", - "0.621644295302\n" + "[-0.01217523 0.0109422 ]\n", + "Positive features: considered,:0.01\n", + "Negative features: Fame.:-0.01\n", + "0.604865771812\n" ] } ], @@ -481,9 +495,10 @@ "model = RpTransformer(num_topics=2)\n", "np.random.mtrand.RandomState(1) # set seed for getting same result\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", - "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", + "pipe = Pipeline([('features', model,), ('classifier', clf)])\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", + "\n", "print(pipe.score(corpus, data.target))" ] }, @@ -503,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "metadata": { "collapsed": true }, @@ -521,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -546,15 +561,411 @@ "source": [ "test_data = data.data[0:2]\n", "test_target = data.target[0:2]\n", - "id2word = Dictionary(map(lambda x: x.split(), test_data))\n", - "corpus = [id2word.doc2bow(i.split()) for i in test_data]\n", + "id2word_ldaseq = Dictionary(map(lambda x: x.split(), test_data))\n", + "corpus_ldaseq = [id2word_ldaseq.doc2bow(i.split()) for i in test_data]\n", "\n", - "model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')\n", + "model = LdaSeqTransformer(id2word=id2word_ldaseq, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", - "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", - "pipe.fit(corpus, test_target)\n", - "print_features_pipe(pipe, id2word.values())\n", - "print(pipe.score(corpus, test_target))" + "pipe = Pipeline([('features', model,), ('classifier', clf)])\n", + "pipe.fit(corpus_ldaseq, test_target)\n", + "print_features_pipe(pipe, id2word_ldaseq.values())\n", + "\n", + "print(pipe.score(corpus_ldaseq, test_target))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### Word2Vec Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use Word2Vec model begin with importing Word2Vec wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import W2VTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7\n" + ] + } + ], + "source": [ + "w2v_texts = [\n", + " ['calculus', 'is', 'the', 'mathematical', 'study', 'of', 'continuous', 'change'],\n", + " ['geometry', 'is', 'the', 'study', 'of', 'shape'],\n", + " ['algebra', 'is', 'the', 'study', 'of', 'generalizations', 'of', 'arithmetic', 'operations'],\n", + " ['differential', 'calculus', 'is', 'related', 'to', 'rates', 'of', 'change', 'and', 'slopes', 'of', 'curves'],\n", + " ['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and', 'the', 'areas', 'under', 'and', 'between', 'curves'],\n", + " ['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter', 'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'],\n", + " ['the', 'main', 'goal', 'of', 'physics', 'is', 'to', 'understand', 'how', 'the', 'universe', 'behaves'],\n", + " ['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new', 'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'],\n", + " ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', 'transformed', 'modern', 'day', 'society']\n", + "]\n", + "\n", + "model = W2VTransformer(size=10, min_count=1)\n", + "model.fit(w2v_texts)\n", + "\n", + "class_dict = {'mathematics': 1, 'physics': 0}\n", + "train_data = [\n", + " ('calculus', 'mathematics'), ('mathematical', 'mathematics'), ('geometry', 'mathematics'), ('operations', 'mathematics'), ('curves', 'mathematics'),\n", + " ('natural', 'physics'), ('nuclear', 'physics'), ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics')\n", + "]\n", + "\n", + "train_input = list(map(lambda x: x[0], train_data))\n", + "train_target = list(map(lambda x: class_dict[x[1]], train_data))\n", + "\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "clf.fit(model.transform(train_input), train_target)\n", + "text_w2v = Pipeline([('features', model,), ('classifier', clf)])\n", + "score = text_w2v.score(train_input, train_target)\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### AuthorTopic Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use AuthorTopic model begin with importing AuthorTopic wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import AuthorTopicTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 0 0]\n" + ] + } + ], + "source": [ + "from sklearn import cluster\n", + "\n", + "atm_texts = [\n", + " ['complier', 'system', 'computer'],\n", + " ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],\n", + " ['graph', 'flow', 'network', 'graph'],\n", + " ['loading', 'computer', 'system'],\n", + " ['user', 'server', 'system'],\n", + " ['tree', 'hamiltonian'],\n", + " ['graph', 'trees'],\n", + " ['computer', 'kernel', 'malfunction', 'computer'],\n", + " ['server', 'system', 'computer'],\n", + "]\n", + "atm_dictionary = Dictionary(atm_texts)\n", + "atm_corpus = [atm_dictionary.doc2bow(text) for text in atm_texts]\n", + "author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7]}\n", + "\n", + "model = AuthorTopicTransformer(id2word=atm_dictionary, author2doc=author2doc, num_topics=10, passes=100)\n", + "model.fit(atm_corpus)\n", + "\n", + "# create and train clustering model\n", + "clstr = cluster.MiniBatchKMeans(n_clusters=2)\n", + "authors_full = ['john', 'jane', 'jack', 'jill']\n", + "clstr.fit(model.transform(authors_full))\n", + "\n", + "# stack together the two models in a pipeline\n", + "text_atm = Pipeline([('features', model,), ('cluster', clstr)])\n", + "author_list = ['jane', 'jack', 'jill']\n", + "ret_val = text_atm.predict(author_list)\n", + "\n", + "print(ret_val)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Doc2Vec Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use Doc2Vec model begin with importing Doc2Vec wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import D2VTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0\n" + ] + } + ], + "source": [ + "from gensim.models import doc2vec\n", + "d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)]\n", + "\n", + "model = D2VTransformer(min_count=1)\n", + "model.fit(d2v_sentences)\n", + "\n", + "class_dict = {'mathematics': 1, 'physics': 0}\n", + "train_data = [\n", + " (['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'),\n", + " (['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics')\n", + "]\n", + "train_input = list(map(lambda x: x[0], train_data))\n", + "train_target = list(map(lambda x: class_dict[x[1]], train_data))\n", + "\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "clf.fit(model.transform(train_input), train_target)\n", + "text_d2v = Pipeline([('features', model,), ('classifier', clf)])\n", + "score = text_d2v.score(train_input, train_target)\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Text2Bow Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use Text2Bow model begin with importing Text2Bow wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import Text2BowTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.947147651007\n" + ] + } + ], + "source": [ + "text2bow_model = Text2BowTransformer()\n", + "lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=np.random.seed(0))\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "text_t2b = Pipeline([('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)])\n", + "text_t2b.fit(data.data, data.target)\n", + "score = text_t2b.score(data.data, data.target)\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TfIdf Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use TfIdf model begin with importing TfIdf wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import TfIdfTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.578859060403\n" + ] + } + ], + "source": [ + "tfidf_model = TfIdfTransformer()\n", + "tfidf_model.fit(corpus)\n", + "lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=np.random.seed(0))\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "text_tfidf = Pipeline((('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf)))\n", + "text_tfidf.fit(corpus, data.target)\n", + "score = text_tfidf.score(corpus, data.target)\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### HDP Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use HDP model begin with importing HDP wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_api import HdpTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.848154362416\n" + ] + } + ], + "source": [ + "model = HdpTransformer(id2word=id2word)\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1)\n", + "text_hdp = Pipeline([('features', model,), ('classifier', clf)])\n", + "text_hdp.fit(corpus, data.target)\n", + "score = text_hdp.score(corpus, data.target)\n", + "\n", + "print(score)" ] }, { diff --git a/gensim/sklearn_api/__init__.py b/gensim/sklearn_api/__init__.py index eb3e652fed..570fc7e875 100644 --- a/gensim/sklearn_api/__init__.py +++ b/gensim/sklearn_api/__init__.py @@ -17,3 +17,8 @@ from .ldaseqmodel import LdaSeqTransformer # noqa: F401 from .w2vmodel import W2VTransformer # noqa: F401 from .atmodel import AuthorTopicTransformer # noqa: F401 +from .d2vmodel import D2VTransformer # noqa: F401 +from .text2bow import Text2BowTransformer # noqa: F401 +from .tfidf import TfIdfTransformer # noqa: F401 +from .hdp import HdpTransformer # noqa: F401 +from .phrases import PhrasesTransformer # noqa: F401 diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py new file mode 100644 index 0000000000..05d496d9b1 --- /dev/null +++ b/gensim/sklearn_api/d2vmodel.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +import numpy as np +from six import string_types +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +from gensim import models + + +class D2VTransformer(TransformerMixin, BaseEstimator): + """ + Base Doc2Vec module + """ + + def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, + dm_tag_count=1, docvecs=None, docvecs_mapfile=None, + comment=None, trim_rule=None, size=100, alpha=0.025, + window=5, min_count=5, max_vocab_size=None, sample=1e-3, + seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, + cbow_mean=1, hashfxn=hash, iter=5, sorted_vocab=1, + batch_words=10000): + """ + Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. + """ + self.gensim_model = None + self.dm_mean = dm_mean + self.dm = dm + self.dbow_words = dbow_words + self.dm_concat = dm_concat + self.dm_tag_count = dm_tag_count + self.docvecs = docvecs + self.docvecs_mapfile = docvecs_mapfile + self.comment = comment + self.trim_rule = trim_rule + + # attributes associated with gensim.models.Word2Vec + self.size = size + self.alpha = alpha + self.window = window + self.min_count = min_count + self.max_vocab_size = max_vocab_size + self.sample = sample + self.seed = seed + self.workers = workers + self.min_alpha = min_alpha + self.hs = hs + self.negative = negative + self.cbow_mean = int(cbow_mean) + self.hashfxn = hashfxn + self.iter = iter + self.sorted_vocab = sorted_vocab + self.batch_words = batch_words + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + Calls gensim.models.Doc2Vec + """ + self.gensim_model = models.Doc2Vec(documents=X, dm_mean=self.dm_mean, dm=self.dm, + dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, + docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, + trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window, + min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, + seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, + negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, + iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words) + return self + + def transform(self, docs): + """ + Return the vector representations for the input documents. + The input `docs` should be a list of lists like : [ ['calculus', 'mathematical'], ['geometry', 'operations', 'curves'] ] + or a single document like : ['calculus', 'mathematical'] + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + # The input as array of array + check = lambda x: [x] if isinstance(x[0], string_types) else x + docs = check(docs) + X = [[] for _ in range(0, len(docs))] + + for k, v in enumerate(docs): + doc_vec = self.gensim_model.infer_vector(v) + X[k] = doc_vec + + return np.reshape(np.array(X), (len(docs), self.gensim_model.vector_size)) diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py new file mode 100644 index 0000000000..92265a5e8f --- /dev/null +++ b/gensim/sklearn_api/hdp.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +import numpy as np +from scipy import sparse +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +from gensim import models +from gensim import matutils + + +class HdpTransformer(TransformerMixin, BaseEstimator): + """ + Base HDP module + """ + + def __init__(self, id2word, max_chunks=None, max_time=None, + chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, + gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, + outputdir=None, random_state=None): + """ + Sklearn api for HDP model. See gensim.models.HdpModel for parameter details. + """ + self.gensim_model = None + self.id2word = id2word + self.max_chunks = max_chunks + self.max_time = max_time + self.chunksize = chunksize + self.kappa = kappa + self.tau = tau + self.K = K + self.T = T + self.alpha = alpha + self.gamma = gamma + self.eta = eta + self.scale = scale + self.var_converge = var_converge + self.outputdir = outputdir + self.random_state = random_state + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + Calls gensim.models.HdpModel + """ + if sparse.issparse(X): + corpus = matutils.Sparse2Corpus(X) + else: + corpus = X + + self.gensim_model = models.HdpModel(corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, + max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, + K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, + var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) + return self + + def transform(self, docs): + """ + Takes a list of documents as input ('docs'). + Returns a matrix of topic distribution for the given document bow, where a_ij + indicates (topic_i, topic_probability_j). + The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ] + or a single document like : [(4, 1), (7, 1)] + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + # The input as array of array + check = lambda x: [x] if isinstance(x[0], tuple) else x + docs = check(docs) + X = [[] for _ in range(0, len(docs))] + + max_num_topics = 0 + for k, v in enumerate(docs): + X[k] = self.gensim_model[v] + max_num_topics = max(max_num_topics, max(list(map(lambda x: x[0], X[k]))) + 1) + + for k, v in enumerate(X): + # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + dense_vec = matutils.sparse2full(v, max_num_topics) + X[k] = dense_vec + + return np.reshape(np.array(X), (len(docs), max_num_topics)) + + def partial_fit(self, X): + """ + Train model over X. + """ + if sparse.issparse(X): + X = matutils.Sparse2Corpus(X) + + if self.gensim_model is None: + self.gensim_model = models.HdpModel(id2word=self.id2word, max_chunks=self.max_chunks, + max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, + K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, + var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) + + self.gensim_model.update(corpus=X) + return self diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py new file mode 100644 index 0000000000..8a944f0235 --- /dev/null +++ b/gensim/sklearn_api/phrases.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +from six import string_types +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +from gensim import models + + +class PhrasesTransformer(TransformerMixin, BaseEstimator): + """ + Base Phrases module + """ + + def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, + delimiter=b'_', progress_per=10000): + """ + Sklearn wrapper for Phrases model. + """ + self.gensim_model = None + self.min_count = min_count + self.threshold = threshold + self.max_vocab_size = max_vocab_size + self.delimiter = delimiter + self.progress_per = progress_per + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + """ + self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) + return self + + def transform(self, docs): + """ + Return the input documents to return phrase tokens. + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + # input as python lists + check = lambda x: [x] if isinstance(x[0], string_types) else x + docs = check(docs) + X = [[] for _ in range(0, len(docs))] + + for k, v in enumerate(docs): + phrase_tokens = self.gensim_model[v] + X[k] = phrase_tokens + + return X + + def partial_fit(self, X): + if self.gensim_model is None: + self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) + + self.gensim_model.add_vocab(X) + return self diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py new file mode 100644 index 0000000000..e5a96e6551 --- /dev/null +++ b/gensim/sklearn_api/text2bow.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +from six import string_types +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +from gensim.corpora import Dictionary +from gensim.utils import tokenize + + +class Text2BowTransformer(TransformerMixin, BaseEstimator): + """ + Base Text2Bow module + """ + + def __init__(self, prune_at=2000000, tokenizer=tokenize): + """ + Sklearn wrapper for Text2Bow model. + """ + self.gensim_model = None + self.prune_at = prune_at + self.tokenizer = tokenizer + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + """ + tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X)) + self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at) + return self + + def transform(self, docs): + """ + Return the BOW format for the input documents. + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + # input as python lists + check = lambda x: [x] if isinstance(x, string_types) else x + docs = check(docs) + tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), docs)) + X = [[] for _ in range(0, len(tokenized_docs))] + + for k, v in enumerate(tokenized_docs): + bow_val = self.gensim_model.doc2bow(v) + X[k] = bow_val + + return X + + def partial_fit(self, X): + if self.gensim_model is None: + self.gensim_model = Dictionary(prune_at=self.prune_at) + + tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X)) + self.gensim_model.add_documents(tokenized_docs) + return self diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py new file mode 100644 index 0000000000..ca34af6b40 --- /dev/null +++ b/gensim/sklearn_api/tfidf.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" + +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.exceptions import NotFittedError + +import gensim +from gensim.models import TfidfModel + + +class TfIdfTransformer(TransformerMixin, BaseEstimator): + """ + Base Tf-Idf module + """ + + def __init__(self, corpus=None, id2word=None, dictionary=None, + wlocal=gensim.utils.identity, wglobal=gensim.models.tfidfmodel.df2idf, normalize=True): + """ + Sklearn wrapper for Tf-Idf model. + """ + self.gensim_model = None + self.id2word = id2word + self.dictionary = dictionary + self.wlocal = wlocal + self.wglobal = wglobal + self.normalize = normalize + + def fit(self, X, y=None): + """ + Fit the model according to the given training data. + """ + self.gensim_model = TfidfModel(corpus=X, id2word=self.id2word, dictionary=self.dictionary, + wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize) + return self + + def transform(self, docs): + """ + Return the transformed documents after multiplication with the tf-idf matrix. + """ + if self.gensim_model is None: + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + + # input as python lists + check = lambda x: [x] if isinstance(x[0], tuple) else x + docs = check(docs) + X = [[] for _ in range(0, len(docs))] + + for k, v in enumerate(docs): + transformed_doc = self.gensim_model[v] + X[k] = transformed_doc + + return X diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index a4529c3346..d990cc403e 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -18,9 +18,13 @@ from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer from gensim.sklearn_api.w2vmodel import W2VTransformer from gensim.sklearn_api.atmodel import AuthorTopicTransformer +from gensim.sklearn_api.d2vmodel import D2VTransformer +from gensim.sklearn_api.text2bow import Text2BowTransformer +from gensim.sklearn_api.tfidf import TfIdfTransformer +from gensim.sklearn_api.hdp import HdpTransformer +from gensim.sklearn_api.phrases import PhrasesTransformer from gensim.corpora import mmcorpus, Dictionary -from gensim import models -from gensim import matutils +from gensim import matutils, models module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -94,6 +98,33 @@ ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', 'transformed', 'modern', 'day', 'society'] ] +d2v_sentences = [models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)] + +dict_texts = [ + 'human interface computer', + 'survey user computer system response time', + 'eps user interface system', + 'system human system eps', + 'user response time', + 'trees', + 'graph trees', + 'graph minors trees', + 'graph minors survey' +] + +phrases_sentences = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'], + ['graph', 'minors', 'survey', 'human', 'interface'] +] + class TestLdaWrapper(unittest.TestCase): def setUp(self): @@ -163,7 +194,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lda = Pipeline((('features', model,), ('classifier', clf))) + text_lda = Pipeline([('features', model,), ('classifier', clf)]) text_lda.fit(corpus, data.target) score = text_lda.score(corpus, data.target) self.assertGreater(score, 0.40) @@ -253,7 +284,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lsi = Pipeline((('features', model,), ('classifier', clf))) + text_lsi = Pipeline([('features', model,), ('classifier', clf)]) text_lsi.fit(corpus, data.target) score = text_lsi.score(corpus, data.target) self.assertGreater(score, 0.50) @@ -336,7 +367,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in test_data] model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim') clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_ldaseq = Pipeline((('features', model,), ('classifier', clf))) + text_ldaseq = Pipeline([('features', model,), ('classifier', clf)]) text_ldaseq.fit(corpus, test_target) score = text_ldaseq.score(corpus, test_target) self.assertGreater(score, 0.50) @@ -406,7 +437,7 @@ def testPipeline(self): corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_rp = Pipeline((('features', model,), ('classifier', clf))) + text_rp = Pipeline([('features', model,), ('classifier', clf)]) text_rp.fit(corpus, data.target) score = text_rp.score(corpus, data.target) self.assertGreater(score, 0.40) @@ -491,7 +522,7 @@ def testPipeline(self): clf = linear_model.LogisticRegression(penalty='l2', C=0.1) clf.fit(model.transform(train_input), train_target) - text_w2v = Pipeline((('features', model,), ('classifier', clf))) + text_w2v = Pipeline([('features', model,), ('classifier', clf)]) score = text_w2v.score(train_input, train_target) self.assertGreater(score, 0.40) @@ -562,7 +593,7 @@ def testPipeline(self): clstr.fit(model.transform(authors_full)) # stack together the two models in a pipeline - text_atm = Pipeline((('features', model,), ('cluster', clstr))) + text_atm = Pipeline([('features', model,), ('cluster', clstr)]) author_list = ['jane', 'jack', 'jill'] ret_val = text_atm.predict(author_list) self.assertEqual(len(ret_val), len(author_list)) @@ -609,5 +640,331 @@ def testModelNotFitted(self): self.assertRaises(NotFittedError, atmodel_wrapper.transform, author_list) +class TestD2VTransformer(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = D2VTransformer(min_count=1) + self.model.fit(d2v_sentences) + + def testTransform(self): + # tranform multiple documents + docs = [] + docs.append(w2v_texts[0]) + docs.append(w2v_texts[1]) + docs.append(w2v_texts[2]) + matrix = self.model.transform(docs) + self.assertEqual(matrix.shape[0], 3) + self.assertEqual(matrix.shape[1], self.model.size) + + # tranform one document + doc = w2v_texts[0] + matrix = self.model.transform(doc) + self.assertEqual(matrix.shape[0], 1) + self.assertEqual(matrix.shape[1], self.model.size) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(negative=20) + model_params = self.model.get_params() + self.assertEqual(model_params["negative"], 20) + + # verify that the attributes values are also changed for `gensim_model` after fitting + self.model.fit(d2v_sentences) + self.assertEqual(getattr(self.model.gensim_model, 'negative'), 20) + + def testPipeline(self): + numpy.random.seed(0) # set fixed seed to get similar values everytime + model = D2VTransformer(min_count=1) + model.fit(d2v_sentences) + + class_dict = {'mathematics': 1, 'physics': 0} + train_data = [ + (['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'), + (['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics') + ] + train_input = list(map(lambda x: x[0], train_data)) + train_target = list(map(lambda x: class_dict[x[1]], train_data)) + + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) + clf.fit(model.transform(train_input), train_target) + text_w2v = Pipeline([('features', model,), ('classifier', clf)]) + score = text_w2v.score(train_input, train_target) + self.assertGreater(score, 0.40) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = w2v_texts[0] + loaded_transformed_vecs = model_load.transform(doc) + + # sanity check for transformation operation + self.assertEqual(loaded_transformed_vecs.shape[0], 1) + self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size) + + # comparing the original and loaded models + original_transformed_vecs = self.model.transform(doc) + passed = numpy.allclose(sorted(loaded_transformed_vecs), sorted(original_transformed_vecs), atol=1e-1) + self.assertTrue(passed) + + def testConsistencyWithGensimModel(self): + # training a D2VTransformer + self.model = D2VTransformer(min_count=1) + self.model.fit(d2v_sentences) + + # training a Gensim Doc2Vec model with the same params + gensim_d2vmodel = models.Doc2Vec(d2v_sentences, min_count=1) + + doc = w2v_texts[0] + vec_transformer_api = self.model.transform(doc) # vector returned by D2VTransformer + vec_gensim_model = gensim_d2vmodel[doc] # vector returned by Doc2Vec + passed = numpy.allclose(vec_transformer_api, vec_gensim_model, atol=1e-1) + self.assertTrue(passed) + + def testModelNotFitted(self): + d2vmodel_wrapper = D2VTransformer(min_count=1) + self.assertRaises(NotFittedError, d2vmodel_wrapper.transform, 1) + + +class TestText2BowTransformer(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = Text2BowTransformer() + self.model.fit(dict_texts) + + def testTransform(self): + # tranform one document + doc = ['computer system interface time computer system'] + bow_vec = self.model.transform(doc)[0] + expected_values = [1, 1, 2, 2] # comparing only the word-counts + values = list(map(lambda x: x[1], bow_vec)) + self.assertEqual(sorted(expected_values), sorted(values)) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(prune_at=1000000) + model_params = self.model.get_params() + self.assertEqual(model_params["prune_at"], 1000000) + + def testPipeline(self): + with open(datapath('mini_newsgroup'), 'rb') as f: + compressed_content = f.read() + uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') + cache = pickle.loads(uncompressed_content) + data = cache + text2bow_model = Text2BowTransformer() + lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) + numpy.random.mtrand.RandomState(1) # set seed for getting same result + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) + text_lda = Pipeline([('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)]) + text_lda.fit(data.data, data.target) + score = text_lda.score(data.data, data.target) + self.assertGreater(score, 0.40) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = dict_texts[0] + loaded_transformed_vecs = model_load.transform(doc) + + # comparing the original and loaded models + original_transformed_vecs = self.model.transform(doc) + self.assertEqual(original_transformed_vecs, loaded_transformed_vecs) + + def testModelNotFitted(self): + text2bow_wrapper = Text2BowTransformer() + self.assertRaises(NotFittedError, text2bow_wrapper.transform, dict_texts[0]) + + +class TestTfIdfTransformer(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = TfIdfTransformer(normalize=True) + self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.model.fit(self.corpus) + + def testTransform(self): + # tranform one document + doc = corpus[0] + transformed_doc = self.model.transform(doc) + expected_doc = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]] + self.assertTrue(numpy.allclose(transformed_doc, expected_doc)) + + # tranform multiple documents + docs = [corpus[0], corpus[1]] + transformed_docs = self.model.transform(docs) + expected_docs = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)], + [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(normalize=False) + model_params = self.model.get_params() + self.assertEqual(model_params["normalize"], False) + + # verify that the attributes values are also changed for `gensim_model` after fitting + self.model.fit(self.corpus) + self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False) + + def testPipeline(self): + with open(datapath('mini_newsgroup'), 'rb') as f: + compressed_content = f.read() + uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') + cache = pickle.loads(uncompressed_content) + data = cache + id2word = Dictionary(map(lambda x: x.split(), data.data)) + corpus = [id2word.doc2bow(i.split()) for i in data.data] + tfidf_model = TfIdfTransformer() + tfidf_model.fit(corpus) + lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) + numpy.random.mtrand.RandomState(1) # set seed for getting same result + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) + text_tfidf = Pipeline([('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf)]) + text_tfidf.fit(corpus, data.target) + score = text_tfidf.score(corpus, data.target) + self.assertGreater(score, 0.40) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = corpus[0] + loaded_transformed_doc = model_load.transform(doc) + + # comparing the original and loaded models + original_transformed_doc = self.model.transform(doc) + self.assertEqual(original_transformed_doc, loaded_transformed_doc) + + def testModelNotFitted(self): + tfidf_wrapper = TfIdfTransformer() + self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0]) + + +class TestHdpTransformer(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = HdpTransformer(id2word=dictionary) + self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.model.fit(self.corpus) + + def testTransform(self): + # tranform one document + doc = self.corpus[0] + transformed_doc = self.model.transform(doc) + expected_doc = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148]] + self.assertTrue(numpy.allclose(transformed_doc, expected_doc)) + + # tranform multiple documents + docs = [self.corpus[0], self.corpus[1]] + transformed_docs = self.model.transform(docs) + expected_docs = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148], + [0.0368655605, 0.709055041, 0.194436428, 0.0151706795, 0.0113863652, 1.00000000e-12, 1.00000000e-12]] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + def testPartialFit(self): + for i in range(10): + self.model.partial_fit(X=self.corpus) # fit against the model again + doc = list(self.corpus)[0] # transform only the first document + transformed = self.model.transform(doc) + expected = numpy.array([0.76777752, 0.01757334, 0.01600339, 0.01374061, 0.01275931, 0.01126313, 0.01058131, 0.01167185]) + passed = numpy.allclose(sorted(transformed[0]), sorted(expected), atol=1e-1) + self.assertTrue(passed) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(var_converge=0.05) + model_params = self.model.get_params() + self.assertEqual(model_params["var_converge"], 0.05) + + # verify that the attributes values are also changed for `gensim_model` after fitting + self.model.fit(self.corpus) + self.assertEqual(getattr(self.model.gensim_model, 'm_var_converge'), 0.05) + + def testPipeline(self): + with open(datapath('mini_newsgroup'), 'rb') as f: + compressed_content = f.read() + uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') + cache = pickle.loads(uncompressed_content) + data = cache + id2word = Dictionary(map(lambda x: x.split(), data.data)) + corpus = [id2word.doc2bow(i.split()) for i in data.data] + model = HdpTransformer(id2word=id2word) + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) + text_lda = Pipeline([('features', model,), ('classifier', clf)]) + text_lda.fit(corpus, data.target) + score = text_lda.score(corpus, data.target) + self.assertGreater(score, 0.40) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = corpus[0] + loaded_transformed_doc = model_load.transform(doc) + + # comparing the original and loaded models + original_transformed_doc = self.model.transform(doc) + self.assertTrue(numpy.allclose(original_transformed_doc, loaded_transformed_doc)) + + def testModelNotFitted(self): + hdp_wrapper = HdpTransformer(id2word=dictionary) + self.assertRaises(NotFittedError, hdp_wrapper.transform, corpus[0]) + + +class TestPhrasesTransformer(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = PhrasesTransformer(min_count=1, threshold=1) + self.model.fit(phrases_sentences) + + def testTransform(self): + # tranform one document + doc = phrases_sentences[-1] + phrase_tokens = self.model.transform(doc)[0] + expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface'] + self.assertEqual(phrase_tokens, expected_phrase_tokens) + + def testPartialFit(self): + new_sentences = [ + ['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'], + ['world', 'peace', 'people'], + ['world', 'peace', 'humans'] + ] + self.model.partial_fit(X=new_sentences) # train model with new sentences + + doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace'] + phrase_tokens = self.model.transform(doc)[0] + expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface', u'world_peace'] + self.assertEqual(phrase_tokens, expected_phrase_tokens) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(progress_per=5000) + model_params = self.model.get_params() + self.assertEqual(model_params["progress_per"], 5000) + + # verify that the attributes values are also changed for `gensim_model` after fitting + self.model.fit(phrases_sentences) + self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = phrases_sentences[-1] + loaded_phrase_tokens = model_load.transform(doc) + + # comparing the original and loaded models + original_phrase_tokens = self.model.transform(doc) + self.assertEqual(original_phrase_tokens, loaded_phrase_tokens) + + def testModelNotFitted(self): + phrases_transformer = PhrasesTransformer() + self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) + if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index 912fb1075a..6097e34c5a 100644 --- a/setup.py +++ b/setup.py @@ -229,7 +229,7 @@ def finalize_options(self): test_env = [ 'testfixtures', 'Morfessor == 2.0.2a4', - 'scikit-learn == 0.18.2', + 'scikit-learn', 'pyemd', 'annoy', 'tensorflow >= 1.1.0',