From 3e38e3378443dbd7fd765692b7be6b84b41b9d76 Mon Sep 17 00:00:00 2001 From: robotcator Date: Fri, 7 Jul 2017 13:51:21 +0800 Subject: [PATCH] Fix train error of ConcatenatedDoc2Vec in doc2vec-IMDB (#1377) * fix the compatibility between python2 & 3 * require explicit corpus size, epochs for train() * make all train() calls use explicit count, epochs * add tests to make sure that ValueError is indeed thrown * update test * fix the word2vec's reset_from() * require explicit corpus size, epochs for train() * make all train() calls use explicit count, epochs * fix some error * fix test error * fix the train error of ConcatenatedDoc2Vec * update the ConcatenatedDoc2Vec class * update the parameters * rerun all the cells --- docs/notebooks/doc2vec-IMDB.ipynb | 703 +++++++++++++----------------- gensim/test/test_doc2vec.py | 4 +- 2 files changed, 297 insertions(+), 410 deletions(-) diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb index 18441f9013..9beb99935f 100644 --- a/docs/notebooks/doc2vec-IMDB.ipynb +++ b/docs/notebooks/doc2vec-IMDB.ipynb @@ -2,20 +2,14 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# gensim doc2vec & IMDB sentiment dataset" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "TODO: section on introduction & motivation\n", "\n", @@ -30,38 +24,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Load corpus" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Fetch and prep exactly as in Mikolov's go.sh shell script. (Note this cell tests for existence of required files, so steps won't repeat once the final summary file (`aclImdb/alldata-id.txt`) is available alongside this notebook.)" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 2, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "total running time: 0.0014970000000000816\n" + "total running time: 41.018378\n" ] } ], @@ -153,11 +137,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -167,22 +149,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The data is small enough to be read into memory. " ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -218,20 +193,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Set-up Doc2Vec Training & Evaluation Models" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Approximating experiment of Le & Mikolov [\"Distributed Representations of Sentences and Documents\"](http://cs.stanford.edu/~quocle/paragraph_vector.pdf), also with guidance from Mikolov's [example go.sh](https://groups.google.com/d/msg/word2vec-toolkit/Q49FIrNOQRo/J6KG8mUj45sJ):\n", "\n", @@ -248,20 +217,16 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 4, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec(dm/c,d100,n5,w5,mc2,t4)\n", - "Doc2Vec(dbow,d100,n5,mc2,t4)\n", - "Doc2Vec(dm/m,d100,n5,w10,mc2,t4)\n" + "Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)\n", + "Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)\n", + "Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)\n" ] } ], @@ -295,10 +260,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Following the paper, we also evaluate models in pairs. These wrappers return the concatenation of the vectors from each model. (Only the singular models are trained.)" ] @@ -307,9 +269,7 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -320,33 +280,32 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Predictive Evaluation Methods" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Helper methods for evaluating error rate." ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/lib/python3.4/importlib/_bootstrap.py:321: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", + " return f(*args, **kwds)\n" + ] + } + ], "source": [ "import numpy as np\n", "import statsmodels.api as sm\n", @@ -397,20 +356,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Bulk Training" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Using explicit multiple-pass, alpha-reduction approach as sketched in [gensim doc2vec blog post](http://radimrehurek.com/2014/12/doc2vec-tutorial/) – with added shuffling of corpus on each pass.\n", "\n", @@ -423,11 +376,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -437,159 +388,166 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 8, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "START 2015-06-28 20:34:29.500839\n", - "*0.417080 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 84.5s 1.0s\n", - "*0.363200 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred 84.5s 14.9s\n", - "*0.219520 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.0s 0.6s\n", - "*0.184000 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,t8)_inferred 19.0s 4.6s\n", - "*0.277080 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.0s 0.6s\n", - "*0.230800 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred 35.0s 6.4s\n", - "*0.207840 : 1 passes : dbow+dmm 0.0s 1.5s\n", - "*0.185200 : 1 passes : dbow+dmm_inferred 0.0s 11.2s\n", - "*0.220720 : 1 passes : dbow+dmc 0.0s 1.1s\n", - "*0.189200 : 1 passes : dbow+dmc_inferred 0.0s 19.3s\n", + "START 2017-06-06 15:19:50.208091\n", + "*0.408320 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 131.9s 33.6s\n", + "*0.341600 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)_inferred 131.9s 48.3s\n", + "*0.239960 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 35.3s 45.9s\n", + "*0.193200 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)_inferred 35.3s 48.3s\n", + "*0.268640 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 48.6s 48.5s\n", + "*0.208000 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)_inferred 48.6s 47.4s\n", + "*0.216160 : 1 passes : dbow+dmm 0.0s 168.9s\n", + "*0.176000 : 1 passes : dbow+dmm_inferred 0.0s 176.4s\n", + "*0.237280 : 1 passes : dbow+dmc 0.0s 169.3s\n", + "*0.194400 : 1 passes : dbow+dmc_inferred 0.0s 183.9s\n", "completed pass 1 at alpha 0.025000\n", - "*0.357120 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 73.1s 0.6s\n", - "*0.144360 : 2 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.8s 0.6s\n", - "*0.225640 : 2 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 36.2s 1.0s\n", - "*0.141160 : 2 passes : dbow+dmm 0.0s 1.1s\n", - "*0.144800 : 2 passes : dbow+dmc 0.0s 1.2s\n", + "*0.346760 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 133.4s 42.2s\n", + "*0.145280 : 2 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 29.0s 42.8s\n", + "*0.210920 : 2 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 38.8s 42.2s\n", + "*0.139120 : 2 passes : dbow+dmm 0.0s 173.2s\n", + "*0.147120 : 2 passes : dbow+dmc 0.0s 191.8s\n", "completed pass 2 at alpha 0.023800\n", - "*0.326840 : 3 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 73.6s 0.6s\n", - "*0.125880 : 3 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.1s 0.7s\n", - "*0.202680 : 3 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 36.0s 0.6s\n", - "*0.123280 : 3 passes : dbow+dmm 0.0s 1.6s\n", - "*0.126040 : 3 passes : dbow+dmc 0.0s 1.2s\n", + "*0.314920 : 3 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 112.3s 37.6s\n", + "*0.126720 : 3 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 28.4s 42.6s\n", + "*0.191920 : 3 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 37.9s 42.2s\n", + "*0.121640 : 3 passes : dbow+dmm 0.0s 190.8s\n", + "*0.127040 : 3 passes : dbow+dmc 0.0s 188.1s\n", "completed pass 3 at alpha 0.022600\n", - "*0.302360 : 4 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 72.6s 0.6s\n", - "*0.113640 : 4 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.9s 0.7s\n", - "*0.189880 : 4 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.8s 0.6s\n", - "*0.114200 : 4 passes : dbow+dmm 0.0s 1.2s\n", - "*0.115640 : 4 passes : dbow+dmc 0.0s 1.6s\n", + "*0.282080 : 4 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 104.9s 36.3s\n", + "*0.115520 : 4 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 27.6s 49.9s\n", + "*0.181280 : 4 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 40.7s 42.2s\n", + "*0.114760 : 4 passes : dbow+dmm 0.0s 188.6s\n", + "*0.116040 : 4 passes : dbow+dmc 0.0s 192.5s\n", "completed pass 4 at alpha 0.021400\n", - "*0.281480 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 72.7s 0.7s\n", - "*0.109720 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 21.5s 0.7s\n", - "*0.181360 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 37.8s 0.7s\n", - "*0.109760 : 5 passes : dbow+dmm 0.0s 1.3s\n", - "*0.110400 : 5 passes : dbow+dmc 0.0s 1.6s\n", + "*0.257560 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 102.5s 35.8s\n", + "*0.265200 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)_inferred 102.5s 48.6s\n", + "*0.110880 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 27.0s 46.5s\n", + "*0.117600 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)_inferred 27.0s 50.5s\n", + "*0.171240 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 39.1s 43.7s\n", + "*0.207200 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)_inferred 39.1s 47.5s\n", + "*0.108920 : 5 passes : dbow+dmm 0.0s 203.4s\n", + "*0.114800 : 5 passes : dbow+dmm_inferred 0.0s 213.4s\n", + "*0.111520 : 5 passes : dbow+dmc 0.0s 189.5s\n", + "*0.132000 : 5 passes : dbow+dmc_inferred 0.0s 202.6s\n", "completed pass 5 at alpha 0.020200\n", - "*0.264640 : 6 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 72.0s 0.7s\n", - "*0.292000 : 6 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred 72.0s 13.3s\n", - "*0.107440 : 6 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 21.6s 0.7s\n", - "*0.116000 : 6 passes : Doc2Vec(dbow,d100,n5,mc2,t8)_inferred 21.6s 4.7s\n", - "*0.176040 : 6 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 37.4s 1.1s\n", - "*0.213600 : 6 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred 37.4s 6.4s\n", - "*0.107000 : 6 passes : dbow+dmm 0.0s 1.2s\n", - "*0.108000 : 6 passes : dbow+dmm_inferred 0.0s 11.2s\n", - "*0.107880 : 6 passes : dbow+dmc 0.0s 1.2s\n", - "*0.124400 : 6 passes : dbow+dmc_inferred 0.0s 18.3s\n", + "*0.240440 : 6 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 117.6s 39.2s\n", + "*0.107600 : 6 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 32.3s 52.1s\n", + "*0.166800 : 6 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 46.4s 40.8s\n", + "*0.108160 : 6 passes : dbow+dmm 0.0s 197.8s\n", + "*0.109920 : 6 passes : dbow+dmc 0.0s 189.4s\n", "completed pass 6 at alpha 0.019000\n", - "*0.254200 : 7 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 65.7s 1.1s\n", - "*0.106720 : 7 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.5s 0.7s\n", - "*0.172880 : 7 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.6s 0.7s\n", - "*0.106080 : 7 passes : dbow+dmm 0.0s 1.2s\n", - "*0.106320 : 7 passes : dbow+dmc 0.0s 1.2s\n", + "*0.225280 : 7 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 102.8s 36.0s\n", + "*0.105560 : 7 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 31.0s 47.0s\n", + "*0.164320 : 7 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 38.6s 43.7s\n", + "*0.104760 : 7 passes : dbow+dmm 0.0s 187.1s\n", + "*0.107600 : 7 passes : dbow+dmc 0.0s 182.9s\n", "completed pass 7 at alpha 0.017800\n", - "*0.245880 : 8 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 68.6s 0.7s\n", - "*0.104920 : 8 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.0s 1.0s\n", - "*0.171000 : 8 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.4s 0.7s\n", - "*0.104760 : 8 passes : dbow+dmm 0.0s 1.3s\n", - "*0.105600 : 8 passes : dbow+dmc 0.0s 1.3s\n", + "*0.214280 : 8 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 99.2s 41.1s\n", + "*0.102400 : 8 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 28.6s 47.3s\n", + "*0.161000 : 8 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 36.4s 40.9s\n", + "*0.102720 : 8 passes : dbow+dmm 0.0s 188.2s\n", + "*0.104280 : 8 passes : dbow+dmc 0.0s 187.3s\n", "completed pass 8 at alpha 0.016600\n", - "*0.238400 : 9 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 66.1s 0.6s\n", - "*0.104520 : 9 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 21.2s 1.1s\n", - "*0.167600 : 9 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 37.5s 0.7s\n", - "*0.103680 : 9 passes : dbow+dmm 0.0s 1.2s\n", - "*0.103480 : 9 passes : dbow+dmc 0.0s 1.2s\n", + "*0.206840 : 9 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 96.9s 41.4s\n", + " 0.102920 : 9 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 27.1s 46.4s\n", + "*0.158600 : 9 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 40.3s 40.7s\n", + "*0.101880 : 9 passes : dbow+dmm 0.0s 188.1s\n", + "*0.103960 : 9 passes : dbow+dmc 0.0s 192.2s\n", "completed pass 9 at alpha 0.015400\n", - "*0.232160 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 69.0s 0.7s\n", - "*0.103680 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 21.8s 0.7s\n", - "*0.166000 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.4s 1.1s\n", - "*0.101920 : 10 passes : dbow+dmm 0.0s 1.2s\n", - " 0.103560 : 10 passes : dbow+dmc 0.0s 1.2s\n", + "*0.198960 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 116.0s 43.0s\n", + "*0.194000 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)_inferred 116.0s 54.2s\n", + "*0.102120 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 27.8s 47.1s\n", + "*0.100000 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)_inferred 27.8s 50.4s\n", + "*0.156640 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 38.3s 41.9s\n", + "*0.178400 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)_inferred 38.3s 46.8s\n", + " 0.102520 : 10 passes : dbow+dmm 0.0s 192.5s\n", + "*0.104000 : 10 passes : dbow+dmm_inferred 0.0s 207.3s\n", + "*0.103560 : 10 passes : dbow+dmc 0.0s 191.0s\n", + "*0.115200 : 10 passes : dbow+dmc_inferred 0.0s 203.5s\n", "completed pass 10 at alpha 0.014200\n", - "*0.227760 : 11 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 66.4s 0.7s\n", - "*0.242400 : 11 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred 66.4s 13.0s\n", - "*0.102160 : 11 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.7s 0.6s\n", - "*0.113200 : 11 passes : Doc2Vec(dbow,d100,n5,mc2,t8)_inferred 19.7s 5.0s\n", - "*0.163480 : 11 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.4s 0.6s\n", - "*0.208800 : 11 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred 35.4s 6.2s\n", - "*0.101560 : 11 passes : dbow+dmm 0.0s 1.2s\n", - "*0.102000 : 11 passes : dbow+dmm_inferred 0.0s 11.4s\n", - "*0.101920 : 11 passes : dbow+dmc 0.0s 1.6s\n", - "*0.109600 : 11 passes : dbow+dmc_inferred 0.0s 17.4s\n", + "*0.192000 : 11 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 97.3s 42.7s\n", + " 0.102840 : 11 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 26.8s 45.1s\n", + " 0.156680 : 11 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 36.9s 41.1s\n", + "*0.101600 : 11 passes : dbow+dmm 0.0s 187.8s\n", + " 0.103880 : 11 passes : dbow+dmc 0.0s 187.9s\n", "completed pass 11 at alpha 0.013000\n", - "*0.225960 : 12 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 61.8s 0.7s\n", - "*0.101720 : 12 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.2s 0.7s\n", - "*0.163000 : 12 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.5s 0.7s\n", - "*0.100840 : 12 passes : dbow+dmm 0.0s 1.2s\n", - "*0.101920 : 12 passes : dbow+dmc 0.0s 1.7s\n", + "*0.190440 : 12 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 99.1s 44.5s\n", + " 0.103640 : 12 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 34.7s 45.9s\n", + "*0.154640 : 12 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 37.3s 41.8s\n", + " 0.103400 : 12 passes : dbow+dmm 0.0s 190.1s\n", + " 0.103640 : 12 passes : dbow+dmc 0.0s 190.6s\n", "completed pass 12 at alpha 0.011800\n", - "*0.222360 : 13 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 65.2s 0.7s\n", - " 0.103120 : 13 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.0s 0.7s\n", - "*0.161960 : 13 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.2s 0.6s\n", - " 0.101640 : 13 passes : dbow+dmm 0.0s 1.2s\n", - " 0.102600 : 13 passes : dbow+dmc 0.0s 1.2s\n", + "*0.186840 : 13 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 99.1s 41.0s\n", + " 0.102560 : 13 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 26.7s 44.5s\n", + "*0.153880 : 13 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 35.9s 40.0s\n", + " 0.103760 : 13 passes : dbow+dmm 0.0s 182.8s\n", + " 0.103680 : 13 passes : dbow+dmc 0.0s 174.8s\n", "completed pass 13 at alpha 0.010600\n", - "*0.220960 : 14 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 65.3s 1.1s\n", - " 0.102920 : 14 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.9s 0.7s\n", - "*0.160160 : 14 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 36.0s 0.7s\n", - " 0.101720 : 14 passes : dbow+dmm 0.0s 1.2s\n", - " 0.102560 : 14 passes : dbow+dmc 0.0s 1.2s\n", + "*0.184600 : 14 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 92.0s 38.6s\n", + " 0.103080 : 14 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 26.7s 44.5s\n", + "*0.153760 : 14 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 35.8s 39.0s\n", + " 0.103120 : 14 passes : dbow+dmm 0.0s 177.6s\n", + " 0.103960 : 14 passes : dbow+dmc 0.0s 176.0s\n", "completed pass 14 at alpha 0.009400\n", - "*0.219400 : 15 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 64.0s 1.0s\n", - "*0.101440 : 15 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.5s 0.7s\n", - " 0.160640 : 15 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 38.6s 0.7s\n", - "*0.100160 : 15 passes : dbow+dmm 0.0s 1.2s\n", - "*0.101880 : 15 passes : dbow+dmc 0.0s 1.3s\n", + "*0.182720 : 15 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 91.7s 38.7s\n", + "*0.179600 : 15 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)_inferred 91.7s 50.8s\n", + " 0.103280 : 15 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 26.7s 43.5s\n", + " 0.104400 : 15 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)_inferred 26.7s 47.8s\n", + "*0.153720 : 15 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 36.0s 39.0s\n", + " 0.187200 : 15 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)_inferred 36.0s 43.7s\n", + " 0.103520 : 15 passes : dbow+dmm 0.0s 174.9s\n", + " 0.105600 : 15 passes : dbow+dmm_inferred 0.0s 183.2s\n", + " 0.103680 : 15 passes : dbow+dmc 0.0s 175.9s\n", + "*0.106000 : 15 passes : dbow+dmc_inferred 0.0s 189.9s\n", "completed pass 15 at alpha 0.008200\n", - "*0.216880 : 16 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 64.1s 1.1s\n", - "*0.232400 : 16 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred 64.1s 12.8s\n", - " 0.101760 : 16 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.1s 0.7s\n", - "*0.111600 : 16 passes : Doc2Vec(dbow,d100,n5,mc2,t8)_inferred 19.1s 4.7s\n", - "*0.159800 : 16 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 34.9s 0.6s\n", - "*0.184000 : 16 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred 34.9s 6.5s\n", - " 0.100640 : 16 passes : dbow+dmm 0.0s 1.6s\n", - "*0.094800 : 16 passes : dbow+dmm_inferred 0.0s 11.7s\n", - "*0.101320 : 16 passes : dbow+dmc 0.0s 1.2s\n", - " 0.109600 : 16 passes : dbow+dmc_inferred 0.0s 17.5s\n", + "*0.181040 : 16 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 91.6s 41.2s\n", + " 0.103240 : 16 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 26.7s 45.3s\n", + "*0.153600 : 16 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 36.1s 40.6s\n", + " 0.103960 : 16 passes : dbow+dmm 0.0s 175.9s\n", + "*0.103400 : 16 passes : dbow+dmc 0.0s 175.9s\n", "completed pass 16 at alpha 0.007000\n", - " 0.217160 : 17 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 58.6s 0.6s\n", - " 0.101760 : 17 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.5s 0.7s\n", - "*0.159640 : 17 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 37.0s 1.1s\n", - " 0.100760 : 17 passes : dbow+dmm 0.0s 1.3s\n", - " 0.101480 : 17 passes : dbow+dmc 0.0s 1.3s\n", + "*0.180080 : 17 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 92.1s 40.3s\n", + " 0.102760 : 17 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 26.3s 44.9s\n", + "*0.152880 : 17 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 35.4s 39.0s\n", + " 0.103200 : 17 passes : dbow+dmm 0.0s 182.5s\n", + "*0.103280 : 17 passes : dbow+dmc 0.0s 178.0s\n", "completed pass 17 at alpha 0.005800\n", - "*0.216080 : 18 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 60.7s 0.6s\n", - " 0.101520 : 18 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.6s 0.6s\n", - "*0.158760 : 18 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 34.9s 1.0s\n", - " 0.100800 : 18 passes : dbow+dmm 0.0s 1.2s\n", - " 0.101760 : 18 passes : dbow+dmc 0.0s 1.2s\n", + "*0.178720 : 18 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 91.1s 39.0s\n", + "*0.101640 : 18 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 26.4s 44.3s\n", + "*0.152280 : 18 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 35.6s 39.5s\n", + " 0.102360 : 18 passes : dbow+dmm 0.0s 183.8s\n", + " 0.103320 : 18 passes : dbow+dmc 0.0s 179.0s\n", "completed pass 18 at alpha 0.004600\n", - "*0.215560 : 19 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 62.6s 0.7s\n", - "*0.101000 : 19 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.6s 0.7s\n", - " 0.159080 : 19 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.9s 0.7s\n", - "*0.099920 : 19 passes : dbow+dmm 0.0s 1.7s\n", - " 0.102280 : 19 passes : dbow+dmc 0.0s 1.2s\n", + "*0.178600 : 19 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 91.1s 38.9s\n", + " 0.102320 : 19 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 26.3s 45.7s\n", + "*0.151920 : 19 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 35.5s 40.7s\n", + " 0.102240 : 19 passes : dbow+dmm 0.0s 181.7s\n", + "*0.103000 : 19 passes : dbow+dmc 0.0s 181.7s\n", "completed pass 19 at alpha 0.003400\n", - "*0.215160 : 20 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 58.3s 0.6s\n", - " 0.101360 : 20 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.5s 0.7s\n", - " 0.158920 : 20 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 33.6s 0.6s\n", - " 0.100480 : 20 passes : dbow+dmm 0.0s 1.5s\n", - " 0.102160 : 20 passes : dbow+dmc 0.0s 1.1s\n", + "*0.177360 : 20 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 90.9s 40.0s\n", + " 0.190800 : 20 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)_inferred 90.9s 52.1s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.102520 : 20 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 26.4s 45.2s\n", + " 0.108800 : 20 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)_inferred 26.4s 48.7s\n", + "*0.151680 : 20 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 35.5s 40.8s\n", + " 0.182400 : 20 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)_inferred 35.5s 45.3s\n", + " 0.102320 : 20 passes : dbow+dmm 0.0s 183.5s\n", + " 0.113200 : 20 passes : dbow+dmm_inferred 0.0s 192.3s\n", + "*0.102800 : 20 passes : dbow+dmc 0.0s 183.3s\n", + " 0.111200 : 20 passes : dbow+dmc_inferred 0.0s 196.1s\n", "completed pass 20 at alpha 0.002200\n", - "END 2015-06-28 21:20:48.994706\n" + "END 2017-06-06 19:46:10.508929\n" ] } ], @@ -610,7 +568,7 @@ " duration = 'na'\n", " train_model.alpha, train_model.min_alpha = alpha, alpha\n", " with elapsed_timer() as elapsed:\n", - " train_model.train(doc_list, total_examples=train_model.corpus_count, epochs=train_model.iter)\n", + " train_model.train(doc_list, total_examples=len(doc_list), epochs=1)\n", " duration = '%.1f' % elapsed()\n", " \n", " # evaluate\n", @@ -643,37 +601,30 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Achieved Sentiment-Prediction Accuracy" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 9, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.094800 dbow+dmm_inferred\n", - "0.099920 dbow+dmm\n", - "0.101000 Doc2Vec(dbow,d100,n5,mc2,t8)\n", - "0.101320 dbow+dmc\n", - "0.109600 dbow+dmc_inferred\n", - "0.111600 Doc2Vec(dbow,d100,n5,mc2,t8)_inferred\n", - "0.158760 Doc2Vec(dm/m,d100,n5,w10,mc2,t8)\n", - "0.184000 Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred\n", - "0.215160 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)\n", - "0.232400 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred\n" + "0.100000 Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)_inferred\n", + "0.101600 dbow+dmm\n", + "0.101640 Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)\n", + "0.102800 dbow+dmc\n", + "0.104000 dbow+dmm_inferred\n", + "0.106000 dbow+dmc_inferred\n", + "0.151680 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)\n", + "0.177360 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)\n", + "0.178400 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)_inferred\n", + "0.179600 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)_inferred\n" ] } ], @@ -685,54 +636,41 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "In my testing, unlike the paper's report, DBOW performs best. Concatenating vectors from different models only offers a small predictive improvement. The best results I've seen are still just under 10% error rate, still a ways from the paper's 7.42%.\n" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Examining Results" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Are inferred vectors close to the precalculated ones?" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 10, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "for doc 25430...\n", - "Doc2Vec(dm/c,d100,n5,w5,mc2,t8):\n", - " [(25430, 0.6583491563796997), (27314, 0.4142411947250366), (16479, 0.40846431255340576)]\n", - "Doc2Vec(dbow,d100,n5,mc2,t8):\n", - " [(25430, 0.9325973987579346), (49281, 0.5766637921333313), (79679, 0.5634804964065552)]\n", - "Doc2Vec(dm/m,d100,n5,w10,mc2,t8):\n", - " [(25430, 0.7970066666603088), (97818, 0.6925815343856812), (230, 0.690807580947876)]\n" + "for doc 47495...\n", + "Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4):\n", + " [(47495, 0.8063223361968994), (28683, 0.4661555588245392), (10030, 0.3962923586368561)]\n", + "Doc2Vec(dbow,d100,n5,mc2,s0.001,t4):\n", + " [(47495, 0.9660482406616211), (17469, 0.5925078392028809), (52349, 0.5742233991622925)]\n", + "Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4):\n", + " [(47495, 0.8801028728485107), (60782, 0.5431949496269226), (42472, 0.5375599265098572)]\n" ] } ], @@ -746,46 +684,36 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "(Yes, here the stored vector from 20 epochs of training is usually one of the closest to a freshly-inferred vector for the same words. Note the defaults for inference are very abbreviated – just 3 steps starting at a high alpha – and likely need tuning for other applications.)" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Do close documents seem more related than distant ones?" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 11, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "TARGET (72927): «this is one of the best films of this year . for a year that was fueled by controversy and crap , it was nice to finally see a film that had a true heart to it . from the opening scene to the end , i was so moved by the love that will smith has for his son . basically , if you see this movie and walk out of it feeling nothing , there is something that is very wrong with you . loved this movie , it's the perfect movie to end the year with . the best part was after the movie , my friends and i all got up and realized that this movie had actually made the four of us tear up ! it's an amazing film and if will smith doesn't get at least an oscar nom , then the oscars will just suck . in fact will smith should actually just win an oscar for this role . ! ! ! i loved this movie ! ! ! ! everybody needs to see especially the people in this world that take everything for granted , watch this movie , it will change you !»\n", + "TARGET (43375): «the film \" chaos \" takes its name from gleick's 1988 pop science explanation of chaos theory . what does the book or anything related to the content of the book have to do with the plot of the movie \" chaos \" ? nothing . the film makers seem to have skimmed the book ( obviously without understanding a thing about it ) looking for a \" theme \" to united the series of mundane action sequences that overlie the flimsy string of events that acts in place of a plot in the film . in this respect , the movie \" choas \" resembles the canadian effort \" cube , \" in which prime numbers function as a device to mystify the audience so that the ridiculousness of the plot will not be noticed : in \" cube \" a bunch of prime numbers are tossed in so that viewers will attribute their lack of understanding to lack of knowledge about primes : the same approach is taken in \" chaos \" : disconnected extracts from gleick's books are thrown in make the doings of the bad guy in the film seem fiendishly clever . this , of course , is an insultingly condescending treatment of the audience , and any literate viewer of \" chaos \" who can stand to sit through the entire film will end up bewildered . how could a film so bad be made ? rewritten as a novel , the story in \" chaos \" would probably not even make it past a literary agent's secretary's desk . how could ( at least ) hundreds of thousands ( and probably millions ) of dollars have been thrown away on what can only be considered a waste of time for everyone except those who took home money from the film ? regarding what's in the movie , every performance is phoned in . save for technical glitches , it would be astonishing if more than one take was used for any one scene . the story is uniformly senseless : the last time i saw a story to disconnected it was the production of a literal eight-year-old . among other massive shortcomings are the following : the bad guy leaves hints for the police to follow . he has no reason whatsoever for leaving such hints . police officers do not carry or use radios . dupes of the bad guy have no reason to act in concert with the bad guy . let me strongly recommend that no one watch this film . if there is any other movie you like ( or even simply do not hate ) watch that instead .»\n", "\n", - "SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w10,mc2,t8):\n", + "SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4):\n", "\n", - "MOST (2046, 0.7372332215309143): «i thought this movie would be dumb , but i really liked it . people i know hate it because spirit was the only horse that talked . well , so what ? the songs were good , and the horses didn't need to talk to seem human . i wouldn't care to own the movie , and i would love to see it again . 8/10»\n", + "MOST (48890, 0.5806792378425598): «asmali konak has arguably become one of the best tv series to come out of turkey . with its unique cinematography and visual approach to filming , the series has gained a wide following base with rating records continuously broken . personally i do not agree with singers becoming actors ( hence , ozcan deniz - the lead actor ) but i guess the figures speak for themselves . in relation to the movie , it was disgusting to see how much someone can destroy such a plotline . years in the making , this movie was able to oversee every descent story that existed within the series . not only that , the cultural mistakes were unacceptable , with an idiotic scene involving the family members dancing ( greek style ) and breaking plates , which does not exists anywhere within the turkish culture . some argue the movie should be taken as a stand alone movie not as a continuation of the tv series but this theory has one major fall , the way the movie was marketed was that it will be picking up where the series left off and will conclude the series once and for all . so with that note in mind , me and everyone i know , would have asked for a refund and accepted to stand outside the theatre to warn other victims .»\n", "\n", - "MEDIAN (6999, 0.4129640758037567): «okay , the recent history of star trek has not been good . the next generation faded in its last few seasons , ds9 boldly stayed where no one had stayed before , and voyager started very bad and never really lived up to its promise . so , when they announced a new star trek series , i did not have high expectations . and , the first episode , broken bow , did have some problems . but , overall it was solid trek material and a good romp . i'll get the nits out of the way first . the opening theme is dull and i don't look forward to sitting through it regularly , but that's what remotes are for . what was really bad was the completely gratuitous lotion rubbing scene that just about drove my wife out of the room . they need to cut that nonsense out . but , the plot was strong and moved along well . the characters , though still new , seem to be well rounded and not always what you would expect . the vulcans are clearly being presented very differently than before , with a slightly ominous theme . i particularly liked the linguist , who is the first star trek character to not be able to stand proud in the face of death , but rather has to deal with her phobias and fears . they seemed to stay true to trek lore , something that has been a significant problem in past series , though they have plenty of time to bring us things like shooting through shields , the instant invention of technology that can fix anything , and the inevitable plethora of time-travel stories . anyone want to start a pool on how long before the borg show up ? all in all , the series has enormous potential . they are seeing the universe with fresh eyes . we have the chance to learn how things got the way they were in the later series . how did the klingons go from just insulting to war ? how did we meet the romulans ? how did the federation form and just who put earth in charge . why is the prime directive so important ? if they address these things rather than spitting out time travel episodes , this will be an interesting series . my favorite line : zephram cochran saying \" where no man has gone before \" ( not \" no one \" )»\n", + "MEDIAN (93452, 0.22335509955883026): «this is the second film ( dead men walking ) set in a prison by theasylum . the mythos behind plot is very good , russian mafia has this demon do there dirty work and the rainbow array of inmates have to defend their bars & mortar . jennifer lee ( see interview ) wiggins stars as a prison guard who has a inmate , who maybe a demon . the monster suit is awesome and frightening , and a different look that almost smacks of a toy franchise , hey if full moon and todd mcfarlane can make action figures for any character . . why not the beast from bray road wolfette , shapeshifter with medallion accessory , or the rhett giles everyman hero with removable appendages .»\n", "\n", - "LEAST (16617, 0.015464222989976406): «i saw this movie during a tolkien-themed interim class during my sophomore year of college . i was seated unfortunately close to the screen and my professor chose me to serve as a whipping boy- everyone else was laughing , but they weren't within constant eyesight . let's get it out of the way : the peter jackson 'lord of the rings' films do owe something to the bakshi film . in jackson's version of the fellowship of the ring , for instance , the scene in which the black riders assault the empty inn beds is almost a complete carbon copy of the scene in bakshi's film , shot by shot . you could call this plagiarism or homage , depending on your agenda . i'm sure the similarities don't stop there . i'm not going to do any research to find out what they are , because that would imply i have some mote of respect for this film . i'm sure others have outlined the similarities- look around . this movie is a complete train wreck in every sense of the metaphor , and many , many people died in the accident . i've decided to list what i can remember in a more or less chronological fashion- if i've left out anything else that offended me it's because i'm completely overwhelmed , confronted with a wealth of failure ( and , at high points , mediocrity ) . *due to heavy use of rotoscoping , gandalf is no longer a gentle , wise wizard but a wildly flailing prophet of doom ( whose hat inexplicably changes color once or twice during the course of the film ) . *saruman the white is sometimes referred to as 'aruman' during the film , without explanation . he wears purple and red for some mysterious reason . *sam is flat out hideous . the portrayal of his friendship with frodo is strangely childlike and unsatisfying . yes , hobbits are small like children , but they are not children . *merry and pippin are never introduced--they simply appear during a scene change with a one-sentence explanation . the film is filled with sloppy editing like this . *frodo , sam , pippin and merry are singing merrily as they skip through along the road . one of the hobbits procures a lute at least twice as large as he is from behind his back--which was not visible before--and begins strumming in typical fantasy bard fashion as they all break into \" la-la-la \" s . awful . *aragorn , apparently , is a native american dressed in an extremely stereotypical fantasy tunic ( no pants ) , complete with huge , square pilgrim belt buckle . he is arguably the worst swordsman in the entire movie--oftentimes he gets one wobbly swing in before being knocked flat on his ass . *the black riders appear more like lepers than menacing instruments of evil . they limp everywhere they go at a painfully slow pace . this is disturbing to be sure , but not frightening . *the scene before the black riders attempt to cross the ford of bruinen ( in which they stare at frodo , who is on the other side on horseback ) goes on forever , during which time the riders rear their horses in a vaguely threatening manner and . . . do nothing else . the scene was probably intended to illustrate frodo's hallucinatory decline as he succumbs to his wound . it turns out to be more plodding than anything else . *gimli the dwarf is just as tall as legolas the elf . he's a dwarf . there is simply no excuse for that . he also looks like a bastardized david the gnome . it's a crude but accurate description . *boromir appears to have pilfered elmer fudd's golden viking armor from that bugs bunny opera episode . he looks ridiculous . *despite the similarity to tolkien's illustration , the balrog is howl inducing and the least-threatening villain in the entire film . it looks like someone wearing pink bedroom slippers , and it's barely taller than gandalf . \" purists \" may prefer this balrog , but i'll take jackson's version any day . *the battle scenes are awkward and embarrassing . almost none of the characters display any level of competency with their armaments . i'm not asking for action-packed scenes like those in jackson's film , but they are supposed to be fighting . *treebeard makes a very short appearance , and i was sorry he bothered to show up at all . watch the film , you'll see what i mean . alright , now for the good parts of the film . *some of the voice acting is pretty good . it isn't that aragorn sounds bad , he just looks kind of like the jolly green giant . *galadriel is somewhat interesting in this portrayal ; like tom bombadil , she seems immune to the ring's powers of temptation , and her voice actress isn't horrible either . *boromir's death isn't as heart wrenching as in jackson's portrayal of the same scene , but it's still appropriately dramatic ( and more true to his death in the book , though i don't believe jackson made a mistake shooting it the way he did ) . *as my professor pointed out ( between whispered threats ) , the orcs ( mainly at helm's deep , if i'm correct ) resemble the war-ravaged corpses of soldiers , a political statement that works pretty well if you realize what's being attempted . *while this isn't really a positive point about the film , bakshi can't be blamed for the majority of the failures in this movie , or so i've been told--the project was on a tight budget , and late in its production he lost creative control to some of the higher-ups ( who i'm sure hadn't read the books ) . let me be clear : i respect bakshi for even attempting something of this magnitude . i simply have a hard time believing he was happy with the final product . overall , i cannot in any way recommend this blasphemous adaptation of tolkien's classic trilogy even for laughs , unless you've already read the books and have your own visualizations of the characters , places and events . i'm sure somebody , somewhere , will pick a copy of this up in confusion ; if you do , keep an open mind and glean what good you can from it .»\n", + "LEAST (57989, -0.22353392839431763): «saw this movie on re-run just once , when i was about 13 , in 1980 . it completely matched my teenaged fantasies of sweet , gentle , interesting — and let's face it — hot — \" older \" guys . just ordered it from cd universe about a month ago , and have given it about four whirls in the two weeks since . as somebody mentioned — i'm haunted by it . as somebody else mentioned — i think it's part of a midlife crisis as well ! being 39 and realizing how much has changed since those simpler '70s times when girls of 13 actually did take buses and go to malls together and had a lot more freedom away from the confines of modern suburbia makes me sad for my daughter — who is nearly 13 herself . thirteen back then was in many ways a lot more grown up . the film is definitely '70s but not in a super-dated cheesy way , in fact the outfits denise miller as jessie wears could be current now ! you know what they say , everything that goes around . . . although the short-short jogging shorts worn by rex with the to-the-knees sweat socks probably won't make a comeback . the subject matter is handled in a very sensitive way and the characters are treated with a lot of respect . it's not the most chatty movie going — i often wished for more to be said between jessie and michael that would cement why he was also attracted to her . but the acting is solid , the movie is sweet and atmospheric , and the fringe characters give great performances . mary beth manning as jessie's friend caroline is a total hoot — i think we all had friends like her . maia danziger as the relentless flirt with michael gives a wiggy , stoned-out performance that just makes you laugh — because we also all knew girls that acted like that . denise miller knocked her performance out of the ballpark with a very down-to-earth quality likely credited to her uknown status and being new to the industry . and i think not a little of the credit for the film's theatre-grade quality comes from the very capable , brilliant hands of the story's authors , carole and the late bruce hart , who also wrote for sesame street . they really cared about the message of the movie , which was not an overt in-your-face thing , while at the same time understanding how eager many girls are to grow up at that age . one thing that made me love the film then as much as now is not taking the cliché , easy , tied-with-a-bow but sort of let-down ending . in fact it's probably the end that has caused so many women to return to viewing the movie in their later years . re-watching sooner or later has me absolutely sick with nostalgia for those simpler times , and has triggered a ridiculous and sudden obsession with catching up with rex smith — whom while i enjoyed his albums sooner or later and forever when i was young , i never plastered his posters on my walls as i did some of my other faves . in the past week , i've put his music on my ipod , read fan sites , found interviews ( and marveled in just how brilliant he really is — the man has a fascinating way of thinking ) , watched clips on youtube — what am i , 13 ? i guess that's the biggest appeal of this movie . remembering what it was like to be 13 and the whole world was ahead of you .»\n", "\n" ] } @@ -804,31 +732,23 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "(Somewhat, in terms of reviewer tone, movie genre, etc... the MOST cosine-similar docs usually seem more like the TARGET than the MEDIAN or LEAST.)" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Do the word vectors show useful similarities?" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -837,83 +757,83 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 13, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "most similar words for 'comedy/drama' (38 occurences)\n" + "most similar words for 'gymnast' (36 occurences)\n" ] }, { "data": { "text/html": [ - "
Doc2Vec(dm/c,d100,n5,w5,mc2,t8)Doc2Vec(dbow,d100,n5,mc2,t8)Doc2Vec(dm/m,d100,n5,w10,mc2,t8)
[('comedy', 0.7255545258522034),
\n", - "('thriller', 0.6946465969085693),
\n", - "('drama', 0.6763534545898438),
\n", - "('romance', 0.6251884698867798),
\n", - "('dramedy', 0.6217159032821655),
\n", - "('melodrama', 0.6156137585639954),
\n", - "('adventure', 0.6091135740280151),
\n", - "('farce', 0.6034293174743652),
\n", - "('chiller', 0.5948368906974792),
\n", - "('romantic-comedy', 0.5876704454421997),
\n", - "('fantasy', 0.5863304138183594),
\n", - "('mystery/comedy', 0.577541708946228),
\n", - "('whodunit', 0.572147011756897),
\n", - "('biopic', 0.5679721832275391),
\n", - "('thriller/drama', 0.5630226731300354),
\n", - "('sitcom', 0.5574496984481812),
\n", - "('slash-fest', 0.5573585033416748),
\n", - "('mystery', 0.5542301535606384),
\n", - "('potboiler', 0.5519827604293823),
\n", - "('mockumentary', 0.5490710139274597)]
[('1000%', 0.42290645837783813),
\n", - "(\"gymnast's\", 0.4180164337158203),
\n", - "('hollywoodland', 0.3898555636405945),
\n", - "('cultures', 0.3857914209365845),
\n", - "('hooda', 0.3851744532585144),
\n", - "('cites', 0.38047513365745544),
\n", - "(\"78's\", 0.3792475461959839),
\n", - "(\"dormael's\", 0.3775535225868225),
\n", - "('jokester', 0.3725704252719879),
\n", - "('impelled', 0.36853262782096863),
\n", - "('lia', 0.3684236407279968),
\n", - "('snivelling', 0.3683513104915619),
\n", - "('astral', 0.36715900897979736),
\n", - "('euro-exploitation', 0.35853487253189087),
\n", - "(\"serra's\", 0.3578598201274872),
\n", - "('down-on-their-luck', 0.3576606214046478),
\n", - "('rowles', 0.3567575514316559),
\n", - "('romantica', 0.3549702763557434),
\n", - "('bonham-carter', 0.354231059551239),
\n", - "('1877', 0.3541453182697296)]
[('comedy-drama', 0.6274900436401367),
\n", - "('comedy', 0.5986765623092651),
\n", - "('thriller', 0.5765297412872314),
\n", - "('road-movie', 0.5615973472595215),
\n", - "('dramedy', 0.5580120086669922),
\n", - "('time-killer', 0.5497636795043945),
\n", - "('potboiler', 0.5456510782241821),
\n", - "('comedy/', 0.5439876317977905),
\n", - "('actioner', 0.5423712134361267),
\n", - "('diversion', 0.541743278503418),
\n", - "('romcom', 0.5402226448059082),
\n", - "('rom-com', 0.5358527302742004),
\n", - "('drama', 0.5320745706558228),
\n", - "('chiller', 0.5229591727256775),
\n", - "('romp', 0.5228806734085083),
\n", - "('horror/comedy', 0.5219299793243408),
\n", - "('weeper', 0.5195824503898621),
\n", - "('mockumentary', 0.5149033069610596),
\n", - "('camp-fest', 0.5122634768486023),
\n", - "('mystery/comedy', 0.5020694732666016)]
" + "
Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)
[('scientist', 0.530441164970398),
\n", + "('psychotherapist', 0.527083694934845),
\n", + "('parapsychologist', 0.5239906907081604),
\n", + "('cringer', 0.5199892520904541),
\n", + "('samir', 0.5048707127571106),
\n", + "('reporter', 0.49532145261764526),
\n", + "('swimmer', 0.4937909245491028),
\n", + "('thrill-seeker', 0.4905340373516083),
\n", + "('chiara', 0.48281964659690857),
\n", + "('psychiatrist', 0.4788440763950348),
\n", + "('nerd', 0.4779984951019287),
\n", + "('surgeon', 0.47712844610214233),
\n", + "('jock', 0.4741038382053375),
\n", + "('geek', 0.4714686870574951),
\n", + "('mumu', 0.47104766964912415),
\n", + "('painter', 0.4689804017543793),
\n", + "('cheater', 0.4655175805091858),
\n", + "('hypnotist', 0.4645438492298126),
\n", + "('whizz', 0.46407681703567505),
\n", + "('cryptozoologist', 0.4627385437488556)]
[('bang-bang', 0.4289792478084564),
\n", + "('master', 0.41190674901008606),
\n", + "('greenleaf', 0.38207903504371643),
\n", + "('122', 0.3811250925064087),
\n", + "('fingernails', 0.3794997036457062),
\n", + "('cardboard-cutout', 0.3740081787109375),
\n", + "(\"album'\", 0.3706256151199341),
\n", + "('sex-starved', 0.3696949779987335),
\n", + "('creme-de-la-creme', 0.36426788568496704),
\n", + "('destroyed', 0.3638569116592407),
\n", + "('imminent', 0.3612757921218872),
\n", + "('cruisers', 0.3568859398365021),
\n", + "(\"emo's\", 0.35605981945991516),
\n", + "('lavransdatter', 0.3534432649612427),
\n", + "(\"'video'\", 0.3508487641811371),
\n", + "('garris', 0.3507363796234131),
\n", + "('romanzo', 0.3495352268218994),
\n", + "('tombes', 0.3494585454463959),
\n", + "('story-writers', 0.3461073637008667),
\n", + "('georgette', 0.34602558612823486)]
[('ex-marine', 0.5273298621177673),
\n", + "('koichi', 0.5020822882652283),
\n", + "('dorkish', 0.49750325083732605),
\n", + "('fenyö', 0.4765225946903229),
\n", + "('castleville', 0.46756264567375183),
\n", + "('smoorenburg', 0.46484801173210144),
\n", + "('chimp', 0.46456438302993774),
\n", + "('swimmer', 0.46236276626586914),
\n", + "('falcone', 0.4614230990409851),
\n", + "('yak', 0.45991501212120056),
\n", + "('gms', 0.4542686939239502),
\n", + "('iván', 0.4503802955150604),
\n", + "('spidy', 0.4494086503982544),
\n", + "('arnie', 0.44659116864204407),
\n", + "('hobo', 0.4465593695640564),
\n", + "('evelyne', 0.4455353617668152),
\n", + "('pandey', 0.4452363848686218),
\n", + "('hector', 0.4442984461784363),
\n", + "('baboon', 0.44382452964782715),
\n", + "('miao', 0.4437481164932251)]
" + ], + "text/plain": [ + "" ] }, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -940,10 +860,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Do the DBOW words look meaningless? That's because the gensim DBOW model doesn't train word vectors – they remain at their random initialized values – unless you ask with the `dbow_words=1` initialization parameter. Concurrent word-training slows DBOW mode significantly, and offers little improvement (and sometimes a little worsening) of the error rate on this IMDB sentiment-prediction task. \n", "\n", @@ -952,30 +869,23 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "### Are the word vectors from this dataset any good at analogies?" ] }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 15, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec(dm/c,d100,n5,w5,mc2,t8): 28.70% correct (2873 of 10012)\n", - "Doc2Vec(dbow,d100,n5,mc2,t8): 0.01% correct (1 of 10012)\n", - "Doc2Vec(dm/m,d100,n5,w10,mc2,t8): 27.24% correct (2727 of 10012)\n" + "Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4): 31.50% correct (3154 of 10012)\n", + "Doc2Vec(dbow,d100,n5,mc2,s0.001,t4): 0.00% correct (0 of 10012)\n", + "Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4): 32.24% correct (3228 of 10012)\n" ] } ], @@ -992,20 +902,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Even though this is a tiny, domain-specific dataset, it shows some meager capability on the general word analogies – at least for the DM/concat and DM/mean models which actually train word vectors. (The untrained random-initialized words of the DBOW model of course fail miserably.)" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Slop" ] @@ -1014,9 +918,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1025,10 +927,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "To mix the Google dataset (if locally available) into the word tests..." ] @@ -1037,9 +936,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1051,10 +948,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "To get copious logging output from above steps..." ] @@ -1063,9 +957,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1077,10 +969,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "To auto-reload python code while developing..." ] @@ -1089,9 +978,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1120,5 +1007,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index cc229415ec..1265edc2b7 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -374,7 +374,7 @@ def testTrainWarning(self, l): model.alpha += 0.05 warning = "Effective 'alpha' higher than previous training cycles" self.assertTrue(warning in str(l)) - + def testLoadOnClassError(self): """Test if exception is raised when loading doc2vec model on instance""" self.assertRaises(AttributeError, load_on_instance) @@ -408,7 +408,7 @@ def __getitem__(self, token): def infer_vector(self, document, alpha=0.1, min_alpha=0.0001, steps=5): return np.concatenate([model.infer_vector(document, alpha, min_alpha, steps) for model in self.models]) - def train(self, ignored): + def train(self, *ignore_args, **ignore_kwargs): pass # train subcomponents individually