diff --git a/CHANGELOG.md b/CHANGELOG.md index 8817d0e29c..b658cb84b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ See the [method documentation](https://github.com/RaRe-Technologies/gensim/blob/ * Explicit epochs and corpus size in word2vec train(). (@gojomo, @robotcator, [#1139](https://github.com/RaRe-Technologies/gensim/pull/1139), [#1237](https://github.com/RaRe-Technologies/gensim/pull/1237)) New features: - +* Add modified save_word2vec_format for Doc2Vec, to save document vectors. (@parulsethi,[#1256](https://github.com/RaRe-Technologies/gensim/pull/1256)) * Add output word prediction in word2vec. Only for negative sampling scheme. See [ipynb]( https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/word2vec.ipynb) (@chinmayapancholi13,[#1209](https://github.com/RaRe-Technologies/gensim/pull/1209)) * scikit_learn wrapper for LSI Model in Gensim (@chinmayapancholi13,[#1244](https://github.com/RaRe-Technologies/gensim/pull/1244)) * Add the 'keep_tokens' parameter to 'filter_extremes'. (@toliwa,[#1210](https://github.com/RaRe-Technologies/gensim/pull/1210)) diff --git a/docs/notebooks/Tensorboard.png b/docs/notebooks/Tensorboard.png new file mode 100644 index 0000000000..651a23e689 Binary files /dev/null and b/docs/notebooks/Tensorboard.png differ diff --git a/docs/notebooks/Tensorboard_doc2vec.ipynb b/docs/notebooks/Tensorboard_doc2vec.ipynb new file mode 100644 index 0000000000..aa12646f32 --- /dev/null +++ b/docs/notebooks/Tensorboard_doc2vec.ipynb @@ -0,0 +1,884 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualizing Doc2Vec with TensorBoard\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "\n", + "\n", + "\n", + "\n", + "In this tutorial, I will explain how to visualize Doc2Vec Embeddings aka [Paragraph Vectors]() via TensorBoard. It is a data visualization framework for visualizing and inspecting the TensorFlow runs and graphs. We will use a built-in Tensorboard visualizer called *Embedding Projector* in this tutorial. It lets you interactively visualize and analyze high-dimensional data like embeddings.\n", + "\n", + "For this tutorial, a transformed MovieLens dataset[1] was used from this [repository](https://github.com/RaRe-Technologies/movie-plots-by-genre) and the movie titles were added afterwards. You can download the prepared csv from [here](https://github.com/parulsethi/DocViz/blob/master/movie_plots.csv). The input documents for training are the synopsis of movies, on which Doc2Vec model is trained. \n", + "\n", + "The visualizations will be a scatterplot as seen in the above image, where each datapoint is labelled by the movie title and colored by it's corresponding genre. You can also visit this [Projector link](http://projector.tensorflow.org/?config=https://raw.githubusercontent.com/parulsethi/DocViz/master/movie_plot_config.json) which is configured with my embeddings for the above mentioned dataset. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Define a Function to Read and Preprocess Text" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MovieIDTitlesPlotsGenres
01Toy Story (1995)A little boy named Andy loves to be in his roo...animation
12Jumanji (1995)When two kids find and play a magical board ga...fantasy
23Grumpier Old Men (1995)Things don't seem to change much in Wabasha Co...comedy
36Heat (1995)Hunters and their prey--Neil and his professio...action
47Sabrina (1995)An ugly duckling having undergone a remarkable...romance
59Sudden Death (1995)Some terrorists kidnap the Vice President of t...action
610GoldenEye (1995)James Bond teams up with the lone survivor of ...action
715Cutthroat Island (1995)Morgan Adams and her slave, William Shaw, are ...action
817Sense and Sensibility (1995)When Mr. Dashwood dies, he must leave the bulk...romance
918Four Rooms (1995)This movie features the collaborative director...comedy
1019Ace Ventura: When Nature Calls (1995)Ace Ventura, emerging from self-imposed exile ...comedy
1129City of Lost Children, The (Cité des enfants p...Krank (Daniel Emilfork), who cannot dream, kid...sci-fi
1232Twelve Monkeys (a.k.a. 12 Monkeys) (1995)In a future world devastated by disease, a con...sci-fi
1334Babe (1995)Farmer Hoggett wins a runt piglet at a local f...fantasy
1439Clueless (1995)A rich high school student tries to boost a ne...romance
1544Mortal Kombat (1995)Based on the popular video game of the same na...action
1648Pocahontas (1995)Capt. John Smith leads a rag-tag band of Engli...animation
1750Usual Suspects, The (1995)Following a truck hijack in New York, five con...comedy
1857Home for the Holidays (1995)After losing her job, making out with her soon...comedy
1969Friday (1995)Two homies, Smokey and Craig, smoke a dope dea...comedy
2070From Dusk Till Dawn (1996)Two criminals and their hostages unknowingly s...action
2176Screamers (1995)(SIRIUS 6B, Year 2078) On a distant mining pla...sci-fi
2282Antonia's Line (Antonia) (1995)In an anonymous Dutch village, a sturdy, stron...fantasy
2388Black Sheep (1996)Comedy about the prospective Washington State ...comedy
2495Broken Arrow (1996)\"Broken Arrow\" is the term used to describe a ...action
25104Happy Gilmore (1996)A rejected hockey player puts his skills to th...comedy
26105Bridges of Madison County, The (1995)Photographer Robert Kincaid wanders into the l...romance
27110Braveheart (1995)When his secret bride is executed for assaulti...action
28141Birdcage, The (1996)Armand Goldman owns a popular drag nightclub i...comedy
29145Bad Boys (1995)Marcus Burnett is a hen-pecked family man. Mik...action
...............
1813122902Fantastic Four (2015)FANTASTIC FOUR, a contemporary re-imagining of...sci-fi
1814127098Louis C.K.: Live at The Comedy Store (2015)Comedian Louis C.K. performs live at the Comed...comedy
1815127158Tig (2015)An intimate, mixed media documentary that foll...comedy
1816127202Me and Earl and the Dying Girl (2015)Seventeen-year-old Greg has managed to become ...comedy
1817129354Focus (2015)In the midst of veteran con man Nicky's latest...action
1818129428The Second Best Exotic Marigold Hotel (2015)The Second Best Exotic Marigold Hotel is the e...comedy
1819129937Run All Night (2015)Professional Brooklyn hitman Jimmy Conlon is m...action
1820130490Insurgent (2015)One choice can transform you-or it can destroy...sci-fi
1821130520Home (2015)An alien on the run from his own people makes ...animation
1822130634Furious 7 (2015)Dominic and his crew thought they'd left the c...action
1823131013Get Hard (2015)Kevin Hart plays the role of Darnell--a family...comedy
1824132046Tomorrowland (2015)Bound by a shared destiny, a bright, optimisti...sci-fi
1825132480The Age of Adaline (2015)A young woman, born at the turn of the 20th ce...romance
1826132488Lovesick (2014)Lovesick is the comic tale of Charlie Darby (M...fantasy
1827132796San Andreas (2015)In San Andreas, California is experiencing a s...action
1828132961Far from the Madding Crowd (2015)In Victorian England, the independent and head...romance
1829133195Hitman: Agent 47 (2015)An assassin teams up with a woman to help her ...action
1830133645Carol (2015)In an adaptation of Patricia Highsmith's semin...romance
1831134130The Martian (2015)During a manned mission to Mars, Astronaut Mar...sci-fi
1832134368Spy (2015)A desk-bound CIA analyst volunteers to go unde...comedy
1833134783Entourage (2015)Movie star Vincent Chase, together with his bo...comedy
1834134853Inside Out (2015)After young Riley is uprooted from her Midwest...comedy
1835135518Self/less (2015)A dying real estate mogul transfers his consci...sci-fi
1836135861Ted 2 (2015)Months after John's divorce, Ted and Tami-Lynn...comedy
1837135887Minions (2015)Ever since the dawn of time, the Minions have ...comedy
1838136016The Good Dinosaur (2015)In a world where dinosaurs and humans live sid...animation
1839139855Anomalisa (2015)Michael Stone, an author that specializes in c...animation
1840142997Hotel Transylvania 2 (2015)The Drac pack is back for an all-new monster c...animation
1841145935Peanuts Movie, The (2015)Charlie Brown, Lucy, Snoopy, and the whole gan...animation
1842149406Kung Fu Panda 3 (2016)Continuing his \"legendary adventures of awesom...comedy
\n", + "

1843 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " MovieID Titles \\\n", + "0 1 Toy Story (1995) \n", + "1 2 Jumanji (1995) \n", + "2 3 Grumpier Old Men (1995) \n", + "3 6 Heat (1995) \n", + "4 7 Sabrina (1995) \n", + "5 9 Sudden Death (1995) \n", + "6 10 GoldenEye (1995) \n", + "7 15 Cutthroat Island (1995) \n", + "8 17 Sense and Sensibility (1995) \n", + "9 18 Four Rooms (1995) \n", + "10 19 Ace Ventura: When Nature Calls (1995) \n", + "11 29 City of Lost Children, The (Cité des enfants p... \n", + "12 32 Twelve Monkeys (a.k.a. 12 Monkeys) (1995) \n", + "13 34 Babe (1995) \n", + "14 39 Clueless (1995) \n", + "15 44 Mortal Kombat (1995) \n", + "16 48 Pocahontas (1995) \n", + "17 50 Usual Suspects, The (1995) \n", + "18 57 Home for the Holidays (1995) \n", + "19 69 Friday (1995) \n", + "20 70 From Dusk Till Dawn (1996) \n", + "21 76 Screamers (1995) \n", + "22 82 Antonia's Line (Antonia) (1995) \n", + "23 88 Black Sheep (1996) \n", + "24 95 Broken Arrow (1996) \n", + "25 104 Happy Gilmore (1996) \n", + "26 105 Bridges of Madison County, The (1995) \n", + "27 110 Braveheart (1995) \n", + "28 141 Birdcage, The (1996) \n", + "29 145 Bad Boys (1995) \n", + "... ... ... \n", + "1813 122902 Fantastic Four (2015) \n", + "1814 127098 Louis C.K.: Live at The Comedy Store (2015) \n", + "1815 127158 Tig (2015) \n", + "1816 127202 Me and Earl and the Dying Girl (2015) \n", + "1817 129354 Focus (2015) \n", + "1818 129428 The Second Best Exotic Marigold Hotel (2015) \n", + "1819 129937 Run All Night (2015) \n", + "1820 130490 Insurgent (2015) \n", + "1821 130520 Home (2015) \n", + "1822 130634 Furious 7 (2015) \n", + "1823 131013 Get Hard (2015) \n", + "1824 132046 Tomorrowland (2015) \n", + "1825 132480 The Age of Adaline (2015) \n", + "1826 132488 Lovesick (2014) \n", + "1827 132796 San Andreas (2015) \n", + "1828 132961 Far from the Madding Crowd (2015) \n", + "1829 133195 Hitman: Agent 47 (2015) \n", + "1830 133645 Carol (2015) \n", + "1831 134130 The Martian (2015) \n", + "1832 134368 Spy (2015) \n", + "1833 134783 Entourage (2015) \n", + "1834 134853 Inside Out (2015) \n", + "1835 135518 Self/less (2015) \n", + "1836 135861 Ted 2 (2015) \n", + "1837 135887 Minions (2015) \n", + "1838 136016 The Good Dinosaur (2015) \n", + "1839 139855 Anomalisa (2015) \n", + "1840 142997 Hotel Transylvania 2 (2015) \n", + "1841 145935 Peanuts Movie, The (2015) \n", + "1842 149406 Kung Fu Panda 3 (2016) \n", + "\n", + " Plots Genres \n", + "0 A little boy named Andy loves to be in his roo... animation \n", + "1 When two kids find and play a magical board ga... fantasy \n", + "2 Things don't seem to change much in Wabasha Co... comedy \n", + "3 Hunters and their prey--Neil and his professio... action \n", + "4 An ugly duckling having undergone a remarkable... romance \n", + "5 Some terrorists kidnap the Vice President of t... action \n", + "6 James Bond teams up with the lone survivor of ... action \n", + "7 Morgan Adams and her slave, William Shaw, are ... action \n", + "8 When Mr. Dashwood dies, he must leave the bulk... romance \n", + "9 This movie features the collaborative director... comedy \n", + "10 Ace Ventura, emerging from self-imposed exile ... comedy \n", + "11 Krank (Daniel Emilfork), who cannot dream, kid... sci-fi \n", + "12 In a future world devastated by disease, a con... sci-fi \n", + "13 Farmer Hoggett wins a runt piglet at a local f... fantasy \n", + "14 A rich high school student tries to boost a ne... romance \n", + "15 Based on the popular video game of the same na... action \n", + "16 Capt. John Smith leads a rag-tag band of Engli... animation \n", + "17 Following a truck hijack in New York, five con... comedy \n", + "18 After losing her job, making out with her soon... comedy \n", + "19 Two homies, Smokey and Craig, smoke a dope dea... comedy \n", + "20 Two criminals and their hostages unknowingly s... action \n", + "21 (SIRIUS 6B, Year 2078) On a distant mining pla... sci-fi \n", + "22 In an anonymous Dutch village, a sturdy, stron... fantasy \n", + "23 Comedy about the prospective Washington State ... comedy \n", + "24 \"Broken Arrow\" is the term used to describe a ... action \n", + "25 A rejected hockey player puts his skills to th... comedy \n", + "26 Photographer Robert Kincaid wanders into the l... romance \n", + "27 When his secret bride is executed for assaulti... action \n", + "28 Armand Goldman owns a popular drag nightclub i... comedy \n", + "29 Marcus Burnett is a hen-pecked family man. Mik... action \n", + "... ... ... \n", + "1813 FANTASTIC FOUR, a contemporary re-imagining of... sci-fi \n", + "1814 Comedian Louis C.K. performs live at the Comed... comedy \n", + "1815 An intimate, mixed media documentary that foll... comedy \n", + "1816 Seventeen-year-old Greg has managed to become ... comedy \n", + "1817 In the midst of veteran con man Nicky's latest... action \n", + "1818 The Second Best Exotic Marigold Hotel is the e... comedy \n", + "1819 Professional Brooklyn hitman Jimmy Conlon is m... action \n", + "1820 One choice can transform you-or it can destroy... sci-fi \n", + "1821 An alien on the run from his own people makes ... animation \n", + "1822 Dominic and his crew thought they'd left the c... action \n", + "1823 Kevin Hart plays the role of Darnell--a family... comedy \n", + "1824 Bound by a shared destiny, a bright, optimisti... sci-fi \n", + "1825 A young woman, born at the turn of the 20th ce... romance \n", + "1826 Lovesick is the comic tale of Charlie Darby (M... fantasy \n", + "1827 In San Andreas, California is experiencing a s... action \n", + "1828 In Victorian England, the independent and head... romance \n", + "1829 An assassin teams up with a woman to help her ... action \n", + "1830 In an adaptation of Patricia Highsmith's semin... romance \n", + "1831 During a manned mission to Mars, Astronaut Mar... sci-fi \n", + "1832 A desk-bound CIA analyst volunteers to go unde... comedy \n", + "1833 Movie star Vincent Chase, together with his bo... comedy \n", + "1834 After young Riley is uprooted from her Midwest... comedy \n", + "1835 A dying real estate mogul transfers his consci... sci-fi \n", + "1836 Months after John's divorce, Ted and Tami-Lynn... comedy \n", + "1837 Ever since the dawn of time, the Minions have ... comedy \n", + "1838 In a world where dinosaurs and humans live sid... animation \n", + "1839 Michael Stone, an author that specializes in c... animation \n", + "1840 The Drac pack is back for an all-new monster c... animation \n", + "1841 Charlie Brown, Lucy, Snoopy, and the whole gan... animation \n", + "1842 Continuing his \"legendary adventures of awesom... comedy \n", + "\n", + "[1843 rows x 4 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import gensim\n", + "import pandas as pd\n", + "import smart_open\n", + "import random\n", + "\n", + "# read data\n", + "dataframe = pd.read_csv('movie_plots.csv')\n", + "dataframe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below, we define a function to read the training documents, pre-process each document using a simple gensim pre-processing tool (i.e., tokenize text into individual words, remove punctuation, set to lowercase, etc), and return a list of words. Also, to train the model, we'll need to associate a tag/number with each document of the training corpus. In our case, the tag is simply the zero-based line number." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def read_corpus(documents):\n", + " for i, plot in enumerate(documents):\n", + " yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(plot, max_len=30), [i])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "train_corpus = list(read_corpus(dataframe.Plots))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the training corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[TaggedDocument(words=[u'little', u'boy', u'named', u'andy', u'loves', u'to', u'be', u'in', u'his', u'room', u'playing', u'with', u'his', u'toys', u'especially', u'his', u'doll', u'named', u'woody', u'but', u'what', u'do', u'the', u'toys', u'do', u'when', u'andy', u'is', u'not', u'with', u'them', u'they', u'come', u'to', u'life', u'woody', u'believes', u'that', u'he', u'has', u'life', u'as', u'toy', u'good', u'however', u'he', u'must', u'worry', u'about', u'andy', u'family', u'moving', u'and', u'what', u'woody', u'does', u'not', u'know', u'is', u'about', u'andy', u'birthday', u'party', u'woody', u'does', u'not', u'realize', u'that', u'andy', u'mother', u'gave', u'him', u'an', u'action', u'figure', u'known', u'as', u'buzz', u'lightyear', u'who', u'does', u'not', u'believe', u'that', u'he', u'is', u'toy', u'and', u'quickly', u'becomes', u'andy', u'new', u'favorite', u'toy', u'woody', u'who', u'is', u'now', u'consumed', u'with', u'jealousy', u'tries', u'to', u'get', u'rid', u'of', u'buzz', u'then', u'both', u'woody', u'and', u'buzz', u'are', u'now', u'lost', u'they', u'must', u'find', u'way', u'to', u'get', u'back', u'to', u'andy', u'before', u'he', u'moves', u'without', u'them', u'but', u'they', u'will', u'have', u'to', u'pass', u'through', u'ruthless', u'toy', u'killer', u'sid', u'phillips'], tags=[0]),\n", + " TaggedDocument(words=[u'when', u'two', u'kids', u'find', u'and', u'play', u'magical', u'board', u'game', u'they', u'release', u'man', u'trapped', u'for', u'decades', u'in', u'it', u'and', u'host', u'of', u'dangers', u'that', u'can', u'only', u'be', u'stopped', u'by', u'finishing', u'the', u'game'], tags=[1])]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_corpus[:2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training the Doc2Vec Model\n", + "We'll instantiate a Doc2Vec model with a vector size with 50 words and iterating over the training corpus 55 times. We set the minimum word count to 2 in order to give higher frequency words more weighting. Model accuracy can be improved by increasing the number of iterations but this generally increases the training time. Small datasets with short documents, like this one, can benefit from more training passes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)\n", + "model.build_vocab(train_corpus)\n", + "model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we'll save the document embedding vectors per doctag." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "model.save_word2vec_format('doc_tensor.w2v', doctag_vec=True, word_vec=False) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare the Input files for Tensorboard" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tensorboard takes two Input files. One containing the embedding vectors and the other containing relevant metadata. We'll use a gensim script to directly convert the embedding file saved in word2vec format above to the tsv format required in Tensorboard." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%run -m gensim.scripts.word2vec2tensor -i doc_tensor.w2v -o movie_plot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The script above generates two files, `movie_plot_tensor.tsv` which contain the embedding vectors and `movie_plot_metadata.tsv` containing doctags. But, these doctags are simply the unique index values and hence are not really useful to interpret what the document was while visualizing. So, we will overwrite `movie_plot_metadata.tsv` to have a custom metadata file with two columns. The first column will be for the movie titles and the second for their corresponding genres." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "with open('movie_plot_metadata.tsv','w') as w:\n", + " w.write('Titles\\tGenres\\n')\n", + " for i,j in zip(dataframe.Titles, dataframe.Genres):\n", + " w.write(\"%s\\t%s\\n\" % (i,j))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Now you can go to http://projector.tensorflow.org/ and upload the two files by clicking on *Load data* in the left panel.\n", + "\n", + "For demo purposes I have uploaded the Doc2Vec embeddings generated from the model trained above [here](https://github.com/parulsethi/DocViz). You can access the Embedding projector configured with these uploaded embeddings at this [link](http://projector.tensorflow.org/?config=https://raw.githubusercontent.com/parulsethi/DocViz/master/movie_plot_config.json)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# Using Tensorboard" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the visualization purpose, the multi-dimensional embeddings that we get from the Doc2Vec model above, needs to be downsized to 2 or 3 dimensions. So that we basically end up with a new 2d or 3d embedding which tries to preserve information from the original multi-dimensional embedding. As these vectors are reduced to a much smaller dimension, the exact cosine/euclidean distances between them are not preserved, but rather relative, and hence as you’ll see below the nearest similarity results may change.\n", + "\n", + "TensorBoard has two popular dimensionality reduction methods for visualizing the embeddings and also provides a custom method based on text searches:\n", + "\n", + "- **Principal Component Analysis**: PCA aims at exploring the global structure in data, and could end up losing the local similarities between neighbours. It maximizes the total variance in the lower dimensional subspace and hence, often preserves the larger pairwise distances better than the smaller ones. See an intuition behind it in this nicely explained [answer](https://stats.stackexchange.com/questions/176672/what-is-meant-by-pca-preserving-only-large-pairwise-distances) on stackexchange.\n", + "\n", + "\n", + "- **T-SNE**: The idea of T-SNE is to place the local neighbours close to each other, and almost completely ignoring the global structure. It is useful for exploring local neighborhoods and finding local clusters. But the global trends are not represented accurately and the separation between different groups is often not preserved (see the t-sne plots of our data below which testify the same).\n", + "\n", + "\n", + "- **Custom Projections**: This is a custom bethod based on the text searches you define for different directions. It could be useful for finding meaningful directions in the vector space, for example, female to male, currency to country etc.\n", + "\n", + "You can refer to this [doc](https://www.tensorflow.org/get_started/embedding_viz) for instructions on how to use and navigate through different panels available in TensorBoard." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize using PCA\n", + "\n", + "The Embedding Projector computes the top 10 principal components. The menu at the left panel lets you project those components onto any combination of two or three. \n", + "\n", + "The above plot was made using the first two principal components with total variance covered being 36.5%.\n", + "\n", + "### Findings\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Visualize using T-SNE\n", + "\n", + "Data is visualized by animating through every iteration of the t-sne algorithm. The t-sne menu at the left lets you adjust the value of it's two hyperparameters. The first one is **Perplexity**, which is basically a measure of information. It may be viewed as a knob that sets the number of effective nearest neighbors[2]. The second one is **learning rate** that defines how quickly an algorithm learns on encountering new examples/data points.\n", + "\n", + "\n", + "\n", + "The above plot was generated with perplexity 8, learning rate 10 and iteration 500. Though the results could vary on successive runs, and you may not get the exact plot as above with same hyperparameter settings. But some small clusters will start forming as above, with different orientations.\n", + "\n", + "### Findings\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "\n", + "# References\n", + " 1. https://grouplens.org/datasets/movielens/\n", + " 2. https://lvdmaaten.github.io/tsne/\n", + " \n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/notebooks/pca.png b/docs/notebooks/pca.png new file mode 100644 index 0000000000..6a90939056 Binary files /dev/null and b/docs/notebooks/pca.png differ diff --git a/docs/notebooks/tsne.png b/docs/notebooks/tsne.png new file mode 100644 index 0000000000..51f49018bc Binary files /dev/null and b/docs/notebooks/tsne.png differ diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index a166d17687..9c1a325528 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -61,6 +61,7 @@ from gensim.utils import call_on_class_only from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc from gensim.models.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg +from gensim.models.keyedvectors import KeyedVectors from six.moves import xrange, zip from six import string_types, integer_types, itervalues @@ -808,6 +809,44 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen if self.docvecs and hasattr(self.docvecs, 'doctag_syn0_lockf'): del self.docvecs.doctag_syn0_lockf + def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): + """ + Store the input-hidden weight matrix. + + `fname` is the file used to save the vectors in + `doctag_vec` is an optional boolean indicating whether to store document vectors + `word_vec` is an optional boolean indicating whether to store word vectors + (if both doctag_vec and word_vec are True, then both vectors are stored in the same file) + `prefix` to uniquely identify doctags from word vocab, and avoid collision + in case of repeated string in doctag and word vocab + `fvocab` is an optional file used to save the vocabulary + `binary` is an optional boolean indicating whether the data is to be saved + in binary word2vec format (default: False) + + """ + total_vec = len(self.wv.vocab) + len(self.docvecs) + # save word vectors + if word_vec: + if not doctag_vec: + total_vec = len(self.wv.vocab) + KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec) + # save document vectors + if doctag_vec: + with utils.smart_open(fname, 'ab') as fout: + if not word_vec: + total_vec = len(self.docvecs) + logger.info("storing %sx%s projection weights into %s" % (total_vec, self.vector_size, fname)) + fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) + # store as in input order + for i in range(len(self.docvecs)): + doctag = prefix + str(self.docvecs.index_to_doctag(i)) + row = self.docvecs.doctag_syn0[i] + if binary: + fout.write(utils.to_utf8(doctag) + b" " + row.tostring()) + else: + fout.write(utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row)))) + + class TaggedBrownCorpus(object): """Iterate over documents from the Brown corpus (part of NLTK data), yielding each document out as a TaggedDocument object.""" diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 51918855a0..4d187bd7dd 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -118,7 +118,7 @@ def save(self, *args, **kwargs): kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) super(KeyedVectors, self).save(*args, **kwargs) - def save_word2vec_format(self, fname, fvocab=None, binary=False): + def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. @@ -127,18 +127,22 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False): `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) + `total_vec` is an optional parameter to explicitly specify total no. of vectors + (in case word vectors are appended with document vectors afterwards) """ + if total_vec is None: + total_vec = len(self.vocab) vector_size = self.syn0.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s" % (fvocab)) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) - logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), vector_size, fname)) + logger.info("storing %sx%s projection weights into %s" % (total_vec, vector_size, fname)) assert (len(self.vocab), vector_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: - fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape)) + fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 1cc32f0095..cfb9220ca5 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -23,7 +23,7 @@ import numpy as np from gensim import utils, matutils -from gensim.models import doc2vec +from gensim.models import doc2vec, keyedvectors module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -76,6 +76,25 @@ def test_persistence(self): model.save(testfile()) self.models_equal(model, doc2vec.Doc2Vec.load(testfile())) + def testPersistenceWord2VecFormat(self): + """Test storing the entire model in word2vec format.""" + model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) + # test saving both document and word embedding + test_doc_word = os.path.join(tempfile.gettempdir(), 'gensim_doc2vec.dw') + model.save_word2vec_format(test_doc_word, doctag_vec=True, word_vec=True, binary=True) + binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc_word, binary=True) + self.assertEqual(len(model.wv.vocab) + len(model.docvecs), len(binary_model_dv.vocab)) + # test saving document embedding only + test_doc = os.path.join(tempfile.gettempdir(), 'gensim_doc2vec.d') + model.save_word2vec_format(test_doc, doctag_vec=True, word_vec=False, binary=True) + binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc, binary=True) + self.assertEqual(len(model.docvecs), len(binary_model_dv.vocab)) + # test saving word embedding only + test_word = os.path.join(tempfile.gettempdir(), 'gensim_doc2vec.w') + model.save_word2vec_format(test_word, doctag_vec=False, word_vec=True, binary=True) + binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_word, binary=True) + self.assertEqual(len(model.wv.vocab), len(binary_model_dv.vocab)) + def test_load_mmap(self): """Test storing/loading the entire model.""" model = doc2vec.Doc2Vec(sentences, min_count=1)