diff --git a/.circleci/config.yml b/.circleci/config.yml
index d2125123c3..fd4dc7f12f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -30,7 +30,7 @@ jobs:
name: Build documentation
command: |
source venv/bin/activate
- tox -e docs -vv
+ tox -e compile,docs -vv
- store_artifacts:
path: docs/src/_build
diff --git a/.gitignore b/.gitignore
index aef8db9736..c3ba120f37 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@
*.o
*.so
*.pyc
+*.pyo
+*.pyd
# Packages #
############
diff --git a/.travis.yml b/.travis.yml
index 3cbccc0b0a..e8df82ceec 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,7 +13,10 @@ language: python
matrix:
include:
- python: '2.7'
- env: TOXENV="flake8"
+ env: TOXENV="flake8,flake8-docs"
+
+ - python: '3.6'
+ env: TOXENV="flake8,flake8-docs"
- python: '2.7'
env: TOXENV="py27-linux"
@@ -24,5 +27,13 @@ matrix:
- python: '3.6'
env: TOXENV="py36-linux"
+ - python: '3.7'
+ env:
+ - TOXENV="py37-linux"
+ - BOTO_CONFIG="/dev/null"
+ dist: xenial
+ sudo: true
+
+
install: pip install tox
script: tox -vv
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7dd69eb6de..3e51b0f8dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,107 @@
Changes
===========
+## 3.6.0, 2018-09-20
+
+### :star2: New features
+* File-based training for `*2Vec` models (__[@persiyanov](https://github.com/persiyanov)__, [#2127](https://github.com/RaRe-Technologies/gensim/pull/2127) & [#2078](https://github.com/RaRe-Technologies/gensim/pull/2078) & [#2048](https://github.com/RaRe-Technologies/gensim/pull/2048))
+
+ New training mode for `*2Vec` models (word2vec, doc2vec, fasttext) that allows model training to scale linearly with the number of cores (full GIL elimination). The result of our Google Summer of Code 2018 project by Dmitry Persiyanov.
+
+ **Benchmark**
+ - Dataset: `full English Wikipedia`
+ - Cloud: `GCE`
+ - CPU: `Intel(R) Xeon(R) CPU @ 2.30GHz 32 cores`
+ - BLAS: `MKL`
+
+
+ | Model | Queue-based version [sec] | File-based version [sec] | speed up | Accuracy (queue-based) | Accuracy (file-based) |
+ |-------|------------|--------------------|----------|----------------|-----------------------|
+ | Word2Vec | 9230 | **2437** | **3.79x** | 0.754 (± 0.003) | 0.750 (± 0.001) |
+ | Doc2Vec | 18264 | **2889** | **6.32x** | 0.721 (± 0.002) | 0.683 (± 0.003) |
+ | FastText | 16361 | **10625** | **1.54x** | 0.642 (± 0.002) | 0.660 (± 0.001) |
+
+ Usage:
+
+ ```python
+ import gensim.downloader as api
+ from multiprocessing import cpu_count
+ from gensim.utils import save_as_line_sentence
+ from gensim.test.utils import get_tmpfile
+ from gensim.models import Word2Vec, Doc2Vec, FastText
+
+
+ # Convert any corpus to the needed format: 1 document per line, words delimited by " "
+ corpus = api.load("text8")
+ corpus_fname = get_tmpfile("text8-file-sentence.txt")
+ save_as_line_sentence(corpus, corpus_fname)
+
+ # Choose num of cores that you want to use (let's use all, models scale linearly now!)
+ num_cores = cpu_count()
+
+ # Train models using all cores
+ w2v_model = Word2Vec(corpus_file=corpus_fname, workers=num_cores)
+ d2v_model = Doc2Vec(corpus_file=corpus_fname, workers=num_cores)
+ ft_model = FastText(corpus_file=corpus_fname, workers=num_cores)
+
+ ```
+ [Read notebook tutorial with full description.](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Any2Vec_Filebased.ipynb)
+
+
+### :+1: Improvements
+
+* Add scikit-learn wrapper for `FastText` (__[@mcemilg](https://github.com/mcemilg)__, [#2178](https://github.com/RaRe-Technologies/gensim/pull/2178))
+* Add multiprocessing support for `BM25` (__[@Shiki-H](https://github.com/Shiki-H)__, [#2146](https://github.com/RaRe-Technologies/gensim/pull/2146))
+* Add `name_only` option for downloader api (__[@aneesh-joshi](https://github.com/aneesh-joshi)__, [#2143](https://github.com/RaRe-Technologies/gensim/pull/2143))
+* Make `word2vec2tensor` script compatible with `python3` (__[@vsocrates](https://github.com/vsocrates)__, [#2147](https://github.com/RaRe-Technologies/gensim/pull/2147))
+* Add custom filter for `Wikicorpus` (__[@mattilyra](https://github.com/mattilyra)__, [#2089](https://github.com/RaRe-Technologies/gensim/pull/2089))
+* Make `similarity_matrix` support non-contiguous dictionaries (__[@Witiko](https://github.com/Witiko)__, [#2047](https://github.com/RaRe-Technologies/gensim/pull/2047))
+
+
+### :red_circle: Bug fixes
+
+* Fix memory consumption in `AuthorTopicModel` (__[@philipphager](https://github.com/philipphager)__, [#2122](https://github.com/RaRe-Technologies/gensim/pull/2122))
+* Correctly process empty documents in `AuthorTopicModel` (__[@probinso](https://github.com/probinso)__, [#2133](https://github.com/RaRe-Technologies/gensim/pull/2133))
+* Fix ZeroDivisionError `keywords` issue with short input (__[@LShostenko](https://github.com/LShostenko)__, [#2154](https://github.com/RaRe-Technologies/gensim/pull/2154))
+* Fix `min_count` handling in phrases detection using `npmi_scorer` (__[@lopusz](https://github.com/lopusz)__, [#2072](https://github.com/RaRe-Technologies/gensim/pull/2072))
+* Remove duplicate count from `Phraser` log message (__[@robguinness](https://github.com/robguinness)__, [#2151](https://github.com/RaRe-Technologies/gensim/pull/2151))
+* Replace `np.integer` -> `np.int` in `AuthorTopicModel` (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#2145](https://github.com/RaRe-Technologies/gensim/pull/2145))
+
+
+### :books: Tutorial and doc improvements
+
+* Update docstring with new analogy evaluation method (__[@akutuzov](https://github.com/akutuzov)__, [#2130](https://github.com/RaRe-Technologies/gensim/pull/2130))
+* Improve `prune_at` parameter description for `gensim.corpora.Dictionary` (__[@yxonic](https://github.com/yxonic)__, [#2128](https://github.com/RaRe-Technologies/gensim/pull/2128))
+* Fix `default` -> `auto` prior parameter in documentation for lda-related models (__[@Laubeee](https://github.com/Laubeee)__, [#2156](https://github.com/RaRe-Technologies/gensim/pull/2156))
+* Use heading instead of bold style in `gensim.models.translation_matrix` (__[@nzw0301](https://github.com/nzw0301)__, [#2164](https://github.com/RaRe-Technologies/gensim/pull/2164))
+* Fix quote of vocabulary from `gensim.models.Word2Vec` (__[@nzw0301](https://github.com/nzw0301)__, [#2161](https://github.com/RaRe-Technologies/gensim/pull/2161))
+* Replace deprecated parameters with new in docstring of `gensim.models.Doc2Vec` (__[@xuhdev](https://github.com/xuhdev)__, [#2165](https://github.com/RaRe-Technologies/gensim/pull/2165))
+* Fix formula in Mallet documentation (__[@Laubeee](https://github.com/Laubeee)__, [#2186](https://github.com/RaRe-Technologies/gensim/pull/2186))
+* Fix minor semantic issue in docs for `Phrases` (__[@RunHorst](https://github.com/RunHorst)__, [#2148](https://github.com/RaRe-Technologies/gensim/pull/2148))
+* Fix typo in documentation (__[@KenjiOhtsuka](https://github.com/KenjiOhtsuka)__, [#2157](https://github.com/RaRe-Technologies/gensim/pull/2157))
+* Additional documentation fixes (__[@piskvorky](https://github.com/piskvorky)__, [#2121](https://github.com/RaRe-Technologies/gensim/pull/2121))
+
+### :warning: Deprecations (will be removed in the next major release)
+
+* Remove
+ - `gensim.models.wrappers.fasttext` (obsoleted by the new native `gensim.models.fasttext` implementation)
+ - `gensim.examples`
+ - `gensim.nosy`
+ - `gensim.scripts.word2vec_standalone`
+ - `gensim.scripts.make_wiki_lemma`
+ - `gensim.scripts.make_wiki_online`
+ - `gensim.scripts.make_wiki_online_lemma`
+ - `gensim.scripts.make_wiki_online_nodebug`
+ - `gensim.scripts.make_wiki` (all of these obsoleted by the new native `gensim.scripts.segment_wiki` implementation)
+ - "deprecated" functions and attributes
+
+* Move
+ - `gensim.scripts.make_wikicorpus` ➡ `gensim.scripts.make_wiki.py`
+ - `gensim.summarization` ➡ `gensim.models.summarization`
+ - `gensim.topic_coherence` ➡ `gensim.models._coherence`
+ - `gensim.utils` ➡ `gensim.utils.utils` (old imports will continue to work)
+ - `gensim.parsing.*` ➡ `gensim.utils.text_utils`
+
+
## 3.5.0, 2018-07-06
This release comprises a glorious 38 pull requests from 28 contributors. Most of the effort went into improving the documentation—hence the release code name "Docs 💬"!
@@ -202,7 +304,7 @@ Apart from the **massive overhaul of all Gensim documentation** (including docst
- `gensim.parsing.*` ➡ `gensim.utils.text_utils`
-## 3.3.0, 2018-01-02
+## 3.3.0, 2018-02-02
:star2: New features:
* Re-designed all "*2vec" implementations (__[@manneshiva](https://github.com/manneshiva)__, [#1777](https://github.com/RaRe-Technologies/gensim/pull/1777))
diff --git a/MANIFEST.in b/MANIFEST.in
index 9bfc31660f..da4b2ee47e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,14 +4,29 @@ include CHANGELOG.md
include COPYING
include COPYING.LESSER
include ez_setup.py
+
include gensim/models/voidptr.h
+include gensim/models/fast_line_sentence.h
+
include gensim/models/word2vec_inner.c
include gensim/models/word2vec_inner.pyx
include gensim/models/word2vec_inner.pxd
+include gensim/models/word2vec_corpusfile.cpp
+include gensim/models/word2vec_corpusfile.pyx
+include gensim/models/word2vec_corpusfile.pxd
+
include gensim/models/doc2vec_inner.c
include gensim/models/doc2vec_inner.pyx
+include gensim/models/doc2vec_inner.pxd
+include gensim/models/doc2vec_corpusfile.cpp
+include gensim/models/doc2vec_corpusfile.pyx
+
include gensim/models/fasttext_inner.c
include gensim/models/fasttext_inner.pyx
+include gensim/models/fasttext_inner.pxd
+include gensim/models/fasttext_corpusfile.cpp
+include gensim/models/fasttext_corpusfile.pyx
+
include gensim/models/_utils_any2vec.c
include gensim/models/_utils_any2vec.pyx
include gensim/corpora/_mmreader.c
diff --git a/README.md b/README.md
index d2b9e865f5..78c2209f42 100644
--- a/README.md
+++ b/README.md
@@ -119,29 +119,23 @@ Documentation
Adopters
--------
-
-
-| Name | Logo | URL | Description |
-|----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| RaRe Technologies | ![rare](docs/src/readme_images/rare.png) | [rare-technologies.com](http://rare-technologies.com) | Machine learning & NLP consulting and training. Creators and maintainers of Gensim. |
-| Mindseye | ![mindseye](docs/src/readme_images/mindseye.png) | [mindseye.com](http://www.mindseyesolutions.com/) | Similarities in legal documents |
-| Talentpair | ![talent-pair](docs/src/readme_images/talent-pair.png) | [talentpair.com](http://talentpair.com) | Data science driving high-touch recruiting |
-| Tailwind | ![tailwind](docs/src/readme_images/tailwind.png)| [Tailwindapp.com](https://www.tailwindapp.com/)| Post interesting and relevant content to Pinterest |
-| Issuu | ![issuu](docs/src/readme_images/issuu.png) | [Issuu.com](https://issuu.com/)| Gensim’s LDA module lies at the very core of the analysis we perform on each uploaded publication to figure out what it’s all about.
-| Sports Authority | ![sports-authority](docs/src/readme_images/sports-authority.png) | [sportsauthority.com](https://en.wikipedia.org/wiki/Sports_Authority)| Text mining of customer surveys and social media sources |
-| Search Metrics | ![search-metrics](docs/src/readme_images/search-metrics.png) | [searchmetrics.com](http://www.searchmetrics.com/)| Gensim word2vec used for entity disambiguation in Search Engine Optimisation
-| Cisco Security | ![cisco](docs/src/readme_images/cisco.png) | [cisco.com](http://www.cisco.com/c/en/us/products/security/index.html)| Large-scale fraud detection
-| 12K Research | ![12k](docs/src/readme_images/12k.png)| [12k.co](https://12k.co/)| Document similarity analysis on media articles
-| National Institutes of Health | ![nih](docs/src/readme_images/nih.png) | [github/NIHOPA](https://github.com/NIHOPA/pipeline_word2vec)| Processing grants and publications with word2vec
-| Codeq LLC | ![codeq](docs/src/readme_images/codeq.png) | [codeq.com](https://codeq.com)| Document classification with word2vec
-| Mass Cognition | ![mass-cognition](docs/src/readme_images/mass-cognition.png) | [masscognition.com](http://www.masscognition.com/) | Topic analysis service for consumer text data and general text data |
-| Stillwater Supercomputing | ![stillwater](docs/src/readme_images/stillwater.png) | [stillwater-sc.com](http://www.stillwater-sc.com/) | Document comprehension and association with word2vec |
-| Channel 4 | ![channel4](docs/src/readme_images/channel4.png) | [channel4.com](http://www.channel4.com/) | Recommendation engine |
-| Amazon | ![amazon](docs/src/readme_images/amazon.png) | [amazon.com](http://www.amazon.com/) | Document similarity|
-| SiteGround Hosting | ![siteground](docs/src/readme_images/siteground.png) | [siteground.com](https://www.siteground.com/) | An ensemble search engine which uses different embeddings models and similarities, including word2vec, WMD, and LDA. |
-| Juju | ![juju](docs/src/readme_images/juju.png) | [www.juju.com](http://www.juju.com/) | Provide non-obvious related job suggestions. |
-| NLPub | ![nlpub](docs/src/readme_images/nlpub.png) | [nlpub.org](https://nlpub.org/) | Distributional semantic models including word2vec. |
-|Capital One | ![capitalone](docs/src/readme_images/capitalone.png) | [www.capitalone.com](https://www.capitalone.com/) | Topic modeling for customer complaints exploration. |
+| Company | Logo | Industry | Use of Gensim |
+|---------|------|----------|---------------|
+| [RARE Technologies](http://rare-technologies.com) | ![rare](docs/src/readme_images/rare.png) | ML & NLP consulting | Creators of Gensim – this is us! |
+| [Amazon](http://www.amazon.com/) | ![amazon](docs/src/readme_images/amazon.png) | Retail | Document similarity. |
+| [National Institutes of Health](https://github.com/NIHOPA/pipeline_word2vec) | ![nih](docs/src/readme_images/nih.png) | Health | Processing grants and publications with word2vec. |
+| [Cisco Security](http://www.cisco.com/c/en/us/products/security/index.html) | ![cisco](docs/src/readme_images/cisco.png) | Security | Large-scale fraud detection. |
+| [Mindseye](http://www.mindseyesolutions.com/) | ![mindseye](docs/src/readme_images/mindseye.png) | Legal | Similarities in legal documents. |
+| [Channel 4](http://www.channel4.com/) | ![channel4](docs/src/readme_images/channel4.png) | Media | Recommendation engine. |
+| [Talentpair](http://talentpair.com) | ![talent-pair](docs/src/readme_images/talent-pair.png) | HR | Candidate matching in high-touch recruiting. |
+| [Juju](http://www.juju.com/) | ![juju](docs/src/readme_images/juju.png) | HR | Provide non-obvious related job suggestions. |
+| [Tailwind](https://www.tailwindapp.com/) | ![tailwind](docs/src/readme_images/tailwind.png) | Media | Post interesting and relevant content to Pinterest. |
+| [Issuu](https://issuu.com/) | ![issuu](docs/src/readme_images/issuu.png) | Media | Gensim's LDA module lies at the very core of the analysis we perform on each uploaded publication to figure out what it's all about. |
+| [Search Metrics](http://www.searchmetrics.com/) | ![search-metrics](docs/src/readme_images/search-metrics.png) | Content Marketing | Gensim word2vec used for entity disambiguation in Search Engine Optimisation. |
+| [12K Research](https://12k.co/) | ![12k](docs/src/readme_images/12k.png)| Media | Document similarity analysis on media articles. |
+| [Stillwater Supercomputing](http://www.stillwater-sc.com/) | ![stillwater](docs/src/readme_images/stillwater.png) | Hardware | Document comprehension and association with word2vec. |
+| [SiteGround](https://www.siteground.com/) | ![siteground](docs/src/readme_images/siteground.png) | Web hosting | An ensemble search engine which uses different embeddings models and similarities, including word2vec, WMD, and LDA. |
+| [Capital One](https://www.capitalone.com/) | ![capitalone](docs/src/readme_images/capitalone.png) | Finance | Topic modeling for customer complaints exploration. |
-------
diff --git a/appveyor.yml b/appveyor.yml
index 04da45cd43..c9bbf02931 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -28,6 +28,11 @@ environment:
PYTHON_ARCH: "64"
TOXENV: "py36-win"
+ - PYTHON: "C:\\Python37-x64"
+ PYTHON_VERSION: "3.7.0"
+ PYTHON_ARCH: "64"
+ TOXENV: "py37-win"
+
init:
- "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%"
- "ECHO \"%APPVEYOR_SCHEDULED_BUILD%\""
diff --git a/docs/notebooks/Any2Vec_Filebased.ipynb b/docs/notebooks/Any2Vec_Filebased.ipynb
new file mode 100644
index 0000000000..0ad4c2a282
--- /dev/null
+++ b/docs/notebooks/Any2Vec_Filebased.ipynb
@@ -0,0 +1,550 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# *2Vec File-based Training: API Tutorial\n",
+ "\n",
+ "This tutorial introduces a new file-based training mode for **`gensim.models.{Word2Vec, FastText, Doc2Vec}`** which leads to (much) faster training on machines with many cores. Below we demonstrate how to use this new mode, with Python examples."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## In this tutorial\n",
+ "\n",
+ "1. We will show how to use the new training mode on Word2Vec, FastText and Doc2Vec.\n",
+ "2. Evaluate the performance of file-based training on the English Wikipedia and compare it to the existing queue-based training.\n",
+ "3. Show that model quality (analogy accuracies on `question-words.txt`) are almost the same for both modes."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Motivation\n",
+ "\n",
+ "The original implementation of Word2Vec training in Gensim is already super fast (covered in [this blog series](https://rare-technologies.com/word2vec-in-python-part-two-optimizing/), see also [benchmarks against other implementations in Tensorflow, DL4J, and C](https://rare-technologies.com/machine-learning-hardware-benchmarks/)) and flexible, allowing you to train on arbitrary Python streams. We had to jump through [some serious hoops](https://www.youtube.com/watch?v=vU4TlwZzTfU) to make it so, avoiding the Global Interpreter Lock (the dreaded GIL, the main bottleneck for any serious high performance computation in Python).\n",
+ "\n",
+ "The end result worked great for modest machines (< 8 cores), but for higher-end servers, the GIL reared its ugly head again. Simply managing the input stream iterators and worker queues, which has to be done in Python holding the GIL, was becoming the bottleneck. Simply put, the Python implementation didn't scale linearly with cores, as the original C implementation by Tomáš Mikolov did."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![scaling of word2vec file-based training](word2vec_file_scaling.png)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We decided to change that. After [much](https://github.com/RaRe-Technologies/gensim/pull/2127) [experimentation](https://github.com/RaRe-Technologies/gensim/pull/2048#issuecomment-401494412) and [benchmarking](https://persiyanov.github.io/jekyll/update/2018/05/28/gsoc-first-weeks.html), including some pretty [hardcore outlandish ideas](https://github.com/RaRe-Technologies/gensim/pull/2127#issuecomment-405937741), we figured there's no way around the GIL limitations—not at the level of fine-tuned performance needed here. Remember, we're talking >500k words (training instances) per second, using highly optimized C code. Way past the naive \"vectorize with NumPy arrays\" territory.\n",
+ "\n",
+ "So we decided to introduce a new code path, which has *less flexibility* in favour of *more performance*. We call this code path **`file-based training`**, and it's realized by passing a new `corpus_file` parameter to training. The existing `sentences` parameter (queue-based training) is still available, and you can continue using without any change: there's **full backward compatibility**."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How it works\n",
+ "\n",
+ "\n",
+ "\n",
+ "| *code path* | *input parameter* | *advantages* | *disadvantages*\n",
+ "| :-------- | :-------- | :--------- | :----------- |\n",
+ "| queue-based training (existing) | `sentences` (Python iterable) | Input can be generated dynamically from any storage, or even on-the-fly. | Scaling plateaus after 8 cores. |\n",
+ "| file-based training (new) | `corpus_file` (file on disk) | Scales linearly with CPU cores. | Training corpus must be serialized to disk in a specific format. |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "When you specify `corpus_file`, the model will read and process different portions of the file with different workers. The entire bulk of work is done outside of GIL, using no Python structures at all. The workers update the same weight matrix, but otherwise there's no communication, each worker munches on its data portion completely independently. This is the same approach the original C tool uses. \n",
+ "\n",
+ "Training with `corpus_file` yields a **significant performance boost**: for example, in the experiment belows training is 3.7x faster with 32 workers in comparison to training with `sentences` argument. It even outperforms the original Word2Vec C tool in terms of words/sec processing speed on high-core machines.\n",
+ "\n",
+ "The limitation of this approach is that `corpus_file` argument accepts a path to your corpus file, which must be stored on disk in a specific format. The format is simply the well-known [gensim.models.word2vec.LineSentence](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence): one sentence per line, with words separated by spaces."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How to use it\n",
+ "\n",
+ "You only need to:\n",
+ "\n",
+ "1. Save your corpus in the LineSentence format to disk (you may use [gensim.utils.save_as_line_sentence(your_corpus, your_corpus_file)](https://radimrehurek.com/gensim/utils.html#gensim.utils.save_as_line_sentence) for convenience).\n",
+ "2. Change `sentences=your_corpus` argument to `corpus_file=your_corpus_file` in `Word2Vec.__init__`, `Word2Vec.build_vocab`, `Word2Vec.train` calls.\n",
+ "\n",
+ "\n",
+ "A short Word2Vec example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1\n"
+ ]
+ }
+ ],
+ "source": [
+ "import gensim\n",
+ "import gensim.downloader as api\n",
+ "from gensim.utils import save_as_line_sentence\n",
+ "from gensim.models.word2vec import Word2Vec\n",
+ "\n",
+ "print(gensim.models.word2vec.CORPUSFILE_VERSION) # must be >= 0, i.e. optimized compiled version\n",
+ "\n",
+ "corpus = api.load(\"text8\")\n",
+ "save_as_line_sentence(corpus, \"my_corpus.txt\")\n",
+ "\n",
+ "model = Word2Vec(corpus_file=\"my_corpus.txt\", iter=5, size=300, workers=14)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Let's prepare the full Wikipedia dataset as training corpus\n",
+ "\n",
+ "We load wikipedia dump from `gensim-data`, perform text preprocessing with Gensim functions, and finally save processed corpus in LineSentence format."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CORPUS_FILE = 'wiki-en-20171001.txt'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import itertools\n",
+ "from gensim.parsing.preprocessing import preprocess_string\n",
+ "\n",
+ "def processed_corpus():\n",
+ " raw_corpus = api.load('wiki-english-20171001')\n",
+ " for article in raw_corpus:\n",
+ " # concatenate all section titles and texts of each Wikipedia article into a single \"sentence\"\n",
+ " doc = '\\n'.join(itertools.chain.from_iterable(zip(article['section_titles'], article['section_texts'])))\n",
+ " yield preprocess_string(doc)\n",
+ "\n",
+ "# serialize the preprocessed corpus into a single file on disk, using memory-efficient streaming\n",
+ "save_as_line_sentence(processed_corpus(), CORPUS_FILE)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Word2Vec\n",
+ "\n",
+ "We train two models:\n",
+ "* With `sentences` argument\n",
+ "* With `corpus_file` argument\n",
+ "\n",
+ "\n",
+ "Then, we compare the timings and accuracy on `question-words.txt`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from gensim.models.word2vec import LineSentence\n",
+ "import time\n",
+ "\n",
+ "start_time = time.time()\n",
+ "model_sent = Word2Vec(sentences=LineSentence(CORPUS_FILE), iter=5, size=300, workers=32)\n",
+ "sent_time = time.time() - start_time\n",
+ "\n",
+ "start_time = time.time()\n",
+ "model_corp_file = Word2Vec(corpus_file=CORPUS_FILE, iter=5, size=300, workers=32)\n",
+ "file_time = time.time() - start_time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training model with `sentences` took 9494.237 seconds\n",
+ "Training model with `corpus_file` took 2566.170 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Training model with `sentences` took {:.3f} seconds\".format(sent_time))\n",
+ "print(\"Training model with `corpus_file` took {:.3f} seconds\".format(file_time))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Training with `corpus_file` took 3.7x less time!**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, let's compare the accuracies:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from gensim.test.utils import datapath"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/persiyanov/gensim/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+ " if np.issubdtype(vec.dtype, np.int):\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Word analogy accuracy with `sentences`: 75.4%\n",
+ "Word analogy accuracy with `corpus_file`: 74.8%\n"
+ ]
+ }
+ ],
+ "source": [
+ "model_sent_accuracy = model_sent.wv.evaluate_word_analogies(datapath('questions-words.txt'))[0]\n",
+ "print(\"Word analogy accuracy with `sentences`: {:.1f}%\".format(100.0 * model_sent_accuracy))\n",
+ "\n",
+ "model_corp_file_accuracy = model_corp_file.wv.evaluate_word_analogies(datapath('questions-words.txt'))[0]\n",
+ "print(\"Word analogy accuracy with `corpus_file`: {:.1f}%\".format(100.0 * model_corp_file_accuracy))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The accuracies are approximately the same."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## FastText"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Short example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import gensim.downloader as api\n",
+ "from gensim.utils import save_as_line_sentence\n",
+ "from gensim.models.fasttext import FastText\n",
+ "\n",
+ "corpus = api.load(\"text8\")\n",
+ "save_as_line_sentence(corpus, \"my_corpus.txt\")\n",
+ "\n",
+ "model = FastText(corpus_file=\"my_corpus.txt\", iter=5, size=300, workers=14)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Let's compare the timings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from gensim.models.word2vec import LineSentence\n",
+ "from gensim.models.fasttext import FastText\n",
+ "import time\n",
+ "\n",
+ "start_time = time.time()\n",
+ "model_corp_file = FastText(corpus_file=CORPUS_FILE, iter=5, size=300, workers=32)\n",
+ "file_time = time.time() - start_time\n",
+ "\n",
+ "start_time = time.time()\n",
+ "model_sent = FastText(sentences=LineSentence(CORPUS_FILE), iter=5, size=300, workers=32)\n",
+ "sent_time = time.time() - start_time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training model with `sentences` took 17963.283 seconds\n",
+ "Training model with `corpus_file` took 10725.931 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Training model with `sentences` took {:.3f} seconds\".format(sent_time))\n",
+ "print(\"Training model with `corpus_file` took {:.3f} seconds\".format(file_time))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**We see a 1.67x performance boost!**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Now, accuracies:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/persiyanov/gensim/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+ " if np.issubdtype(vec.dtype, np.int):\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Word analogy accuracy with `sentences`: 64.2%\n",
+ "Word analogy accuracy with `corpus_file`: 66.2%\n"
+ ]
+ }
+ ],
+ "source": [
+ "from gensim.test.utils import datapath\n",
+ "\n",
+ "model_sent_accuracy = model_sent.wv.evaluate_word_analogies(datapath('questions-words.txt'))[0]\n",
+ "print(\"Word analogy accuracy with `sentences`: {:.1f}%\".format(100.0 * model_sent_accuracy))\n",
+ "\n",
+ "model_corp_file_accuracy = model_corp_file.wv.evaluate_word_analogies(datapath('questions-words.txt'))[0]\n",
+ "print(\"Word analogy accuracy with `corpus_file`: {:.1f}%\".format(100.0 * model_corp_file_accuracy))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Doc2Vec"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Short example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import gensim.downloader as api\n",
+ "from gensim.utils import save_as_line_sentence\n",
+ "from gensim.models.doc2vec import Doc2Vec\n",
+ "\n",
+ "corpus = api.load(\"text8\")\n",
+ "save_as_line_sentence(corpus, \"my_corpus.txt\")\n",
+ "\n",
+ "model = Doc2Vec(corpus_file=\"my_corpus.txt\", epochs=5, vector_size=300, workers=14)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Let's compare the timings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument\n",
+ "import time\n",
+ "\n",
+ "start_time = time.time()\n",
+ "model_corp_file = Doc2Vec(corpus_file=CORPUS_FILE, epochs=5, vector_size=300, workers=32)\n",
+ "file_time = time.time() - start_time\n",
+ "\n",
+ "start_time = time.time()\n",
+ "model_sent = Doc2Vec(documents=TaggedLineDocument(CORPUS_FILE), epochs=5, vector_size=300, workers=32)\n",
+ "sent_time = time.time() - start_time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training model with `sentences` took 20427.949 seconds\n",
+ "Training model with `corpus_file` took 3085.256 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Training model with `sentences` took {:.3f} seconds\".format(sent_time))\n",
+ "print(\"Training model with `corpus_file` took {:.3f} seconds\".format(file_time))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**A 6.6x speedup!**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Accuracies:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/persiyanov/gensim/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+ " if np.issubdtype(vec.dtype, np.int):\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Word analogy accuracy with `sentences`: 71.7%\n",
+ "Word analogy accuracy with `corpus_file`: 67.8%\n"
+ ]
+ }
+ ],
+ "source": [
+ "from gensim.test.utils import datapath\n",
+ "\n",
+ "model_sent_accuracy = model_sent.wv.evaluate_word_analogies(datapath('questions-words.txt'))[0]\n",
+ "print(\"Word analogy accuracy with `sentences`: {:.1f}%\".format(100.0 * model_sent_accuracy))\n",
+ "\n",
+ "model_corp_file_accuracy = model_corp_file.wv.evaluate_word_analogies(datapath('questions-words.txt'))[0]\n",
+ "print(\"Word analogy accuracy with `corpus_file`: {:.1f}%\".format(100.0 * model_corp_file_accuracy))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## TL;DR: Conclusion\n",
+ "\n",
+ "In case your training corpus already lives on disk, you lose nothing by switching to the new `corpus_file` training mode. Training will be much faster.\n",
+ "\n",
+ "In case your corpus is generated dynamically, you can either serialize it to disk first with `gensim.utils.save_as_line_sentence` (and then use the fast `corpus_file`), or if that's not possible continue using the existing `sentences` training mode.\n",
+ "\n",
+ "------\n",
+ "\n",
+ "This new code branch was created by [@persiyanov](https://github.com/persiyanov) as a Google Summer of Code 2018 project in the [RARE Student Incubator](https://rare-technologies.com/incubator/).\n",
+ "\n",
+ "Questions, comments? Use our Gensim [mailing list](https://groups.google.com/forum/#!forum/gensim) and [twitter](https://twitter.com/gensim_py). Happy training!"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/notebooks/FastText_Tutorial.ipynb b/docs/notebooks/FastText_Tutorial.ipynb
index f547009215..bc964b2829 100644
--- a/docs/notebooks/FastText_Tutorial.ipynb
+++ b/docs/notebooks/FastText_Tutorial.ipynb
@@ -134,7 +134,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the folllowing parameters from the original word2vec - \n",
+ "Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the following parameters from the original word2vec - \n",
" - model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`)\n",
" - size: Size of embeddings to be learnt (Default 100)\n",
" - alpha: Initial learning rate (Default 0.025)\n",
diff --git a/docs/notebooks/Poincare Evaluation.ipynb b/docs/notebooks/Poincare Evaluation.ipynb
index 0d3f8bb851..d2dd4bfac5 100644
--- a/docs/notebooks/Poincare Evaluation.ipynb
+++ b/docs/notebooks/Poincare Evaluation.ipynb
@@ -1706,7 +1706,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "1. The model can be investigated further to understand why it doesn't produce results as good as the paper. It is possible that this might be due to training details not present in the paper, or due to us incorrectly interpreting some ambiguous parts of the paper. We have not been able to clarify all such ambiguitities in communication with the authors.\n",
+ "1. The model can be investigated further to understand why it doesn't produce results as good as the paper. It is possible that this might be due to training details not present in the paper, or due to us incorrectly interpreting some ambiguous parts of the paper. We have not been able to clarify all such ambiguities in communication with the authors.\n",
"2. Optimizing the training process further - with a model size of 50 dimensions and a dataset with ~700k relations and ~80k nodes, the Gensim implementation takes around 45 seconds to complete an epoch (~15k relations per second), whereas the open source C++ implementation takes around 1/6th the time (~95k relations per second).\n",
"3. Implementing the variant of the model mentioned in the paper for symmetric graphs and evaluating on the scientific collaboration datasets described earlier in the report."
]
diff --git a/docs/notebooks/Tensorboard_visualizations.ipynb b/docs/notebooks/Tensorboard_visualizations.ipynb
index a2d88e9619..f65083b938 100644
--- a/docs/notebooks/Tensorboard_visualizations.ipynb
+++ b/docs/notebooks/Tensorboard_visualizations.ipynb
@@ -844,7 +844,7 @@
"- **T-SNE**: The idea of T-SNE is to place the local neighbours close to each other, and almost completely ignoring the global structure. It is useful for exploring local neighborhoods and finding local clusters. But the global trends are not represented accurately and the separation between different groups is often not preserved (see the t-sne plots of our data below which testify the same).\n",
"\n",
"\n",
- "- **Custom Projections**: This is a custom bethod based on the text searches you define for different directions. It could be useful for finding meaningful directions in the vector space, for example, female to male, currency to country etc.\n",
+ "- **Custom Projections**: This is a custom method based on the text searches you define for different directions. It could be useful for finding meaningful directions in the vector space, for example, female to male, currency to country etc.\n",
"\n",
"You can refer to this [doc](https://www.tensorflow.org/get_started/embedding_viz) for instructions on how to use and navigate through different panels available in TensorBoard."
]
@@ -1112,9 +1112,10 @@
"\n",
"The above plot was generated with perplexity 11, learning rate 10 and iteration 1100. Though the results could vary on successive runs, and you may not get the exact plot as above even with same hyperparameter settings. But some small clusters will start forming as above, with different orientations.\n",
"\n",
- "I named some clusters above based on the genre of it's movies and also using the `show_topic()` to see relevant terms of the topic which was most prevelant in a cluster. Most of the clusters had doocumets belonging dominantly to a single topic. For ex. The cluster with movies belonging primarily to topic 0 could be named Fantasy/Romance based on terms displayed below for topic 0. You can play with the visualization yourself on this [link](http://projector.tensorflow.org/?config=https://raw.githubusercontent.com/parulsethi/LdaProjector/master/doc_lda_config.json) and try to conclude a label for clusters based on movies it has and their dominant topic. You can see the top 5 topics of every point by hovering over it.\n",
+ "I named some clusters above based on the genre of it's movies and also using the `show_topic()` to see relevant terms of the topic which was most prevalent in a cluster. Most of the clusters had documents belonging dominantly to a single topic. For ex. The cluster with movies belonging primarily to topic 0 could be named Fantasy/Romance based on terms displayed below for topic 0. You can play with the visualization yourself on this [link](http://projector.tensorflow.org/?config=https://raw.githubusercontent.com/parulsethi/LdaProjector/master/doc_lda_config.json) and try to conclude a label for clusters based on movies it has and
+ dominant topic. You can see the top 5 topics of every point by hovering over it.\n",
"\n",
- "Now, we can notice that their are more than 10 clusters in the above image, whereas we trained our model for `num_topics=10`. It's because their are few clusters, which has documents belonging to more than one topic with an approximately close topic probability values."
+ "Now, we can notice that there are more than 10 clusters in the above image, whereas we trained our model for `num_topics=10`. It's because there are few clusters, which has documents belonging to more than one topic with an approximately close topic probability values."
]
},
{
diff --git a/docs/notebooks/Topics_and_Transformations.ipynb b/docs/notebooks/Topics_and_Transformations.ipynb
index 5a8ec7f985..b8b2ff129f 100644
--- a/docs/notebooks/Topics_and_Transformations.ipynb
+++ b/docs/notebooks/Topics_and_Transformations.ipynb
@@ -199,7 +199,7 @@
"In this particular case, we are transforming the same corpus that we used for training, but this is only incidental. Once the transformation model has been initialized, it can be used on any vectors (provided they come from the same vector space, of course), even if they were not used in the training corpus at all. This is achieved by a process called folding-in for LSA, by topic inference for LDA etc.\n",
"\n",
"> Note: \n",
- "> Calling model[corpus] only creates a wrapper around the old corpus document stream – actual conversions are done on-the-fly, during document iteration. We cannot convert the entire corpus at the time of calling corpus_transformed = model[corpus], because that would mean storing the result in main memory, and that contradicts gensim’s objective of memory-indepedence. If you will be iterating over the transformed corpus_transformed multiple times, and the transformation is costly, serialize the resulting corpus to disk first and continue using that.\n",
+ "> Calling model[corpus] only creates a wrapper around the old corpus document stream – actual conversions are done on-the-fly, during document iteration. We cannot convert the entire corpus at the time of calling corpus_transformed = model[corpus], because that would mean storing the result in main memory, and that contradicts gensim’s objective of memory-independence. If you will be iterating over the transformed corpus_transformed multiple times, and the transformation is costly, serialize the resulting corpus to disk first and continue using that.\n",
"\n",
"Transformations can also be serialized, one on top of another, in a sort of chain:"
]
@@ -332,7 +332,7 @@
"metadata": {},
"source": [
"### [Latent Semantic Indexing, LSI (or sometimes LSA)](http://en.wikipedia.org/wiki/Latent_semantic_indexing) \n",
- "LSI transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into a latent space of a lower dimensionality. For the toy corpus above we used only 2 latent dimensions, but on real corpora, target dimensionality of 200–500 is recommended as a “golden standard” [1]."
+ "LSI transforms documents from either bag-of-words or (preferably) TfIdf-weighted space into a latent space of a lower dimensionality. For the toy corpus above we used only 2 latent dimensions, but on real corpora, target dimensionality of 200–500 is recommended as a “golden standard” [1]."
]
},
{
diff --git a/docs/notebooks/WMD_tutorial.ipynb b/docs/notebooks/WMD_tutorial.ipynb
index 3a529f471e..8f627c37ce 100644
--- a/docs/notebooks/WMD_tutorial.ipynb
+++ b/docs/notebooks/WMD_tutorial.ipynb
@@ -14,7 +14,7 @@
"\n",
"WMD is a method that allows us to assess the \"distance\" between two documents in a meaningful way, even when they have no words in common. It uses [word2vec](http://rare-technologies.com/word2vec-tutorial/) [4] vector embeddings of words. It been shown to outperform many of the state-of-the-art methods in *k*-nearest neighbors classification [3].\n",
"\n",
- "WMD is illustrated below for two very similar sentences (illustration taken from [Vlad Niculae's blog](http://vene.ro/blog/word-movers-distance-in-python.html)). The sentences have no words in common, but by matching the relevant words, WMD is able to accurately measure the (dis)similarity between the two sentences. The method also uses the bag-of-words representation of the documents (simply put, the word's frequencies in the documents), noted as $d$ in the figure below. The intution behind the method is that we find the minimum \"traveling distance\" between documents, in other words the most efficient way to \"move\" the distribution of document 1 to the distribution of document 2.\n",
+ "WMD is illustrated below for two very similar sentences (illustration taken from [Vlad Niculae's blog](http://vene.ro/blog/word-movers-distance-in-python.html)). The sentences have no words in common, but by matching the relevant words, WMD is able to accurately measure the (dis)similarity between the two sentences. The method also uses the bag-of-words representation of the documents (simply put, the word's frequencies in the documents), noted as $d$ in the figure below. The intuition behind the method is that we find the minimum \"traveling distance\" between documents, in other words the most efficient way to \"move\" the distribution of document 1 to the distribution of document 2.\n",
"\n",
"\n",
"\n",
@@ -639,4 +639,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/docs/notebooks/Wordrank_comparisons.ipynb b/docs/notebooks/Wordrank_comparisons.ipynb
index 26bac2e880..a3ab167cc1 100644
--- a/docs/notebooks/Wordrank_comparisons.ipynb
+++ b/docs/notebooks/Wordrank_comparisons.ipynb
@@ -1071,7 +1071,7 @@
" # sort analogies according to their mean frequences \n",
" copy_mean_freq = sorted(copy_mean_freq.items(), key=lambda x: x[1][1])\n",
" # prepare analogies buckets according to given size\n",
- " for centre_p in xrange(bucket_size//2, len(copy_mean_freq), bucket_size):\n",
+ " for centre_p in range(bucket_size//2, len(copy_mean_freq), bucket_size):\n",
" bucket = copy_mean_freq[centre_p-bucket_size//2:centre_p+bucket_size//2]\n",
" b_acc = 0\n",
" # calculate current bucket accuracy with b_acc count\n",
@@ -1174,7 +1174,7 @@
"source": [
"This shows the results for text8(17 million tokens). Following points can be observed in this case-\n",
"\n",
- "1. For Semantic analogies, all the models perform comparitively poor on rare words and also when the word frequency is high towards the end.\n",
+ "1. For Semantic analogies, all the models perform comparatively poor on rare words and also when the word frequency is high towards the end.\n",
"2. For Syntactic Analogies, FastText performance is fairly well on rare words but then falls steeply at highly frequent words.\n",
"3. WordRank and Word2Vec perform very similar with low accuracy for rare and highly frequent words in Syntactic Analogies.\n",
"4. FastText is again better in total analogies case due to the same reason described previously. Here the total no. of Semantic analogies is 7416 and Syntactic Analogies is 10411.\n",
diff --git a/docs/notebooks/translation_matrix.ipynb b/docs/notebooks/translation_matrix.ipynb
index 2a82098752..5439005a8c 100644
--- a/docs/notebooks/translation_matrix.ipynb
+++ b/docs/notebooks/translation_matrix.ipynb
@@ -417,7 +417,7 @@
"duration = []\n",
"sizeofword = []\n",
"\n",
- "for idx in xrange(0, test_case):\n",
+ "for idx in range(0, test_case):\n",
" sub_pair = word_pair[: (idx + 1) * step]\n",
"\n",
" startTime = time.time()\n",
@@ -1450,7 +1450,7 @@
"small_train_docs = train_docs[:15000]\n",
"# train for small corpus\n",
"model1.build_vocab(small_train_docs)\n",
- "for epoch in xrange(50):\n",
+ "for epoch in range(50):\n",
" shuffle(small_train_docs)\n",
" model1.train(small_train_docs, total_examples=len(small_train_docs), epochs=1)\n",
"model.save(\"small_doc_15000_iter50.bin\")\n",
@@ -1458,7 +1458,7 @@
"large_train_docs = train_docs + test_docs\n",
"# train for large corpus\n",
"model2.build_vocab(large_train_docs)\n",
- "for epoch in xrange(50):\n",
+ "for epoch in range(50):\n",
" shuffle(large_train_docs)\n",
" model2.train(large_train_docs, total_examples=len(train_docs), epochs=1)\n",
"# save the model\n",
diff --git a/docs/notebooks/word2vec_file_scaling.png b/docs/notebooks/word2vec_file_scaling.png
new file mode 100644
index 0000000000..e4f5311736
Binary files /dev/null and b/docs/notebooks/word2vec_file_scaling.png differ
diff --git a/docs/src/_index.rst.unused b/docs/src/_index.rst.unused
new file mode 100644
index 0000000000..71390c1060
--- /dev/null
+++ b/docs/src/_index.rst.unused
@@ -0,0 +1,100 @@
+
+:github_url: https://github.com/RaRe-Technologies/gensim
+
+Gensim documentation
+===================================
+
+============
+Introduction
+============
+
+Gensim is a free Python library designed to automatically extract semantic
+topics from documents, as efficiently (computer-wise) and painlessly (human-wise) as possible.
+
+Gensim is designed to process raw, unstructured digital texts ("plain text").
+
+The algorithms in Gensim, such as **Word2Vec**, **FastText**, **Latent Semantic Analysis**, **Latent Dirichlet Allocation** and **Random Projections**, discover semantic structure of documents by examining statistical co-occurrence patterns within a corpus of training documents. These algorithms are **unsupervised**, which means no human input is necessary -- you only need a corpus of plain text documents.
+
+Once these statistical patterns are found, any plain text documents can be succinctly
+expressed in the new, semantic representation and queried for topical similarity
+against other documents, words or phrases.
+
+.. note::
+ If the previous paragraphs left you confused, you can read more about the `Vector
+ Space Model `_ and `unsupervised
+ document analysis `_ on Wikipedia.
+
+
+.. _design:
+
+Features
+--------
+
+* **Memory independence** -- there is no need for the whole training corpus to
+ reside fully in RAM at any one time (can process large, web-scale corpora).
+* **Memory sharing** -- trained models can be persisted to disk and loaded back via mmap. Multiple processes can share the same data, cutting down RAM footprint.
+* Efficient implementations for several popular vector space algorithms,
+ including Word2Vec, Doc2Vec, FastText, TF-IDF, Latent Semantic Analysis (LSI, LSA),
+ Latent Dirichlet Allocation (LDA) or Random Projection.
+* I/O wrappers and readers from several popular data formats.
+* Fast similarity queries for documents in their semantic representation.
+
+The **principal design objectives** behind Gensim are:
+
+1. Straightforward interfaces and low API learning curve for developers. Good for prototyping.
+2. Memory independence with respect to the size of the input corpus; all intermediate
+ steps and algorithms operate in a streaming fashion, accessing one document
+ at a time.
+
+.. seealso::
+
+ We built a high performance server for NLP, document analysis, indexing, search and clustering: https://scaletext.ai.
+ ScaleText is a commercial product, available both on-prem or as SaaS.
+ Reach out at info@scaletext.com if you need an industry-grade tool with professional support.
+
+.. _availability:
+
+Availability
+------------
+
+Gensim is licensed under the OSI-approved `GNU LGPLv2.1 license `_ and can be downloaded either from its `github repository `_ or from the `Python Package Index `_.
+
+.. seealso::
+
+ See the :doc:`install ` page for more info on Gensim deployment.
+
+
+.. toctree::
+ :glob:
+ :maxdepth: 1
+ :caption: Getting started
+
+ install
+ intro
+ support
+ about
+ license
+ citing
+
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Tutorials
+
+ tutorial
+ tut1
+ tut2
+ tut3
+
+
+.. toctree::
+ :maxdepth: 1
+ :caption: API Reference
+
+ apiref
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
diff --git a/docs/src/_license.rst.unused b/docs/src/_license.rst.unused
new file mode 100644
index 0000000000..d85983aa44
--- /dev/null
+++ b/docs/src/_license.rst.unused
@@ -0,0 +1,26 @@
+:orphan:
+
+.. _license:
+
+Licensing
+---------
+
+Gensim is licensed under the OSI-approved `GNU LGPLv2.1 license `_.
+
+This means that it's free for both personal and commercial use, but if you make any
+modification to Gensim that you distribute to other people, you have to disclose
+the source code of these modifications.
+
+Apart from that, you are free to redistribute Gensim in any way you like, though you're
+not allowed to modify its license (doh!).
+
+My intent here is to **get more help and community involvement** with the development of Gensim.
+The legalese is therefore less important to me than your input and contributions.
+
+`Contact me `_ if LGPL doesn't fit your bill but you'd like the LGPL restrictions liften.
+
+.. seealso::
+
+ We built a high performance server for NLP, document analysis, indexing, search and clustering: https://scaletext.ai.
+ ScaleText is a commercial product, available both on-prem or as SaaS.
+ Reach out at info@scaletext.com if you need an industry-grade tool with professional support.
diff --git a/docs/src/changes_080.rst b/docs/src/changes_080.rst
index b038ccb930..2786d4b71a 100644
--- a/docs/src/changes_080.rst
+++ b/docs/src/changes_080.rst
@@ -23,10 +23,12 @@ That's not as tragic as it sounds, gensim was almost there anyway. The changes a
If you stored a model that is affected by this to disk, you'll need to rename its attributes manually:
->>> lsa = gensim.models.LsiModel.load('/some/path') # load old <0.8.0 model
->>> lsa.num_terms, lsa.num_topics = lsa.numTerms, lsa.numTopics # rename attributes
->>> del lsa.numTerms, lsa.numTopics # clean up old attributes (optional)
->>> lsa.save('/some/path') # save again to disk, as 0.8.0 compatible
+.. sourcecode:: pycon
+
+ >>> lsa = gensim.models.LsiModel.load('/some/path') # load old <0.8.0 model
+ >>> lsa.num_terms, lsa.num_topics = lsa.numTerms, lsa.numTopics # rename attributes
+ >>> del lsa.numTerms, lsa.numTopics # clean up old attributes (optional)
+ >>> lsa.save('/some/path') # save again to disk, as 0.8.0 compatible
Only attributes (variables) need to be renamed; method names (functions) are not affected, due to the way `pickle` works.
@@ -41,9 +43,11 @@ and can be processed independently. In addition, documents can now be added to a
There is also a new way to query the similarity indexes:
->>> index = MatrixSimilarity(corpus) # create an index
->>> sims = index[document] # get cosine similarity of query "document" against every document in the index
->>> sims = index[chunk_of_documents] # new syntax!
+.. sourcecode:: pycon
+
+ >>> index = MatrixSimilarity(corpus) # create an index
+ >>> sims = index[document] # get cosine similarity of query "document" against every document in the index
+ >>> sims = index[chunk_of_documents] # new syntax!
Advantage of the last line (querying multiple documents at the same time) is faster execution.
@@ -69,7 +73,7 @@ Other changes (that you're unlikely to notice unless you look)
----------------------------------------------------------------------
* Improved efficiency of ``lsi[corpus]`` transformations (documents are chunked internally for better performance).
-* Large matrices (numpy/scipy.sparse, in `LsiModel`, `Similarity` etc.) are now mmapped to/from disk when doing `save/load`. The `cPickle` approach used previously was too `buggy `_ and `slow `_.
+* Large matrices (numpy/scipy.sparse, in `LsiModel`, `Similarity` etc.) are now `mmapped `_ to/from disk when doing `save/load`. The `cPickle` approach used previously was too `buggy `_ and `slow `_.
* Renamed `chunks` parameter to `chunksize` (i.e. `LsiModel(corpus, num_topics=100, chunksize=20000)`). This better reflects its purpose: size of a chunk=number of documents to be processed at once.
* Also improved memory efficiency of LSI and LDA model generation (again).
* Removed SciPy 0.6 from the list of supported SciPi versions (need >=0.7 now).
diff --git a/docs/src/conf.py b/docs/src/conf.py
index d05558c540..da7d0a1994 100644
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -17,7 +17,7 @@
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.append(os.path.abspath('.'))
+sys.path.insert(0, os.path.abspath('../..'))
# -- General configuration -----------------------------------------------------
@@ -55,9 +55,9 @@
# built documents.
#
# The short X.Y version.
-version = '3.5'
+version = '3.6'
# The full version, including alpha/beta/rc tags.
-release = '3.5.0'
+release = '3.6.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
diff --git a/docs/src/dist_lda.rst b/docs/src/dist_lda.rst
index baf2d28aba..a8d0cb9816 100644
--- a/docs/src/dist_lda.rst
+++ b/docs/src/dist_lda.rst
@@ -19,7 +19,9 @@ Running LDA
____________
Run LDA like you normally would, but turn on the `distributed=True` constructor
-parameter::
+parameter
+
+.. sourcecode:: pycon
>>> # extract 100 LDA topics, using default parameters
>>> lda = LdaModel(corpus=mm, id2word=id2word, num_topics=100, distributed=True)
@@ -34,7 +36,9 @@ In distributed mode with four workers (Linux, Xeons of 2Ghz, 4GB RAM
with `ATLAS `_), the wallclock time taken drops to 3h20m.
To run standard batch LDA (no online updates of mini-batches) instead, you would similarly
-call::
+call
+
+.. sourcecode:: pycon
>>> lda = LdaModel(corpus=mm, id2word=id2token, num_topics=100, update_every=0, passes=20, distributed=True)
using distributed version with 4 workers
@@ -50,7 +54,7 @@ and then, some two days later::
iteration 19, dispatching documents up to #3199665/3199665
reached the end of input; now waiting for all remaining jobs to finish
-::
+.. sourcecode:: pycon
>>> lda.print_topics(20)
topic #0: 0.007*disease + 0.006*medical + 0.005*treatment + 0.005*cells + 0.005*cell + 0.005*cancer + 0.005*health + 0.005*blood + 0.004*patients + 0.004*drug
diff --git a/docs/src/dist_lsi.rst b/docs/src/dist_lsi.rst
index 15dfb41f9c..45c79cb222 100644
--- a/docs/src/dist_lsi.rst
+++ b/docs/src/dist_lsi.rst
@@ -58,16 +58,20 @@ ____________
So let's test our setup and run one computation of distributed LSA. Open a Python
shell on one of the five machines (again, this can be done on any computer
in the same `broadcast domain `_,
-our choice is incidental) and try::
+our choice is incidental) and try:
- >>> from gensim import corpora, models, utils
+.. sourcecode:: pycon
+
+ >>> from gensim import corpora, models
>>> import logging
+ >>>
>>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
-
- >>> corpus = corpora.MmCorpus('/tmp/deerwester.mm') # load a corpus of nine documents, from the Tutorials
+ >>>
+ >>> corpus = corpora.MmCorpus('/tmp/deerwester.mm') # load a corpus of nine documents, from the Tutorials
>>> id2word = corpora.Dictionary.load('/tmp/deerwester.dict')
-
- >>> lsi = models.LsiModel(corpus, id2word=id2word, num_topics=200, chunksize=1, distributed=True) # run distributed LSA on nine documents
+ >>>
+ >>> # run distributed LSA on nine documents
+ >>> lsi = models.LsiModel(corpus, id2word=id2word, num_topics=200, chunksize=1, distributed=True)
This uses the corpus and feature-token mapping created in the :doc:`tut1` tutorial.
If you look at the log in your Python session, you should see a line similar to::
@@ -76,7 +80,9 @@ If you look at the log in your Python session, you should see a line similar to:
which means all went well. You can also check the logs coming from your worker and dispatcher
processes --- this is especially helpful in case of problems.
-To check the LSA results, let's print the first two latent topics::
+To check the LSA results, let's print the first two latent topics:
+
+.. sourcecode:: pycon
>>> lsi.print_topics(num_topics=2, num_words=5)
topic #0(3.341): 0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response"
@@ -86,13 +92,15 @@ Success! But a corpus of nine documents is no challenge for our powerful cluster
In fact, we had to lower the job size (`chunksize` parameter above) to a single document
at a time, otherwise all documents would be processed by a single worker all at once.
-So let's run LSA on **one million documents** instead::
+So let's run LSA on **one million documents** instead
+
+.. sourcecode:: pycon
>>> # inflate the corpus to 1M documents, by repeating its documents over&over
>>> corpus1m = utils.RepeatCorpus(corpus, 1000000)
>>> # run distributed LSA on 1 million documents
>>> lsi1m = models.LsiModel(corpus1m, id2word=id2word, num_topics=200, chunksize=10000, distributed=True)
-
+ >>>
>>> lsi1m.print_topics(num_topics=2, num_words=5)
topic #0(1113.628): 0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response"
topic #1(847.233): 0.623*"graph" + 0.490*"trees" + 0.451*"minors" + 0.274*"survey" + -0.167*"system"
@@ -118,25 +126,31 @@ Distributed LSA on Wikipedia
++++++++++++++++++++++++++++++
First, download and prepare the Wikipedia corpus as per :doc:`wiki`, then load
-the corpus iterator with::
+the corpus iterator with
- >>> import logging, gensim
- >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+.. sourcecode:: pycon
+ >>> import logging
+ >>> import gensim
+ >>>
+ >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+ >>>
>>> # load id->word mapping (the dictionary)
>>> id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')
>>> # load corpus iterator
>>> mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')
>>> # mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output
-
+ >>>
>>> print(mm)
MmCorpus(3199665 documents, 100000 features, 495547400 non-zero entries)
-Now we're ready to run distributed LSA on the English Wikipedia::
+Now we're ready to run distributed LSA on the English Wikipedia:
+
+.. sourcecode:: pycon
>>> # extract 400 LSI topics, using a cluster of nodes
>>> lsi = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=400, chunksize=20000, distributed=True)
-
+ >>>
>>> # print the most contributing words (both positively and negatively) for each of the first ten topics
>>> lsi.print_topics(10)
2010-11-03 16:08:27,602 : INFO : topic #0(200.990): -0.475*"delete" + -0.383*"deletion" + -0.275*"debate" + -0.223*"comments" + -0.220*"edits" + -0.213*"modify" + -0.208*"appropriate" + -0.194*"subsequent" + -0.155*"wp" + -0.117*"notability"
diff --git a/docs/src/intro.rst b/docs/src/intro.rst
index bcb60efa27..b686a23a49 100644
--- a/docs/src/intro.rst
+++ b/docs/src/intro.rst
@@ -30,7 +30,7 @@ Features
* **Memory independence** -- there is no need for the whole training corpus to
reside fully in RAM at any one time (can process large, web-scale corpora).
-* **Memory sharing** -- trained models can be persisted to disk and loaded back via mmap. Multiple processes can share the same data, cutting down RAM footprint.
+* **Memory sharing** -- trained models can be persisted to disk and loaded back via `mmap `_. Multiple processes can share the same data, cutting down RAM footprint.
* Efficient implementations for several popular vector space algorithms,
including :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.doc2vec.Doc2Vec`, :class:`~gensim.models.fasttext.FastText`,
TF-IDF, Latent Semantic Analysis (LSI, LSA, see :class:`~gensim.models.lsimodel.LsiModel`),
diff --git a/docs/src/simserver.rst b/docs/src/simserver.rst
index 1b0d2b4396..49b26ab5d4 100644
--- a/docs/src/simserver.rst
+++ b/docs/src/simserver.rst
@@ -20,20 +20,20 @@ Conceptually, a service that lets you :
2. index arbitrary documents using this semantic model
3. query the index for similar documents (the query can be either an id of a document already in the index, or an arbitrary text)
-
->>> from simserver import SessionServer
->>> server = SessionServer('/tmp/my_server') # resume server (or create a new one)
-
->>> server.train(training_corpus, method='lsi') # create a semantic model
->>> server.index(some_documents) # convert plain text to semantic representation and index it
->>> server.find_similar(query) # convert query to semantic representation and compare against index
->>> ...
->>> server.index(more_documents) # add to index: incremental indexing works
->>> server.find_similar(query)
->>> ...
->>> server.delete(ids_to_delete) # incremental deleting also works
->>> server.find_similar(query)
->>> ...
+ .. sourcecode:: pycon
+
+ >>> from simserver import SessionServer
+ >>> server = SessionServer('/tmp/my_server') # resume server (or create a new one)
+ >>>
+ >>> server.train(training_corpus, method='lsi') # create a semantic model
+ >>> server.index(some_documents) # convert plain text to semantic representation and index it
+ >>> server.find_similar(query) # convert query to semantic representation and compare against index
+ >>>
+ >>> server.index(more_documents) # add to index: incremental indexing works
+ >>> server.find_similar(query)
+ >>>
+ >>> server.delete(ids_to_delete) # incremental deleting also works
+ >>> server.find_similar(query)
.. note::
"Semantic" here refers to semantics of the crude, statistical type --
@@ -89,19 +89,23 @@ version 4.8 as of this writing)::
$ sudo easy_install Pyro4
.. note::
- Don't forget to initialize logging to see logging messages::
+ Don't forget to initialize logging to see logging messages:
+
+ .. sourcecode:: pycon
- >>> import logging
- >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+ >>> import logging
+ >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
What is a document?
-------------------
-In case of text documents, the service expects::
+In case of text documents, the service expects:
->>> document = {'id': 'some_unique_string',
->>> 'tokens': ['content', 'of', 'the', 'document', '...'],
->>> 'other_fields_are_allowed_but_ignored': None}
+.. sourcecode:: pycon
+
+ >>> document = {'id': 'some_unique_string',
+ >>> 'tokens': ['content', 'of', 'the', 'document', '...'],
+ >>> 'other_fields_are_allowed_but_ignored': None}
This format was chosen because it coincides with plain JSON and is therefore easy to serialize and send over the wire, in almost any language.
All strings involved must be utf8-encoded.
@@ -113,23 +117,29 @@ What is a corpus?
A sequence of documents. Anything that supports the `for document in corpus: ...`
iterator protocol. Generators are ok. Plain lists are also ok (but consume more memory).
->>> from gensim import utils
->>> texts = ["Human machine interface for lab abc computer applications",
->>> "A survey of user opinion of computer system response time",
->>> "The EPS user interface management system",
->>> "System and human system engineering testing of EPS",
->>> "Relation of user perceived response time to error measurement",
->>> "The generation of random binary unordered trees",
->>> "The intersection graph of paths in trees",
->>> "Graph minors IV Widths of trees and well quasi ordering",
->>> "Graph minors A survey"]
->>> corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)}
->>> for num, text in enumerate(texts)]
+.. sourcecode:: pycon
+
+ >>> from gensim import utils
+ >>>
+ >>> texts = ["Human machine interface for lab abc computer applications",
+ >>> "A survey of user opinion of computer system response time",
+ >>> "The EPS user interface management system",
+ >>> "System and human system engineering testing of EPS",
+ >>> "Relation of user perceived response time to error measurement",
+ >>> "The generation of random binary unordered trees",
+ >>> "The intersection graph of paths in trees",
+ >>> "Graph minors IV Widths of trees and well quasi ordering",
+ >>> "Graph minors A survey"]
+ >>>
+ >>> corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)}
+ >>> for num, text in enumerate(texts)]
Since corpora are allowed to be arbitrarily large, it is
recommended client splits them into smaller chunks before uploading them to the server:
->>> utils.upload_chunked(server, corpus, chunksize=1000) # send 1k docs at a time
+.. sourcecode:: pycon
+
+ >>> utils.upload_chunked(server, corpus, chunksize=1000) # send 1k docs at a time
Wait, upload what, where?
-------------------------
@@ -141,11 +151,13 @@ option, not a necessity.
Document similarity can also act as a long-running service, a daemon process on a separate machine. In that
case, I'll call the service object a *server*.
-But let's start with a local object. Open your `favourite shell `_ and::
+But let's start with a local object. Open your `favourite shell `_ and
->>> from gensim import utils
->>> from simserver import SessionServer
->>> service = SessionServer('/tmp/my_server/') # or wherever
+.. sourcecode:: pycon
+
+ >>> from simserver import SessionServer
+ >>>
+ >>> service = SessionServer('/tmp/my_server/') # or wherever
That initialized a new service, located in `/tmp/my_server` (you need write access rights to that directory).
@@ -162,14 +174,18 @@ Model training
We can start indexing right away:
->>> service.index(corpus)
-AttributeError: must initialize model for /tmp/my_server/b before indexing documents
+.. sourcecode:: pycon
+
+ >>> service.index(corpus)
+ AttributeError: must initialize model for /tmp/my_server/b before indexing documents
Oops, we can not. The service indexes documents in a semantic representation, which
is different to the plain text we give it. We must teach the service how to convert
-between plain text and semantics first::
+between plain text and semantics first:
->>> service.train(corpus, method='lsi')
+.. sourcecode:: pycon
+
+ >>> service.train(corpus, method='lsi')
That was easy. The `method='lsi'` parameter meant that we trained a model for
`Latent Semantic Indexing `_
@@ -188,19 +204,25 @@ on a corpus that is:
Indexing documents
------------------
->>> service.index(corpus) # index the same documents that we trained on...
+.. sourcecode:: pycon
+
+ >>> service.index(corpus) # index the same documents that we trained on...
Indexing can happen over any documents, but I'm too lazy to create another example corpus, so we index the same 9 docs used for training.
-Delete documents with::
+Delete documents with:
- >>> service.delete(['doc_5', 'doc_8']) # supply a list of document ids to be removed from the index
+.. sourcecode:: pycon
+
+ >>> service.delete(['doc_5', 'doc_8']) # supply a list of document ids to be removed from the index
When you pass documents that have the same id as some already indexed document,
the indexed document is overwritten by the new input (=only the latest counts;
-document ids are always unique per service)::
+document ids are always unique per service):
+
+.. sourcecode:: pycon
- >>> service.index(corpus[:3]) # overall index size unchanged (just 3 docs overwritten)
+ >>> service.index(corpus[:3]) # overall index size unchanged (just 3 docs overwritten)
The index/delete/overwrite calls can be arbitrarily interspersed with queries.
You don't have to index **all** documents first to start querying, indexing can be incremental.
@@ -212,26 +234,26 @@ There are two types of queries:
1. by id:
- .. code-block:: python
-
- >>> print(service.find_similar('doc_0'))
- [('doc_0', 1.0, None), ('doc_2', 0.30426699, None), ('doc_1', 0.25648531, None), ('doc_3', 0.25480536, None)]
+ .. sourcecode:: pycon
- >>> print(service.find_similar('doc_5')) # we deleted doc_5 and doc_8, remember?
- ValueError: document 'doc_5' not in index
+ >>> print(service.find_similar('doc_0'))
+ [('doc_0', 1.0, None), ('doc_2', 0.30426699, None), ('doc_1', 0.25648531, None), ('doc_3', 0.25480536, None)]
+ >>>
+ >>> print(service.find_similar('doc_5')) # we deleted doc_5 and doc_8, remember?
+ ValueError: document 'doc_5' not in index
- In the resulting 3-tuples, `doc_n` is the document id we supplied during indexing,
- `0.30426699` is the similarity of `doc_n` to the query, but what's up with that `None`, you ask?
- Well, you can associate each document with a "payload", during indexing.
- This payload object (anything pickle-able) is later returned during querying.
- If you don't specify `doc['payload']` during indexing, queries simply return `None` in the result tuple, as in our example here.
+ In the resulting 3-tuples, `doc_n` is the document id we supplied during indexing,
+ `0.30426699` is the similarity of `doc_n` to the query, but what's up with that `None`, you ask?
+ Well, you can associate each document with a "payload", during indexing.
+ This payload object (anything pickle-able) is later returned during querying.
+ If you don't specify `doc['payload']` during indexing, queries simply return `None` in the result tuple, as in our example here.
2. or by document (using `document['tokens']`; id is ignored in this case):
- .. code-block:: python
+ .. sourcecode:: pycon
- >>> doc = {'tokens': utils.simple_preprocess('Graph and minors and humans and trees.')}
- >>> print(service.find_similar(doc, min_score=0.4, max_results=50))
+ >>> doc = {'tokens': utils.simple_preprocess('Graph and minors and humans and trees.')}
+ >>> print(service.find_similar(doc, min_score=0.4, max_results=50))
[('doc_7', 0.93350589, None), ('doc_3', 0.42718196, None)]
Remote access
@@ -250,20 +272,23 @@ included with simserver, run it with::
You can just `ctrl+c` to terminate the server, but leave it running for now.
-Now open your Python shell again, in another terminal window or possibly on another machine, and::
+Now open your Python shell again, in another terminal window or possibly on another machine, and
->>> import Pyro4
->>> service = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
+.. sourcecode:: pycon
+
+ >>> import Pyro4
+ >>> service = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
Now `service` is only a proxy object: every call is physically executed wherever
you ran the `run_server.py` script, which can be a totally different computer
-(within a network broadcast domain), but you don't even know::
+(within a network broadcast domain), but you don't even know:
->>> print(service.status())
->>> service.train(corpus)
->>> service.index(other_corpus)
->>> service.find_similar(query)
->>> ...
+.. sourcecode:: pycon
+
+ >>> print(service.status())
+ >>> service.train(corpus)
+ >>> service.index(other_corpus)
+ >>> service.find_similar(query)
It is worth mentioning that Irmen, the author of Pyro, also released
`Pyrolite `_ recently. That is a package
@@ -300,7 +325,9 @@ with how the session went), it can be rolled back. It also means other clients c
continue querying the original index during index updates.
The mechanism is hidden from users by default through auto-committing (it was already happening
-in the examples above too), but auto-committing can be turned off explicitly::
+in the examples above too), but auto-committing can be turned off explicitly
+
+.. sourcecode:: pycon
>>> service.set_autosession(False)
>>> service.train(corpus)
@@ -309,19 +336,22 @@ in the examples above too), but auto-committing can be turned off explicitly::
>>> service.train(corpus)
>>> service.index(corpus)
>>> service.delete(doc_ids)
- >>> ...
None of these changes are visible to other clients, yet. Also, other clients'
calls to index/train/etc will block until this session is committed/rolled back---there
cannot be two open sessions at the same time.
-To end a session::
+To end a session
+
+.. sourcecode:: pycon
+
+ >>> service.rollback() # discard all changes since open_session()
- >>> service.rollback() # discard all changes since open_session()
+or
-or::
+.. sourcecode:: pycon
- >>> service.commit() # make changes public; now other clients can see changes/acquire the modification lock
+ >>> service.commit() # make changes public; now other clients can see changes/acquire the modification lock
Other stuff
diff --git a/docs/src/tut1.rst b/docs/src/tut1.rst
index 4d6f80b375..992858ffad 100644
--- a/docs/src/tut1.rst
+++ b/docs/src/tut1.rst
@@ -20,17 +20,17 @@ From Strings to Vectors
This time, let's start from documents represented as strings:
->>> from gensim import corpora
->>>
->>> documents = ["Human machine interface for lab abc computer applications",
->>> "A survey of user opinion of computer system response time",
->>> "The EPS user interface management system",
->>> "System and human system engineering testing of EPS",
->>> "Relation of user perceived response time to error measurement",
->>> "The generation of random binary unordered trees",
->>> "The intersection graph of paths in trees",
->>> "Graph minors IV Widths of trees and well quasi ordering",
->>> "Graph minors A survey"]
+.. sourcecode:: pycon
+
+ >>> documents = ["Human machine interface for lab abc computer applications",
+ >>> "A survey of user opinion of computer system response time",
+ >>> "The EPS user interface management system",
+ >>> "System and human system engineering testing of EPS",
+ >>> "Relation of user perceived response time to error measurement",
+ >>> "The generation of random binary unordered trees",
+ >>> "The intersection graph of paths in trees",
+ >>> "Graph minors IV Widths of trees and well quasi ordering",
+ >>> "Graph minors A survey"]
This is a tiny corpus of nine documents, each consisting of only a single sentence.
@@ -38,32 +38,35 @@ This is a tiny corpus of nine documents, each consisting of only a single senten
First, let's tokenize the documents, remove common words (using a toy stoplist)
as well as words that only appear once in the corpus:
->>> # remove common words and tokenize
->>> stoplist = set('for a of the and to in'.split())
->>> texts = [[word for word in document.lower().split() if word not in stoplist]
->>> for document in documents]
->>>
->>> # remove words that appear only once
->>> from collections import defaultdict
->>> frequency = defaultdict(int)
->>> for text in texts:
->>> for token in text:
->>> frequency[token] += 1
->>>
->>> texts = [[token for token in text if frequency[token] > 1]
->>> for text in texts]
->>>
->>> from pprint import pprint # pretty-printer
->>> pprint(texts)
-[['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+.. sourcecode:: pycon
+
+ >>> from pprint import pprint # pretty-printer
+ >>> from collections import defaultdict
+ >>>
+ >>> # remove common words and tokenize
+ >>> stoplist = set('for a of the and to in'.split())
+ >>> texts = [[word for word in document.lower().split() if word not in stoplist]
+ >>> for document in documents]
+ >>>
+ >>> # remove words that appear only once
+ >>> frequency = defaultdict(int)
+ >>> for text in texts:
+ >>> for token in text:
+ >>> frequency[token] += 1
+ >>>
+ >>> texts = [[token for token in text if frequency[token] > 1]
+ >>> for text in texts]
+ >>>
+ >>> pprint(texts)
+ [['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']]
Your way of processing the documents will likely vary; here, I only split on whitespace
to tokenize, followed by lowercasing each word. In fact, I use this particular
@@ -98,16 +101,20 @@ and relevant statistics. In the end, we see there are twelve distinct words in t
processed corpus, which means each document will be represented by twelve numbers (ie., by a 12-D vector).
To see the mapping between words and their ids:
->>> print(dictionary.token2id)
-{'minors': 11, 'graph': 10, 'system': 5, 'trees': 9, 'eps': 8, 'computer': 0,
-'survey': 4, 'user': 7, 'human': 1, 'time': 6, 'interface': 2, 'response': 3}
+.. sourcecode:: pycon
+
+ >>> print(dictionary.token2id)
+ {'minors': 11, 'graph': 10, 'system': 5, 'trees': 9, 'eps': 8, 'computer': 0,
+ 'survey': 4, 'user': 7, 'human': 1, 'time': 6, 'interface': 2, 'response': 3}
To actually convert tokenized documents to vectors:
->>> new_doc = "Human computer interaction"
->>> new_vec = dictionary.doc2bow(new_doc.lower().split())
->>> print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored
-[(0, 1), (1, 1)]
+.. sourcecode:: pycon
+
+ >>> new_doc = "Human computer interaction"
+ >>> new_vec = dictionary.doc2bow(new_doc.lower().split())
+ >>> print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored
+ [(0, 1), (1, 1)]
The function :func:`doc2bow` simply counts the number of occurrences of
each distinct word, converts the word to its integer word id
@@ -115,6 +122,8 @@ and returns the result as a sparse vector. The sparse vector ``[(0, 1), (1, 1)]`
therefore reads: in the document `"Human computer interaction"`, the words `computer`
(id 0) and `human` (id 1) appear once; the other ten dictionary words appear (implicitly) zero times.
+.. sourcecode:: pycon
+
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use
>>> print(corpus)
@@ -140,13 +149,15 @@ Note that `corpus` above resides fully in memory, as a plain Python list.
In this simple example, it doesn't matter much, but just to make things clear,
let's assume there are millions of documents in the corpus. Storing all of them in RAM won't do.
Instead, let's assume the documents are stored in a file on disk, one document per line. Gensim
-only requires that a corpus must be able to return one document vector at a time::
+only requires that a corpus must be able to return one document vector at a time:
+
+.. sourcecode:: pycon
->>> class MyCorpus(object):
->>> def __iter__(self):
->>> for line in open('mycorpus.txt'):
->>> # assume there's one document per line, tokens separated by whitespace
->>> yield dictionary.doc2bow(line.lower().split())
+ >>> class MyCorpus(object):
+ >>> def __iter__(self):
+ >>> for line in open('mycorpus.txt'):
+ >>> # assume there's one document per line, tokens separated by whitespace
+ >>> yield dictionary.doc2bow(line.lower().split())
Download the sample `mycorpus.txt file here <./mycorpus.txt>`_. The assumption that
each document occupies one line in a single file is not important; you can mold
@@ -155,13 +166,17 @@ Walking directories, parsing XML, accessing network...
Just parse your input to retrieve a clean list of tokens in each document,
then convert the tokens via a dictionary to their ids and yield the resulting sparse vector inside `__iter__`.
->>> corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
->>> print(corpus_memory_friendly)
-<__main__.MyCorpus object at 0x10d5690>
+.. sourcecode:: pycon
+
+ >>> corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
+ >>> print(corpus_memory_friendly)
+ <__main__.MyCorpus object at 0x10d5690>
Corpus is now an object. We didn't define any way to print it, so `print` just outputs address
of the object in memory. Not very useful. To see the constituent vectors, let's
-iterate over the corpus and print each document vector (one at a time)::
+iterate over the corpus and print each document vector (one at a time):
+
+.. sourcecode:: pycon
>>> for vector in corpus_memory_friendly: # load one vector into memory at a time
... print(vector)
@@ -179,7 +194,9 @@ Although the output is the same as for the plain Python list, the corpus is now
more memory friendly, because at most one vector resides in RAM at a time. Your
corpus can now be as large as you want.
-Similarly, to construct the dictionary without loading all texts into memory::
+Similarly, to construct the dictionary without loading all texts into memory:
+
+.. sourcecode:: pycon
>>> from six import iteritems
>>> # collect statistics about all tokens
@@ -215,49 +232,63 @@ a time, without the whole corpus being read into main memory at once.
One of the more notable file formats is the `Market Matrix format `_.
To save a corpus in the Matrix Market format:
->>> # create a toy corpus of 2 documents, as a plain Python list
->>> corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it
->>>
->>> corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
+.. sourcecode:: pycon
+
+ >>> # create a toy corpus of 2 documents, as a plain Python list
+ >>> corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it
+ >>>
+ >>> corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
Other formats include `Joachim's SVMlight format `_,
`Blei's LDA-C format `_ and
`GibbsLDA++ format `_.
->>> corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
->>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
->>> corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
+.. sourcecode:: pycon
+
+ >>> corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
+ >>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
+ >>> corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
Conversely, to load a corpus iterator from a Matrix Market file:
->>> corpus = corpora.MmCorpus('/tmp/corpus.mm')
+.. sourcecode:: pycon
+
+ >>> corpus = corpora.MmCorpus('/tmp/corpus.mm')
Corpus objects are streams, so typically you won't be able to print them directly:
->>> print(corpus)
-MmCorpus(2 documents, 2 features, 1 non-zero entries)
+.. sourcecode:: pycon
+
+ >>> print(corpus)
+ MmCorpus(2 documents, 2 features, 1 non-zero entries)
Instead, to view the contents of a corpus:
->>> # one way of printing a corpus: load it entirely into memory
->>> print(list(corpus)) # calling list() will convert any sequence to a plain Python list
-[[(1, 0.5)], []]
+.. sourcecode:: pycon
+
+ >>> # one way of printing a corpus: load it entirely into memory
+ >>> print(list(corpus)) # calling list() will convert any sequence to a plain Python list
+ [[(1, 0.5)], []]
or
->>> # another way of doing it: print one document at a time, making use of the streaming interface
->>> for doc in corpus:
-... print(doc)
-[(1, 0.5)]
-[]
+.. sourcecode:: pycon
+
+ >>> # another way of doing it: print one document at a time, making use of the streaming interface
+ >>> for doc in corpus:
+ ... print(doc)
+ [(1, 0.5)]
+ []
The second way is obviously more memory-friendly, but for testing and development
purposes, nothing beats the simplicity of calling ``list(corpus)``.
To save the same Matrix Market document stream in Blei's LDA-C format,
->>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
+.. sourcecode:: pycon
+
+ >>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
In this way, `gensim` can also be used as a memory-efficient **I/O format conversion tool**:
just load a document stream using one format and immediately save it in another format.
@@ -268,20 +299,24 @@ Compatibility with NumPy and SciPy
----------------------------------
Gensim also contains `efficient utility functions `_
-to help converting from/to numpy matrices::
+to help converting from/to numpy matrices
+
+.. sourcecode:: pycon
+
+ >>> import gensim
+ >>> import numpy as np
+ >>> numpy_matrix = np.random.randint(10, size=[5, 2]) # random matrix as an example
+ >>> corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
+ >>> numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)
->>> import gensim
->>> import numpy as np
->>> numpy_matrix = np.random.randint(10, size=[5,2]) # random matrix as an example
->>> corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
->>> numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)
+and from/to `scipy.sparse` matrices
-and from/to `scipy.sparse` matrices::
+.. sourcecode:: pycon
->>> import scipy.sparse
->>> scipy_sparse_matrix = scipy.sparse.random(5,2) # random sparse matrix as example
->>> corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
->>> scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)
+ >>> import scipy.sparse
+ >>> scipy_sparse_matrix = scipy.sparse.random(5, 2) # random sparse matrix as example
+ >>> corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
+ >>> scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)
-------------
diff --git a/docs/src/tut2.rst b/docs/src/tut2.rst
index 130bba7375..24db8ae092 100644
--- a/docs/src/tut2.rst
+++ b/docs/src/tut2.rst
@@ -6,8 +6,10 @@ Topics and Transformations
Don't forget to set
->>> import logging
->>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+.. sourcecode:: pycon
+
+ >>> import logging
+ >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if you want to see logging events.
@@ -17,15 +19,18 @@ Transformation interface
In the previous tutorial on :doc:`tut1`, we created a corpus of documents represented
as a stream of vectors. To continue, let's fire up gensim and use that corpus:
->>> from gensim import corpora, models, similarities
->>> if (os.path.exists("/tmp/deerwester.dict")):
->>> dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
->>> corpus = corpora.MmCorpus('/tmp/deerwester.mm')
->>> print("Used files generated from first tutorial")
->>> else:
->>> print("Please run first tutorial to generate data set")
+.. sourcecode:: pycon
+
+ >>> from gensim import corpora
+ >>>
+ >>> if (os.path.exists("/tmp/deerwester.dict")):
+ >>> dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
+ >>> corpus = corpora.MmCorpus('/tmp/deerwester.mm')
+ >>> print("Used files generated from first tutorial")
+ >>> else:
+ >>> print("Please run first tutorial to generate data set")
-MmCorpus(9 documents, 12 features, 28 non-zero entries)
+ MmCorpus(9 documents, 12 features, 28 non-zero entries)
In this tutorial, I will show how to transform documents from one vector representation
into another. This process serves two goals:
@@ -43,7 +48,9 @@ Creating a transformation
The transformations are standard Python objects, typically initialized by means of
a :dfn:`training corpus`:
->>> tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
+.. sourcecode:: pycon
+
+ >>> tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
We used our old corpus from tutorial 1 to initialize (train) the transformation model. Different
transformations may require different initialization parameters; in case of TfIdf, the
@@ -69,24 +76,28 @@ From now on, ``tfidf`` is treated as a read-only object that can be used to conv
any vector from the old representation (bag-of-words integer counts) to the new representation
(TfIdf real-valued weights):
->>> doc_bow = [(0, 1), (1, 1)]
->>> print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors
-[(0, 0.70710678), (1, 0.70710678)]
+.. sourcecode:: pycon
+
+ >>> doc_bow = [(0, 1), (1, 1)]
+ >>> print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors
+ [(0, 0.70710678), (1, 0.70710678)]
Or to apply a transformation to a whole corpus:
->>> corpus_tfidf = tfidf[corpus]
->>> for doc in corpus_tfidf:
-... print(doc)
-[(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)]
-[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548)]
-[(2, 0.5710059809418182), (5, 0.41707573620227772), (7, 0.41707573620227772), (8, 0.5710059809418182)]
-[(1, 0.49182558987264147), (5, 0.71848116070837686), (8, 0.49182558987264147)]
-[(3, 0.62825804686700459), (6, 0.62825804686700459), (7, 0.45889394536615247)]
-[(9, 1.0)]
-[(9, 0.70710678118654746), (10, 0.70710678118654746)]
-[(9, 0.50804290089167492), (10, 0.50804290089167492), (11, 0.69554641952003704)]
-[(4, 0.62825804686700459), (10, 0.45889394536615247), (11, 0.62825804686700459)]
+.. sourcecode:: pycon
+
+ >>> corpus_tfidf = tfidf[corpus]
+ >>> for doc in corpus_tfidf:
+ ... print(doc)
+ [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)]
+ [(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548)]
+ [(2, 0.5710059809418182), (5, 0.41707573620227772), (7, 0.41707573620227772), (8, 0.5710059809418182)]
+ [(1, 0.49182558987264147), (5, 0.71848116070837686), (8, 0.49182558987264147)]
+ [(3, 0.62825804686700459), (6, 0.62825804686700459), (7, 0.45889394536615247)]
+ [(9, 1.0)]
+ [(9, 0.70710678118654746), (10, 0.70710678118654746)]
+ [(9, 0.50804290089167492), (10, 0.50804290089167492), (11, 0.69554641952003704)]
+ [(4, 0.62825804686700459), (10, 0.45889394536615247), (11, 0.62825804686700459)]
In this particular case, we are transforming the same corpus that we used
for training, but this is only incidental. Once the transformation model has been initialized,
@@ -105,13 +116,17 @@ folding-in for LSA, by topic inference for LDA etc.
Transformations can also be serialized, one on top of another, in a sort of chain:
->>> lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
->>> corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
+.. sourcecode:: pycon
+
+ >>> lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
+ >>> corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
Here we transformed our Tf-Idf corpus via `Latent Semantic Indexing `_
into a latent 2-D space (2-D because we set ``num_topics=2``). Now you're probably wondering: what do these two latent
dimensions stand for? Let's inspect with :func:`models.LsiModel.print_topics`:
+.. sourcecode:: pycon
+
>>> lsi.print_topics(2)
topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"response" + -0.060*"time" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"
topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"
@@ -125,23 +140,27 @@ second topic practically concerns itself with all the other words. As expected,
the first five documents are more strongly related to the second topic while the
remaining four documents to the first topic:
->>> for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
-... print(doc)
-[(0, -0.066), (1, 0.520)] # "Human machine interface for lab abc computer applications"
-[(0, -0.197), (1, 0.761)] # "A survey of user opinion of computer system response time"
-[(0, -0.090), (1, 0.724)] # "The EPS user interface management system"
-[(0, -0.076), (1, 0.632)] # "System and human system engineering testing of EPS"
-[(0, -0.102), (1, 0.574)] # "Relation of user perceived response time to error measurement"
-[(0, -0.703), (1, -0.161)] # "The generation of random binary unordered trees"
-[(0, -0.877), (1, -0.168)] # "The intersection graph of paths in trees"
-[(0, -0.910), (1, -0.141)] # "Graph minors IV Widths of trees and well quasi ordering"
-[(0, -0.617), (1, 0.054)] # "Graph minors A survey"
+.. sourcecode:: pycon
+
+ >>> for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
+ ... print(doc)
+ [(0, -0.066), (1, 0.520)] # "Human machine interface for lab abc computer applications"
+ [(0, -0.197), (1, 0.761)] # "A survey of user opinion of computer system response time"
+ [(0, -0.090), (1, 0.724)] # "The EPS user interface management system"
+ [(0, -0.076), (1, 0.632)] # "System and human system engineering testing of EPS"
+ [(0, -0.102), (1, 0.574)] # "Relation of user perceived response time to error measurement"
+ [(0, -0.703), (1, -0.161)] # "The generation of random binary unordered trees"
+ [(0, -0.877), (1, -0.168)] # "The intersection graph of paths in trees"
+ [(0, -0.910), (1, -0.141)] # "Graph minors IV Widths of trees and well quasi ordering"
+ [(0, -0.617), (1, 0.054)] # "Graph minors A survey"
Model persistency is achieved with the :func:`save` and :func:`load` functions:
->>> lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
->>> lsi = models.LsiModel.load('/tmp/model.lsi')
+.. sourcecode:: pycon
+
+ >>> lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
+ >>> lsi = models.LsiModel.load('/tmp/model.lsi')
The next question might be: just how exactly similar are those documents to each other?
@@ -165,7 +184,9 @@ Gensim implements several popular Vector Space Model algorithms:
the number of dimensions intact. It can also optionally normalize the resulting
vectors to (Euclidean) unit length.
- >>> model = models.TfidfModel(corpus, normalize=True)
+ .. sourcecode:: pycon
+
+ >>> model = models.TfidfModel(corpus, normalize=True)
* `Latent Semantic Indexing, LSI (or sometimes LSA) `_
transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into
@@ -173,7 +194,9 @@ Gensim implements several popular Vector Space Model algorithms:
2 latent dimensions, but on real corpora, target dimensionality of 200--500 is recommended
as a "golden standard" [1]_.
- >>> model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
+ .. sourcecode:: pycon
+
+ >>> model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
LSI training is unique in that we can continue "training" at any point, simply
by providing more training documents. This is done by incremental updates to
@@ -181,12 +204,13 @@ Gensim implements several popular Vector Space Model algorithms:
input document stream may even be infinite -- just keep feeding LSI new documents
as they arrive, while using the computed transformation model as read-only in the meanwhile!
- >>> model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus
- >>> lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model
- >>> ...
- >>> model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
- >>> lsi_vec = model[tfidf_vec]
- >>> ...
+ .. sourcecode:: pycon
+
+ >>> model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus
+ >>> lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model
+ >>>
+ >>> model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
+ >>> lsi_vec = model[tfidf_vec]
See the :mod:`gensim.models.lsimodel` documentation for details on how to make
LSI gradually "forget" old observations in infinite streams. If you want to get dirty,
@@ -205,7 +229,9 @@ Gensim implements several popular Vector Space Model algorithms:
CPU-friendly) approach to approximating TfIdf distances between documents, by throwing in a little randomness.
Recommended target dimensionality is again in the hundreds/thousands, depending on your dataset.
- >>> model = models.RpModel(tfidf_corpus, num_topics=500)
+ .. sourcecode:: pycon
+
+ >>> model = models.RpModel(tfidf_corpus, num_topics=500)
* `Latent Dirichlet Allocation, LDA `_
is yet another transformation from bag-of-words counts into a topic space of lower
@@ -214,7 +240,9 @@ Gensim implements several popular Vector Space Model algorithms:
just like with LSA, inferred automatically from a training corpus. Documents
are in turn interpreted as a (soft) mixture of these topics (again, just like with LSA).
- >>> model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
+ .. sourcecode:: pycon
+
+ >>> model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
`gensim` uses a fast implementation of online LDA parameter estimation based on [2]_,
modified to run in :doc:`distributed mode ` on a cluster of computers.
@@ -222,7 +250,9 @@ Gensim implements several popular Vector Space Model algorithms:
* `Hierarchical Dirichlet Process, HDP `_
is a non-parametric bayesian method (note the missing number of requested topics):
- >>> model = models.HdpModel(corpus, id2word=dictionary)
+ .. sourcecode:: pycon
+
+ >>> model = models.HdpModel(corpus, id2word=dictionary)
`gensim` uses a fast, online implementation based on [3]_.
The HDP model is a new addition to `gensim`, and still rough around its academic edges -- use with care.
diff --git a/docs/src/tut3.rst b/docs/src/tut3.rst
index f017edfc37..d9b28220cc 100644
--- a/docs/src/tut3.rst
+++ b/docs/src/tut3.rst
@@ -6,8 +6,10 @@ Similarity Queries
Don't forget to set
->>> import logging
->>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+.. sourcecode:: pycon
+
+ >>> import logging
+ >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if you want to see logging events.
@@ -25,16 +27,21 @@ previous examples (which really originally comes from Deerwester et al.'s
`"Indexing by Latent Semantic Analysis" `_
seminal 1990 article):
->>> from gensim import corpora, models, similarities
->>> dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
->>> corpus = corpora.MmCorpus('/tmp/deerwester.mm') # comes from the first tutorial, "From strings to vectors"
->>> print(corpus)
-MmCorpus(9 documents, 12 features, 28 non-zero entries)
+.. sourcecode:: pycon
+
+ >>> from gensim import corpora
+ >>> dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
+ >>> corpus = corpora.MmCorpus('/tmp/deerwester.mm') # comes from the first tutorial, "From strings to vectors"
+ >>> print(corpus)
+ MmCorpus(9 documents, 12 features, 28 non-zero entries)
To follow Deerwester's example, we first use this tiny corpus to define a 2-dimensional
LSI space:
->>> lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
+.. sourcecode:: pycon
+
+ >>> from gensim import models
+ >>> lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
Now suppose a user typed in the query `"Human computer interaction"`. We would
like to sort our nine corpus documents in decreasing order of relevance to this query.
@@ -42,11 +49,13 @@ Unlike modern search engines, here we only concentrate on a single aspect of pos
similarities---on apparent semantic relatedness of their texts (words). No hyperlinks,
no random-walk static ranks, just a semantic extension over the boolean keyword match:
->>> doc = "Human computer interaction"
->>> vec_bow = dictionary.doc2bow(doc.lower().split())
->>> vec_lsi = lsi[vec_bow] # convert the query to LSI space
->>> print(vec_lsi)
-[(0, -0.461821), (1, 0.070028)]
+.. sourcecode:: pycon
+
+ >>> doc = "Human computer interaction"
+ >>> vec_bow = dictionary.doc2bow(doc.lower().split())
+ >>> vec_lsi = lsi[vec_bow] # convert the query to LSI space
+ >>> print(vec_lsi)
+ [(0, -0.461821), (1, 0.070028)]
In addition, we will be considering `cosine similarity `_
to determine the similarity of two vectors. Cosine similarity is a standard measure
@@ -62,7 +71,9 @@ to compare against subsequent queries. In our case, they are the same nine docum
used for training LSI, converted to 2-D LSA space. But that's only incidental, we
might also be indexing a different corpus altogether.
->>> index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it
+.. sourcecode:: pycon
+
+ >>> index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it
.. warning::
The class :class:`similarities.MatrixSimilarity` is only appropriate when the whole
@@ -76,8 +87,10 @@ might also be indexing a different corpus altogether.
Index persistency is handled via the standard :func:`save` and :func:`load` functions:
->>> index.save('/tmp/deerwester.index')
->>> index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
+.. sourcecode:: pycon
+
+ >>> index.save('/tmp/deerwester.index')
+ >>> index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
This is true for all similarity indexing classes (:class:`similarities.Similarity`,
:class:`similarities.MatrixSimilarity` and :class:`similarities.SparseMatrixSimilarity`).
@@ -90,10 +103,12 @@ Performing queries
To obtain similarities of our query document against the nine indexed documents:
->>> sims = index[vec_lsi] # perform a similarity query against the corpus
->>> print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples
-[(0, 0.99809301), (1, 0.93748635), (2, 0.99844527), (3, 0.9865886), (4, 0.90755945),
-(5, -0.12416792), (6, -0.1063926), (7, -0.098794639), (8, 0.05004178)]
+.. sourcecode:: pycon
+
+ >>> sims = index[vec_lsi] # perform a similarity query against the corpus
+ >>> print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples
+ [(0, 0.99809301), (1, 0.93748635), (2, 0.99844527), (3, 0.9865886), (4, 0.90755945),
+ (5, -0.12416792), (6, -0.1063926), (7, -0.098794639), (8, 0.05004178)]
Cosine measure returns similarities in the range `<-1, 1>` (the greater, the more similar),
so that the first document has a score of 0.99809301 etc.
@@ -101,17 +116,19 @@ so that the first document has a score of 0.99809301 etc.
With some standard Python magic we sort these similarities into descending
order, and obtain the final answer to the query `"Human computer interaction"`:
->>> sims = sorted(enumerate(sims), key=lambda item: -item[1])
->>> print(sims) # print sorted (document number, similarity score) 2-tuples
-[(2, 0.99844527), # The EPS user interface management system
-(0, 0.99809301), # Human machine interface for lab abc computer applications
-(3, 0.9865886), # System and human system engineering testing of EPS
-(1, 0.93748635), # A survey of user opinion of computer system response time
-(4, 0.90755945), # Relation of user perceived response time to error measurement
-(8, 0.050041795), # Graph minors A survey
-(7, -0.098794639), # Graph minors IV Widths of trees and well quasi ordering
-(6, -0.1063926), # The intersection graph of paths in trees
-(5, -0.12416792)] # The generation of random binary unordered trees
+.. sourcecode:: pycon
+
+ >>> sims = sorted(enumerate(sims), key=lambda item: -item[1])
+ >>> print(sims) # print sorted (document number, similarity score) 2-tuples
+ [(2, 0.99844527), # The EPS user interface management system
+ (0, 0.99809301), # Human machine interface for lab abc computer applications
+ (3, 0.9865886), # System and human system engineering testing of EPS
+ (1, 0.93748635), # A survey of user opinion of computer system response time
+ (4, 0.90755945), # Relation of user perceived response time to error measurement
+ (8, 0.050041795), # Graph minors A survey
+ (7, -0.098794639), # Graph minors IV Widths of trees and well quasi ordering
+ (6, -0.1063926), # The intersection graph of paths in trees
+ (5, -0.12416792)] # The generation of random binary unordered trees
(I added the original documents in their "string form" to the output comments, to
improve clarity.)
@@ -145,5 +162,5 @@ That doesn't mean it's perfect though:
`user stories and general questions `_.
Gensim has no ambition to become an all-encompassing framework, across all NLP (or even Machine Learning) subfields.
-Its mission is to help NLP practicioners try out popular topic modelling algorithms
+Its mission is to help NLP practitioners try out popular topic modelling algorithms
on large datasets easily, and to facilitate prototyping of new algorithms for researchers.
diff --git a/docs/src/tutorial.rst b/docs/src/tutorial.rst
index e8ec9b8912..3ec9631153 100644
--- a/docs/src/tutorial.rst
+++ b/docs/src/tutorial.rst
@@ -38,17 +38,17 @@ Quick Example
First, let's import gensim and create a small corpus of nine documents and twelve features [1]_:
->>> from gensim import corpora, models, similarities
->>>
->>> corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
->>> [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
->>> [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
->>> [(0, 1.0), (4, 2.0), (7, 1.0)],
->>> [(3, 1.0), (5, 1.0), (6, 1.0)],
->>> [(9, 1.0)],
->>> [(9, 1.0), (10, 1.0)],
->>> [(9, 1.0), (10, 1.0), (11, 1.0)],
->>> [(8, 1.0), (10, 1.0), (11, 1.0)]]
+.. sourcecode:: pycon
+
+ >>> corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
+ >>> [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
+ >>> [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
+ >>> [(0, 1.0), (4, 2.0), (7, 1.0)],
+ >>> [(3, 1.0), (5, 1.0), (6, 1.0)],
+ >>> [(9, 1.0)],
+ >>> [(9, 1.0), (10, 1.0)],
+ >>> [(9, 1.0), (10, 1.0), (11, 1.0)],
+ >>> [(8, 1.0), (10, 1.0), (11, 1.0)]]
In `gensim` a :dfn:`corpus` is simply an object which, when iterated over, returns its documents represented
as sparse vectors. In this case we're using a list of list of tuples. If you're not familiar with the `vector space model `_, we'll bridge the gap between **raw strings**, **corpora** and **sparse vectors** in the next tutorial on :doc:`tut1`.
@@ -67,13 +67,19 @@ has major impact on the quality of any subsequent applications.
Next, let's initialize a :dfn:`transformation`:
->>> tfidf = models.TfidfModel(corpus)
+.. sourcecode:: pycon
+
+ >>> from gensim import models
+ >>>
+ >>> tfidf = models.TfidfModel(corpus)
A transformation is used to convert documents from one vector representation into another:
->>> vec = [(0, 1), (4, 1)]
->>> print(tfidf[vec])
-[(0, 0.8075244), (4, 0.5898342)]
+.. sourcecode:: pycon
+
+ >>> vec = [(0, 1), (4, 1)]
+ >>> print(tfidf[vec])
+ [(0, 0.8075244), (4, 0.5898342)]
Here, we used `Tf-Idf `_, a simple
transformation which takes documents represented as bag-of-words counts and applies
@@ -84,13 +90,19 @@ Transformations are covered in detail in the tutorial on :doc:`tut2`.
To transform the whole corpus via TfIdf and index it, in preparation for similarity queries:
->>> index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
+.. sourcecode:: pycon
+
+ >>> from gensim import similarities
+ >>>
+ >>> index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
and to query the similarity of our query vector ``vec`` against every document in the corpus:
->>> sims = index[tfidf[vec]]
->>> print(list(enumerate(sims)))
-[(0, 0.4662244), (1, 0.19139354), (2, 0.24600551), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]
+.. sourcecode:: pycon
+
+ >>> sims = index[tfidf[vec]]
+ >>> print(list(enumerate(sims)))
+ [(0, 0.4662244), (1, 0.19139354), (2, 0.24600551), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]
How to read this output? Document number zero (the first document) has a similarity score of 0.466=46.6\%,
the second document has a similarity score of 19.1\% etc.
diff --git a/docs/src/wiki.rst b/docs/src/wiki.rst
index 2992cf8401..bc148729d4 100644
--- a/docs/src/wiki.rst
+++ b/docs/src/wiki.rst
@@ -36,17 +36,21 @@ Preparing the corpus
Latent Semantic Analysis
--------------------------
-First let's load the corpus iterator and dictionary, created in the second step above::
+First let's load the corpus iterator and dictionary, created in the second step above
- >>> import logging, gensim
- >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+.. sourcecode:: pycon
+ >>> import logging
+ >>> import gensim
+ >>>
+ >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+ >>>
>>> # load id->word mapping (the dictionary), one of the results of step 2 above
>>> id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')
>>> # load corpus iterator
>>> mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')
>>> # mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output (recommended)
-
+ >>>
>>> print(mm)
MmCorpus(3931787 documents, 100000 features, 756379027 non-zero entries)
@@ -54,11 +58,13 @@ We see that our corpus contains 3.9M documents, 100K features (distinct
tokens) and 0.76G non-zero entries in the sparse TF-IDF matrix. The Wikipedia corpus
contains about 2.24 billion tokens in total.
-Now we're ready to compute LSA of the English Wikipedia::
+Now we're ready to compute LSA of the English Wikipedia:
+
+.. sourcecode:: pycon
>>> # extract 400 LSI topics; use the default one-pass algorithm
>>> lsi = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=400)
-
+ >>>
>>> # print the most contributing words (both positively and negatively) for each of the first ten topics
>>> lsi.print_topics(10)
topic #0(332.762): 0.425*"utc" + 0.299*"talk" + 0.293*"page" + 0.226*"article" + 0.224*"delete" + 0.216*"discussion" + 0.205*"deletion" + 0.198*"should" + 0.146*"debate" + 0.132*"be"
@@ -91,17 +97,21 @@ or where the cost of storing/iterating over the corpus multiple times is too hig
Latent Dirichlet Allocation
----------------------------
-As with Latent Semantic Analysis above, first load the corpus iterator and dictionary::
+As with Latent Semantic Analysis above, first load the corpus iterator and dictionary
- >>> import logging, gensim
- >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+.. sourcecode:: pycon
+ >>> import logging
+ >>> import gensim
+ >>>
+ >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+ >>>
>>> # load id->word mapping (the dictionary), one of the results of step 2 above
>>> id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')
>>> # load corpus iterator
>>> mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')
>>> # mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output
-
+ >>>
>>> print(mm)
MmCorpus(3931787 documents, 100000 features, 756379027 non-zero entries)
@@ -114,15 +124,19 @@ over the smaller chunks (subcorpora) are pretty good in themselves, so that the
model estimation converges faster. As a result, we will perhaps only need a single full
pass over the corpus: if the corpus has 3 million articles, and we update once after
every 10,000 articles, this means we will have done 300 updates in one pass, quite likely
-enough to have a very accurate topics estimate::
+enough to have a very accurate topics estimate
+
+.. sourcecode:: pycon
>>> # extract 100 LDA topics, using 1 pass and updating once every 1 chunk (10,000 documents)
- >>> lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=1, chunksize=10000, passes=1)
+ >>> lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=1, passes=1)
using serial LDA version on this node
running online LDA training, 100 topics, 1 passes over the supplied corpus of 3931787 documents, updating model once every 10000 documents
...
-Unlike LSA, the topics coming from LDA are easier to interpret::
+Unlike LSA, the topics coming from LDA are easier to interpret
+
+.. sourcecode:: pycon
>>> # print the most contributing words for 20 randomly selected topics
>>> lda.print_topics(20)
@@ -164,7 +178,9 @@ In short, be careful if using LDA to incrementally add new documents to the mode
over time. **Batch usage of LDA**, where the entire training corpus is either known beforehand or does
not exhibit topic drift, **is ok and not affected**.
-To run batch LDA (not online), train `LdaModel` with::
+To run batch LDA (not online), train `LdaModel` with:
+
+.. sourcecode:: pycon
>>> # extract 100 LDA topics, using 20 full passes, no online updates
>>> lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=0, passes=20)
@@ -172,6 +188,8 @@ To run batch LDA (not online), train `LdaModel` with::
As usual, a trained model can used be to transform new, unseen documents (plain bag-of-words count vectors)
into LDA topic distributions:
+.. sourcecode:: pycon
+
>>> doc_lda = lda[doc_bow]
--------------------
diff --git a/ez_setup.py b/ez_setup.py
deleted file mode 100644
index 4251063fc0..0000000000
--- a/ez_setup.py
+++ /dev/null
@@ -1,405 +0,0 @@
-#!python
-"""Bootstrap setuptools installation
-
-If you want to use setuptools in your package's setup.py, just include this
-file in the same directory with it, and add this to the top of your setup.py::
-
- from ez_setup import use_setuptools
- use_setuptools()
-
-If you want to require a specific version of setuptools, set a download
-mirror, or use an alternate download directory, you can do so by supplying
-the appropriate options to ``use_setuptools()``.
-
-This file can also be run as a script to install or upgrade setuptools.
-"""
-import os
-import shutil
-import sys
-import tempfile
-import tarfile
-import optparse
-import subprocess
-import platform
-
-from distutils import log
-
-try:
- from site import USER_SITE
-except ImportError:
- USER_SITE = None
-
-DEFAULT_VERSION = "1.3.2"
-DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/"
-
-
-def _python_cmd(*args):
- args = (sys.executable,) + args
- return subprocess.call(args) == 0
-
-
-def _check_call_py24(cmd, *args, **kwargs):
- res = subprocess.call(cmd, *args, **kwargs)
-
- class CalledProcessError(Exception):
- pass
- if not res == 0:
- msg = "Command '%s' return non-zero exit status %d" % (cmd, res)
- raise CalledProcessError(msg)
-
-
-vars(subprocess).setdefault('check_call', _check_call_py24)
-
-
-def _install(tarball, install_args=()):
- # extracting the tarball
- tmpdir = tempfile.mkdtemp()
- log.warn('Extracting in %s', tmpdir)
- old_wd = os.getcwd()
- try:
- os.chdir(tmpdir)
- tar = tarfile.open(tarball)
- _extractall(tar)
- tar.close()
-
- # going in the directory
- subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
- os.chdir(subdir)
- log.warn('Now working in %s', subdir)
-
- # installing
- log.warn('Installing Setuptools')
- if not _python_cmd('setup.py', 'install', *install_args):
- log.warn('Something went wrong during the installation.')
- log.warn('See the error message above.')
- # exitcode will be 2
- return 2
- finally:
- os.chdir(old_wd)
- shutil.rmtree(tmpdir)
-
-
-def _build_egg(egg, tarball, to_dir):
- # extracting the tarball
- tmpdir = tempfile.mkdtemp()
- log.warn('Extracting in %s', tmpdir)
- old_wd = os.getcwd()
- try:
- os.chdir(tmpdir)
- tar = tarfile.open(tarball)
- _extractall(tar)
- tar.close()
-
- # going in the directory
- subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
- os.chdir(subdir)
- log.warn('Now working in %s', subdir)
-
- # building an egg
- log.warn('Building a Setuptools egg in %s', to_dir)
- _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir)
-
- finally:
- os.chdir(old_wd)
- shutil.rmtree(tmpdir)
- # returning the result
- log.warn(egg)
- if not os.path.exists(egg):
- raise IOError('Could not build the egg.')
-
-
-def _do_download(version, download_base, to_dir, download_delay):
- egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg'
- % (version, sys.version_info[0], sys.version_info[1]))
- if not os.path.exists(egg):
- tarball = download_setuptools(version, download_base,
- to_dir, download_delay)
- _build_egg(egg, tarball, to_dir)
- sys.path.insert(0, egg)
-
- # Remove previously-imported pkg_resources if present (see
- # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details).
- if 'pkg_resources' in sys.modules:
- del sys.modules['pkg_resources']
-
- import setuptools
- setuptools.bootstrap_install_from = egg
-
-
-def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
- to_dir=os.curdir, download_delay=15):
- # making sure we use the absolute path
- to_dir = os.path.abspath(to_dir)
- was_imported = 'pkg_resources' in sys.modules or \
- 'setuptools' in sys.modules
- try:
- import pkg_resources
- except ImportError:
- return _do_download(version, download_base, to_dir, download_delay)
- try:
- pkg_resources.require("setuptools>=" + version)
- return
- except pkg_resources.VersionConflict:
- e = sys.exc_info()[1]
- if was_imported:
- sys.stderr.write(
- "The required version of setuptools (>=%s) is not available,\n"
- "and can't be installed while this script is running. Please\n"
- "install a more recent version first, using\n"
- "'easy_install -U setuptools'."
- "\n\n(Currently using %r)\n" % (version, e.args[0]))
- sys.exit(2)
- else:
- del pkg_resources, sys.modules['pkg_resources'] # reload ok
- return _do_download(version, download_base, to_dir,
- download_delay)
- except pkg_resources.DistributionNotFound:
- return _do_download(version, download_base, to_dir,
- download_delay)
-
-
-def _clean_check(cmd, target):
- """
- Run the command to download target. If the command fails, clean up before
- re-raising the error.
- """
- try:
- subprocess.check_call(cmd)
- except subprocess.CalledProcessError:
- if os.access(target, os.F_OK):
- os.unlink(target)
- raise
-
-
-def download_file_powershell(url, target):
- """
- Download the file at url to target using Powershell (which will validate
- trust). Raise an exception if the command cannot complete.
- """
- target = os.path.abspath(target)
- cmd = [
- 'powershell',
- '-Command',
- "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" % vars(),
- ]
- _clean_check(cmd, target)
-
-
-def has_powershell():
- if platform.system() != 'Windows':
- return False
- cmd = ['powershell', '-Command', 'echo test']
- devnull = open(os.path.devnull, 'wb')
- try:
- try:
- subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
- except Exception:
- return False
- finally:
- devnull.close()
- return True
-
-
-download_file_powershell.viable = has_powershell
-
-
-def download_file_curl(url, target):
- cmd = ['curl', url, '--silent', '--output', target]
- _clean_check(cmd, target)
-
-
-def has_curl():
- cmd = ['curl', '--version']
- devnull = open(os.path.devnull, 'wb')
- try:
- try:
- subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
- except Exception:
- return False
- finally:
- devnull.close()
- return True
-
-
-download_file_curl.viable = has_curl
-
-
-def download_file_wget(url, target):
- cmd = ['wget', url, '--quiet', '--output-document', target]
- _clean_check(cmd, target)
-
-
-def has_wget():
- cmd = ['wget', '--version']
- devnull = open(os.path.devnull, 'wb')
- try:
- try:
- subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
- except Exception:
- return False
- finally:
- devnull.close()
- return True
-
-
-download_file_wget.viable = has_wget
-
-
-def download_file_insecure(url, target):
- """
- Use Python to download the file, even though it cannot authenticate the
- connection.
- """
- try:
- from urllib.request import urlopen
- except ImportError:
- from urllib2 import urlopen
- src = dst = None
- try:
- src = urlopen(url)
- # Read/write all in one block, so we don't create a corrupt file
- # if the download is interrupted.
- data = src.read()
- dst = open(target, "wb")
- dst.write(data)
- finally:
- if src:
- src.close()
- if dst:
- dst.close()
-
-
-download_file_insecure.viable = lambda: True
-
-
-def get_best_downloader():
- downloaders = [
- download_file_powershell,
- download_file_curl,
- download_file_wget,
- download_file_insecure,
- ]
-
- for dl in downloaders:
- if dl.viable():
- return dl
-
-
-def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
- to_dir=os.curdir, delay=15,
- downloader_factory=get_best_downloader):
- """Download setuptools from a specified location and return its filename
-
- `version` should be a valid setuptools version number that is available
- as an egg for download under the `download_base` URL (which should end
- with a '/'). `to_dir` is the directory where the egg will be downloaded.
- `delay` is the number of seconds to pause before an actual download
- attempt.
-
- ``downloader_factory`` should be a function taking no arguments and
- returning a function for downloading a URL to a target.
- """
- # making sure we use the absolute path
- to_dir = os.path.abspath(to_dir)
- tgz_name = "setuptools-%s.tar.gz" % version
- url = download_base + tgz_name
- saveto = os.path.join(to_dir, tgz_name)
- if not os.path.exists(saveto): # Avoid repeated downloads
- log.warn("Downloading %s", url)
- downloader = downloader_factory()
- downloader(url, saveto)
- return os.path.realpath(saveto)
-
-
-def _extractall(self, path=".", members=None):
- """Extract all members from the archive to the current working
- directory and set owner, modification time and permissions on
- directories afterwards. `path' specifies a different directory
- to extract to. `members' is optional and must be a subset of the
- list returned by getmembers().
- """
- import copy
- import operator
- from tarfile import ExtractError
- directories = []
-
- if members is None:
- members = self
-
- for tarinfo in members:
- if tarinfo.isdir():
- # Extract directories with a safe mode.
- directories.append(tarinfo)
- tarinfo = copy.copy(tarinfo)
- tarinfo.mode = 448 # decimal for oct 0700
- self.extract(tarinfo, path)
-
- # Reverse sort directories.
- if sys.version_info < (2, 4):
- def sorter(dir1, dir2):
- return cmp(dir1.name, dir2.name) # noqa:F821
- directories.sort(sorter)
- directories.reverse()
- else:
- directories.sort(key=operator.attrgetter('name'), reverse=True)
-
- # Set correct owner, mtime and filemode on directories.
- for tarinfo in directories:
- dirpath = os.path.join(path, tarinfo.name)
- try:
- self.chown(tarinfo, dirpath)
- self.utime(tarinfo, dirpath)
- self.chmod(tarinfo, dirpath)
- except ExtractError:
- e = sys.exc_info()[1]
- if self.errorlevel > 1:
- raise
- else:
- self._dbg(1, "tarfile: %s" % e)
-
-
-def _build_install_args(options):
- """
- Build the arguments to 'python setup.py install' on the setuptools package
- """
- install_args = []
- if options.user_install:
- if sys.version_info < (2, 6):
- log.warn("--user requires Python 2.6 or later")
- raise SystemExit(1)
- install_args.append('--user')
- return install_args
-
-
-def _parse_args():
- """
- Parse the command line for options
- """
- parser = optparse.OptionParser()
- parser.add_option(
- '--user', dest='user_install', action='store_true', default=False,
- help='install in user site package (requires Python 2.6 or later)')
- parser.add_option(
- '--download-base', dest='download_base', metavar="URL",
- default=DEFAULT_URL,
- help='alternative URL from where to download the setuptools package')
- parser.add_option(
- '--insecure', dest='downloader_factory', action='store_const',
- const=lambda: download_file_insecure, default=get_best_downloader,
- help='Use internal, non-validating downloader'
- )
- options, args = parser.parse_args()
- # positional arguments are ignored
- return options
-
-
-def main(version=DEFAULT_VERSION):
- """Install or upgrade setuptools and EasyInstall"""
- options = _parse_args()
- tarball = download_setuptools(download_base=options.download_base,
- downloader_factory=options.downloader_factory)
- return _install(tarball, _build_install_args(options))
-
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/gensim/__init__.py b/gensim/__init__.py
index f70ef0f412..280af83834 100644
--- a/gensim/__init__.py
+++ b/gensim/__init__.py
@@ -5,16 +5,9 @@
from gensim import parsing, corpora, matutils, interfaces, models, similarities, summarization, utils # noqa:F401
import logging
-__version__ = '3.5.0'
-
-
-class NullHandler(logging.Handler):
- """For python versions <= 2.6; same as `logging.NullHandler` in 2.7."""
-
- def emit(self, record):
- pass
+__version__ = '3.6.0'
logger = logging.getLogger('gensim')
if len(logger.handlers) == 0: # To ensure reload() doesn't add another one
- logger.addHandler(NullHandler())
+ logger.addHandler(logging.NullHandler())
diff --git a/gensim/_matutils.c b/gensim/_matutils.c
index 90afbdcd72..384881f47f 100644
--- a/gensim/_matutils.c
+++ b/gensim/_matutils.c
@@ -1,4 +1,4 @@
-/* Generated by Cython 0.28.3 */
+/* Generated by Cython 0.28.4 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
@@ -7,7 +7,7 @@
#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
#error Cython requires Python 2.6+ or Python 3.3+.
#else
-#define CYTHON_ABI "0_28_3"
+#define CYTHON_ABI "0_28_4"
#define CYTHON_FUTURE_DIVISION 1
#include
#ifndef offsetof
@@ -24968,7 +24968,7 @@ static CYTHON_INLINE int __Pyx_dict_iter_next(
/* SaveResetException */
#if CYTHON_FAST_THREAD_STATE
static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
- #if PY_VERSION_HEX >= 0x030700A2
+ #if PY_VERSION_HEX >= 0x030700A3
*type = tstate->exc_state.exc_type;
*value = tstate->exc_state.exc_value;
*tb = tstate->exc_state.exc_traceback;
@@ -24983,7 +24983,7 @@ static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject *
}
static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
PyObject *tmp_type, *tmp_value, *tmp_tb;
- #if PY_VERSION_HEX >= 0x030700A2
+ #if PY_VERSION_HEX >= 0x030700A3
tmp_type = tstate->exc_state.exc_type;
tmp_value = tstate->exc_state.exc_value;
tmp_tb = tstate->exc_state.exc_traceback;
@@ -25067,7 +25067,7 @@ static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb)
*value = local_value;
*tb = local_tb;
#if CYTHON_FAST_THREAD_STATE
- #if PY_VERSION_HEX >= 0x030700A2
+ #if PY_VERSION_HEX >= 0x030700A3
tmp_type = tstate->exc_state.exc_type;
tmp_value = tstate->exc_state.exc_value;
tmp_tb = tstate->exc_state.exc_traceback;
@@ -25368,7 +25368,7 @@ static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *o, PyObject *n, PyObject
#if CYTHON_FAST_THREAD_STATE
static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
PyObject *tmp_type, *tmp_value, *tmp_tb;
- #if PY_VERSION_HEX >= 0x030700A2
+ #if PY_VERSION_HEX >= 0x030700A3
tmp_type = tstate->exc_state.exc_type;
tmp_value = tstate->exc_state.exc_value;
tmp_tb = tstate->exc_state.exc_traceback;
@@ -25519,14 +25519,42 @@ static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err,
return res;
}
#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+ Py_ssize_t i, n;
+ assert(PyExceptionClass_Check(exc_type));
+ n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+ for (i=0; i
#ifndef offsetof
@@ -7865,7 +7865,7 @@ static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject
/* SaveResetException */
#if CYTHON_FAST_THREAD_STATE
static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
- #if PY_VERSION_HEX >= 0x030700A2
+ #if PY_VERSION_HEX >= 0x030700A3
*type = tstate->exc_state.exc_type;
*value = tstate->exc_state.exc_value;
*tb = tstate->exc_state.exc_traceback;
@@ -7880,7 +7880,7 @@ static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject *
}
static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
PyObject *tmp_type, *tmp_value, *tmp_tb;
- #if PY_VERSION_HEX >= 0x030700A2
+ #if PY_VERSION_HEX >= 0x030700A3
tmp_type = tstate->exc_state.exc_type;
tmp_value = tstate->exc_state.exc_value;
tmp_tb = tstate->exc_state.exc_traceback;
@@ -8146,7 +8146,7 @@ static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb)
*value = local_value;
*tb = local_tb;
#if CYTHON_FAST_THREAD_STATE
- #if PY_VERSION_HEX >= 0x030700A2
+ #if PY_VERSION_HEX >= 0x030700A3
tmp_type = tstate->exc_state.exc_type;
tmp_value = tstate->exc_state.exc_value;
tmp_tb = tstate->exc_state.exc_traceback;
@@ -9524,14 +9524,42 @@ static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err,
return res;
}
#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+ Py_ssize_t i, n;
+ assert(PyExceptionClass_Check(exc_type));
+ n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+ for (i=0; i= 0x030700A2
+ #if PY_VERSION_HEX >= 0x030700A3
tmp_type = tstate->exc_state.exc_type;
tmp_value = tstate->exc_state.exc_value;
tmp_tb = tstate->exc_state.exc_traceback;
diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx
index f4844127a3..36cf11a1b9 100644
--- a/gensim/corpora/_mmreader.pyx
+++ b/gensim/corpora/_mmreader.pyx
@@ -8,7 +8,7 @@ from __future__ import with_statement
from gensim import utils
from six import string_types
-from six.moves import xrange
+from six.moves import range
import logging
cimport cython
@@ -148,7 +148,7 @@ cdef class MmReader(object):
# return implicit (empty) documents between previous id and new id
# too, to keep consistent document numbering and corpus length
- for previd in xrange(previd + 1, docid):
+ for previd in range(previd + 1, docid):
yield previd, []
# from now on start adding fields to a new document, with a new id
@@ -163,7 +163,7 @@ cdef class MmReader(object):
# return empty documents between the last explicit document and the number
# of documents as specified in the header
- for previd in xrange(previd + 1, self.num_docs):
+ for previd in range(previd + 1, self.num_docs):
yield previd, []
def docbyoffset(self, offset):
diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
index b0e5094ac0..701831b1b1 100644
--- a/gensim/corpora/bleicorpus.py
+++ b/gensim/corpora/bleicorpus.py
@@ -14,7 +14,7 @@
from gensim import utils
from gensim.corpora import IndexedCorpus
-from six.moves import xrange
+from six.moves import range
logger = logging.getLogger(__name__)
@@ -143,8 +143,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
logger.info("no word id mapping provided; initializing from corpus")
id2word = utils.dict_from_corpus(corpus)
num_terms = len(id2word)
+ elif id2word:
+ num_terms = 1 + max(id2word)
else:
- num_terms = 1 + max([-1] + id2word.keys())
+ num_terms = 0
logger.info("storing corpus in Blei's LDA-C format into %s", fname)
with utils.smart_open(fname, 'wb') as fout:
@@ -159,7 +161,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
fname_vocab = utils.smart_extension(fname, '.vocab')
logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
with utils.smart_open(fname_vocab, 'wb') as fout:
- for featureid in xrange(num_terms):
+ for featureid in range(num_terms):
fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
return offsets
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
index 84e2ed9945..c08d4e31b8 100644
--- a/gensim/corpora/dictionary.py
+++ b/gensim/corpora/dictionary.py
@@ -16,8 +16,7 @@
from gensim import utils
from six import PY3, iteritems, iterkeys, itervalues, string_types
-from six.moves import xrange
-from six.moves import zip as izip
+from six.moves import zip, range
if sys.version_info[0] >= 3:
unicode = str
@@ -56,17 +55,21 @@ def __init__(self, documents=None, prune_at=2000000):
documents : iterable of iterable of str, optional
Documents to be used to initialize the mapping and collect corpus statistics.
prune_at : int, optional
- Dictionary will keep no more than `prune_at` words in its mapping, to limit its RAM footprint.
+ Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM
+ footprint, the correctness is not guaranteed.
+ Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering.
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>>
- >>> texts = [['human', 'interface', 'computer']]
- >>> dct = Dictionary(texts) # initialize a Dictionary
- >>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # add more document (extend the vocabulary)
- >>> dct.doc2bow(["dog", "computer", "non_existent_word"])
- [(0, 1), (6, 1)]
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> texts = [['human', 'interface', 'computer']]
+ >>> dct = Dictionary(texts) # initialize a Dictionary
+ >>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # add more document (extend the vocabulary)
+ >>> dct.doc2bow(["dog", "computer", "non_existent_word"])
+ [(0, 1), (6, 1)]
"""
self.token2id = {}
@@ -172,19 +175,23 @@ def add_documents(self, documents, prune_at=2000000):
documents : iterable of iterable of str
Input corpus. All tokens should be already **tokenized and normalized**.
prune_at : int, optional
- Dictionary will keep no more than `prune_at` words in its mapping, to limit its RAM footprint.
+ Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM
+ footprint, the correctness is not guaranteed.
+ Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering.
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>>
- >>> corpus = ["máma mele maso".split(), "ema má máma".split()]
- >>> dct = Dictionary(corpus)
- >>> len(dct)
- 5
- >>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]])
- >>> len(dct)
- 10
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> corpus = ["máma mele maso".split(), "ema má máma".split()]
+ >>> dct = Dictionary(corpus)
+ >>> len(dct)
+ 5
+ >>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]])
+ >>> len(dct)
+ 10
"""
for docno, document in enumerate(documents):
@@ -224,12 +231,15 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
- >>> dct.doc2bow(["this", "is", "máma"])
- [(2, 1)]
- >>> dct.doc2bow(["this", "is", "máma"], return_missing=True)
- ([(2, 1)], {u'this': 1, u'is': 1})
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
+ >>> dct.doc2bow(["this", "is", "máma"])
+ [(2, 1)]
+ >>> dct.doc2bow(["this", "is", "máma"], return_missing=True)
+ ([(2, 1)], {u'this': 1, u'is': 1})
"""
if isinstance(document, string_types):
@@ -284,12 +294,14 @@ def doc2idx(self, document, unknown_word_index=-1):
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>>
- >>> corpus = [["a", "a", "b"], ["a", "c"]]
- >>> dct = Dictionary(corpus)
- >>> dct.doc2idx(["a", "a", "c", "not_in_dictionary", "c"])
- [0, 0, 2, -1, 2]
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> corpus = [["a", "a", "b"], ["a", "c"]]
+ >>> dct = Dictionary(corpus)
+ >>> dct.doc2idx(["a", "a", "c", "not_in_dictionary", "c"])
+ [0, 0, 2, -1, 2]
"""
if isinstance(document, string_types):
@@ -327,32 +339,36 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>>
- >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
- >>> dct = Dictionary(corpus)
- >>> len(dct)
- 5
- >>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1)
- >>> len(dct)
- 1
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
+ >>> dct = Dictionary(corpus)
+ >>> len(dct)
+ 5
+ >>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1)
+ >>> len(dct)
+ 1
"""
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
# determine which tokens to keep
if keep_tokens:
- keep_ids = [self.token2id[v] for v in keep_tokens if v in self.token2id]
- good_ids = (
+ keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id}
+ good_ids = [
v for v in itervalues(self.token2id)
if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids
- )
+ ]
+ good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True)
else:
- good_ids = (
+ good_ids = [
v for v in itervalues(self.token2id)
if no_below <= self.dfs.get(v, 0) <= no_above_abs
- )
- good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)
+ ]
+ good_ids.sort(key=self.dfs.get, reverse=True)
if keep_n is not None:
good_ids = good_ids[:keep_n]
bad_words = [(self[idx], self.dfs.get(idx, 0)) for idx in set(self).difference(good_ids)]
@@ -376,15 +392,18 @@ def filter_n_most_frequent(self, remove_n):
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>>
- >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
- >>> dct = Dictionary(corpus)
- >>> len(dct)
- 5
- >>> dct.filter_n_most_frequent(2)
- >>> len(dct)
- 3
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
+ >>> dct = Dictionary(corpus)
+ >>> len(dct)
+ 5
+ >>> dct.filter_n_most_frequent(2)
+ >>> len(dct)
+ 3
"""
# determine which tokens to keep
@@ -412,20 +431,23 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>>
- >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
- >>> dct = Dictionary(corpus)
- >>> 'ema' in dct.token2id
- True
- >>> dct.filter_tokens(bad_ids=[dct.token2id['ema']])
- >>> 'ema' in dct.token2id
- False
- >>> len(dct)
- 4
- >>> dct.filter_tokens(good_ids=[dct.token2id['maso']])
- >>> len(dct)
- 1
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
+ >>> dct = Dictionary(corpus)
+ >>> 'ema' in dct.token2id
+ True
+ >>> dct.filter_tokens(bad_ids=[dct.token2id['ema']])
+ >>> 'ema' in dct.token2id
+ False
+ >>> len(dct)
+ 4
+ >>> dct.filter_tokens(good_ids=[dct.token2id['maso']])
+ >>> len(dct)
+ 1
"""
if bad_ids is not None:
@@ -443,7 +465,7 @@ def compactify(self):
logger.debug("rebuilding dictionary, shrinking gaps")
# build mapping from old id -> new id
- idmap = dict(izip(sorted(itervalues(self.token2id)), xrange(len(self.token2id))))
+ idmap = dict(zip(sorted(itervalues(self.token2id)), range(len(self.token2id))))
# reassign mappings to new ids
self.token2id = {token: idmap[tokenid] for token, tokenid in iteritems(self.token2id)}
@@ -482,17 +504,19 @@ def save_as_text(self, fname, sort_by_word=True):
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>> from gensim.test.utils import get_tmpfile
- >>>
- >>> tmp_fname = get_tmpfile("dictionary")
- >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
- >>>
- >>> dct = Dictionary(corpus)
- >>> dct.save_as_text(tmp_fname)
- >>>
- >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
- >>> assert dct.token2id == loaded_dct.token2id
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>> from gensim.test.utils import get_tmpfile
+ >>>
+ >>> tmp_fname = get_tmpfile("dictionary")
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
+ >>>
+ >>> dct = Dictionary(corpus)
+ >>> dct.save_as_text(tmp_fname)
+ >>>
+ >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
+ >>> assert dct.token2id == loaded_dct.token2id
"""
logger.info("saving dictionary mapping to %s", fname)
@@ -532,15 +556,18 @@ def merge_with(self, other):
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>>
- >>> corpus_1, corpus_2 = [["a", "b", "c"]], [["a", "f", "f"]]
- >>> dct_1, dct_2 = Dictionary(corpus_1), Dictionary(corpus_2)
- >>> dct_1.doc2bow(corpus_2[0])
- [(0, 1)]
- >>> transformer = dct_1.merge_with(dct_2)
- >>> dct_1.doc2bow(corpus_2[0])
- [(0, 1), (3, 2)]
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> corpus_1, corpus_2 = [["a", "b", "c"]], [["a", "f", "f"]]
+ >>> dct_1, dct_2 = Dictionary(corpus_1), Dictionary(corpus_2)
+ >>> dct_1.doc2bow(corpus_2[0])
+ [(0, 1)]
+ >>> transformer = dct_1.merge_with(dct_2)
+ >>> dct_1.doc2bow(corpus_2[0])
+ [(0, 1), (3, 2)]
"""
old2new = {}
@@ -585,17 +612,20 @@ def load_from_text(fname):
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>> from gensim.test.utils import get_tmpfile
- >>>
- >>> tmp_fname = get_tmpfile("dictionary")
- >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
- >>>
- >>> dct = Dictionary(corpus)
- >>> dct.save_as_text(tmp_fname)
- >>>
- >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
- >>> assert dct.token2id == loaded_dct.token2id
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>> from gensim.test.utils import get_tmpfile
+ >>>
+ >>> tmp_fname = get_tmpfile("dictionary")
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
+ >>>
+ >>> dct = Dictionary(corpus)
+ >>> dct.save_as_text(tmp_fname)
+ >>>
+ >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
+ >>> assert dct.token2id == loaded_dct.token2id
"""
result = Dictionary()
@@ -647,12 +677,15 @@ def from_corpus(corpus, id2word=None):
Examples
--------
- >>> from gensim.corpora import Dictionary
- >>>
- >>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []]
- >>> dct = Dictionary.from_corpus(corpus)
- >>> len(dct)
- 3
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []]
+ >>> dct = Dictionary.from_corpus(corpus)
+ >>> len(dct)
+ 3
"""
result = Dictionary()
@@ -669,7 +702,7 @@ def from_corpus(corpus, id2word=None):
if id2word is None:
# make sure length(result) == get_max_id(corpus) + 1
- result.token2id = {unicode(i): i for i in xrange(max_id + 1)}
+ result.token2id = {unicode(i): i for i in range(max_id + 1)}
else:
# id=>word mapping given: simply copy it
result.token2id = {utils.to_unicode(token): idx for idx, token in iteritems(id2word)}
diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py
index 85922d16c7..433f61aa42 100644
--- a/gensim/corpora/hashdictionary.py
+++ b/gensim/corpora/hashdictionary.py
@@ -50,13 +50,15 @@ class HashDictionary(utils.SaveLoad, dict):
Examples
--------
- >>> from gensim.corpora import HashDictionary
- >>>
- >>> dct = HashDictionary(debug=False) # needs no training corpus!
- >>>
- >>> texts = [['human', 'interface', 'computer']]
- >>> dct.doc2bow(texts[0])
- [(10608, 1), (12466, 1), (31002, 1)]
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import HashDictionary
+ >>>
+ >>> dct = HashDictionary(debug=False) # needs no training corpus!
+ >>>
+ >>> texts = [['human', 'interface', 'computer']]
+ >>> dct.doc2bow(texts[0])
+ [(10608, 1), (12466, 1), (31002, 1)]
"""
def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True):
@@ -172,16 +174,18 @@ def add_documents(self, documents):
Examples
--------
- >>> from gensim.corpora import HashDictionary
- >>>
- >>> dct = HashDictionary(debug=True) # needs no training corpus!
- >>>
- >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
- >>> "sparta" in dct.token2id
- False
- >>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]])
- >>> "sparta" in dct.token2id
- True
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import HashDictionary
+ >>>
+ >>> dct = HashDictionary(debug=True) # needs no training corpus!
+ >>>
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
+ >>> "sparta" in dct.token2id
+ False
+ >>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]])
+ >>> "sparta" in dct.token2id
+ True
"""
for docno, document in enumerate(documents):
@@ -222,11 +226,13 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
Examples
--------
- >>> from gensim.corpora import HashDictionary
- >>>
- >>> dct = HashDictionary()
- >>> dct.doc2bow(["this", "is", "máma"])
- [(1721, 1), (5280, 1), (22493, 1)]
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import HashDictionary
+ >>>
+ >>> dct = HashDictionary()
+ >>> dct.doc2bow(["this", "is", "máma"])
+ [(1721, 1), (5280, 1), (22493, 1)]
"""
result = {}
@@ -297,7 +303,7 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
tokenid: {token for token in tokens if token in self.dfs_debug}
for tokenid, tokens in iteritems(self.id2token)
}
- self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, set())}
+ self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, False)}
# for word->document frequency
logger.info(
@@ -325,12 +331,15 @@ def save_as_text(self, fname):
Examples
--------
- >>> from gensim.corpora import HashDictionary
- >>> from gensim.test.utils import get_tmpfile
- >>>
- >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
- >>> data = HashDictionary(corpus)
- >>> data.save_as_text(get_tmpfile("dictionary_in_text_format"))
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import HashDictionary
+ >>> from gensim.test.utils import get_tmpfile
+ >>>
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
+ >>> data = HashDictionary(corpus)
+ >>> data.save_as_text(get_tmpfile("dictionary_in_text_format"))
"""
logger.info("saving %s mapping to %s" % (self, fname))
diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
index c4e58cb95a..5b6f8a42f2 100644
--- a/gensim/corpora/indexedcorpus.py
+++ b/gensim/corpora/indexedcorpus.py
@@ -23,18 +23,22 @@ class IndexedCorpus(interfaces.CorpusABC):
While the standard corpus interface in gensim allows iterating over corpus,
we'll show it with :class:`~gensim.corpora.mmcorpus.MmCorpus`.
- >>> from gensim.corpora import MmCorpus
- >>> from gensim.test.utils import datapath
- >>>
- >>> corpus = MmCorpus(datapath('testcorpus.mm'))
- >>> for doc in corpus:
- ... pass
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import MmCorpus
+ >>> from gensim.test.utils import datapath
+ >>>
+ >>> corpus = MmCorpus(datapath('testcorpus.mm'))
+ >>> for doc in corpus:
+ ... pass
:class:`~gensim.corpora.indexedcorpus.IndexedCorpus` allows accessing the documents with index
in :math:`{O}(1)` look-up time.
- >>> document_index = 3
- >>> doc = corpus[document_index]
+ .. sourcecode:: pycon
+
+ >>> document_index = 3
+ >>> doc = corpus[document_index]
Notes
-----
@@ -89,16 +93,19 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
Examples
--------
- >>> from gensim.corpora import MmCorpus
- >>> from gensim.test.utils import get_tmpfile
- >>>
- >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]]
- >>> output_fname = get_tmpfile("test.mm")
- >>>
- >>> MmCorpus.serialize(output_fname, corpus)
- >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access
- >>> print(mm[1]) # retrieve document no. 42, etc.
- [(1, 0.1)]
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import MmCorpus
+ >>> from gensim.test.utils import get_tmpfile
+ >>>
+ >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]]
+ >>> output_fname = get_tmpfile("test.mm")
+ >>>
+ >>> MmCorpus.serialize(output_fname, corpus)
+ >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access
+ >>> print(mm[1]) # retrieve document no. 42, etc.
+ [(1, 0.1)]
"""
if getattr(corpus, 'fname', None) == fname:
diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
index 277df249e5..2944aafd27 100644
--- a/gensim/corpora/lowcorpus.py
+++ b/gensim/corpora/lowcorpus.py
@@ -14,7 +14,7 @@
from gensim import utils
from gensim.corpora import IndexedCorpus
from six import iterkeys
-from six.moves import xrange, zip as izip
+from six.moves import zip, range
logger = logging.getLogger(__name__)
@@ -60,20 +60,22 @@ class LowCorpus(IndexedCorpus):
Examples
--------
- >>> from gensim.test.utils import datapath, get_tmpfile, common_texts
- >>> from gensim.corpora import LowCorpus
- >>> from gensim.corpora import Dictionary
- >>>
- >>> # Prepare needed data
- >>> dictionary = Dictionary(common_texts)
- >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts]
- >>>
- >>> # Write corpus in GibbsLda++ format to disk
- >>> output_fname = get_tmpfile("corpus.low")
- >>> LowCorpus.serialize(output_fname, corpus, dictionary)
- >>>
- >>> # Read corpus
- >>> loaded_corpus = LowCorpus(output_fname)
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import get_tmpfile, common_texts
+ >>> from gensim.corpora import LowCorpus
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> # Prepare needed data
+ >>> dictionary = Dictionary(common_texts)
+ >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts]
+ >>>
+ >>> # Write corpus in GibbsLda++ format to disk
+ >>> output_fname = get_tmpfile("corpus.low")
+ >>> LowCorpus.serialize(output_fname, corpus, dictionary)
+ >>>
+ >>> # Read corpus
+ >>> loaded_corpus = LowCorpus(output_fname)
"""
def __init__(self, fname, id2word=None, line2words=split_on_space):
@@ -107,7 +109,7 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
all_terms.update(word for word, wordCnt in doc)
all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
# build a mapping of word id(int) -> word (string)
- self.id2word = dict(izip(xrange(len(all_terms)), all_terms))
+ self.id2word = dict(zip(range(len(all_terms)), all_terms))
else:
logger.info("using provided word mapping (%i ids)", len(id2word))
self.id2word = id2word
@@ -263,14 +265,17 @@ def docbyoffset(self, offset):
Examples
--------
- >>> from gensim.test.utils import datapath
- >>> from gensim.corpora import LowCorpus
- >>>
- >>> data = LowCorpus(datapath("testcorpus.low"))
- >>> data.docbyoffset(1) # end of first line
- []
- >>> data.docbyoffset(2) # start of second line
- [(0, 1), (3, 1), (4, 1)]
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.corpora import LowCorpus
+ >>>
+ >>> data = LowCorpus(datapath("testcorpus.low"))
+ >>> data.docbyoffset(1) # end of first line
+ []
+ >>> data.docbyoffset(2) # start of second line
+ [(0, 1), (3, 1), (4, 1)]
"""
with utils.smart_open(self.fname) as f:
diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
index 37d7fc0d9d..db28b9e632 100644
--- a/gensim/corpora/malletcorpus.py
+++ b/gensim/corpora/malletcorpus.py
@@ -36,20 +36,22 @@ class MalletCorpus(LowCorpus):
Examples
--------
- >>> from gensim.test.utils import datapath, get_tmpfile, common_texts
- >>> from gensim.corpora import MalletCorpus
- >>> from gensim.corpora import Dictionary
- >>>
- >>> # Prepare needed data
- >>> dictionary = Dictionary(common_texts)
- >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts]
- >>>
- >>> # Write corpus in Mallet format to disk
- >>> output_fname = get_tmpfile("corpus.mallet")
- >>> MalletCorpus.serialize(output_fname, corpus, dictionary)
- >>>
- >>> # Read corpus
- >>> loaded_corpus = MalletCorpus(output_fname)
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import get_tmpfile, common_texts
+ >>> from gensim.corpora import MalletCorpus
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> # Prepare needed data
+ >>> dictionary = Dictionary(common_texts)
+ >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts]
+ >>>
+ >>> # Write corpus in Mallet format to disk
+ >>> output_fname = get_tmpfile("corpus.mallet")
+ >>> MalletCorpus.serialize(output_fname, corpus, dictionary)
+ >>>
+ >>> # Read corpus
+ >>> loaded_corpus = MalletCorpus(output_fname)
"""
def __init__(self, fname, id2word=None, metadata=False):
@@ -113,12 +115,15 @@ def line2doc(self, line):
Examples
--------
- >>> from gensim.test.utils import datapath
- >>> from gensim.corpora import MalletCorpus
- >>>
- >>> corpus = MalletCorpus(datapath("testcorpus.mallet"))
- >>> corpus.line2doc("en computer human interface")
- [(3, 1), (4, 1)]
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.corpora import MalletCorpus
+ >>>
+ >>> corpus = MalletCorpus(datapath("testcorpus.mallet"))
+ >>> corpus.line2doc("en computer human interface")
+ [(3, 1), (4, 1)]
"""
splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word]
@@ -214,14 +219,16 @@ def docbyoffset(self, offset):
Examples
--------
- >>> from gensim.test.utils import datapath
- >>> from gensim.corpora import MalletCorpus
- >>>
- >>> data = MalletCorpus(datapath("testcorpus.mallet"))
- >>> data.docbyoffset(1) # end of first line
- [(3, 1), (4, 1)]
- >>> data.docbyoffset(4) # start of second line
- [(4, 1)]
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.corpora import MalletCorpus
+ >>>
+ >>> data = MalletCorpus(datapath("testcorpus.mallet"))
+ >>> data.docbyoffset(1) # end of first line
+ [(3, 1), (4, 1)]
+ >>> data.docbyoffset(4) # start of second line
+ [(4, 1)]
"""
with utils.smart_open(self.fname) as f:
diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py
index 92048bb67d..3650f75e11 100644
--- a/gensim/corpora/mmcorpus.py
+++ b/gensim/corpora/mmcorpus.py
@@ -41,13 +41,14 @@ class MmCorpus(matutils.MmReader, IndexedCorpus):
Example
--------
- >>> from gensim.corpora.mmcorpus import MmCorpus
- >>> from gensim.test.utils import datapath
- >>> import gensim.downloader as api
- >>>
- >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm'))
- >>> for document in corpus:
- ... pass
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora.mmcorpus import MmCorpus
+ >>> from gensim.test.utils import datapath
+ >>>
+ >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm'))
+ >>> for document in corpus:
+ ... pass
"""
def __init__(self, fname):
@@ -107,14 +108,15 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
Example
-------
- >>> from gensim.corpora.mmcorpus import MmCorpus
- >>> from gensim.test.utils import datapath
- >>> import gensim.downloader as api
- >>>
- >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm'))
- >>>
- >>> MmCorpus.save_corpus("random", corpus) # Do not do it, use `serialize` instead.
- [97, 121, 169, 201, 225, 249, 258, 276, 303]
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora.mmcorpus import MmCorpus
+ >>> from gensim.test.utils import datapath
+ >>>
+ >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm'))
+ >>>
+ >>> MmCorpus.save_corpus("random", corpus) # Do not do it, use `serialize` instead.
+ [97, 121, 169, 201, 225, 249, 258, 276, 303]
"""
logger.info("storing corpus in Matrix Market format to %s", fname)
diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py
index 049e22f226..754cc7bbf4 100644
--- a/gensim/corpora/sharded_corpus.py
+++ b/gensim/corpora/sharded_corpus.py
@@ -26,7 +26,7 @@
import scipy.sparse as sparse
import time
-from six.moves import xrange
+from six.moves import range
import gensim
from gensim.corpora import IndexedCorpus
@@ -67,9 +67,11 @@ class ShardedCorpus(IndexedCorpus):
supply the dimension of your data to the corpus. (The dimension of word
frequency vectors will typically be the size of the vocabulary, etc.)
- >>> corpus = gensim.utils.mock_data()
- >>> output_prefix = 'mydata.shdat'
- >>> ShardedCorpus.serialize(output_prefix, corpus, dim=1000)
+ .. sourcecode:: pycon
+
+ >>> corpus = gensim.utils.mock_data()
+ >>> output_prefix = 'mydata.shdat'
+ >>> ShardedCorpus.serialize(output_prefix, corpus, dim=1000)
The `output_prefix` tells the ShardedCorpus where to put the data.
Shards are saved as `output_prefix.0`, `output_prefix.1`, etc.
@@ -88,15 +90,19 @@ class ShardedCorpus(IndexedCorpus):
To retrieve data, you can load the corpus and use it like a list:
- >>> sh_corpus = ShardedCorpus.load(output_prefix)
- >>> batch = sh_corpus[100:150]
+ .. sourcecode:: pycon
+
+ >>> sh_corpus = ShardedCorpus.load(output_prefix)
+ >>> batch = sh_corpus[100:150]
This will retrieve a numpy 2-dimensional array of 50 rows and 1000
columns (1000 was the dimension of the data we supplied to the corpus).
To retrieve gensim-style sparse vectors, set the `gensim` property:
- >>> sh_corpus.gensim = True
- >>> batch = sh_corpus[100:150]
+ .. sourcecode:: pycon
+
+ >>> sh_corpus.gensim = True
+ >>> batch = sh_corpus[100:150]
The batch now will be a generator of gensim vectors.
@@ -105,8 +111,10 @@ class ShardedCorpus(IndexedCorpus):
`ShardedCorpus.serialize()`, you can just initialize and use the corpus
right away:
- >>> corpus = ShardedCorpus(output_prefix, corpus, dim=1000)
- >>> batch = corpus[100:150]
+ .. sourcecode:: pycon
+
+ >>> corpus = ShardedCorpus(output_prefix, corpus, dim=1000)
+ >>> batch = corpus[100:150]
ShardedCorpus also supports working with scipy sparse matrices, both
during retrieval and during serialization. If you want to serialize your
@@ -117,15 +125,17 @@ class ShardedCorpus(IndexedCorpus):
will retrieve numpy ndarrays even if it was serialized into sparse
matrices.
- >>> sparse_prefix = 'mydata.sparse.shdat'
- >>> ShardedCorpus.serialize(sparse_prefix, corpus, dim=1000, sparse_serialization=True)
- >>> sparse_corpus = ShardedCorpus.load(sparse_prefix)
- >>> batch = sparse_corpus[100:150]
- >>> type(batch)
-
- >>> sparse_corpus.sparse_retrieval = True
- >>> batch = sparse_corpus[100:150]
-
+ .. sourcecode:: pycon
+
+ >>> sparse_prefix = 'mydata.sparse.shdat'
+ >>> ShardedCorpus.serialize(sparse_prefix, corpus, dim=1000, sparse_serialization=True)
+ >>> sparse_corpus = ShardedCorpus.load(sparse_prefix)
+ >>> batch = sparse_corpus[100:150]
+ >>> type(batch)
+
+ >>> sparse_corpus.sparse_retrieval = True
+ >>> batch = sparse_corpus[100:150]
+
While you *can* touch the `sparse_retrieval` attribute during the life
of a ShardedCorpus object, you should definitely not touch `
@@ -422,7 +432,7 @@ def resize_shards(self, shardsize):
new_shard_names = []
new_offsets = [0]
- for new_shard_idx in xrange(n_new_shards):
+ for new_shard_idx in range(n_new_shards):
new_start = shardsize * new_shard_idx
new_stop = new_start + shardsize
@@ -451,7 +461,7 @@ def resize_shards(self, shardsize):
# Move old shard files out, new ones in. Complicated due to possibility
# of exceptions.
- old_shard_names = [self._shard_name(n) for n in xrange(self.n_shards)]
+ old_shard_names = [self._shard_name(n) for n in range(self.n_shards)]
try:
for old_shard_n, old_shard_name in enumerate(old_shard_names):
os.remove(old_shard_name)
@@ -634,7 +644,7 @@ def __getitem__(self, offset):
s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop)
# First and last get special treatment, these are in between
- for shard_n in xrange(first_shard + 1, last_shard):
+ for shard_n in range(first_shard + 1, last_shard):
self.load_shard(shard_n)
result_start = result_stop
@@ -725,7 +735,7 @@ def row_sparse2gensim(row_idx, csr_matrix):
g_row = [(col_idx, csr_matrix[row_idx, col_idx]) for col_idx in indices]
return g_row
- output = (row_sparse2gensim(i, result) for i in xrange(result.shape[0]))
+ output = (row_sparse2gensim(i, result) for i in range(result.shape[0]))
return output
@@ -735,7 +745,7 @@ def _getitem_dense2gensim(self, result):
output = gensim.matutils.full2sparse(result)
else:
output = (gensim.matutils.full2sparse(result[i])
- for i in xrange(result.shape[0]))
+ for i in range(result.shape[0]))
return output
# Overriding the IndexedCorpus and other corpus superclass methods
@@ -744,7 +754,7 @@ def __iter__(self):
Yield dataset items one by one (generator).
"""
- for i in xrange(len(self)):
+ for i in range(len(self)):
yield self[i]
def save(self, *args, **kwargs):
@@ -756,13 +766,12 @@ def save(self, *args, **kwargs):
"""
# Can we save to a different file than output_prefix? Well, why not?
if len(args) == 0:
- args = tuple([self.output_prefix])
+ args = (self.output_prefix,)
attrs_to_ignore = ['current_shard', 'current_shard_n', 'current_offset']
- if 'ignore' not in kwargs:
- kwargs['ignore'] = frozenset(attrs_to_ignore)
- else:
- kwargs['ignore'] = frozenset([v for v in kwargs['ignore']] + attrs_to_ignore)
+ if 'ignore' in kwargs:
+ attrs_to_ignore.extend(kwargs['ignore'])
+ kwargs['ignore'] = frozenset(attrs_to_ignore)
super(ShardedCorpus, self).save(*args, **kwargs)
@classmethod
diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
index cd3d0d26e4..e5616fe9d7 100644
--- a/gensim/corpora/textcorpus.py
+++ b/gensim/corpora/textcorpus.py
@@ -216,26 +216,29 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter
Examples
--------
- >>> from gensim.corpora.textcorpus import TextCorpus
- >>> from gensim.test.utils import datapath
- >>> from gensim import utils
- >>>
- >>>
- >>> class CorpusMiislita(TextCorpus):
- ... stopwords = set('for a of the and to in on'.split())
- ...
- ... def get_texts(self):
- ... for doc in self.getstream():
- ... yield [word for word in utils.to_unicode(doc).lower().split() if word not in self.stopwords]
- ...
- ... def __len__(self):
- ... self.length = sum(1 for _ in self.get_texts())
- ... return self.length
- >>>
- >>> corpus = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
- >>> len(corpus)
- 250
- >>> document = next(iter(corpus.get_texts()))
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora.textcorpus import TextCorpus
+ >>> from gensim.test.utils import datapath
+ >>> from gensim import utils
+ >>>
+ >>>
+ >>> class CorpusMiislita(TextCorpus):
+ ... stopwords = set('for a of the and to in on'.split())
+ ...
+ ... def get_texts(self):
+ ... for doc in self.getstream():
+ ... yield [word for word in utils.to_unicode(doc).lower().split() if word not in self.stopwords]
+ ...
+ ... def __len__(self):
+ ... self.length = sum(1 for _ in self.get_texts())
+ ... return self.length
+ >>>
+ >>>
+ >>> corpus = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
+ >>> len(corpus)
+ 250
+ >>> document = next(iter(corpus.get_texts()))
"""
self.input = input
diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
index 5aa1c456ba..9831c7bba3 100644
--- a/gensim/corpora/ucicorpus.py
+++ b/gensim/corpora/ucicorpus.py
@@ -17,7 +17,7 @@
from gensim.corpora import IndexedCorpus
from gensim.matutils import MmReader
from gensim.matutils import MmWriter
-from six.moves import xrange
+from six.moves import range
logger = logging.getLogger(__name__)
@@ -171,12 +171,14 @@ def __init__(self, fname, fname_vocab=None):
Examples
--------
- >>> from gensim.corpora import UciCorpus
- >>> from gensim.test.utils import datapath
- >>>
- >>> corpus = UciCorpus(datapath('testcorpus.uci'))
- >>> for document in corpus:
- ... pass
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import UciCorpus
+ >>> from gensim.test.utils import datapath
+ >>>
+ >>> corpus = UciCorpus(datapath('testcorpus.uci'))
+ >>> for document in corpus:
+ ... pass
"""
IndexedCorpus.__init__(self, fname)
@@ -214,10 +216,13 @@ def create_dictionary(self):
Examples
--------
- >>> from gensim.corpora.ucicorpus import UciCorpus
- >>> from gensim.test.utils import datapath
- >>> ucc = UciCorpus(datapath('testcorpus.uci'))
- >>> dictionary = ucc.create_dictionary()
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora.ucicorpus import UciCorpus
+ >>> from gensim.test.utils import datapath
+ >>> ucc = UciCorpus(datapath('testcorpus.uci'))
+ >>> dictionary = ucc.create_dictionary()
"""
dictionary = Dictionary()
@@ -273,14 +278,16 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False)
logger.info("no word id mapping provided; initializing from corpus")
id2word = utils.dict_from_corpus(corpus)
num_terms = len(id2word)
+ elif id2word:
+ num_terms = 1 + max(id2word)
else:
- num_terms = 1 + max([-1] + list(id2word))
+ num_terms = 0
# write out vocabulary
fname_vocab = utils.smart_extension(fname, '.vocab')
logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
with utils.smart_open(fname_vocab, 'wb') as fout:
- for featureid in xrange(num_terms):
+ for featureid in range(num_terms):
fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
logger.info("storing corpus in UCI Bag-of-Words format: %s", fname)
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index f204a6d834..b7ad94083d 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -81,7 +81,7 @@
"""Capture interlinks text and article linked"""
RE_P17 = re.compile(
r'(\n.{0,4}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=)|(scope=))(.*))|'
- '(^.{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))',
+ r'(^.{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))',
re.UNICODE
)
"""Table markup"""
@@ -90,7 +90,7 @@
'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
'Special', 'Talk'
]
-"""`MediaWiki namespaces `_ that ought to be ignored."""
+"""MediaWiki namespaces that ought to be ignored."""
def filter_example(elem, text, *args, **kwargs):
@@ -126,12 +126,16 @@ def filter_example(elem, text, *args, **kwargs):
pageid_path : str
XPath expression for page id.
- Example:
- ------
- >>> import gensim.corpora
- >>> filter_func = gensim.corpora.wikicorpus.filter_example
- >>> dewiki = gensim.corpora.WikiCorpus('./dewiki-20180520-pages-articles-multistream.xml.bz2',
- filter_articles=filter_func)
+ Example
+ -------
+ .. sourcecode:: pycon
+
+ >>> import gensim.corpora
+ >>> filter_func = gensim.corpora.wikicorpus.filter_example
+ >>> dewiki = gensim.corpora.WikiCorpus(
+ ... './dewiki-20180520-pages-articles-multistream.xml.bz2',
+ ... filter_articles=filter_func)
+
"""
# Filter German wikipedia dump for articles that are marked either as
# Lesenswert (featured) or Exzellent (excellent) by wikipedia editors.
@@ -139,8 +143,8 @@ def filter_example(elem, text, *args, **kwargs):
# regex is in the function call so that we do not pollute the wikicorpus
# namespace do not do this in production as this function is called for
# every element in the wiki dump
- _regex_de_excellent = re.compile('.*\{\{(Exzellent.*?)\}\}[\s]*', flags=re.DOTALL)
- _regex_de_featured = re.compile('.*\{\{(Lesenswert.*?)\}\}[\s]*', flags=re.DOTALL)
+ _regex_de_excellent = re.compile(r'.*\{\{(Exzellent.*?)\}\}[\s]*', flags=re.DOTALL)
+ _regex_de_featured = re.compile(r'.*\{\{(Lesenswert.*?)\}\}[\s]*', flags=re.DOTALL)
if text is None:
return False
@@ -285,10 +289,10 @@ def remove_template(s):
# Find the start and end position of each template by finding the opening
# '{{' and closing '}}'
n_open, n_close = 0, 0
- starts, ends = [], []
+ starts, ends = [], [-1]
in_template = False
prev_c = None
- for i, c in enumerate(iter(s)):
+ for i, c in enumerate(s):
if not in_template:
if c == '{' and c == prev_c:
starts.append(i - 1)
@@ -306,7 +310,8 @@ def remove_template(s):
prev_c = c
# Remove all the templates
- return ''.join([s[end + 1:start] for start, end in zip(starts + [None], [-1] + ends)])
+ starts.append(None)
+ return ''.join(s[end + 1:start] for end, start in zip(ends, starts))
def remove_file(s):
@@ -557,14 +562,16 @@ class WikiCorpus(TextCorpus):
Examples
--------
- >>> from gensim.test.utils import datapath, get_tmpfile
- >>> from gensim.corpora import WikiCorpus, MmCorpus
- >>>
- >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2")
- >>> corpus_path = get_tmpfile("wiki-corpus.mm")
- >>>
- >>> wiki = WikiCorpus(path_to_wiki_dump) # create word->word_id mapping, ~8h on full wiki
- >>> MmCorpus.serialize(corpus_path, wiki) # another 8h, creates a file in MatrixMarket format and mapping
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import datapath, get_tmpfile
+ >>> from gensim.corpora import WikiCorpus, MmCorpus
+ >>>
+ >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2")
+ >>> corpus_path = get_tmpfile("wiki-corpus.mm")
+ >>>
+ >>> wiki = WikiCorpus(path_to_wiki_dump) # create word->word_id mapping, ~8h on full wiki
+ >>> MmCorpus.serialize(corpus_path, wiki) # another 8h, creates a file in MatrixMarket format and mapping
"""
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
@@ -643,13 +650,15 @@ def get_texts(self):
Examples
--------
- >>> from gensim.test.utils import datapath
- >>> from gensim.corpora import WikiCorpus
- >>>
- >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2")
- >>>
- >>> for vec in WikiCorpus(path_to_wiki_dump):
- ... pass
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.corpora import WikiCorpus
+ >>>
+ >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2")
+ >>>
+ >>> for vec in WikiCorpus(path_to_wiki_dump):
+ ... pass
Yields
------
diff --git a/gensim/downloader.py b/gensim/downloader.py
index f3672ead01..300cbc2f61 100644
--- a/gensim/downloader.py
+++ b/gensim/downloader.py
@@ -3,32 +3,37 @@
Give information about available models/datasets:
->>> import gensim.downloader as api
->>>
->>> api.info() # return dict with info about available models/datasets
->>> api.info("text8") # return dict with info about "text8" dataset
+.. sourcecode:: pycon
+
+ >>> import gensim.downloader as api
+ >>>
+ >>> api.info() # return dict with info about available models/datasets
+ >>> api.info("text8") # return dict with info about "text8" dataset
Model example:
+.. sourcecode:: pycon
->>> import gensim.downloader as api
->>>
->>> model = api.load("glove-twitter-25") # load glove vectors
->>> model.most_similar("cat") # show words that similar to word 'cat'
+ >>> import gensim.downloader as api
+ >>>
+ >>> model = api.load("glove-twitter-25") # load glove vectors
+ >>> model.most_similar("cat") # show words that similar to word 'cat'
Dataset example:
+.. sourcecode:: pycon
->>> import gensim.downloader as api
->>> from gensim.models import Word2Vec
->>>
->>> dataset = api.load("text8") # load dataset as iterable
->>> model = Word2Vec(dataset) # train w2v model
+ >>> import gensim.downloader as api
+ >>> from gensim.models import Word2Vec
+ >>>
+ >>> dataset = api.load("text8") # load dataset as iterable
+ >>> model = Word2Vec(dataset) # train w2v model
Also, this API available via CLI::
python -m gensim.downloader --info # same as api.info(dataname)
+ python -m gensim.downloader --info name # same as api.info(name_only=True)
python -m gensim.downloader --download # same as api.load(dataname, return_path=True)
"""
@@ -54,7 +59,7 @@
user_dir = os.path.expanduser('~')
base_dir = os.path.join(user_dir, 'gensim-data')
-logger = logging.getLogger('gensim.api')
+logger = logging.getLogger(__name__)
DATA_LIST_URL = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list.json"
DOWNLOAD_BASE_URL = "https://github.com/RaRe-Technologies/gensim-data/releases/download"
@@ -154,7 +159,7 @@ def _calculate_md5_checksum(fname):
return hash_md5.hexdigest()
-def info(name=None, show_only_latest=True):
+def info(name=None, show_only_latest=True, name_only=False):
"""Provide the information related to model/dataset.
Parameters
@@ -164,6 +169,8 @@ def info(name=None, show_only_latest=True):
show_only_latest : bool, optional
If storage contains different versions for one data/model, this flag allow to hide outdated versions.
Affects only if `name` is None.
+ name_only : bool, optional
+ If True, will return only the names of available models and corpora.
Returns
-------
@@ -179,15 +186,17 @@ def info(name=None, show_only_latest=True):
Examples
--------
- >>> import gensim.downloader as api
- >>> api.info("text8") # retrieve information about text8 dataset
- {u'checksum': u'68799af40b6bda07dfa47a32612e5364',
- u'description': u'Cleaned small sample from wikipedia',
- u'file_name': u'text8.gz',
- u'parts': 1,
- u'source': u'http://mattmahoney.net/dc/text8.zip'}
- >>>
- >>> api.info() # retrieve information about all available datasets and models
+ .. sourcecode:: pycon
+
+ >>> import gensim.downloader as api
+ >>> api.info("text8") # retrieve information about text8 dataset
+ {u'checksum': u'68799af40b6bda07dfa47a32612e5364',
+ u'description': u'Cleaned small sample from wikipedia',
+ u'file_name': u'text8.gz',
+ u'parts': 1,
+ u'source': u'http://mattmahoney.net/dc/text8.zip'}
+ >>>
+ >>> api.info() # retrieve information about all available datasets and models
"""
information = json.loads(urlopen(DATA_LIST_URL).read().decode("utf-8"))
@@ -205,6 +214,9 @@ def info(name=None, show_only_latest=True):
if not show_only_latest:
return information
+ if name_only:
+ return {"corpora": list(information['corpora'].keys()), "models": list(information['models'])}
+
return {
"corpora": {name: data for (name, data) in information['corpora'].items() if data.get("latest", True)},
"models": {name: data for (name, data) in information['models'].items() if data.get("latest", True)}
@@ -382,23 +394,30 @@ def load(name, return_path=False):
--------
Model example:
- >>> import gensim.downloader as api
- >>>
- >>> model = api.load("glove-twitter-25") # load glove vectors
- >>> model.most_similar("cat") # show words that similar to word 'cat'
+ .. sourcecode:: pycon
+
+ >>> import gensim.downloader as api
+ >>>
+ >>> model = api.load("glove-twitter-25") # load glove vectors
+ >>> model.most_similar("cat") # show words that similar to word 'cat'
Dataset example:
- >>> import gensim.downloader as api
- >>>
- >>> wiki = api.load("wiki-en") # load extracted Wikipedia dump, around 6 Gb
- >>> for article in wiki: # iterate over all wiki script
- >>> ...
+ .. sourcecode:: pycon
- Download only example
- >>> import gensim.downloader as api
- >>>
- >>> print(api.load("wiki-en", return_path=True)) # output: /home/user/gensim-data/wiki-en/wiki-en.gz
+ >>> import gensim.downloader as api
+ >>>
+ >>> wiki = api.load("wiki-en") # load extracted Wikipedia dump, around 6 Gb
+ >>> for article in wiki: # iterate over all wiki script
+ >>> pass
+
+ Download only example:
+
+ .. sourcecode:: pycon
+
+ >>> import gensim.downloader as api
+ >>>
+ >>> print(api.load("wiki-en", return_path=True)) # output: /home/user/gensim-data/wiki-en/wiki-en.gz
"""
_create_base_dir()
@@ -424,7 +443,7 @@ def load(name, return_path=False):
)
parser = argparse.ArgumentParser(
description="Gensim console API",
- usage="python -m gensim.api.downloader [-h] [-d data_name | -i data_name | -c]"
+ usage="python -m gensim.api.downloader [-h] [-d data_name | -i data_name]"
)
group = parser.add_mutually_exclusive_group()
@@ -444,5 +463,8 @@ def load(name, return_path=False):
data_path = load(args.download[0], return_path=True)
logger.info("Data has been installed and data path is %s", data_path)
elif args.info is not None:
- output = info() if (args.info == full_information) else info(name=args.info)
- print(json.dumps(output, indent=4))
+ if args.info == 'name':
+ print(json.dumps(info(name_only=True), indent=4))
+ else:
+ output = info() if (args.info == full_information) else info(name=args.info)
+ print(json.dumps(output, indent=4))
diff --git a/gensim/interfaces.py b/gensim/interfaces.py
index 327dc9c960..3fd266eb62 100644
--- a/gensim/interfaces.py
+++ b/gensim/interfaces.py
@@ -19,7 +19,7 @@
import logging
from gensim import utils, matutils
-from six.moves import xrange
+from six.moves import range
logger = logging.getLogger(__name__)
@@ -30,38 +30,44 @@ class CorpusABC(utils.SaveLoad):
Corpus is simply an iterable object, where each iteration step yields one document:
- >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
- >>> from gensim.test.utils import datapath
- >>>
- >>> corpus = MmCorpus(datapath("testcorpus.mm"))
- >>> for doc in corpus:
- ... pass # do something with the doc...
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
+ >>> from gensim.test.utils import datapath
+ >>>
+ >>> corpus = MmCorpus(datapath("testcorpus.mm"))
+ >>> for doc in corpus:
+ ... pass # do something with the doc...
A document represented in bag-of-word (BoW) format, i.e. list of (attr_id, attr_value),
like ``[(1, 0.2), (4, 0.6), ...]``.
- >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
- >>> from gensim.test.utils import datapath
- >>>
- >>> corpus = MmCorpus(datapath("testcorpus.mm"))
- >>> doc = next(iter(corpus))
- >>> print(doc)
- [(0, 1.0), (1, 1.0), (2, 1.0)]
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
+ >>> from gensim.test.utils import datapath
+ >>>
+ >>> corpus = MmCorpus(datapath("testcorpus.mm"))
+ >>> doc = next(iter(corpus))
+ >>> print(doc)
+ [(0, 1.0), (1, 1.0), (2, 1.0)]
Remember, that save/load methods save only corpus class (not corpus as data itself),
for save/load functionality, please use this pattern :
- >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
- >>> from gensim.test.utils import datapath, get_tmpfile
- >>>
- >>> corpus = MmCorpus(datapath("testcorpus.mm"))
- >>> tmp_path = get_tmpfile("temp_corpus.mm")
- >>>
- >>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in MmCorpus format
- >>> # MmCorpus.save_corpus(tmp_path, corpus) # this variant also possible, but if serialize availbe - call it.
- >>> loaded_corpus = MmCorpus(tmp_path) # load corpus through constructor
- >>> for (doc_1, doc_2) in zip(corpus, loaded_corpus):
- ... assert doc_1 == doc_2 # check that corpuses exactly same
+ .. sourcecode:: pycon
+
+ >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
+ >>> from gensim.test.utils import datapath, get_tmpfile
+ >>>
+ >>> corpus = MmCorpus(datapath("testcorpus.mm"))
+ >>> tmp_path = get_tmpfile("temp_corpus.mm")
+ >>>
+ >>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in MmCorpus format
+ >>> # MmCorpus.save_corpus(tmp_path, corpus) # this variant also possible, but if serialize availbe - call it.
+ >>> loaded_corpus = MmCorpus(tmp_path) # load corpus through constructor
+ >>> for (doc_1, doc_2) in zip(corpus, loaded_corpus):
+ ... assert doc_1 == doc_2 # check that corpuses exactly same
See Also
@@ -209,12 +215,14 @@ class TransformationABC(utils.SaveLoad):
A 'transformation' is any object which accepts document in BoW format via the `__getitem__` (notation `[]`)
and returns another sparse document in its stead:
- >>> from gensim.models import LsiModel
- >>> from gensim.test.utils import common_dictionary, common_corpus
- >>>
- >>> model = LsiModel(common_corpus, id2word=common_dictionary)
- >>> bow_vector = model[common_corpus[0]] # model applied through __getitem__ on one document from corpus.
- >>> bow_corpus = model[common_corpus] # also, we can apply model on the full corpus
+ .. sourcecode:: pycon
+
+ >>> from gensim.models import LsiModel
+ >>> from gensim.test.utils import common_dictionary, common_corpus
+ >>>
+ >>> model = LsiModel(common_corpus, id2word=common_dictionary)
+ >>> bow_vector = model[common_corpus[0]] # model applied through __getitem__ on one document from corpus.
+ >>> bow_corpus = model[common_corpus] # also, we can apply model on the full corpus
"""
def __getitem__(self, vec):
@@ -256,11 +264,13 @@ class SimilarityABC(utils.SaveLoad):
Examples
--------
- >>> from gensim.similarities import MatrixSimilarity
- >>> from gensim.test.utils import common_dictionary, common_corpus
- >>>
- >>> index = MatrixSimilarity(common_corpus)
- >>> similarities = index.get_similarities(common_corpus[1]) # get similarities between query and corpus
+ .. sourcecode:: pycon
+
+ >>> from gensim.similarities import MatrixSimilarity
+ >>> from gensim.test.utils import common_corpus
+ >>>
+ >>> index = MatrixSimilarity(common_corpus)
+ >>> similarities = index.get_similarities(common_corpus[1]) # get similarities between query and corpus
Notes
-----
@@ -375,7 +385,7 @@ def __iter__(self):
# assumes `self.corpus` holds the index as a 2-d numpy array.
# this is true for MatrixSimilarity and SparseMatrixSimilarity, but
# may not be true for other (future) classes..?
- for chunk_start in xrange(0, self.index.shape[0], self.chunksize):
+ for chunk_start in range(0, self.index.shape[0], self.chunksize):
# scipy.sparse doesn't allow slicing beyond real size of the matrix
# (unlike numpy). so, clip the end of the chunk explicitly to make
# scipy.sparse happy
diff --git a/gensim/matutils.py b/gensim/matutils.py
index 5b01426cce..979b99f6d5 100644
--- a/gensim/matutils.py
+++ b/gensim/matutils.py
@@ -25,7 +25,7 @@
from scipy.special import psi # gamma function utils
from six import iteritems, itervalues, string_types
-from six.moves import xrange, zip as izip
+from six.moves import zip, range
logger = logging.getLogger(__name__)
@@ -153,8 +153,8 @@ def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=
for docno, doc in enumerate(corpus):
if printprogress and docno % printprogress == 0:
logger.info("PROGRESS: at document #%i", docno)
- indices.extend([feature_id for feature_id, _ in doc])
- data.extend([feature_weight for _, feature_weight in doc])
+ indices.extend(feature_id for feature_id, _ in doc)
+ data.extend(feature_weight for _, feature_weight in doc)
num_nnz += len(doc)
indptr.append(num_nnz)
if num_terms is None:
@@ -587,7 +587,7 @@ def __iter__(self):
Document in BoW format.
"""
- for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]):
+ for indprev, indnow in zip(self.sparse.indptr, self.sparse.indptr[1:]):
yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))
def __len__(self):
@@ -716,7 +716,7 @@ def unitvec(vec, norm='l2', return_norm=False):
if norm == 'l2':
veclen = np.sqrt(np.sum(vec.data ** 2))
if veclen > 0.0:
- if np.issubdtype(vec.dtype, np.int):
+ if np.issubdtype(vec.dtype, np.integer):
vec = vec.astype(np.float)
vec /= veclen
if return_norm:
@@ -735,7 +735,7 @@ def unitvec(vec, norm='l2', return_norm=False):
if norm == 'l2':
veclen = blas_nrm2(vec)
if veclen > 0.0:
- if np.issubdtype(vec.dtype, np.int):
+ if np.issubdtype(vec.dtype, np.integer):
vec = vec.astype(np.float)
if return_norm:
return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen
@@ -856,8 +856,8 @@ def softcossim(vec1, vec2, similarity_matrix):
vec2 = dict(vec2)
word_indices = sorted(set(chain(vec1, vec2)))
dtype = similarity_matrix.dtype
- vec1 = np.array([vec1[i] if i in vec1 else 0 for i in word_indices], dtype=dtype)
- vec2 = np.array([vec2[i] if i in vec2 else 0 for i in word_indices], dtype=dtype)
+ vec1 = np.fromiter((vec1[i] if i in vec1 else 0 for i in word_indices), dtype=dtype, count=len(word_indices))
+ vec2 = np.fromiter((vec2[i] if i in vec2 else 0 for i in word_indices), dtype=dtype, count=len(word_indices))
dense_matrix = similarity_matrix[[[i] for i in word_indices], word_indices].todense()
vec1len = vec1.T.dot(dense_matrix).dot(vec1)[0, 0]
vec2len = vec2.T.dot(dense_matrix).dot(vec2)[0, 0]
@@ -1522,7 +1522,7 @@ def __iter__(self):
# return implicit (empty) documents between previous id and new id
# too, to keep consistent document numbering and corpus length
- for previd in xrange(previd + 1, docid):
+ for previd in range(previd + 1, docid):
yield previd, []
# from now on start adding fields to a new document, with a new id
@@ -1537,7 +1537,7 @@ def __iter__(self):
# return empty documents between the last explicit document and the number
# of documents as specified in the header
- for previd in xrange(previd + 1, self.num_docs):
+ for previd in range(previd + 1, self.num_docs):
yield previd, []
def docbyoffset(self, offset):
diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
index b1ef3ff367..a0ee690550 100644
--- a/gensim/models/__init__.py
+++ b/gensim/models/__init__.py
@@ -37,12 +37,14 @@ class VocabTransform(interfaces.TransformationABC):
`VocabTransform[corpus]` returns the same vectors but with the new ids.
Old features that have no counterpart in the new ids are discarded. This
- can be used to filter vocabulary of a corpus "online"::
+ can be used to filter vocabulary of a corpus "online":
- >>> old2new = {oldid: newid for newid, oldid in enumerate(ids_you_want_to_keep)}
- >>> vt = VocabTransform(old2new)
- >>> for vec_with_new_ids in vt[corpus_with_old_ids]:
- >>> ...
+ .. sourcecode:: pycon
+
+ >>> old2new = {oldid: newid for newid, oldid in enumerate(ids_you_want_to_keep)}
+ >>> vt = VocabTransform(old2new)
+ >>> for vec_with_new_ids in vt[corpus_with_old_ids]:
+ >>> pass
"""
diff --git a/gensim/models/_utils_any2vec.c b/gensim/models/_utils_any2vec.c
index 2fd18bbbc7..dfaf7e8125 100644
--- a/gensim/models/_utils_any2vec.c
+++ b/gensim/models/_utils_any2vec.c
@@ -1,4 +1,4 @@
-/* Generated by Cython 0.28.3 */
+/* Generated by Cython 0.28.4 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
@@ -7,7 +7,7 @@
#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
#error Cython requires Python 2.6+ or Python 3.3+.
#else
-#define CYTHON_ABI "0_28_3"
+#define CYTHON_ABI "0_28_4"
#define CYTHON_FUTURE_DIVISION 0
#include
#ifndef offsetof
@@ -3405,14 +3405,42 @@ static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err,
return res;
}
#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+ Py_ssize_t i, n;
+ assert(PyExceptionClass_Check(exc_type));
+ n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+ for (i=0; i>> from gensim.models import AuthorTopicModel
->>> from gensim.corpora import mmcorpus
->>> from gensim.test.utils import common_dictionary, datapath, temporary_file
-
->>> author2doc = {
-... 'john': [0, 1, 2, 3, 4, 5, 6],
-... 'jane': [2, 3, 4, 5, 6, 7, 8],
-... 'jack': [0, 2, 4, 6, 8]
-... }
->>>
->>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
->>>
->>> with temporary_file("serialized") as s_path:
-... model = AuthorTopicModel(
-... corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
-... serialized=True, serialization_path=s_path
-... )
-...
-... model.update(corpus, author2doc) # update the author-topic model with additional documents
->>>
->>> # construct vectors for authors
->>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
+
+.. sourcecode:: pycon
+
+ >>> from gensim.models import AuthorTopicModel
+ >>> from gensim.corpora import mmcorpus
+ >>> from gensim.test.utils import common_dictionary, datapath, temporary_file
+
+ >>> author2doc = {
+ ... 'john': [0, 1, 2, 3, 4, 5, 6],
+ ... 'jane': [2, 3, 4, 5, 6, 7, 8],
+ ... 'jack': [0, 2, 4, 6, 8]
+ ... }
+ >>>
+ >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
+ >>>
+ >>> with temporary_file("serialized") as s_path:
+ ... model = AuthorTopicModel(
+ ... corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
+ ... serialized=True, serialization_path=s_path
+ ... )
+ ...
+ ... model.update(corpus, author2doc) # update the author-topic model with additional documents
+ >>>
+ >>> # construct vectors for authors
+ >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
"""
# TODO: this class inherits LdaModel and overwrites some methods. There is some code
@@ -64,11 +67,11 @@
from gensim import utils
from gensim.models import LdaModel
from gensim.models.ldamodel import LdaState
-from gensim.matutils import dirichlet_expectation
+from gensim.matutils import dirichlet_expectation, mean_absolute_difference
from gensim.corpora import MmCorpus
from itertools import chain
from scipy.special import gammaln # gamma function utils
-from six.moves import xrange
+from six.moves import range
import six
logger = logging.getLogger(__name__)
@@ -373,14 +376,14 @@ def extend_corpus(self, corpus):
self.corpus.extend(corpus)
def compute_phinorm(self, expElogthetad, expElogbetad):
- """Efficiently computes the normalizing factor in phi.
+ r"""Efficiently computes the normalizing factor in phi.
Parameters
----------
expElogthetad: numpy.ndarray
Value of variational distribution :math:`q(\theta|\gamma)`.
expElogbetad: numpy.ndarray
- Value of variational distribution :math:`q(\\beta|\lambda)`.
+ Value of variational distribution :math:`q(\beta|\lambda)`.
Returns
-------
@@ -461,10 +464,11 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
ids = [int(idx) for idx, _ in doc]
else:
ids = [idx for idx, _ in doc]
- cts = np.array([cnt for _, cnt in doc])
+ ids = np.array(ids, dtype=np.int)
+ cts = np.fromiter((cnt for _, cnt in doc), dtype=np.int, count=len(doc))
# Get all authors in current document, and convert the author names to integer IDs.
- authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
+ authors_d = np.fromiter((self.author2id[a] for a in self.doc2author[doc_no]), dtype=np.int)
gammad = self.state.gamma[authors_d, :] # gamma of document d before update.
tilde_gamma = gammad.copy() # gamma that will be updated.
@@ -478,7 +482,7 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
phinorm = self.compute_phinorm(expElogthetad, expElogbetad)
# Iterate between gamma and phi until convergence
- for _ in xrange(self.iterations):
+ for _ in range(self.iterations):
lastgamma = tilde_gamma.copy()
# Update gamma.
@@ -501,7 +505,7 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
# Check for convergence.
# Criterion is mean change in "local" gamma.
- meanchange_gamma = np.mean(abs(tilde_gamma - lastgamma))
+ meanchange_gamma = mean_absolute_difference(tilde_gamma.ravel(), lastgamma.ravel())
gamma_condition = meanchange_gamma < self.gamma_threshold
if gamma_condition:
converged += 1
@@ -695,7 +699,7 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None,
# Just keep training on the already available data.
# Assumes self.update() has been called before with input documents and corresponding authors.
assert self.total_docs > 0, 'update() was called with no documents to train on.'
- train_corpus_idx = [d for d in xrange(self.total_docs)]
+ train_corpus_idx = [d for d in range(self.total_docs)]
num_input_authors = len(self.author2doc)
else:
if doc2author is None and author2doc is None:
@@ -812,7 +816,7 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None,
def rho():
return pow(offset + pass_ + (self.num_updates / chunksize), -decay)
- for pass_ in xrange(passes):
+ for pass_ in range(passes):
if self.dispatcher:
logger.info('initializing %s workers', self.numworkers)
self.dispatcher.reset(self.state)
@@ -884,7 +888,7 @@ def rho():
del other
def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None):
- """Estimate the variational bound of documents from `corpus`.
+ r"""Estimate the variational bound of documents from `corpus`.
:math:`\mathbb{E_{q}}[\log p(corpus)] - \mathbb{E_{q}}[\log q(corpus)]`
@@ -906,7 +910,7 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None,
Assigns the value for document index.
subsample_ratio : float, optional
Used for calculation of word score for estimation of variational bound.
- author2doc : dict of (str, list of int), optinal
+ author2doc : dict of (str, list of int), optional
A dictionary where keys are the names of authors and values are lists of documents that the author
contributes to.
doc2author : dict of (int, list of str), optional
@@ -972,9 +976,9 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None,
else:
doc_no = d
# Get all authors in current document, and convert the author names to integer IDs.
- authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
- ids = np.array([id for id, _ in doc]) # Word IDs in doc.
- cts = np.array([cnt for _, cnt in doc]) # Word counts.
+ authors_d = np.fromiter((self.author2id[a] for a in self.doc2author[doc_no]), dtype=np.int)
+ ids = np.fromiter((id for id, _ in doc), dtype=np.int, count=len(doc)) # Word IDs in doc.
+ cts = np.fromiter((cnt for _, cnt in doc), dtype=np.int, count=len(doc)) # Word counts.
if d % self.chunksize == 0:
logger.debug("bound: at document #%i in chunk", d)
@@ -1090,7 +1094,7 @@ def rollback_new_author_chages():
gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics))
self.state.gamma = np.vstack([self.state.gamma, gamma_new])
- # Should not record the sstats, as we are goint to delete the new author after calculated.
+ # Should not record the sstats, as we are going to delete the new author after calculated.
try:
gammat, _ = self.inference(
corpus, self.author2doc, self.doc2author, rho(),
@@ -1119,28 +1123,30 @@ def get_author_topics(self, author_name, minimum_probability=None):
Example
-------
- >>> from gensim.models import AuthorTopicModel
- >>> from gensim.corpora import mmcorpus
- >>> from gensim.test.utils import common_dictionary, datapath, temporary_file
-
- >>> author2doc = {
- ... 'john': [0, 1, 2, 3, 4, 5, 6],
- ... 'jane': [2, 3, 4, 5, 6, 7, 8],
- ... 'jack': [0, 2, 4, 6, 8]
- ... }
- >>>
- >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
- >>>
- >>> with temporary_file("serialized") as s_path:
- ... model = AuthorTopicModel(
- ... corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
- ... serialized=True, serialization_path=s_path
- ... )
- ...
- ... model.update(corpus, author2doc) # update the author-topic model with additional documents
- >>>
- >>> # construct vectors for authors
- >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
+ .. sourcecode:: pycon
+
+ >>> from gensim.models import AuthorTopicModel
+ >>> from gensim.corpora import mmcorpus
+ >>> from gensim.test.utils import common_dictionary, datapath, temporary_file
+
+ >>> author2doc = {
+ ... 'john': [0, 1, 2, 3, 4, 5, 6],
+ ... 'jane': [2, 3, 4, 5, 6, 7, 8],
+ ... 'jack': [0, 2, 4, 6, 8]
+ ... }
+ >>>
+ >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
+ >>>
+ >>> with temporary_file("serialized") as s_path:
+ ... model = AuthorTopicModel(
+ ... corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
+ ... serialized=True, serialization_path=s_path
+ ... )
+ ...
+ ... model.update(corpus, author2doc) # update the author-topic model with additional documents
+ >>>
+ >>> # construct vectors for authors
+ >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
"""
author_id = self.author2id[author_name]
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
index 8f7123e1d7..48461b23e4 100644
--- a/gensim/models/base_any2vec.py
+++ b/gensim/models/base_any2vec.py
@@ -5,7 +5,7 @@
# Copyright (C) 2018 RaRe Technologies s.r.o.
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-"""This module contains base classes required for implementing \*2vec algorithms.
+r"""This module contains base classes required for implementing \*2vec algorithms.
The class hierarchy is designed to facilitate adding more concrete implementations for creating embeddings.
In the most general case, the purpose of this class is to transform an arbitrary representation to a numerical vector
@@ -36,14 +36,16 @@
import logging
from timeit import default_timer
import threading
-from six.moves import xrange
-from six import itervalues
+from six.moves import range
+from six import itervalues, string_types
from gensim import matutils
from numpy import float32 as REAL, ones, random, dtype, zeros
from types import GeneratorType
from gensim.utils import deprecated
import warnings
-import itertools
+import os
+import copy
+
try:
from queue import Queue
@@ -54,7 +56,7 @@
class BaseAny2VecModel(utils.SaveLoad):
- """Base class for training, using and evaluating \*2vec model.
+ r"""Base class for training, using and evaluating \*2vec model.
Contains implementation for multi-threaded training. The purpose of this class is to provide a
reference interface for concrete embedding implementations, whether the input space is a corpus
@@ -123,6 +125,10 @@ def _clear_post_train(self):
"""Resets certain properties of the model post training. eg. `keyedvectors.vectors_norm`."""
raise NotImplementedError()
+ def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch,
+ total_examples=None, total_words=None, **kwargs):
+ raise NotImplementedError()
+
def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
"""Train a single batch. Return 2-tuple `(effective word count, total word count)`."""
raise NotImplementedError()
@@ -131,10 +137,45 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N
"""Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided."""
raise NotImplementedError()
- def _check_input_data_sanity(self, data_iterable=None, data_iterables=None):
- """Check that only one argument is not None."""
- if not ((data_iterable is not None) ^ (data_iterables is not None)):
- raise ValueError("You must provide only one of singlestream or multistream arguments.")
+ def _check_input_data_sanity(self, data_iterable=None, corpus_file=None):
+ """Check that only one argument is None."""
+ if not (data_iterable is None) ^ (corpus_file is None):
+ raise ValueError("You must provide only one of singlestream or corpus_file arguments.")
+
+ def _worker_loop_corpusfile(self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0,
+ total_examples=None, total_words=None, **kwargs):
+ """Train the model on a `corpus_file` in LineSentence format.
+
+ This function will be called in parallel by multiple workers (threads or processes) to make
+ optimal use of multicore machines.
+
+ Parameters
+ ----------
+ corpus_file : str
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ thread_id : int
+ Thread index starting from 0 to `number of workers - 1`.
+ offset : int
+ Offset (in bytes) in the `corpus_file` for particular worker.
+ cython_vocab : :class:`~gensim.models.word2vec_inner.CythonVocab`
+ Copy of the vocabulary in order to access it without GIL.
+ progress_queue : Queue of (int, int, int)
+ A queue of progress reports. Each report is represented as a tuple of these 3 elements:
+ * Size of data chunk processed, for example number of sentences in the corpus chunk.
+ * Effective word count used in training (after ignoring unknown words and trimming the sentence length).
+ * Total word count used in training.
+ **kwargs : object
+ Additional key word parameters for the specific model inheriting from this class.
+
+ """
+ thread_private_mem = self._get_thread_working_mem()
+
+ examples, tally, raw_tally = self._do_train_epoch(
+ corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch,
+ total_examples=total_examples, total_words=total_words, **kwargs)
+
+ progress_queue.put((examples, tally, raw_tally))
+ progress_queue.put(None)
def _worker_loop(self, job_queue, progress_queue):
"""Train the model, lifting batches of data from the queue.
@@ -243,7 +284,7 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No
)
# give the workers heads up that they can finish -- no more work!
- for _ in xrange(self.workers):
+ for _ in range(self.workers):
job_queue.put(None)
logger.debug("job loop exiting, total %i jobs", job_no)
@@ -252,14 +293,14 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot
raise NotImplementedError()
def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
- trained_word_count, elapsed):
+ trained_word_count, elapsed, is_corpus_file_mode):
raise NotImplementedError()
def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally):
raise NotImplementedError()
- def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None,
- report_delay=1.0):
+ def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None,
+ total_words=None, report_delay=1.0, is_corpus_file_mode=None):
"""Get the progress report for a single training epoch.
Parameters
@@ -284,6 +325,8 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
words in a corpus. Used to log progress.
report_delay : float, optional
Number of seconds between two consecutive progress report messages in the logger.
+ is_corpus_file_mode : bool, optional
+ Whether training is file-based (corpus_file argument) or not.
Returns
-------
@@ -324,20 +367,81 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
elapsed = default_timer() - start
self._log_epoch_end(
cur_epoch, example_count, total_examples, raw_word_count, total_words,
- trained_word_count, elapsed)
+ trained_word_count, elapsed, is_corpus_file_mode)
self.total_train_time += elapsed
return trained_word_count, raw_word_count, job_tally
- def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, total_examples=None,
- total_words=None, queue_factor=2, report_delay=1.0):
+ def _train_epoch_corpusfile(self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, **kwargs):
+ """Train the model for a single epoch.
+
+ Parameters
+ ----------
+ corpus_file : str
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ cur_epoch : int, optional
+ The current training epoch, needed to compute the training parameters for each job.
+ For example in many implementations the learning rate would be dropping with the number of epochs.
+ total_examples : int, optional
+ Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
+ in a corpus, used to log progress.
+ total_words : int
+ Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
+ words in a corpus, used to log progress. Must be provided in order to seek in `corpus_file`.
+ **kwargs : object
+ Additional key word parameters for the specific model inheriting from this class.
+
+ Returns
+ -------
+ (int, int, int)
+ The training report for this epoch consisting of three elements:
+ * Size of data chunk processed, for example number of sentences in the corpus chunk.
+ * Effective word count used in training (after ignoring unknown words and trimming the sentence length).
+ * Total word count used in training.
+
+ """
+ if not total_words:
+ raise ValueError("total_words must be provided alongside corpus_file argument.")
+
+ from gensim.models.word2vec_corpusfile import CythonVocab
+ from gensim.models.fasttext import FastText
+ cython_vocab = CythonVocab(self.wv, hs=self.hs, fasttext=isinstance(self, FastText))
+
+ progress_queue = Queue()
+
+ corpus_file_size = os.path.getsize(corpus_file)
+
+ thread_kwargs = copy.copy(kwargs)
+ thread_kwargs['cur_epoch'] = cur_epoch
+ thread_kwargs['total_examples'] = total_examples
+ thread_kwargs['total_words'] = total_words
+ workers = [
+ threading.Thread(
+ target=self._worker_loop_corpusfile,
+ args=(
+ corpus_file, thread_id, corpus_file_size / self.workers * thread_id, cython_vocab, progress_queue
+ ),
+ kwargs=thread_kwargs
+ ) for thread_id in range(self.workers)
+ ]
+
+ for thread in workers:
+ thread.daemon = True
+ thread.start()
+
+ trained_word_count, raw_word_count, job_tally = self._log_epoch_progress(
+ progress_queue=progress_queue, job_queue=None, cur_epoch=cur_epoch,
+ total_examples=total_examples, total_words=total_words, is_corpus_file_mode=True)
+
+ return trained_word_count, raw_word_count, job_tally
+
+ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None,
+ queue_factor=2, report_delay=1.0):
"""Train the model for a single epoch.
Parameters
----------
data_iterable : iterable of list of object
The input corpus. This will be split in chunks and these chunks will be pushed to the queue.
- data_iterables : iterable of iterables of list of object
- The iterable of input streams like `data_iterable`. Use this parameter in multistream mode.
cur_epoch : int, optional
The current training epoch, needed to compute the training parameters for each job.
For example in many implementations the learning rate would be dropping with the number of epochs.
@@ -361,7 +465,6 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot
* Total word count used in training.
"""
- self._check_input_data_sanity(data_iterable, data_iterables)
job_queue = Queue(maxsize=queue_factor * self.workers)
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
@@ -369,12 +472,9 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot
threading.Thread(
target=self._worker_loop,
args=(job_queue, progress_queue,))
- for _ in xrange(self.workers)
+ for _ in range(self.workers)
]
- # Chain all input streams into one, because multistream training is not supported yet.
- if data_iterables is not None:
- data_iterable = itertools.chain(*data_iterables)
workers.append(threading.Thread(
target=self._job_producer,
args=(data_iterable, job_queue),
@@ -386,11 +486,11 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot
trained_word_count, raw_word_count, job_tally = self._log_epoch_progress(
progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words,
- report_delay=report_delay)
+ report_delay=report_delay, is_corpus_file_mode=False)
return trained_word_count, raw_word_count, job_tally
- def train(self, data_iterable=None, data_iterables=None, epochs=None, total_examples=None,
+ def train(self, data_iterable=None, corpus_file=None, epochs=None, total_examples=None,
total_words=None, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs):
"""Train the model for multiple epochs using multiple workers.
@@ -398,6 +498,9 @@ def train(self, data_iterable=None, data_iterables=None, epochs=None, total_exam
----------
data_iterable : iterable of list of object
The input corpus. This will be split in chunks and these chunks will be pushed to the queue.
+ corpus_file : str, optional
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ If you use this argument instead of `data_iterable`, you must provide `total_words` argument as well.
epochs : int, optional
Number of epochs (training iterations over the whole input) of training.
total_examples : int, optional
@@ -444,10 +547,14 @@ def train(self, data_iterable=None, data_iterables=None, epochs=None, total_exam
for callback in self.callbacks:
callback.on_epoch_begin(self)
- trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch(
- data_iterable=data_iterable, data_iterables=data_iterables, cur_epoch=cur_epoch,
- total_examples=total_examples, total_words=total_words, queue_factor=queue_factor,
- report_delay=report_delay)
+ if data_iterable is not None:
+ trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch(
+ data_iterable, cur_epoch=cur_epoch, total_examples=total_examples,
+ total_words=total_words, queue_factor=queue_factor, report_delay=report_delay)
+ else:
+ trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch_corpusfile(
+ corpus_file, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, **kwargs)
+
trained_word_count += trained_word_count_epoch
raw_word_count += raw_word_count_epoch
job_tally += job_tally_epoch
@@ -538,7 +645,7 @@ def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
def _set_train_params(self, **kwargs):
raise NotImplementedError()
- def __init__(self, sentences=None, input_streams=None, workers=3, vector_size=100, epochs=5, callbacks=(),
+ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100, epochs=5, callbacks=(),
batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5,
ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs):
"""
@@ -550,6 +657,10 @@ def __init__(self, sentences=None, input_streams=None, workers=3, vector_size=10
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` for such examples.
+ corpus_file : str, optional
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+ `corpus_file` arguments need to be passed (or none of them).
workers : int, optional
Number of working threads, used for multiprocessing.
vector_size : int, optional
@@ -622,6 +733,7 @@ def __init__(self, sentences=None, input_streams=None, workers=3, vector_size=10
self.running_training_loss = 0
self.min_alpha_yet_reached = float(alpha)
self.corpus_count = 0
+ self.corpus_total_words = 0
super(BaseWordEmbeddingsModel, self).__init__(
workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words)
@@ -637,20 +749,18 @@ def __init__(self, sentences=None, input_streams=None, workers=3, vector_size=10
self.neg_labels = zeros(self.negative + 1)
self.neg_labels[0] = 1.
- if sentences is not None or input_streams is not None:
- self._check_input_data_sanity(data_iterable=sentences, data_iterables=input_streams)
- if input_streams is not None:
- if not isinstance(input_streams, (tuple, list)):
- raise TypeError("You must pass tuple or list as the input_streams argument.")
- if any(isinstance(stream, GeneratorType) for stream in input_streams):
- raise TypeError("You can't pass a generator as any of input streams. Try an iterator.")
+ if sentences is not None or corpus_file is not None:
+ self._check_input_data_sanity(data_iterable=sentences, corpus_file=corpus_file)
+ if corpus_file is not None and not isinstance(corpus_file, string_types):
+ raise TypeError("You must pass string as the corpus_file argument.")
elif isinstance(sentences, GeneratorType):
raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
- self.build_vocab(sentences=sentences, input_streams=input_streams, trim_rule=trim_rule)
+ self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule)
self.train(
- sentences=sentences, input_streams=input_streams, total_examples=self.corpus_count, epochs=self.epochs,
- start_alpha=self.alpha, end_alpha=self.min_alpha, compute_loss=compute_loss)
+ sentences=sentences, corpus_file=corpus_file, total_examples=self.corpus_count,
+ total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha,
+ end_alpha=self.min_alpha, compute_loss=compute_loss)
else:
if trim_rule is not None:
logger.warning(
@@ -783,7 +893,7 @@ def __str__(self):
self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha
)
- def build_vocab(self, sentences=None, input_streams=None, workers=None, update=False, progress_per=10000,
+ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000,
keep_raw_vocab=False, trim_rule=None, **kwargs):
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
@@ -794,12 +904,10 @@ def build_vocab(self, sentences=None, input_streams=None, workers=None, update=F
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` module for such examples.
- input_streams : list or tuple of iterable of iterables
- The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible
- to process streams in parallel, using `workers` parameter.
- workers : int
- Used if `input_streams` is passed. Determines how many processes to use for vocab building.
- Actual number of workers is determined by `min(len(input_streams), workers)`.
+ corpus_file : str, optional
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+ `corpus_file` arguments need to be passed (not both of them).
update : bool
If true, the new words in `sentences` will be added to model's vocab.
progress_per : int, optional
@@ -824,11 +932,10 @@ def build_vocab(self, sentences=None, input_streams=None, workers=None, update=F
Key word arguments propagated to `self.vocabulary.prepare_vocab`
"""
- workers = workers or self.workers
total_words, corpus_count = self.vocabulary.scan_vocab(
- sentences=sentences, input_streams=input_streams, progress_per=progress_per, trim_rule=trim_rule,
- workers=workers)
+ sentences=sentences, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule)
self.corpus_count = corpus_count
+ self.corpus_total_words = total_words
report_values = self.vocabulary.prepare_vocab(
self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab,
trim_rule=trim_rule, **kwargs)
@@ -916,9 +1023,9 @@ def estimate_memory(self, vocab_size=None, report=None):
)
return report
- def train(self, sentences=None, input_streams=None, total_examples=None, total_words=None,
+ def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None,
epochs=None, start_alpha=None, end_alpha=None, word_count=0,
- queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()):
+ queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), **kwargs):
"""Train the model. If the hyper-parameters are passed, they override the ones set in the constructor.
Parameters
@@ -928,6 +1035,10 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` module for such examples.
+ corpus_file : str, optional
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+ `corpus_file` arguments need to be passed (not both of them).
total_examples : int, optional
Count of sentences.
total_words : int, optional
@@ -949,6 +1060,8 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w
:attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`.
callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
List of callbacks that need to be executed/run at specific stages during training.
+ **kwargs : object
+ Additional key word parameters for the specific model inheriting from this class.
Returns
-------
@@ -962,9 +1075,10 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w
self.compute_loss = compute_loss
self.running_training_loss = 0.0
return super(BaseWordEmbeddingsModel, self).train(
- data_iterable=sentences, data_iterables=input_streams, total_examples=total_examples,
+ data_iterable=sentences, corpus_file=corpus_file, total_examples=total_examples,
total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
- queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)
+ queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks,
+ **kwargs)
def _get_job_params(self, cur_epoch):
"""Get the learning rate used in the current epoch.
@@ -1136,6 +1250,8 @@ def load(cls, *args, **kwargs):
model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary
if not hasattr(model, 'corpus_count'):
model.corpus_count = None
+ if not hasattr(model, 'corpus_total_words'):
+ model.corpus_total_words = None
if not hasattr(model.trainables, 'vectors_lockf') and hasattr(model.wv, 'vectors'):
model.trainables.vectors_lockf = ones(len(model.wv.vectors), dtype=REAL)
if not hasattr(model, 'random'):
@@ -1175,24 +1291,29 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot
elapsed : int
Elapsed time since the beginning of training in seconds.
+ Notes
+ -----
+ If you train the model via `corpus_file` argument, there is no job_queue, so reported job_queue size will
+ always be equal to -1.
+
"""
if total_examples:
# examples-based progress %
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
- utils.qsize(job_queue), utils.qsize(progress_queue)
+ -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue)
)
else:
# words-based progress %
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
- utils.qsize(job_queue), utils.qsize(progress_queue)
+ -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue)
)
def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
- trained_word_count, elapsed):
+ trained_word_count, elapsed, is_corpus_file_mode):
"""Callback used to log the end of a training epoch.
Parameters
@@ -1212,6 +1333,8 @@ def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_coun
the sentence length).
elapsed : int
Elapsed time since the beginning of training in seconds.
+ is_corpus_file_mode : bool
+ Whether training is file-based (corpus_file argument) or not.
Warnings
--------
@@ -1223,6 +1346,10 @@ def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_coun
cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed
)
+ # don't warn if training in file-based mode, because it's expected behavior
+ if is_corpus_file_mode:
+ return
+
# check that the input corpus hasn't changed during iteration
if total_examples and total_examples != example_count:
logger.warning(
diff --git a/gensim/models/basemodel.py b/gensim/models/basemodel.py
index 371b5b7010..04422f8199 100644
--- a/gensim/models/basemodel.py
+++ b/gensim/models/basemodel.py
@@ -15,7 +15,7 @@ def print_topic(self, topicno, topn=10):
String representation of topic, like '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + ... '.
"""
- return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)])
+ return ' + '.join('%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn))
def print_topics(self, num_topics=20, num_words=10):
"""Get the most significant topics (alias for `show_topics()` method).
diff --git a/gensim/models/callbacks.py b/gensim/models/callbacks.py
index 9935fdb3b4..c54efd88d8 100644
--- a/gensim/models/callbacks.py
+++ b/gensim/models/callbacks.py
@@ -22,63 +22,67 @@
To implement a Callback, inherit from this base class and override one or more of its methods.
Create a callback to save the training model after each epoch
-
->>> from gensim.test.utils import common_corpus, common_texts, get_tmpfile
->>> from gensim.models.callbacks import CallbackAny2Vec
->>> from gensim.models import Word2Vec
->>>
->>> class EpochSaver(CallbackAny2Vec):
-... '''Callback to save model after each epoch.'''
-...
-... def __init__(self, path_prefix):
-... self.path_prefix = path_prefix
-... self.epoch = 0
-...
-... def on_epoch_end(self, model):
-... output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))
-... model.save(output_path)
-... self.epoch += 1
-...
-
-Create a callback to print progress information to the console
-
->>> class EpochLogger(CallbackAny2Vec):
-... '''Callback to log information about training'''
-...
-... def __init__(self):
-... self.epoch = 0
-...
-... def on_epoch_begin(self, model):
-... print("Epoch #{} start".format(self.epoch))
-...
-... def on_epoch_end(self, model):
-... print("Epoch #{} end".format(self.epoch))
-... self.epoch += 1
-...
->>>
->>> epoch_logger = EpochLogger()
->>>
->>> w2v_model = Word2Vec(common_texts, iter=5, size=10, min_count=0, seed=42, callbacks=[epoch_logger])
-Epoch #0 start
-Epoch #0 end
-Epoch #1 start
-Epoch #1 end
-Epoch #2 start
-Epoch #2 end
-Epoch #3 start
-Epoch #3 end
-Epoch #4 start
-Epoch #4 end
-
-Create and bind a callback to a topic model. This callback will log the perplexity metric in real time
-
->>> from gensim.models.callbacks import PerplexityMetric
->>> from gensim.models.ldamodel import LdaModel
->>> from gensim.test.utils import common_corpus, common_dictionary
->>>
->>> # Log the perplexity score at the end of each epoch.
->>> perplexity_logger = PerplexityMetric(corpus=common_corpus, logger='shell')
->>> lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=5, callbacks=[perplexity_logger])
+.. sourcecode:: pycon
+
+ >>> from gensim.test.utils import get_tmpfile
+ >>> from gensim.models.callbacks import CallbackAny2Vec
+ >>>
+ >>>
+ >>> class EpochSaver(CallbackAny2Vec):
+ ... '''Callback to save model after each epoch.'''
+ ...
+ ... def __init__(self, path_prefix):
+ ... self.path_prefix = path_prefix
+ ... self.epoch = 0
+ ...
+ ... def on_epoch_end(self, model):
+ ... output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))
+ ... model.save(output_path)
+ ... self.epoch += 1
+ ...
+
+Create a callback to print progress information to the console:
+
+.. sourcecode:: pycon
+
+ >>> class EpochLogger(CallbackAny2Vec):
+ ... '''Callback to log information about training'''
+ ...
+ ... def __init__(self):
+ ... self.epoch = 0
+ ...
+ ... def on_epoch_begin(self, model):
+ ... print("Epoch #{} start".format(self.epoch))
+ ...
+ ... def on_epoch_end(self, model):
+ ... print("Epoch #{} end".format(self.epoch))
+ ... self.epoch += 1
+ ...
+ >>>
+ >>> epoch_logger = EpochLogger()
+ >>> w2v_model = Word2Vec(common_texts, iter=5, size=10, min_count=0, seed=42, callbacks=[epoch_logger])
+ Epoch #0 start
+ Epoch #0 end
+ Epoch #1 start
+ Epoch #1 end
+ Epoch #2 start
+ Epoch #2 end
+ Epoch #3 start
+ Epoch #3 end
+ Epoch #4 start
+ Epoch #4 end
+
+Create and bind a callback to a topic model. This callback will log the perplexity metric in real time:
+
+.. sourcecode:: pycon
+
+ >>> from gensim.models.callbacks import PerplexityMetric
+ >>> from gensim.models.ldamodel import LdaModel
+ >>> from gensim.test.utils import common_corpus, common_dictionary
+ >>>
+ >>> # Log the perplexity score at the end of each epoch.
+ >>> perplexity_logger = PerplexityMetric(corpus=common_corpus, logger='shell')
+ >>> lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=5, callbacks=[perplexity_logger])
"""
diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index b7e27fc474..9633a2e62f 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -94,26 +94,30 @@ class CoherenceModel(interfaces.TransformationABC):
One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly provided
if the model does not contain a dictionary already
- >>> from gensim.test.utils import common_corpus, common_dictionary
- >>> from gensim.models.ldamodel import LdaModel
- >>> from gensim.models.coherencemodel import CoherenceModel
- >>>
- >>> model = LdaModel(common_corpus, 5, common_dictionary)
- >>>
- >>> cm = CoherenceModel(model=model, corpus=common_corpus, coherence='u_mass')
- >>> coherence = cm.get_coherence() # get coherence value
-
- Another way of using this feature is through providing tokenized topics such as
-
- >>> from gensim.test.utils import common_corpus, common_dictionary
- >>> from gensim.models.coherencemodel import CoherenceModel
- >>> topics = [
- ... ['human', 'computer', 'system', 'interface'],
- ... ['graph', 'minors', 'trees', 'eps']
- ... ]
- >>>
- >>> cm = CoherenceModel(topics=topics, corpus=common_corpus, dictionary=common_dictionary, coherence='u_mass')
- >>> coherence = cm.get_coherence() # get coherence value
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import common_corpus, common_dictionary
+ >>> from gensim.models.ldamodel import LdaModel
+ >>> from gensim.models.coherencemodel import CoherenceModel
+ >>>
+ >>> model = LdaModel(common_corpus, 5, common_dictionary)
+ >>>
+ >>> cm = CoherenceModel(model=model, corpus=common_corpus, coherence='u_mass')
+ >>> coherence = cm.get_coherence() # get coherence value
+
+ Another way of using this feature is through providing tokenized topics such as:
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import common_corpus, common_dictionary
+ >>> from gensim.models.coherencemodel import CoherenceModel
+ >>> topics = [
+ ... ['human', 'computer', 'system', 'interface'],
+ ... ['graph', 'minors', 'trees', 'eps']
+ ... ]
+ >>>
+ >>> cm = CoherenceModel(topics=topics, corpus=common_corpus, dictionary=common_dictionary, coherence='u_mass')
+ >>> coherence = cm.get_coherence() # get coherence value
"""
def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None,
@@ -233,14 +237,16 @@ def for_models(cls, models, dictionary, topn=20, **kwargs):
Example
-------
- >>> from gensim.test.utils import common_corpus, common_dictionary
- >>> from gensim.models.ldamodel import LdaModel
- >>> from gensim.models.coherencemodel import CoherenceModel
- >>>
- >>> m1 = LdaModel(common_corpus, 3, common_dictionary)
- >>> m2 = LdaModel(common_corpus, 5, common_dictionary)
- >>>
- >>> cm = CoherenceModel.for_models([m1, m2], common_dictionary, corpus=common_corpus, coherence='u_mass')
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import common_corpus, common_dictionary
+ >>> from gensim.models.ldamodel import LdaModel
+ >>> from gensim.models.coherencemodel import CoherenceModel
+ >>>
+ >>> m1 = LdaModel(common_corpus, 3, common_dictionary)
+ >>> m2 = LdaModel(common_corpus, 5, common_dictionary)
+ >>>
+ >>> cm = CoherenceModel.for_models([m1, m2], common_dictionary, corpus=common_corpus, coherence='u_mass')
"""
topics = [cls.top_topics_as_word_lists(model, dictionary, topn) for model in models]
kwargs['dictionary'] = dictionary
@@ -438,7 +444,7 @@ def _ensure_elements_are_ids(self, topic):
try:
return np.array([self.dictionary.token2id[token] for token in topic])
except KeyError: # might be a list of token ids already, but let's verify all in dict
- topic = [self.dictionary.id2token[_id] for _id in topic]
+ topic = (self.dictionary.id2token[_id] for _id in topic)
return np.array([self.dictionary.token2id[token] for token in topic])
def _update_accumulator(self, new_topics):
@@ -454,9 +460,9 @@ def _relevant_ids_will_differ(self, new_topics):
return not self._accumulator.relevant_ids.issuperset(new_set)
def _topics_differ(self, new_topics):
- return (new_topics is not None and
- self._topics is not None and
- not np.array_equal(new_topics, self._topics))
+ return (new_topics is not None
+ and self._topics is not None
+ and not np.array_equal(new_topics, self._topics))
def _get_topics(self):
"""Internal helper function to return topics from a trained topic model."""
diff --git a/gensim/models/deprecated/doc2vec.py b/gensim/models/deprecated/doc2vec.py
index 33d442904a..76e4a7e2d4 100644
--- a/gensim/models/deprecated/doc2vec.py
+++ b/gensim/models/deprecated/doc2vec.py
@@ -21,16 +21,22 @@
Initialize a model with e.g.::
->>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4)
+.. sourcecode:: pycon
+
+ >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4)
Persist a model to disk with::
->>> model.save(fname)
->>> model = Doc2Vec.load(fname) # you can continue training with the loaded model!
+.. sourcecode:: pycon
+
+ >>> model.save(fname)
+ >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model!
If you're finished training a model (=no more updates, only querying), you can do
- >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True):
+.. sourcecode:: pycon
+
+ >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True):
to trim unneeded model memory = use (much) less RAM.
@@ -75,7 +81,7 @@
from gensim.models.deprecated.old_saveload import SaveLoad
from gensim import matutils # utility fnc for pickling, common scipy operations etc
-from six.moves import xrange, zip
+from six.moves import zip, range
from six import string_types, integer_types
logger = logging.getLogger(__name__)
@@ -91,7 +97,7 @@ def load_old_doc2vec(*args, **kwargs):
'dm_tag_count': old_model.dm_tag_count,
'docvecs_mapfile': old_model.__dict__.get('docvecs_mapfile', None),
'comment': old_model.__dict__.get('comment', None),
- 'size': old_model.vector_size,
+ 'vector_size': old_model.vector_size,
'alpha': old_model.alpha,
'window': old_model.window,
'min_count': old_model.min_count,
@@ -104,7 +110,7 @@ def load_old_doc2vec(*args, **kwargs):
'negative': old_model.negative,
'cbow_mean': old_model.cbow_mean,
'hashfxn': old_model.hashfxn,
- 'iter': old_model.iter,
+ 'epochs': old_model.iter,
'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1),
'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH),
'compute_loss': old_model.__dict__.get('compute_loss', None)
@@ -153,6 +159,7 @@ def load_old_doc2vec(*args, **kwargs):
new_model.train_count = old_model.__dict__.get('train_count', None)
new_model.corpus_count = old_model.__dict__.get('corpus_count', None)
+ new_model.corpus_total_words = old_model.__dict__.get('corpus_total_words', None)
new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0)
new_model.total_train_time = old_model.__dict__.get('total_train_time', None)
new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha)
@@ -235,8 +242,8 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf
- word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
- model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+ word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
+ and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code
@@ -291,8 +298,8 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf
- word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
- model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+ word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
+ and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
doctag_len = len(doctag_indexes)
if doctag_len != model.dm_tag_count:
return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?)
@@ -358,11 +365,13 @@ class DocvecsArray(SaveLoad):
As the 'docvecs' property of a Doc2Vec model, allows access and
comparison of document vectors.
- >>> docvec = d2v_model.docvecs[99]
- >>> docvec = d2v_model.docvecs['SENT_99'] # if string tag used in training
- >>> sims = d2v_model.docvecs.most_similar(99)
- >>> sims = d2v_model.docvecs.most_similar('SENT_99')
- >>> sims = d2v_model.docvecs.most_similar(docvec)
+ .. sourcecode:: pycon
+
+ >>> docvec = d2v_model.docvecs[99]
+ >>> docvec = d2v_model.docvecs['SENT_99'] # if string tag used in training
+ >>> sims = d2v_model.docvecs.most_similar(99)
+ >>> sims = d2v_model.docvecs.most_similar('SENT_99')
+ >>> sims = d2v_model.docvecs.most_similar(docvec)
If only plain int tags are presented during training, the dict (of
string tag -> index) and list (of index -> string tag) stay empty,
@@ -481,7 +490,7 @@ def reset_weights(self, model):
self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL)
self.doctag_syn0_lockf = ones((length,), dtype=REAL) # zeros suppress learning
- for i in xrange(length):
+ for i in range(length):
# construct deterministic seed from index AND model seed
seed = "%d %s" % (model.seed, self.index_to_doctag(i))
self.doctag_syn0[i] = model.seeded_vector(seed)
@@ -501,7 +510,7 @@ def init_sims(self, replace=False):
if getattr(self, 'doctag_syn0norm', None) is None or replace:
logger.info("precomputing L2-norms of doc weight vectors")
if replace:
- for i in xrange(self.doctag_syn0.shape[0]):
+ for i in range(self.doctag_syn0.shape[0]):
self.doctag_syn0[i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1))
self.doctag_syn0norm = self.doctag_syn0
else:
diff --git a/gensim/models/deprecated/fasttext.py b/gensim/models/deprecated/fasttext.py
index 594c310b9a..836c66d4ca 100644
--- a/gensim/models/deprecated/fasttext.py
+++ b/gensim/models/deprecated/fasttext.py
@@ -107,6 +107,7 @@ def load_old_fasttext(*args, **kwargs):
new_model.train_count = old_model.train_count
new_model.corpus_count = old_model.corpus_count
+ new_model.corpus_total_words = old_model.corpus_total_words
new_model.running_training_loss = old_model.running_training_loss
new_model.total_train_time = old_model.total_train_time
new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
@@ -147,8 +148,8 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
"""
result = 0
for sentence in sentences:
- word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
- model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+ word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
+ and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window)
start = max(0, pos - model.window + reduced_window)
@@ -210,8 +211,8 @@ def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
"""
result = 0
for sentence in sentences:
- word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
- model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+ word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
+ and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
@@ -327,13 +328,14 @@ def __init__(
--------
Initialize and train a `FastText` model
- >>> from gensim.models import FastText
- >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
- >>>
- >>> model = FastText(sentences, min_count=1)
- >>> say_vector = model['say'] # get vector for word
- >>> of_vector = model['of'] # get vector for out-of-vocab word
+ .. sourcecode:: pycon
+ >>> from gensim.models import FastText
+ >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+ >>>
+ >>> model = FastText(sentences, min_count=1)
+ >>> say_vector = model['say'] # get vector for word
+ >>> of_vector = model['of'] # get vector for out-of-vocab word
"""
# fastText specific params
@@ -386,15 +388,17 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_
-------
Train a model and update vocab for online training
- >>> from gensim.models import FastText
- >>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]]
- >>> sentences_2 = [["dude", "say", "wazzup!"]]
- >>>
- >>> model = FastText(min_count=1)
- >>> model.build_vocab(sentences_1)
- >>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter)
- >>> model.build_vocab(sentences_2, update=True)
- >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter)
+ .. sourcecode:: pycon
+
+ >>> from gensim.models import FastText
+ >>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+ >>> sentences_2 = [["dude", "say", "wazzup!"]]
+ >>>
+ >>> model = FastText(min_count=1)
+ >>> model.build_vocab(sentences_1)
+ >>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter)
+ >>> model.build_vocab(sentences_2, update=True)
+ >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter)
"""
if update:
@@ -584,12 +588,15 @@ def train(self, sentences, total_examples=None, total_words=None,
Examples
--------
- >>> from gensim.models import FastText
- >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
- >>>
- >>> model = FastText(min_count=1)
- >>> model.build_vocab(sentences)
- >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.models import FastText
+ >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+ >>>
+ >>> model = FastText(min_count=1)
+ >>> model.build_vocab(sentences)
+ >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
"""
self.neg_labels = []
@@ -623,11 +630,13 @@ def __getitem__(self, word):
Example
-------
- >>> from gensim.models import FastText
- >>> from gensim.test.utils import datapath
- >>>
- >>> trained_model = FastText.load_fasttext_format(datapath('lee_fasttext'))
- >>> meow_vector = trained_model['hello'] # get vector for word
+ .. sourcecode:: pycon
+
+ >>> from gensim.models import FastText
+ >>> from gensim.test.utils import datapath
+ >>>
+ >>> trained_model = FastText.load_fasttext_format(datapath('lee_fasttext'))
+ >>> meow_vector = trained_model['hello'] # get vector for word
"""
return self.word_vec(word)
@@ -665,11 +674,13 @@ def word_vec(self, word, use_norm=False):
Example
-------
- >>> from gensim.models import FastText
- >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
- >>>
- >>> model = FastText(sentences, min_count=1)
- >>> meow_vector = model.word_vec('meow') # get vector for word
+ .. sourcecode:: pycon
+
+ >>> from gensim.models import FastText
+ >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+ >>>
+ >>> model = FastText(sentences, min_count=1)
+ >>> meow_vector = model.word_vec('meow') # get vector for word
"""
return FastTextKeyedVectors.word_vec(self.wv, word, use_norm=use_norm)
diff --git a/gensim/models/deprecated/fasttext_wrapper.py b/gensim/models/deprecated/fasttext_wrapper.py
index 930f2c1308..727db0e1e0 100644
--- a/gensim/models/deprecated/fasttext_wrapper.py
+++ b/gensim/models/deprecated/fasttext_wrapper.py
@@ -23,10 +23,11 @@
`Word2Vec` for that.
Example:
+.. sourcecode:: pycon
->>> from gensim.models.wrappers import FastText
->>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8')
->>> print model['forests'] # prints vector for given out-of-vocabulary word
+ >>> from gensim.models.wrappers import FastText
+ >>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8')
+ >>> print model['forests'] # prints vector for given out-of-vocabulary word
.. [1] https://github.com/facebookresearch/fastText#enriching-word-vectors-with-subword-information
@@ -89,10 +90,12 @@ def word_vec(self, word, use_norm=False):
The word can be out-of-vocabulary as long as ngrams for the word are present.
For words with all ngrams absent, a KeyError is raised.
- Example::
+ Example:
- >>> trained_model['office']
- array([ -1.40128313e-02, ...])
+ .. sourcecode:: pycon
+
+ >>> trained_model['office']
+ array([ -1.40128313e-02, ...])
"""
if word in self.vocab:
diff --git a/gensim/models/deprecated/keyedvectors.py b/gensim/models/deprecated/keyedvectors.py
index d86b0f3837..5ead121e48 100644
--- a/gensim/models/deprecated/keyedvectors.py
+++ b/gensim/models/deprecated/keyedvectors.py
@@ -16,46 +16,58 @@
The word vectors are considered read-only in this class.
-Initialize the vectors by training e.g. Word2Vec::
+Initialize the vectors by training e.g. Word2Vec:
->>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
->>> word_vectors = model.wv
+.. sourcecode:: pycon
-Persist the word vectors to disk with::
+ >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
+ >>> word_vectors = model.wv
->>> word_vectors.save(fname)
->>> word_vectors = KeyedVectors.load(fname)
+Persist the word vectors to disk with:
+
+.. sourcecode:: pycon
+
+ >>> word_vectors.save(fname)
+ >>> word_vectors = KeyedVectors.load(fname)
The vectors can also be instantiated from an existing file on disk
-in the original Google's word2vec C format as a KeyedVectors instance::
+in the original Google's word2vec C format as a KeyedVectors instance:
+
+.. sourcecode:: pycon
- >>> from gensim.models.keyedvectors import KeyedVectors
- >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format
- >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format
+ >>> from gensim.models.keyedvectors import KeyedVectors
+ >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format
+ >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format
You can perform various syntactic/semantic NLP word tasks with the vectors. Some of them
-are already built-in::
+are already built-in:
+
+.. sourcecode:: pycon
+
+ >>> word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
+ [('queen', 0.50882536), ...]
+
+ >>> word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
+ [('queen', 0.71382287), ...]
- >>> word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
- [('queen', 0.50882536), ...]
+ >>> word_vectors.doesnt_match("breakfast cereal dinner lunch".split())
+ 'cereal'
- >>> word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
- [('queen', 0.71382287), ...]
+ >>> word_vectors.similarity('woman', 'man')
+ 0.73723527
- >>> word_vectors.doesnt_match("breakfast cereal dinner lunch".split())
- 'cereal'
+Correlation with human opinion on word similarity:
- >>> word_vectors.similarity('woman', 'man')
- 0.73723527
+.. sourcecode:: pycon
-Correlation with human opinion on word similarity::
+ >>> word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
+ 0.51, 0.62, 0.13
- >>> word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
- 0.51, 0.62, 0.13
+And on analogies:
-And on analogies::
+.. sourcecode:: pycon
- >>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
+ >>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
and so on.
@@ -74,7 +86,7 @@
try:
from pyemd import emd
PYEMD_EXT = True
-except ImportError:
+except (ImportError, ValueError):
PYEMD_EXT = False
from numpy import dot, zeros, dtype, float32 as REAL,\
@@ -86,7 +98,7 @@
from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
from gensim.corpora.dictionary import Dictionary
from six import string_types, iteritems
-from six.moves import xrange
+from six.moves import range
from scipy import stats
@@ -227,7 +239,7 @@ def add_word(word, weights):
if binary:
binary_len = dtype(REAL).itemsize * vector_size
- for _ in xrange(vocab_size):
+ for _ in range(vocab_size):
# mixed text and binary: read text first, then binary
word = []
while True:
@@ -242,7 +254,7 @@ def add_word(word, weights):
weights = fromstring(fin.read(binary_len), dtype=REAL)
add_word(word, weights)
else:
- for line_no in xrange(vocab_size):
+ for line_no in range(vocab_size):
line = fin.readline()
if line == b'':
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
@@ -292,10 +304,12 @@ def word_vec(self, word):
Accept a single word as input.
Returns the word's representations in vector space, as a 1D numpy array.
- Example::
+ Example:
- >>> trained_model.word_vec('office')
- array([ -1.40128313e-02, ...])
+ .. sourcecode:: pycon
+
+ >>> trained_model.word_vec('office')
+ array([ -1.40128313e-02, ...])
"""
if word in self.vocab:
@@ -316,15 +330,17 @@ def __getitem__(self, words):
2d numpy array: #words x #vector_size. Matrix rows are in the same order
as in input.
- Example::
+ Example:
+
+ .. sourcecode:: pycon
- >>> trained_model['office']
- array([ -1.40128313e-02, ...])
+ >>> trained_model['office']
+ array([ -1.40128313e-02, ...])
- >>> trained_model[['office', 'products']]
- array([ -1.40128313e-02, ...]
- [ -1.70425311e-03, ...]
- ...)
+ >>> trained_model[['office', 'products']]
+ array([ -1.40128313e-02, ...]
+ [ -1.70425311e-03, ...]
+ ...)
"""
if isinstance(words, string_types):
@@ -349,13 +365,15 @@ def most_similar_to_given(self, w1, word_list):
Raises:
KeyError: If w1 or any word in word_list is not in the vocabulary
- Example::
+ Example:
+
+ .. sourcecode:: pycon
- >>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse'])
- 'sound'
+ >>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse'])
+ 'sound'
- >>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone'])
- 'animal'
+ >>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone'])
+ 'animal'
"""
return word_list[argmax([self.similarity(w1, word) for word in word_list])]
@@ -379,8 +397,10 @@ def words_closer_than(self, w1, w2):
Examples
--------
- >>> model.words_closer_than('carnivore.n.01', 'mammal.n.01')
- ['dog.n.01', 'canine.n.02']
+ .. sourcecode:: pycon
+
+ >>> model.words_closer_than('carnivore.n.01', 'mammal.n.01')
+ ['dog.n.01', 'canine.n.02']
"""
all_distances = self.distances(w1)
@@ -408,8 +428,10 @@ def rank(self, w1, w2):
Examples
--------
- >>> model.rank('mammal.n.01', 'carnivore.n.01')
- 3
+ .. sourcecode:: pycon
+
+ >>> model.rank('mammal.n.01', 'carnivore.n.01')
+ 3
"""
return len(self.words_closer_than(w1, w2)) + 1
@@ -441,10 +463,12 @@ def word_vec(self, word, use_norm=False):
If `use_norm` is True, returns the normalized word vector.
- Example::
+ Example:
+
+ .. sourcecode:: pycon
- >>> trained_model['office']
- array([ -1.40128313e-02, ...])
+ >>> trained_model['office']
+ array([ -1.40128313e-02, ...])
"""
if word in self.vocab:
@@ -475,10 +499,12 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
only check the first 10000 word vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
- Example::
+ Example:
+
+ .. sourcecode:: pycon
- >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
- [('queen', 0.50882536), ...]
+ >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
+ [('queen', 0.50882536), ...]
"""
if positive is None:
@@ -538,10 +564,12 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None):
only check the first 10000 word vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
- Example::
+ Example:
- >>> trained_model.similar_by_word('graph')
- [('user', 0.9999163150787354), ...]
+ .. sourcecode:: pycon
+
+ >>> trained_model.similar_by_word('graph')
+ [('user', 0.9999163150787354), ...]
"""
return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
@@ -580,6 +608,9 @@ def wmdistance(self, document1, document2):
This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler).
Example:
+
+ .. sourcecode:: pycon
+
>>> # Train word2vec model.
>>> model = Word2Vec(sentences)
@@ -671,10 +702,12 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
respectively – a potentially sensible but untested extension of the method. (With
a single positive example, rankings will be the same as in the default most_similar.)
- Example::
+ Example:
- >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
- [(u'iraq', 0.8488819003105164), ...]
+ .. sourcecode:: pycon
+
+ >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
+ [(u'iraq', 0.8488819003105164), ...]
.. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.
@@ -810,13 +843,15 @@ def distance(self, w1, w2):
"""
Compute cosine distance between two words.
- Example::
+ Example:
- >>> trained_model.distance('woman', 'man')
- 0.34
+ .. sourcecode:: pycon
- >>> trained_model.distance('woman', 'woman')
- 0.0
+ >>> trained_model.distance('woman', 'man')
+ 0.34
+
+ >>> trained_model.distance('woman', 'woman')
+ 0.0
"""
return 1 - self.similarity(w1, w2)
@@ -825,13 +860,15 @@ def similarity(self, w1, w2):
"""
Compute cosine similarity between two words.
- Example::
+ Example:
+
+ .. sourcecode:: pycon
- >>> trained_model.similarity('woman', 'man')
- 0.73723527
+ >>> trained_model.similarity('woman', 'man')
+ 0.73723527
- >>> trained_model.similarity('woman', 'woman')
- 1.0
+ >>> trained_model.similarity('woman', 'woman')
+ 1.0
"""
return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
@@ -840,16 +877,18 @@ def n_similarity(self, ws1, ws2):
"""
Compute cosine similarity between two sets of words.
- Example::
+ Example:
+
+ .. sourcecode:: pycon
- >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
- 0.61540466561049689
+ >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
+ 0.61540466561049689
- >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
- 1.0000000000000004
+ >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
+ 1.0000000000000004
- >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')
- True
+ >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')
+ True
"""
if not(len(ws1) and len(ws2)):
@@ -1045,7 +1084,7 @@ def init_sims(self, replace=False):
if getattr(self, 'syn0norm', None) is None or replace:
logger.info("precomputing L2-norms of word weight vectors")
if replace:
- for i in xrange(self.syn0.shape[0]):
+ for i in range(self.syn0.shape[0]):
self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1))
self.syn0norm = self.syn0
else:
diff --git a/gensim/models/deprecated/old_saveload.py b/gensim/models/deprecated/old_saveload.py
index 44f4a5619d..c609dd5532 100644
--- a/gensim/models/deprecated/old_saveload.py
+++ b/gensim/models/deprecated/old_saveload.py
@@ -108,8 +108,8 @@ def _load_specials(self, fname, mmap, compress, subname):
"""
def mmap_error(obj, filename):
return IOError(
- 'Cannot mmap compressed object %s in file %s. ' % (obj, filename) +
- 'Use `load(fname, mmap=None)` or uncompress files manually.'
+ 'Cannot mmap compressed object %s in file %s. ' % (obj, filename)
+ + 'Use `load(fname, mmap=None)` or uncompress files manually.'
)
for attrib in getattr(self, '__recursive_saveloads', []):
diff --git a/gensim/models/deprecated/word2vec.py b/gensim/models/deprecated/word2vec.py
index 5ac913dbb9..b8b04d4c10 100644
--- a/gensim/models/deprecated/word2vec.py
+++ b/gensim/models/deprecated/word2vec.py
@@ -27,26 +27,34 @@
**Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training**
(70x speedup compared to plain NumPy implementation [3]_).
-Initialize a model with e.g.::
+Initialize a model with e.g.:
+
+.. sourcecode:: pycon
>>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
-Persist a model to disk with::
+Persist a model to disk with:
+
+.. sourcecode:: pycon
>>> model.save(fname)
>>> model = Word2Vec.load(fname) # you can continue training with the loaded model!
The word vectors are stored in a KeyedVectors instance in model.wv.
-This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec::
+This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:
+
+.. sourcecode:: pycon
- >>> model.wv['computer'] # numpy vector of a word
- array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32)
+ >>> model.wv['computer'] # numpy vector of a word
+ array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32)
The word vectors can also be instantiated from an existing file on disk in the word2vec C format
as a KeyedVectors instance::
NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights,
- vocabulary frequency and the binary tree is missing::
+ vocabulary frequency and the binary tree is missing:
+
+ .. sourcecode:: pycon
>>> from gensim.models.keyedvectors import KeyedVectors
>>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format
@@ -54,42 +62,51 @@
You can perform various NLP word tasks with the model. Some of them
-are already built-in::
+are already built-in:
+
+.. sourcecode:: pycon
+
+ >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
+ [('queen', 0.50882536), ...]
- >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
- [('queen', 0.50882536), ...]
+ >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
+ [('queen', 0.71382287), ...]
- >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
- [('queen', 0.71382287), ...]
+ >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split())
+ 'cereal'
+ >>> model.wv.similarity('woman', 'man')
+ 0.73723527
- >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split())
- 'cereal'
+Probability of a text under the model:
- >>> model.wv.similarity('woman', 'man')
- 0.73723527
+.. sourcecode:: pycon
-Probability of a text under the model::
+ >>> model.score(["The fox jumped over a lazy dog".split()])
+ 0.2158356
- >>> model.score(["The fox jumped over a lazy dog".split()])
- 0.2158356
+Correlation with human opinion on word similarity:
-Correlation with human opinion on word similarity::
+.. sourcecode:: pycon
- >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
- 0.51, 0.62, 0.13
+ >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
+ 0.51, 0.62, 0.13
-And on analogies::
+And on analogies:
- >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
+.. sourcecode:: pycon
+
+ >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
and so on.
If you're finished training a model (i.e. no more updates, only querying),
then switch to the :mod:`gensim.models.KeyedVectors` instance in wv
- >>> word_vectors = model.wv
- >>> del model
+.. sourcecode:: pycon
+
+ >>> word_vectors = model.wv
+ >>> del model
to trim unneeded model memory = use much less RAM.
@@ -97,6 +114,8 @@
detect phrases longer than one word. Using phrases, you can learn a word2vec model
where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`:
+.. sourcecode:: pycon
+
>>> bigram_transformer = gensim.models.Phrases(sentences)
>>> model = Word2Vec(bigram_transformer[sentences], size=100, ...)
@@ -138,7 +157,7 @@
from gensim import utils
from gensim import matutils # utility fnc for pickling, common scipy operations etc
from six import iteritems, itervalues, string_types
-from six.moves import xrange
+from six.moves import range
from types import GeneratorType
logger = logging.getLogger(__name__)
@@ -191,6 +210,7 @@ def load_old_word2vec(*args, **kwargs):
new_model.train_count = old_model.__dict__.get('train_count', None)
new_model.corpus_count = old_model.__dict__.get('corpus_count', None)
+ new_model.corpus_total_words = old_model.__dict__.get('corpus_total_words', None)
new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0)
new_model.total_train_time = old_model.__dict__.get('total_train_time', None)
new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha)
@@ -212,8 +232,8 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
"""
result = 0
for sentence in sentences:
- word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
- model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+ word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
+ and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
@@ -243,8 +263,8 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
"""
result = 0
for sentence in sentences:
- word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
- model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+ word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
+ and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
start = max(0, pos - model.window + reduced_window)
@@ -638,10 +658,10 @@ def make_cum_table(self, power=0.75, domain=2**31 - 1):
self.cum_table = zeros(vocab_size, dtype=uint32)
# compute sum of all power (Z in paper)
train_words_pow = 0.0
- for word_index in xrange(vocab_size):
+ for word_index in range(vocab_size):
train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power
cumulative = 0.0
- for word_index in xrange(vocab_size):
+ for word_index in range(vocab_size):
cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power
self.cum_table[word_index] = round(cumulative / train_words_pow * domain)
if len(self.cum_table) > 0:
@@ -658,7 +678,7 @@ def create_binary_tree(self):
# build the huffman tree
heap = list(itervalues(self.wv.vocab))
heapq.heapify(heap)
- for i in xrange(len(self.wv.vocab) - 1):
+ for i in range(len(self.wv.vocab) - 1):
min1, min2 = heapq.heappop(heap), heapq.heappop(heap)
heapq.heappush(
heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2)
@@ -718,9 +738,13 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
Examples
--------
- >>> from gensim.models.word2vec import Word2Vec
- >>> model= Word2Vec()
- >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20})
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.models.word2vec import Word2Vec
+ >>> model = Word2Vec()
+ >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20})
+
"""
logger.info("Processing provided word frequencies")
# Instead of scanning text, this will assign provided word frequencies dictionary(word_freq)
@@ -1111,7 +1135,7 @@ def job_producer():
)
# give the workers heads up that they can finish -- no more work!
- for _ in xrange(self.workers):
+ for _ in range(self.workers):
job_queue.put(None)
logger.debug("job loop exiting, total %i jobs", job_no)
@@ -1119,7 +1143,7 @@ def job_producer():
job_queue = Queue(maxsize=queue_factor * self.workers)
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
- workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)]
+ workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)]
unfinished_worker_count = len(workers)
workers.append(threading.Thread(target=job_producer))
@@ -1256,7 +1280,7 @@ def worker_loop():
job_queue = Queue(maxsize=queue_factor * self.workers)
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
- workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)]
+ workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)]
for thread in workers:
thread.daemon = True # make interrupting the process with ctrl+c easier
thread.start()
@@ -1283,7 +1307,7 @@ def worker_loop():
job_queue.put(items)
except StopIteration:
logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1)
- for _ in xrange(self.workers):
+ for _ in range(self.workers):
job_queue.put(None) # give the workers heads up that they can finish -- no more work!
push_done = True
try:
@@ -1330,7 +1354,7 @@ def update_weights(self):
newsyn0 = empty((gained_vocab, self.vector_size), dtype=REAL)
# randomize the remaining words
- for i in xrange(len(self.wv.syn0), len(self.wv.vocab)):
+ for i in range(len(self.wv.syn0), len(self.wv.vocab)):
# construct deterministic seed from word AND seed argument
newsyn0[i - len(self.wv.syn0)] = self.seeded_vector(self.wv.index2word[i] + str(self.seed))
@@ -1357,7 +1381,7 @@ def reset_weights(self):
logger.info("resetting layer weights")
self.wv.syn0 = empty((len(self.wv.vocab), self.vector_size), dtype=REAL)
# randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
- for i in xrange(len(self.wv.vocab)):
+ for i in range(len(self.wv.vocab)):
# construct deterministic seed from word AND seed argument
self.wv.syn0[i] = self.seeded_vector(self.wv.index2word[i] + str(self.seed))
if self.hs:
@@ -1397,7 +1421,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
# TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)?
if binary:
binary_len = dtype(REAL).itemsize * vector_size
- for _ in xrange(vocab_size):
+ for _ in range(vocab_size):
# mixed text and binary: read text first, then binary
word = []
while True:
@@ -1622,6 +1646,8 @@ def load(cls, *args, **kwargs):
model.make_cum_table() # rebuild cum_table from vocabulary
if not hasattr(model, 'corpus_count'):
model.corpus_count = None
+ if not hasattr(model, 'corpus_total_words'):
+ model.corpus_total_words = None
for v in model.wv.vocab.values():
if hasattr(v, 'sample_int'):
break # already 0.12.0+ style int probabilities
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index 57638e71bc..e0af132c23 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -20,43 +20,50 @@
`_.
**Make sure you have a C compiler before installing Gensim, to use the optimized doc2vec routines** (70x speedup
-compared to plain NumPy implementation `_).
+compared to plain NumPy implementation, https://rare-technologies.com/parallelizing-word2vec-in-python/).
-Examples
---------
+Usage examples
+==============
-Initialize & train a model
+Initialize & train a model:
->>> from gensim.test.utils import common_texts
->>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument
->>>
->>> documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
->>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
+.. sourcecode:: pycon
-Persist a model to disk
+ >>> from gensim.test.utils import common_texts
+ >>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+ >>>
+ >>> documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
+ >>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
->>> from gensim.test.utils import get_tmpfile
->>>
->>> fname = get_tmpfile("my_doc2vec_model")
->>>
->>> model.save(fname)
->>> model = Doc2Vec.load(fname) # you can continue training with the loaded model!
+Persist a model to disk:
-If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do
+.. sourcecode:: pycon
->>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
+ >>> from gensim.test.utils import get_tmpfile
+ >>>
+ >>> fname = get_tmpfile("my_doc2vec_model")
+ >>>
+ >>> model.save(fname)
+ >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model!
-Infer vector for new document
+If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do:
->>> vector = model.infer_vector(["system", "response"])
+.. sourcecode:: pycon
+
+ >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
+
+Infer vector for a new document:
+
+.. sourcecode:: pycon
+
+ >>> vector = model.infer_vector(["system", "response"])
"""
import logging
import os
import warnings
-import multiprocessing
try:
from queue import Queue
@@ -65,7 +72,6 @@
from collections import namedtuple, defaultdict
from timeit import default_timer
-from functools import reduce
from numpy import zeros, float32 as REAL, empty, ones, \
memmap as np_memmap, vstack, integer, dtype, sum as np_sum, add as np_add, repeat as np_repeat, concatenate
@@ -75,12 +81,12 @@
from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
from gensim.models.word2vec import Word2VecKeyedVectors, Word2VecVocab, Word2VecTrainables, train_cbow_pair,\
train_sg_pair, train_batch_sg
-from six.moves import xrange
-from six import string_types, integer_types, itervalues, iteritems
+from six.moves import range
+from six import string_types, integer_types, itervalues
from gensim.models.base_any2vec import BaseWordEmbeddingsModel
from gensim.models.keyedvectors import Doc2VecKeyedVectors
from types import GeneratorType
-from gensim.utils import deprecated
+from gensim.utils import deprecated, smart_open
logger = logging.getLogger(__name__)
@@ -221,8 +227,8 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf
- word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
- model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
+ word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
+ and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code
@@ -308,8 +314,8 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf
- word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
- model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
+ word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
+ and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
doctag_len = len(doctag_indexes)
if doctag_len != model.dm_tag_count:
return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?)
@@ -347,6 +353,36 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
return len(padded_document_indexes) - pre_pad_count - post_pad_count
+try:
+ from gensim.models.doc2vec_corpusfile import (
+ d2v_train_epoch_dbow,
+ d2v_train_epoch_dm_concat,
+ d2v_train_epoch_dm,
+ CORPUSFILE_VERSION
+ )
+except ImportError:
+ # corpusfile doc2vec is not supported
+ CORPUSFILE_VERSION = -1
+
+ def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
+ _expected_words, work, _neu1, docvecs_count, word_vectors=None, word_locks=None,
+ train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ doctag_vectors=None, doctag_locks=None):
+ raise NotImplementedError("Training with corpus_file argument is not supported.")
+
+ def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch,
+ _expected_examples, _expected_words, work, _neu1, docvecs_count, word_vectors=None,
+ word_locks=None, learn_doctags=True, learn_words=True, learn_hidden=True,
+ doctag_vectors=None, doctag_locks=None):
+ raise NotImplementedError("Training with corpus_file argument is not supported.")
+
+ def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
+ _expected_words, work, _neu1, docvecs_count, word_vectors=None, word_locks=None,
+ learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None,
+ doctag_locks=None):
+ raise NotImplementedError("Training with corpus_file argument is not supported.")
+
+
class TaggedDocument(namedtuple('TaggedDocument', 'words tags')):
"""Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`.
@@ -437,7 +473,7 @@ class Doc2Vec(BaseWordEmbeddingsModel):
includes not only the word vectors of each word in the context, but also the paragraph vector.
"""
- def __init__(self, documents=None, input_streams=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0,
+ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0,
dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(),
**kwargs):
"""
@@ -448,13 +484,15 @@ def __init__(self, documents=None, input_streams=None, dm_mean=None, dm=1, dbow_
Input corpus, can be simply a list of elements, but for larger corpora,consider an iterable that streams
the documents directly from disk/network. If you don't supply `documents`, the model is
left uninitialized -- use if you plan to initialize it in some other way.
- input_streams : list or tuple of iterable of iterables
- The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible
- to process streams in parallel, using `workers` parameter.
+ corpus_file : str, optional
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+ `corpus_file` arguments need to be passed (or none of them). Documents' tags are assigned automatically
+ and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
dm : {1,0}, optional
Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
Otherwise, `distributed bag of words` (PV-DBOW) is employed.
- size : int, optional
+ vector_size : int, optional
Dimensionality of the feature vectors.
window : int, optional
The maximum distance between the current and predicted word within a sentence.
@@ -480,7 +518,7 @@ def __init__(self, documents=None, input_streams=None, dm_mean=None, dm=1, dbow_
useful range is (0, 1e-5).
workers : int, optional
Use these many worker threads to train the model (=faster training with multicore machines).
- iter : int, optional
+ epochs : int, optional
Number of iterations (epochs) over the corpus.
hs : {1,0}, optional
If 1, hierarchical softmax will be used for model training.
@@ -572,23 +610,18 @@ def __init__(self, documents=None, input_streams=None, dm_mean=None, dm=1, dbow_
self.docvecs = docvecs or Doc2VecKeyedVectors(self.vector_size, docvecs_mapfile)
self.comment = comment
- if documents is not None or input_streams is not None:
- self._check_input_data_sanity(data_iterable=documents, data_iterables=input_streams)
- if input_streams is not None:
- if not isinstance(input_streams, (tuple, list)):
- raise TypeError("You must pass tuple or list as the input_streams argument.")
- if any(isinstance(stream, GeneratorType) for stream in input_streams):
- raise TypeError("You can't pass a generator as any of input streams. Try an iterator.")
- if any(isinstance(stream, TaggedLineDocument) for stream in input_streams):
- warnings.warn("Using TaggedLineDocument in multistream mode can lead to incorrect results "
- "because of tags collision.")
+
+ if documents is not None or corpus_file is not None:
+ self._check_input_data_sanity(data_iterable=documents, corpus_file=corpus_file)
+ if corpus_file is not None and not isinstance(corpus_file, string_types):
+ raise TypeError("You must pass string as the corpus_file argument.")
elif isinstance(documents, GeneratorType):
raise TypeError("You can't pass a generator as the documents argument. Try an iterator.")
- self.build_vocab(documents=documents, input_streams=input_streams,
- trim_rule=trim_rule, workers=self.workers)
+ self.build_vocab(documents=documents, corpus_file=corpus_file, trim_rule=trim_rule)
self.train(
- documents=documents, input_streams=input_streams, total_examples=self.corpus_count, epochs=self.epochs,
- start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks)
+ documents=documents, corpus_file=corpus_file, total_examples=self.corpus_count,
+ total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha,
+ end_alpha=self.min_alpha, callbacks=callbacks)
@property
def dm(self):
@@ -636,6 +669,33 @@ def reset_from(self, other_model):
self.docvecs.offset2doctag = other_model.docvecs.offset2doctag
self.trainables.reset_weights(self.hs, self.negative, self.wv, self.docvecs)
+ def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch,
+ total_examples=None, total_words=None, offsets=None, start_doctags=None, **kwargs):
+ work, neu1 = thread_private_mem
+ doctag_vectors = self.docvecs.vectors_docs
+ doctag_locks = self.trainables.vectors_docs_lockf
+
+ offset = offsets[thread_id]
+ start_doctag = start_doctags[thread_id]
+
+ if self.sg:
+ examples, tally, raw_tally = d2v_train_epoch_dbow(
+ self, corpus_file, offset, start_doctag, cython_vocab, cur_epoch,
+ total_examples, total_words, work, neu1, self.docvecs.count,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks, train_words=self.dbow_words)
+ elif self.dm_concat:
+ examples, tally, raw_tally = d2v_train_epoch_dm_concat(
+ self, corpus_file, offset, start_doctag, cython_vocab, cur_epoch,
+ total_examples, total_words, work, neu1, self.docvecs.count,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ else:
+ examples, tally, raw_tally = d2v_train_epoch_dm(
+ self, corpus_file, offset, start_doctag, cython_vocab, cur_epoch,
+ total_examples, total_words, work, neu1, self.docvecs.count,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+
+ return examples, tally, raw_tally
+
def _do_train_job(self, job, alpha, inits):
"""Train model using `job` data.
@@ -677,7 +737,7 @@ def _do_train_job(self, job, alpha, inits):
)
return tally, self._raw_word_count(job)
- def train(self, documents=None, input_streams=None, total_examples=None, total_words=None,
+ def train(self, documents=None, corpus_file=None, total_examples=None, total_words=None,
epochs=None, start_alpha=None, end_alpha=None,
word_count=0, queue_factor=2, report_delay=1.0, callbacks=()):
"""Update the model's neural weights.
@@ -695,13 +755,15 @@ def train(self, documents=None, input_streams=None, total_examples=None, total_w
Parameters
----------
- documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`
+ documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
Can be simply a list of elements, but for larger corpora,consider an iterable that streams
the documents directly from disk/network. If you don't supply `documents`, the model is
left uninitialized -- use if you plan to initialize it in some other way.
- input_streams : list or tuple of iterable of iterables
- The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible
- to process streams in parallel, using `workers` parameter.
+ corpus_file : str, optional
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+ `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
+ and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
total_examples : int, optional
Count of sentences.
total_words : int, optional
@@ -730,10 +792,61 @@ def train(self, documents=None, input_streams=None, total_examples=None, total_w
List of callbacks that need to be executed/run at specific stages during training.
"""
+ kwargs = {}
+ if corpus_file is not None:
+ # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file)
+ offsets, start_doctags = self._get_offsets_and_start_doctags_for_corpusfile(corpus_file, self.workers)
+ kwargs['offsets'] = offsets
+ kwargs['start_doctags'] = start_doctags
+
super(Doc2Vec, self).train(
- sentences=documents, input_streams=input_streams, total_examples=total_examples, total_words=total_words,
+ sentences=documents, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words,
epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
- queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks)
+ queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks, **kwargs)
+
+ @classmethod
+ def _get_offsets_and_start_doctags_for_corpusfile(cls, corpus_file, workers):
+ """Get offset and initial document tag in a corpus_file for each worker.
+
+ Firstly, approximate offsets are calculated based on number of workers and corpus_file size.
+ Secondly, for each approximate offset we find the maximum offset which points to the beginning of line and
+ less than approximate offset.
+
+ Parameters
+ ----------
+ corpus_file : str
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ workers : int
+ Number of workers.
+
+ Returns
+ -------
+ list of int, list of int
+ Lists with offsets and document tags with length = number of workers.
+ """
+ corpus_file_size = os.path.getsize(corpus_file)
+ approx_offsets = [int(corpus_file_size // workers * i) for i in range(workers)]
+ offsets = []
+ start_doctags = []
+
+ with smart_open(corpus_file, mode='rb') as fin:
+ curr_offset_idx = 0
+ prev_filepos = 0
+
+ for line_no, line in enumerate(fin):
+ if curr_offset_idx == len(approx_offsets):
+ break
+
+ curr_filepos = prev_filepos + len(line)
+ while curr_offset_idx != len(approx_offsets) and approx_offsets[curr_offset_idx] < curr_filepos:
+ offsets.append(prev_filepos)
+ start_doctags.append(line_no)
+
+ curr_offset_idx += 1
+
+ prev_filepos = curr_filepos
+
+ return offsets, start_doctags
def _raw_word_count(self, job):
"""Get the number of words in a given job.
@@ -930,7 +1043,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
fvocab : str, optional
Optional file path used to save the vocabulary.
binary : bool, optional
- If True, the data wil be saved in binary word2vec format, otherwise - will be saved in plain text.
+ If True, the data will be saved in binary word2vec format, otherwise - will be saved in plain text.
"""
total_vec = len(self.wv.vocab) + len(self.docvecs)
@@ -1016,19 +1129,21 @@ def estimate_memory(self, vocab_size=None, report=None):
report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize
return super(Doc2Vec, self).estimate_memory(vocab_size, report=report)
- def build_vocab(self, documents=None, input_streams=None, update=False, progress_per=10000, keep_raw_vocab=False,
- trim_rule=None, workers=None, **kwargs):
+ def build_vocab(self, documents=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False,
+ trim_rule=None, **kwargs):
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
Parameters
----------
- documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`
+ documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
Can be simply a list of :class:`~gensim.models.doc2vec.TaggedDocument` elements, but for larger corpora,
consider an iterable that streams the documents directly from disk/network.
See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument`
- input_streams : list or tuple of iterable of iterables
- The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible
- to process streams in parallel, using `workers` parameter.
+ corpus_file : str, optional
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+ `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
+ and are equal to a line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
update : bool
If true, the new words in `sentences` will be added to model's vocab.
progress_per : int
@@ -1049,20 +1164,16 @@ def build_vocab(self, documents=None, input_streams=None, update=False, progress
* `count` (int) - the word's frequency count in the corpus
* `min_count` (int) - the minimum count threshold.
- workers : int
- Used if `input_streams` is passed. Determines how many processes to use for vocab building.
- Actual number of workers is determined by `min(len(input_streams), workers)`.
-
**kwargs
Additional key word arguments passed to the internal vocabulary construction.
"""
- workers = workers or self.workers
total_words, corpus_count = self.vocabulary.scan_vocab(
- documents=documents, input_streams=input_streams, docvecs=self.docvecs,
- progress_per=progress_per, trim_rule=trim_rule, workers=workers
+ documents=documents, corpus_file=corpus_file, docvecs=self.docvecs,
+ progress_per=progress_per, trim_rule=trim_rule
)
self.corpus_count = corpus_count
+ self.corpus_total_words = total_words
report_values = self.vocabulary.prepare_vocab(
self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
**kwargs)
@@ -1138,40 +1249,6 @@ def _note_doctag(key, document_length, docvecs):
docvecs.count = docvecs.max_rawint + 1 + len(docvecs.offset2doctag)
-def _scan_vocab_worker(stream, progress_queue, max_vocab_size, trim_rule):
- min_reduce = 1
- vocab = defaultdict(int)
- doclen2tags = defaultdict(list)
- checked_string_types = 0
- document_no = -1
- total_words = 0
- for document_no, document in enumerate(stream):
- if not checked_string_types:
- if isinstance(document.words, string_types):
- log_msg = "Each 'words' should be a list of words (usually unicode strings). " \
- "First 'words' here is instead plain %s." % type(document.words)
- progress_queue.put(log_msg)
-
- checked_string_types += 1
-
- document_length = len(document.words)
-
- for tag in document.tags:
- doclen2tags[document_length].append(tag)
-
- for word in document.words:
- vocab[word] += 1
- total_words += len(document.words)
-
- if max_vocab_size and len(vocab) > max_vocab_size:
- utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
- min_reduce += 1
-
- progress_queue.put((total_words, document_no + 1))
- progress_queue.put(None)
- return vocab, doclen2tags
-
-
class Doc2VecVocab(Word2VecVocab):
"""Vocabulary used by :class:`~gensim.models.doc2vec.Doc2Vec`.
@@ -1188,7 +1265,7 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T
if there are more unique words than this, then prune the infrequent ones.
Every 10 million word types need about 1GB of RAM, set to `None` for no limit.
min_count : int
- Words with frequency lower than this limit will be discarded form the vocabulary.
+ Words with frequency lower than this limit will be discarded from the vocabulary.
sample : float, optional
The threshold for configuring which higher-frequency words are randomly downsampled,
useful range is (0, 1e-5).
@@ -1209,51 +1286,7 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent)
- def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule):
- manager = multiprocessing.Manager()
- progress_queue = manager.Queue()
-
- workers = min(workers, len(input_streams))
- logger.info("Scanning vocab in %i processes.", workers)
- pool = multiprocessing.Pool(processes=workers)
-
- worker_max_vocab_size = self.max_vocab_size // workers if self.max_vocab_size else None
- results = [
- pool.apply_async(_scan_vocab_worker,
- (stream, progress_queue, worker_max_vocab_size, trim_rule)
- ) for stream in input_streams
- ]
- pool.close()
-
- unfinished_tasks = len(results)
- total_words = 0
- total_documents = 0
- while unfinished_tasks > 0:
- report = progress_queue.get()
- if report is None:
- unfinished_tasks -= 1
- logger.info("scan vocab task finished, processed %i documents and %i words;"
- " awaiting finish of %i more tasks", total_documents, total_words, unfinished_tasks)
- elif isinstance(report, string_types):
- logger.warning(report)
- else:
- num_words, num_documents = report
- total_words += num_words
- total_documents += num_documents
-
- results = [res.get() for res in results] # pairs (vocab, doclen2tags)
- self.raw_vocab = reduce(utils.merge_counts, [r[0] for r in results])
- if self.max_vocab_size:
- utils.trim_vocab_by_freq(self.raw_vocab, self.max_vocab_size, trim_rule=trim_rule)
-
- # Update `docvecs` with document tags information.
- for (_, doclen2tags) in results:
- for document_length, tags in iteritems(doclen2tags):
- for tag in tags:
- _note_doctag(tag, document_length, docvecs)
- return total_words, total_documents
-
- def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule):
+ def _scan_vocab(self, documents, docvecs, progress_per, trim_rule):
document_no = -1
total_words = 0
min_reduce = 1
@@ -1295,14 +1328,17 @@ def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule):
self.raw_vocab = vocab
return total_words, corpus_count
- def scan_vocab(self, documents=None, input_streams=None, docvecs=None, progress_per=10000, workers=None,
- trim_rule=None):
+ def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_per=10000, trim_rule=None):
"""Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count.
Parameters
----------
- documents : iterable of :class:`~gensim.models.doc2vec.TaggedDocument`
+ documents : iterable of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
The tagged documents used to create the vocabulary. Their tags can be either str tokens or ints (faster).
+ corpus_file : str, optional
+ Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+ You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+ `corpus_file` arguments need to be passed (not both of them).
docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors`
The vector representations of the documents in our corpus. Each of them has a size == `vector_size`.
progress_per : int
@@ -1328,10 +1364,10 @@ def scan_vocab(self, documents=None, input_streams=None, docvecs=None, progress_
"""
logger.info("collecting all words and their counts")
- if input_streams is None:
- total_words, corpus_count = self._scan_vocab_singlestream(documents, docvecs, progress_per, trim_rule)
- else:
- total_words, corpus_count = self._scan_vocab_multistream(input_streams, docvecs, workers, trim_rule)
+ if corpus_file is not None:
+ documents = TaggedLineDocument(corpus_file)
+
+ total_words, corpus_count = self._scan_vocab(documents, docvecs, progress_per, trim_rule)
logger.info(
"collected %i word types and %i unique tags from a corpus of %i examples and %i words",
@@ -1417,7 +1453,7 @@ def reset_doc_weights(self, docvecs):
docvecs.vectors_docs = empty((length, docvecs.vector_size), dtype=REAL)
self.vectors_docs_lockf = ones((length,), dtype=REAL) # zeros suppress learning
- for i in xrange(length):
+ for i in range(length):
# construct deterministic seed from index AND model seed
seed = "%d %s" % (
self.seed, Doc2VecKeyedVectors._index_to_doctag(i, docvecs.offset2doctag, docvecs.max_rawint))
@@ -1486,11 +1522,13 @@ def __init__(self, source):
Examples
--------
- >>> from gensim.test.utils import datapath
- >>> from gensim.models.doc2vec import TaggedLineDocument
- >>>
- >>> for document in TaggedLineDocument(datapath("head500.noblanks.cor")):
- ... pass
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.models.doc2vec import TaggedLineDocument
+ >>>
+ >>> for document in TaggedLineDocument(datapath("head500.noblanks.cor")):
+ ... pass
"""
self.source = source
diff --git a/gensim/models/doc2vec_corpusfile.cpp b/gensim/models/doc2vec_corpusfile.cpp
new file mode 100644
index 0000000000..360b8d1db1
--- /dev/null
+++ b/gensim/models/doc2vec_corpusfile.cpp
@@ -0,0 +1,11515 @@
+/* Generated by Cython 0.28.4 */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#ifndef Py_PYTHON_H
+ #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+ #error Cython requires Python 2.6+ or Python 3.3+.
+#else
+#define CYTHON_ABI "0_28_4"
+#define CYTHON_FUTURE_DIVISION 0
+#include
+#ifndef offsetof
+ #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+ #ifndef __stdcall
+ #define __stdcall
+ #endif
+ #ifndef __cdecl
+ #define __cdecl
+ #endif
+ #ifndef __fastcall
+ #define __fastcall
+ #endif
+#endif
+#ifndef DL_IMPORT
+ #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+ #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+ #if PY_VERSION_HEX >= 0x02070000
+ #define HAVE_LONG_LONG
+ #endif
+#endif
+#ifndef PY_LONG_LONG
+ #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+ #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+ #define CYTHON_COMPILING_IN_PYPY 1
+ #define CYTHON_COMPILING_IN_PYSTON 0
+ #define CYTHON_COMPILING_IN_CPYTHON 0
+ #undef CYTHON_USE_TYPE_SLOTS
+ #define CYTHON_USE_TYPE_SLOTS 0
+ #undef CYTHON_USE_PYTYPE_LOOKUP
+ #define CYTHON_USE_PYTYPE_LOOKUP 0
+ #if PY_VERSION_HEX < 0x03050000
+ #undef CYTHON_USE_ASYNC_SLOTS
+ #define CYTHON_USE_ASYNC_SLOTS 0
+ #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+ #define CYTHON_USE_ASYNC_SLOTS 1
+ #endif
+ #undef CYTHON_USE_PYLIST_INTERNALS
+ #define CYTHON_USE_PYLIST_INTERNALS 0
+ #undef CYTHON_USE_UNICODE_INTERNALS
+ #define CYTHON_USE_UNICODE_INTERNALS 0
+ #undef CYTHON_USE_UNICODE_WRITER
+ #define CYTHON_USE_UNICODE_WRITER 0
+ #undef CYTHON_USE_PYLONG_INTERNALS
+ #define CYTHON_USE_PYLONG_INTERNALS 0
+ #undef CYTHON_AVOID_BORROWED_REFS
+ #define CYTHON_AVOID_BORROWED_REFS 1
+ #undef CYTHON_ASSUME_SAFE_MACROS
+ #define CYTHON_ASSUME_SAFE_MACROS 0
+ #undef CYTHON_UNPACK_METHODS
+ #define CYTHON_UNPACK_METHODS 0
+ #undef CYTHON_FAST_THREAD_STATE
+ #define CYTHON_FAST_THREAD_STATE 0
+ #undef CYTHON_FAST_PYCALL
+ #define CYTHON_FAST_PYCALL 0
+ #undef CYTHON_PEP489_MULTI_PHASE_INIT
+ #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+ #undef CYTHON_USE_TP_FINALIZE
+ #define CYTHON_USE_TP_FINALIZE 0
+#elif defined(PYSTON_VERSION)
+ #define CYTHON_COMPILING_IN_PYPY 0
+ #define CYTHON_COMPILING_IN_PYSTON 1
+ #define CYTHON_COMPILING_IN_CPYTHON 0
+ #ifndef CYTHON_USE_TYPE_SLOTS
+ #define CYTHON_USE_TYPE_SLOTS 1
+ #endif
+ #undef CYTHON_USE_PYTYPE_LOOKUP
+ #define CYTHON_USE_PYTYPE_LOOKUP 0
+ #undef CYTHON_USE_ASYNC_SLOTS
+ #define CYTHON_USE_ASYNC_SLOTS 0
+ #undef CYTHON_USE_PYLIST_INTERNALS
+ #define CYTHON_USE_PYLIST_INTERNALS 0
+ #ifndef CYTHON_USE_UNICODE_INTERNALS
+ #define CYTHON_USE_UNICODE_INTERNALS 1
+ #endif
+ #undef CYTHON_USE_UNICODE_WRITER
+ #define CYTHON_USE_UNICODE_WRITER 0
+ #undef CYTHON_USE_PYLONG_INTERNALS
+ #define CYTHON_USE_PYLONG_INTERNALS 0
+ #ifndef CYTHON_AVOID_BORROWED_REFS
+ #define CYTHON_AVOID_BORROWED_REFS 0
+ #endif
+ #ifndef CYTHON_ASSUME_SAFE_MACROS
+ #define CYTHON_ASSUME_SAFE_MACROS 1
+ #endif
+ #ifndef CYTHON_UNPACK_METHODS
+ #define CYTHON_UNPACK_METHODS 1
+ #endif
+ #undef CYTHON_FAST_THREAD_STATE
+ #define CYTHON_FAST_THREAD_STATE 0
+ #undef CYTHON_FAST_PYCALL
+ #define CYTHON_FAST_PYCALL 0
+ #undef CYTHON_PEP489_MULTI_PHASE_INIT
+ #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+ #undef CYTHON_USE_TP_FINALIZE
+ #define CYTHON_USE_TP_FINALIZE 0
+#else
+ #define CYTHON_COMPILING_IN_PYPY 0
+ #define CYTHON_COMPILING_IN_PYSTON 0
+ #define CYTHON_COMPILING_IN_CPYTHON 1
+ #ifndef CYTHON_USE_TYPE_SLOTS
+ #define CYTHON_USE_TYPE_SLOTS 1
+ #endif
+ #if PY_VERSION_HEX < 0x02070000
+ #undef CYTHON_USE_PYTYPE_LOOKUP
+ #define CYTHON_USE_PYTYPE_LOOKUP 0
+ #elif !defined(CYTHON_USE_PYTYPE_LOOKUP)
+ #define CYTHON_USE_PYTYPE_LOOKUP 1
+ #endif
+ #if PY_MAJOR_VERSION < 3
+ #undef CYTHON_USE_ASYNC_SLOTS
+ #define CYTHON_USE_ASYNC_SLOTS 0
+ #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+ #define CYTHON_USE_ASYNC_SLOTS 1
+ #endif
+ #if PY_VERSION_HEX < 0x02070000
+ #undef CYTHON_USE_PYLONG_INTERNALS
+ #define CYTHON_USE_PYLONG_INTERNALS 0
+ #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
+ #define CYTHON_USE_PYLONG_INTERNALS 1
+ #endif
+ #ifndef CYTHON_USE_PYLIST_INTERNALS
+ #define CYTHON_USE_PYLIST_INTERNALS 1
+ #endif
+ #ifndef CYTHON_USE_UNICODE_INTERNALS
+ #define CYTHON_USE_UNICODE_INTERNALS 1
+ #endif
+ #if PY_VERSION_HEX < 0x030300F0
+ #undef CYTHON_USE_UNICODE_WRITER
+ #define CYTHON_USE_UNICODE_WRITER 0
+ #elif !defined(CYTHON_USE_UNICODE_WRITER)
+ #define CYTHON_USE_UNICODE_WRITER 1
+ #endif
+ #ifndef CYTHON_AVOID_BORROWED_REFS
+ #define CYTHON_AVOID_BORROWED_REFS 0
+ #endif
+ #ifndef CYTHON_ASSUME_SAFE_MACROS
+ #define CYTHON_ASSUME_SAFE_MACROS 1
+ #endif
+ #ifndef CYTHON_UNPACK_METHODS
+ #define CYTHON_UNPACK_METHODS 1
+ #endif
+ #ifndef CYTHON_FAST_THREAD_STATE
+ #define CYTHON_FAST_THREAD_STATE 1
+ #endif
+ #ifndef CYTHON_FAST_PYCALL
+ #define CYTHON_FAST_PYCALL 1
+ #endif
+ #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+ #define CYTHON_PEP489_MULTI_PHASE_INIT (0 && PY_VERSION_HEX >= 0x03050000)
+ #endif
+ #ifndef CYTHON_USE_TP_FINALIZE
+ #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1)
+ #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+ #include "longintrepr.h"
+ #undef SHIFT
+ #undef BASE
+ #undef MASK
+#endif
+#ifndef __has_attribute
+ #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+ #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+ #if defined(__GNUC__)
+ #define CYTHON_RESTRICT __restrict__
+ #elif defined(_MSC_VER) && _MSC_VER >= 1400
+ #define CYTHON_RESTRICT __restrict
+ #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ #define CYTHON_RESTRICT restrict
+ #else
+ #define CYTHON_RESTRICT
+ #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+# define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+# define CYTHON_UNUSED
+# endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+# define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+# define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+# if defined(__cplusplus)
+ template void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
+# else
+# define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
+# endif
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+# define CYTHON_NCP_UNUSED
+# else
+# define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+ #ifndef _MSC_STDINT_H_
+ #if _MSC_VER < 1300
+ typedef unsigned char uint8_t;
+ typedef unsigned int uint32_t;
+ #else
+ typedef unsigned __int8 uint8_t;
+ typedef unsigned __int32 uint32_t;
+ #endif
+ #endif
+#else
+ #include
+#endif
+#ifndef CYTHON_FALLTHROUGH
+ #if defined(__cplusplus) && __cplusplus >= 201103L
+ #if __has_cpp_attribute(fallthrough)
+ #define CYTHON_FALLTHROUGH [[fallthrough]]
+ #elif __has_cpp_attribute(clang::fallthrough)
+ #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+ #elif __has_cpp_attribute(gnu::fallthrough)
+ #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+ #endif
+ #endif
+ #ifndef CYTHON_FALLTHROUGH
+ #if __has_attribute(fallthrough)
+ #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+ #else
+ #define CYTHON_FALLTHROUGH
+ #endif
+ #endif
+ #if defined(__clang__ ) && defined(__apple_build_version__)
+ #if __apple_build_version__ < 7000000
+ #undef CYTHON_FALLTHROUGH
+ #define CYTHON_FALLTHROUGH
+ #endif
+ #endif
+#endif
+
+#ifndef __cplusplus
+ #error "Cython files generated with the C++ option must be compiled with a C++ compiler."
+#endif
+#ifndef CYTHON_INLINE
+ #if defined(__clang__)
+ #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+ #else
+ #define CYTHON_INLINE inline
+ #endif
+#endif
+template
+void __Pyx_call_destructor(T& x) {
+ x.~T();
+}
+template
+class __Pyx_FakeReference {
+ public:
+ __Pyx_FakeReference() : ptr(NULL) { }
+ __Pyx_FakeReference(const T& ref) : ptr(const_cast(&ref)) { }
+ T *operator->() { return ptr; }
+ T *operator&() { return ptr; }
+ operator T&() { return *ptr; }
+ template bool operator ==(U other) { return *ptr == other; }
+ template bool operator !=(U other) { return *ptr != other; }
+ private:
+ T *ptr;
+};
+
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+ #define Py_OptimizeFlag 0
+#endif
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+ #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+ #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+ PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+ #define __Pyx_DefaultClassType PyClass_Type
+#else
+ #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+ #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+ PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+ #define __Pyx_DefaultClassType PyType_Type
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+ #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+ #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+ #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+ #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+ #ifndef METH_FASTCALL
+ #define METH_FASTCALL 0x80
+ #endif
+ typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+ typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+ Py_ssize_t nargs, PyObject *kwnames);
+#else
+ #define __Pyx_PyCFunctionFast _PyCFunctionFast
+ #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+#endif
+#if CYTHON_FAST_PYCCALL
+#define __Pyx_PyFastCFunction_Check(func)\
+ ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS)))))
+#else
+#define __Pyx_PyFastCFunction_Check(func) 0
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+ #define PyObject_Malloc(s) PyMem_Malloc(s)
+ #define PyObject_Free(p) PyMem_Free(p)
+ #define PyObject_Realloc(p) PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_PYSTON
+ #define __Pyx_PyCode_HasFreeVars(co) PyCode_HasFreeVars(co)
+ #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
+#else
+ #define __Pyx_PyCode_HasFreeVars(co) (PyCode_GetNumFree(co) > 0)
+ #define __Pyx_PyFrame_SetLineNumber(frame, lineno) (frame)->f_lineno = (lineno)
+#endif
+#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000
+ #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x03060000
+ #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+ #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+ #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+ *key = PyThread_create_key();
+ return 0; // PyThread_create_key reports success always
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+ Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+ *key = Py_tss_NEEDS_INIT;
+ return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+ PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+ return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+ PyThread_delete_key(*key);
+ *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+ return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+ return PyThread_get_key_value(*key);
+}
+#endif // TSS (Thread Specific Storage) API
+#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n) ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n) PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+ #define __Pyx_PyNumber_Divide(x,y) PyNumber_TrueDivide(x,y)
+ #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceTrueDivide(x,y)
+#else
+ #define __Pyx_PyNumber_Divide(x,y) PyNumber_Divide(x,y)
+ #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStr(dict, name) _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+#else
+#define __Pyx_PyDict_GetItemStr(dict, name) PyDict_GetItem(dict, name)
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+ #define CYTHON_PEP393_ENABLED 1
+ #define __Pyx_PyUnicode_READY(op) (likely(PyUnicode_IS_READY(op)) ?\
+ 0 : _PyUnicode_Ready((PyObject *)(op)))
+ #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_LENGTH(u)
+ #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+ #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u) PyUnicode_MAX_CHAR_VALUE(u)
+ #define __Pyx_PyUnicode_KIND(u) PyUnicode_KIND(u)
+ #define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u)
+ #define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i)
+ #define __Pyx_PyUnicode_WRITE(k, d, i, ch) PyUnicode_WRITE(k, d, i, ch)
+ #define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+#else
+ #define CYTHON_PEP393_ENABLED 0
+ #define PyUnicode_1BYTE_KIND 1
+ #define PyUnicode_2BYTE_KIND 2
+ #define PyUnicode_4BYTE_KIND 4
+ #define __Pyx_PyUnicode_READY(op) (0)
+ #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u)
+ #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+ #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u) ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
+ #define __Pyx_PyUnicode_KIND(u) (sizeof(Py_UNICODE))
+ #define __Pyx_PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u))
+ #define __Pyx_PyUnicode_READ(k, d, i) ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+ #define __Pyx_PyUnicode_WRITE(k, d, i, ch) (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
+ #define __Pyx_PyUnicode_IS_TRUE(u) (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+ #define __Pyx_PyUnicode_Concat(a, b) PyNumber_Add(a, b)
+ #define __Pyx_PyUnicode_ConcatSafe(a, b) PyNumber_Add(a, b)
+#else
+ #define __Pyx_PyUnicode_Concat(a, b) PyUnicode_Concat(a, b)
+ #define __Pyx_PyUnicode_ConcatSafe(a, b) ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+ PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
+ #define PyUnicode_Contains(u, s) PySequence_Contains(u, s)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+ #define PyByteArray_Check(obj) PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
+ #define PyObject_Format(obj, fmt) PyObject_CallMethod(obj, "__format__", "O", fmt)
+#endif
+#define __Pyx_PyString_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+ #define __Pyx_PyString_Format(a, b) PyUnicode_Format(a, b)
+#else
+ #define __Pyx_PyString_Format(a, b) PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+ #define PyObject_ASCII(o) PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define PyBaseString_Type PyUnicode_Type
+ #define PyStringObject PyUnicodeObject
+ #define PyString_Type PyUnicode_Type
+ #define PyString_Check PyUnicode_Check
+ #define PyString_CheckExact PyUnicode_CheckExact
+ #define PyObject_Unicode PyObject_Str
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+ #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+ #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+ #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#ifndef PySet_CheckExact
+ #define PySet_CheckExact(obj) (Py_TYPE(obj) == &PySet_Type)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+ #define __Pyx_PySequence_SIZE(seq) Py_SIZE(seq)
+#else
+ #define __Pyx_PySequence_SIZE(seq) PySequence_Size(seq)
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define PyIntObject PyLongObject
+ #define PyInt_Type PyLong_Type
+ #define PyInt_Check(op) PyLong_Check(op)
+ #define PyInt_CheckExact(op) PyLong_CheckExact(op)
+ #define PyInt_FromString PyLong_FromString
+ #define PyInt_FromUnicode PyLong_FromUnicode
+ #define PyInt_FromLong PyLong_FromLong
+ #define PyInt_FromSize_t PyLong_FromSize_t
+ #define PyInt_FromSsize_t PyLong_FromSsize_t
+ #define PyInt_AsLong PyLong_AsLong
+ #define PyInt_AS_LONG PyLong_AS_LONG
+ #define PyInt_AsSsize_t PyLong_AsSsize_t
+ #define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask
+ #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+ #define PyNumber_Int PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define PyBoolObject PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+ #ifndef PyUnicode_InternFromString
+ #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+ #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+ typedef long Py_hash_t;
+ #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+ #define __Pyx_PyInt_AsHash_t PyInt_AsLong
+#else
+ #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+ #define __Pyx_PyInt_AsHash_t PyInt_AsSsize_t
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define __Pyx_PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : (Py_INCREF(func), func))
+#else
+ #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+ #if PY_VERSION_HEX >= 0x030500B1
+ #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+ #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+ #else
+ #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+ #endif
+#else
+ #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+ typedef struct {
+ unaryfunc am_await;
+ unaryfunc am_aiter;
+ unaryfunc am_anext;
+ } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(WIN32) || defined(MS_WINDOWS)
+ #define _USE_MATH_DEFINES
+#endif
+#include
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+ float value;
+ memset(&value, 0xFF, sizeof(value));
+ return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+{ \
+ __pyx_filename = __pyx_f[f_index]; __pyx_lineno = lineno; __pyx_clineno = __LINE__; goto Ln_error; \
+}
+
+#ifndef __PYX_EXTERN_C
+ #ifdef __cplusplus
+ #define __PYX_EXTERN_C extern "C"
+ #else
+ #define __PYX_EXTERN_C extern
+ #endif
+#endif
+
+#define __PYX_HAVE__gensim__models__doc2vec_corpusfile
+#define __PYX_HAVE_API__gensim__models__doc2vec_corpusfile
+/* Early includes */
+#include
+#include
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "ios"
+#include "new"
+#include "stdexcept"
+#include "typeinfo"
+#include
+#include
+#include "voidptr.h"
+#include
+#include
+#include "fast_line_sentence.h"
+#ifdef _OPENMP
+#include
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+ const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed) (\
+ (sizeof(type) < sizeof(Py_ssize_t)) ||\
+ (sizeof(type) > sizeof(Py_ssize_t) &&\
+ likely(v < (type)PY_SSIZE_T_MAX ||\
+ v == (type)PY_SSIZE_T_MAX) &&\
+ (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+ v == (type)PY_SSIZE_T_MIN))) ||\
+ (sizeof(type) == sizeof(Py_ssize_t) &&\
+ (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+ v == (type)PY_SSIZE_T_MAX))) )
+#if defined (__cplusplus) && __cplusplus >= 201103L
+ #include
+ #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+ #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+ #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+ #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+ #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+ #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+ #define __Pyx_PyStr_FromString __Pyx_PyBytes_FromString
+ #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+ #define __Pyx_PyStr_FromString __Pyx_PyUnicode_FromString
+ #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s) ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s) ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s) ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s) ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s) ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s) ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s) ((char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s) ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s) ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s) ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s) ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s) __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s) __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s) __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s) __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
+ const Py_UNICODE *u_end = u;
+ while (*u_end++) ;
+ return (size_t)(u_end - u - 1);
+}
+#define __Pyx_PyUnicode_FromUnicode(u) PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+ (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+ PyObject* sys;
+ PyObject* default_encoding = NULL;
+ PyObject* ascii_chars_u = NULL;
+ PyObject* ascii_chars_b = NULL;
+ const char* default_encoding_c;
+ sys = PyImport_ImportModule("sys");
+ if (!sys) goto bad;
+ default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+ Py_DECREF(sys);
+ if (!default_encoding) goto bad;
+ default_encoding_c = PyBytes_AsString(default_encoding);
+ if (!default_encoding_c) goto bad;
+ if (strcmp(default_encoding_c, "ascii") == 0) {
+ __Pyx_sys_getdefaultencoding_not_ascii = 0;
+ } else {
+ char ascii_chars[128];
+ int c;
+ for (c = 0; c < 128; c++) {
+ ascii_chars[c] = c;
+ }
+ __Pyx_sys_getdefaultencoding_not_ascii = 1;
+ ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+ if (!ascii_chars_u) goto bad;
+ ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+ if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+ PyErr_Format(
+ PyExc_ValueError,
+ "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+ default_encoding_c);
+ goto bad;
+ }
+ Py_DECREF(ascii_chars_u);
+ Py_DECREF(ascii_chars_b);
+ }
+ Py_DECREF(default_encoding);
+ return 0;
+bad:
+ Py_XDECREF(default_encoding);
+ Py_XDECREF(ascii_chars_u);
+ Py_XDECREF(ascii_chars_b);
+ return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+ PyObject* sys;
+ PyObject* default_encoding = NULL;
+ char* default_encoding_c;
+ sys = PyImport_ImportModule("sys");
+ if (!sys) goto bad;
+ default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+ Py_DECREF(sys);
+ if (!default_encoding) goto bad;
+ default_encoding_c = PyBytes_AsString(default_encoding);
+ if (!default_encoding_c) goto bad;
+ __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c));
+ if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+ strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+ Py_DECREF(default_encoding);
+ return 0;
+bad:
+ Py_XDECREF(default_encoding);
+ return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+ #define likely(x) __builtin_expect(!!(x), 1)
+ #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+ #define likely(x) (x)
+ #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+static PyObject *__pyx_m = NULL;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_cython_runtime = NULL;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static PyObject *__pyx_empty_unicode;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+/* Header.proto */
+#if !defined(CYTHON_CCOMPLEX)
+ #if defined(__cplusplus)
+ #define CYTHON_CCOMPLEX 1
+ #elif defined(_Complex_I)
+ #define CYTHON_CCOMPLEX 1
+ #else
+ #define CYTHON_CCOMPLEX 0
+ #endif
+#endif
+#if CYTHON_CCOMPLEX
+ #ifdef __cplusplus
+ #include
+ #else
+ #include
+ #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+ #undef _Complex_I
+ #define _Complex_I 1.0fj
+#endif
+
+
+static const char *__pyx_f[] = {
+ "gensim/models/doc2vec_corpusfile.pyx",
+ "__init__.pxd",
+ "type.pxd",
+ "gensim/models/word2vec_corpusfile.pxd",
+};
+/* NoFastGil.proto */
+#define __Pyx_PyGILState_Ensure PyGILState_Ensure
+#define __Pyx_PyGILState_Release PyGILState_Release
+#define __Pyx_FastGIL_Remember()
+#define __Pyx_FastGIL_Forget()
+#define __Pyx_FastGilFuncInit()
+
+/* ForceInitThreads.proto */
+#ifndef __PYX_FORCE_INIT_THREADS
+ #define __PYX_FORCE_INIT_THREADS 0
+#endif
+
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":730
+ * # in Cython to enable them only on the right systems.
+ *
+ * ctypedef npy_int8 int8_t # <<<<<<<<<<<<<<
+ * ctypedef npy_int16 int16_t
+ * ctypedef npy_int32 int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":731
+ *
+ * ctypedef npy_int8 int8_t
+ * ctypedef npy_int16 int16_t # <<<<<<<<<<<<<<
+ * ctypedef npy_int32 int32_t
+ * ctypedef npy_int64 int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":732
+ * ctypedef npy_int8 int8_t
+ * ctypedef npy_int16 int16_t
+ * ctypedef npy_int32 int32_t # <<<<<<<<<<<<<<
+ * ctypedef npy_int64 int64_t
+ * #ctypedef npy_int96 int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":733
+ * ctypedef npy_int16 int16_t
+ * ctypedef npy_int32 int32_t
+ * ctypedef npy_int64 int64_t # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96 int96_t
+ * #ctypedef npy_int128 int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":737
+ * #ctypedef npy_int128 int128_t
+ *
+ * ctypedef npy_uint8 uint8_t # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16 uint16_t
+ * ctypedef npy_uint32 uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":738
+ *
+ * ctypedef npy_uint8 uint8_t
+ * ctypedef npy_uint16 uint16_t # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32 uint32_t
+ * ctypedef npy_uint64 uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":739
+ * ctypedef npy_uint8 uint8_t
+ * ctypedef npy_uint16 uint16_t
+ * ctypedef npy_uint32 uint32_t # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64 uint64_t
+ * #ctypedef npy_uint96 uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":740
+ * ctypedef npy_uint16 uint16_t
+ * ctypedef npy_uint32 uint32_t
+ * ctypedef npy_uint64 uint64_t # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96 uint96_t
+ * #ctypedef npy_uint128 uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":744
+ * #ctypedef npy_uint128 uint128_t
+ *
+ * ctypedef npy_float32 float32_t # <<<<<<<<<<<<<<
+ * ctypedef npy_float64 float64_t
+ * #ctypedef npy_float80 float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":745
+ *
+ * ctypedef npy_float32 float32_t
+ * ctypedef npy_float64 float64_t # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80 float80_t
+ * #ctypedef npy_float128 float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":754
+ * # The int types are mapped a bit surprising --
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long int_t # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong long_t
+ * ctypedef npy_longlong longlong_t
+ */
+typedef npy_long __pyx_t_5numpy_int_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":755
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long int_t
+ * ctypedef npy_longlong long_t # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong longlong_t
+ *
+ */
+typedef npy_longlong __pyx_t_5numpy_long_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":756
+ * ctypedef npy_long int_t
+ * ctypedef npy_longlong long_t
+ * ctypedef npy_longlong longlong_t # <<<<<<<<<<<<<<
+ *
+ * ctypedef npy_ulong uint_t
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":758
+ * ctypedef npy_longlong longlong_t
+ *
+ * ctypedef npy_ulong uint_t # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong ulong_t
+ * ctypedef npy_ulonglong ulonglong_t
+ */
+typedef npy_ulong __pyx_t_5numpy_uint_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":759
+ *
+ * ctypedef npy_ulong uint_t
+ * ctypedef npy_ulonglong ulong_t # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong ulonglong_t
+ *
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":760
+ * ctypedef npy_ulong uint_t
+ * ctypedef npy_ulonglong ulong_t
+ * ctypedef npy_ulonglong ulonglong_t # <<<<<<<<<<<<<<
+ *
+ * ctypedef npy_intp intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":762
+ * ctypedef npy_ulonglong ulonglong_t
+ *
+ * ctypedef npy_intp intp_t # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp uintp_t
+ *
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":763
+ *
+ * ctypedef npy_intp intp_t
+ * ctypedef npy_uintp uintp_t # <<<<<<<<<<<<<<
+ *
+ * ctypedef npy_double float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":765
+ * ctypedef npy_uintp uintp_t
+ *
+ * ctypedef npy_double float_t # <<<<<<<<<<<<<<
+ * ctypedef npy_double double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":766
+ *
+ * ctypedef npy_double float_t
+ * ctypedef npy_double double_t # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ *
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":767
+ * ctypedef npy_double float_t
+ * ctypedef npy_double double_t
+ * ctypedef npy_longdouble longdouble_t # <<<<<<<<<<<<<<
+ *
+ * ctypedef npy_cfloat cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+
+/* "word2vec_inner.pxd":19
+ * void* PyCObject_AsVoidPtr(object obj)
+ *
+ * ctypedef np.float32_t REAL_t # <<<<<<<<<<<<<<
+ *
+ * # BLAS routine signatures
+ */
+typedef __pyx_t_5numpy_float32_t __pyx_t_6gensim_6models_14word2vec_inner_REAL_t;
+
+/* "gensim/models/word2vec_corpusfile.pxd":21
+ * cimport numpy as np
+ *
+ * ctypedef np.float32_t REAL_t # <<<<<<<<<<<<<<
+ *
+ *
+ */
+typedef __pyx_t_5numpy_float32_t __pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t;
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+ #ifdef __cplusplus
+ typedef ::std::complex< float > __pyx_t_float_complex;
+ #else
+ typedef float _Complex __pyx_t_float_complex;
+ #endif
+#else
+ typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+ #ifdef __cplusplus
+ typedef ::std::complex< double > __pyx_t_double_complex;
+ #else
+ typedef double _Complex __pyx_t_double_complex;
+ #endif
+#else
+ typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+
+/*--- Type declarations ---*/
+struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence;
+struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonVocab;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":769
+ * ctypedef npy_longdouble longdouble_t
+ *
+ * ctypedef npy_cfloat cfloat_t # <<<<<<<<<<<<<<
+ * ctypedef npy_cdouble cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t
+ */
+typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":770
+ *
+ * ctypedef npy_cfloat cfloat_t
+ * ctypedef npy_cdouble cdouble_t # <<<<<<<<<<<<<<
+ * ctypedef npy_clongdouble clongdouble_t
+ *
+ */
+typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771
+ * ctypedef npy_cfloat cfloat_t
+ * ctypedef npy_cdouble cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t # <<<<<<<<<<<<<<
+ *
+ * ctypedef npy_cdouble complex_t
+ */
+typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":773
+ * ctypedef npy_clongdouble clongdouble_t
+ *
+ * ctypedef npy_cdouble complex_t # <<<<<<<<<<<<<<
+ *
+ * cdef inline object PyArray_MultiIterNew1(a):
+ */
+typedef npy_cdouble __pyx_t_5numpy_complex_t;
+struct __pyx_t_6gensim_6models_14word2vec_inner_Word2VecConfig;
+struct __pyx_opt_args_6gensim_6models_14word2vec_inner_init_w2v_config;
+
+/* "word2vec_inner.pxd":22
+ *
+ * # BLAS routine signatures
+ * ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
+ * ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
+ * ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ */
+typedef void (*__pyx_t_6gensim_6models_14word2vec_inner_scopy_ptr)(int const *, float const *, int const *, float *, int const *);
+
+/* "word2vec_inner.pxd":23
+ * # BLAS routine signatures
+ * ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil
+ * ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
+ * ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ * ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ */
+typedef void (*__pyx_t_6gensim_6models_14word2vec_inner_saxpy_ptr)(int const *, float const *, float const *, int const *, float *, int const *);
+
+/* "word2vec_inner.pxd":24
+ * ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil
+ * ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
+ * ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
+ * ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ * ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil
+ */
+typedef float (*__pyx_t_6gensim_6models_14word2vec_inner_sdot_ptr)(int const *, float const *, int const *, float const *, int const *);
+
+/* "word2vec_inner.pxd":25
+ * ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
+ * ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ * ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
+ * ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil
+ * ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil
+ */
+typedef double (*__pyx_t_6gensim_6models_14word2vec_inner_dsdot_ptr)(int const *, float const *, int const *, float const *, int const *);
+
+/* "word2vec_inner.pxd":26
+ * ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ * ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ * ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil # <<<<<<<<<<<<<<
+ * ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil
+ *
+ */
+typedef double (*__pyx_t_6gensim_6models_14word2vec_inner_snrm2_ptr)(int const *, float const *, int const *);
+
+/* "word2vec_inner.pxd":27
+ * ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ * ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil
+ * ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil # <<<<<<<<<<<<<<
+ *
+ * cdef scopy_ptr scopy
+ */
+typedef void (*__pyx_t_6gensim_6models_14word2vec_inner_sscal_ptr)(int const *, float const *, float const *, int const *);
+
+/* "word2vec_inner.pxd":44
+ *
+ * # function implementations swapped based on BLAS detected in word2vec_inner.pyx init()
+ * ctypedef REAL_t (*our_dot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
+ * ctypedef void (*our_saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
+ *
+ */
+typedef __pyx_t_6gensim_6models_14word2vec_inner_REAL_t (*__pyx_t_6gensim_6models_14word2vec_inner_our_dot_ptr)(int const *, float const *, int const *, float const *, int const *);
+
+/* "word2vec_inner.pxd":45
+ * # function implementations swapped based on BLAS detected in word2vec_inner.pyx init()
+ * ctypedef REAL_t (*our_dot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ * ctypedef void (*our_saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
+ *
+ * cdef our_dot_ptr our_dot
+ */
+typedef void (*__pyx_t_6gensim_6models_14word2vec_inner_our_saxpy_ptr)(int const *, float const *, float const *, int const *, float *, int const *);
+
+/* "word2vec_inner.pxd":51
+ *
+ *
+ * cdef struct Word2VecConfig: # <<<<<<<<<<<<<<
+ * int hs, negative, sample, compute_loss, size, window, cbow_mean, workers
+ * REAL_t running_training_loss, alpha
+ */
+struct __pyx_t_6gensim_6models_14word2vec_inner_Word2VecConfig {
+ int hs;
+ int negative;
+ int sample;
+ int compute_loss;
+ int size;
+ int window;
+ int cbow_mean;
+ int workers;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t running_training_loss;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t alpha;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *syn0;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *word_locks;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *work;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *neu1;
+ int codelens[0x2710];
+ __pyx_t_5numpy_uint32_t indexes[0x2710];
+ __pyx_t_5numpy_uint32_t reduced_windows[0x2710];
+ int sentence_idx[(0x2710 + 1)];
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *syn1;
+ __pyx_t_5numpy_uint32_t *points[0x2710];
+ __pyx_t_5numpy_uint8_t *codes[0x2710];
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *syn1neg;
+ __pyx_t_5numpy_uint32_t *cum_table;
+ unsigned PY_LONG_LONG cum_table_len;
+ unsigned PY_LONG_LONG next_random;
+};
+
+/* "word2vec_inner.pxd":125
+ *
+ *
+ * cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1=*) # <<<<<<<<<<<<<<
+ */
+struct __pyx_opt_args_6gensim_6models_14word2vec_inner_init_w2v_config {
+ int __pyx_n;
+ PyObject *_neu1;
+};
+struct __pyx_t_6gensim_6models_13doc2vec_inner_Doc2VecConfig;
+struct __pyx_opt_args_6gensim_6models_13doc2vec_inner_init_d2v_config;
+
+/* "gensim/models/doc2vec_inner.pxd":23
+ *
+ *
+ * cdef struct Doc2VecConfig: # <<<<<<<<<<<<<<
+ * int hs, negative, sample, learn_doctags, learn_words, learn_hidden, train_words, cbow_mean
+ * int document_len, doctag_len, window, expected_doctag_len, null_word_index, workers, docvecs_count
+ */
+struct __pyx_t_6gensim_6models_13doc2vec_inner_Doc2VecConfig {
+ int hs;
+ int negative;
+ int sample;
+ int learn_doctags;
+ int learn_words;
+ int learn_hidden;
+ int train_words;
+ int cbow_mean;
+ int document_len;
+ int doctag_len;
+ int window;
+ int expected_doctag_len;
+ int null_word_index;
+ int workers;
+ int docvecs_count;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *word_vectors;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *doctag_vectors;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *word_locks;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *doctag_locks;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *work;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *neu1;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t alpha;
+ int layer1_size;
+ int vector_size;
+ int codelens[0x2710];
+ __pyx_t_5numpy_uint32_t indexes[0x2710];
+ __pyx_t_5numpy_uint32_t doctag_indexes[0x2710];
+ __pyx_t_5numpy_uint32_t window_indexes[0x2710];
+ __pyx_t_5numpy_uint32_t reduced_windows[0x2710];
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *syn1;
+ __pyx_t_5numpy_uint32_t *points[0x2710];
+ __pyx_t_5numpy_uint8_t *codes[0x2710];
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *syn1neg;
+ __pyx_t_5numpy_uint32_t *cum_table;
+ unsigned PY_LONG_LONG cum_table_len;
+ unsigned PY_LONG_LONG next_random;
+};
+
+/* "gensim/models/doc2vec_inner.pxd":91
+ *
+ *
+ * cdef init_d2v_config(Doc2VecConfig *c, model, alpha, learn_doctags, learn_words, learn_hidden, train_words=*, work=*, # <<<<<<<<<<<<<<
+ * neu1=*, word_vectors=*, word_locks=*, doctag_vectors=*, doctag_locks=*, docvecs_count=*)
+ */
+struct __pyx_opt_args_6gensim_6models_13doc2vec_inner_init_d2v_config {
+ int __pyx_n;
+ PyObject *train_words;
+ PyObject *work;
+ PyObject *neu1;
+ PyObject *word_vectors;
+ PyObject *word_locks;
+ PyObject *doctag_vectors;
+ PyObject *doctag_locks;
+ PyObject *docvecs_count;
+};
+struct __pyx_t_6gensim_6models_19word2vec_corpusfile_VocabItem;
+
+/* "gensim/models/word2vec_corpusfile.pxd":47
+ *
+ *
+ * cdef struct VocabItem: # <<<<<<<<<<<<<<
+ * long long sample_int
+ * np.uint32_t index
+ */
+struct __pyx_t_6gensim_6models_19word2vec_corpusfile_VocabItem {
+ PY_LONG_LONG sample_int;
+ __pyx_t_5numpy_uint32_t index;
+ __pyx_t_5numpy_uint8_t *code;
+ int code_len;
+ __pyx_t_5numpy_uint32_t *point;
+ int subword_idx_len;
+ __pyx_t_5numpy_uint32_t *subword_idx;
+};
+
+/* "gensim/models/word2vec_corpusfile.pxd":59
+ *
+ *
+ * ctypedef unordered_map[string, VocabItem] cvocab_t # <<<<<<<<<<<<<<
+ *
+ * cdef class CythonVocab:
+ */
+typedef std::unordered_map __pyx_t_6gensim_6models_19word2vec_corpusfile_cvocab_t;
+
+/* "gensim/models/word2vec_corpusfile.pxd":33
+ *
+ *
+ * cdef class CythonLineSentence: # <<<<<<<<<<<<<<
+ * cdef FastLineSentence* _thisptr
+ * cdef public bytes source
+ */
+struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence {
+ PyObject_HEAD
+ struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *__pyx_vtab;
+ FastLineSentence *_thisptr;
+ PyObject *source;
+ size_t max_sentence_length;
+ size_t max_words_in_batch;
+ size_t offset;
+ std::vector > buf_data;
+};
+
+
+/* "gensim/models/word2vec_corpusfile.pxd":61
+ * ctypedef unordered_map[string, VocabItem] cvocab_t
+ *
+ * cdef class CythonVocab: # <<<<<<<<<<<<<<
+ * cdef cvocab_t vocab
+ * cdef subword_arrays
+ */
+struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonVocab {
+ PyObject_HEAD
+ struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonVocab *__pyx_vtab;
+ __pyx_t_6gensim_6models_19word2vec_corpusfile_cvocab_t vocab;
+ PyObject *subword_arrays;
+};
+
+
+
+/* "gensim/models/word2vec_corpusfile.pxd":33
+ *
+ *
+ * cdef class CythonLineSentence: # <<<<<<<<<<<<<<
+ * cdef FastLineSentence* _thisptr
+ * cdef public bytes source
+ */
+
+struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence {
+ bool (*is_eof)(struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *, int __pyx_skip_dispatch);
+ std::vector (*read_sentence)(struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *, int __pyx_skip_dispatch);
+ std::vector > (*_read_chunked_sentence)(struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *, int __pyx_skip_dispatch);
+ std::vector > (*_chunk_sentence)(struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *, std::vector , int __pyx_skip_dispatch);
+ void (*reset)(struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *, int __pyx_skip_dispatch);
+ std::vector > (*next_batch)(struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *, int __pyx_skip_dispatch);
+};
+static struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *__pyx_vtabptr_6gensim_6models_19word2vec_corpusfile_CythonLineSentence;
+
+
+/* "gensim/models/word2vec_corpusfile.pxd":61
+ * ctypedef unordered_map[string, VocabItem] cvocab_t
+ *
+ * cdef class CythonVocab: # <<<<<<<<<<<<<<
+ * cdef cvocab_t vocab
+ * cdef subword_arrays
+ */
+
+struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonVocab {
+ __pyx_t_6gensim_6models_19word2vec_corpusfile_cvocab_t *(*get_vocab_ptr)(struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonVocab *);
+};
+static struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonVocab *__pyx_vtabptr_6gensim_6models_19word2vec_corpusfile_CythonVocab;
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+ #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+ typedef struct {
+ void (*INCREF)(void*, PyObject*, int);
+ void (*DECREF)(void*, PyObject*, int);
+ void (*GOTREF)(void*, PyObject*, int);
+ void (*GIVEREF)(void*, PyObject*, int);
+ void* (*SetupContext)(const char*, int, const char*);
+ void (*FinishContext)(void**);
+ } __Pyx_RefNannyAPIStruct;
+ static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+ static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+ #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+ #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+ if (acquire_gil) {\
+ PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+ __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+ PyGILState_Release(__pyx_gilstate_save);\
+ } else {\
+ __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+ }
+#else
+ #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+ __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+ #define __Pyx_RefNannyFinishContext()\
+ __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+ #define __Pyx_INCREF(r) __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+ #define __Pyx_DECREF(r) __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+ #define __Pyx_GOTREF(r) __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+ #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+ #define __Pyx_XINCREF(r) do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+ #define __Pyx_XDECREF(r) do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+ #define __Pyx_XGOTREF(r) do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+ #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+ #define __Pyx_RefNannyDeclarations
+ #define __Pyx_RefNannySetupContext(name, acquire_gil)
+ #define __Pyx_RefNannyFinishContext()
+ #define __Pyx_INCREF(r) Py_INCREF(r)
+ #define __Pyx_DECREF(r) Py_DECREF(r)
+ #define __Pyx_GOTREF(r)
+ #define __Pyx_GIVEREF(r)
+ #define __Pyx_XINCREF(r) Py_XINCREF(r)
+ #define __Pyx_XDECREF(r) Py_XDECREF(r)
+ #define __Pyx_XGOTREF(r)
+ #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_XDECREF_SET(r, v) do {\
+ PyObject *tmp = (PyObject *) r;\
+ r = v; __Pyx_XDECREF(tmp);\
+ } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+ PyObject *tmp = (PyObject *) r;\
+ r = v; __Pyx_DECREF(tmp);\
+ } while (0)
+#define __Pyx_CLEAR(r) do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r) do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+ Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\
+ PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\
+ const char* function_name);
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* ExtTypeTest.proto */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type);
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign __pyx_tstate = __Pyx_PyThreadState_Current;
+#define __Pyx_PyErr_Occurred() __pyx_tstate->curexc_type
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred() PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb) __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb) __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb) __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb) __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb) PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb) PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb) PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb) PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb) PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb) PyErr_Fetch(type, value, tb)
+#endif
+
+/* RaiseException.proto */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause);
+
+/* PyCFunctionFastCall.proto */
+#if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject *__Pyx_PyCFunction_FastCall(PyObject *func, PyObject **args, Py_ssize_t nargs);
+#else
+#define __Pyx_PyCFunction_FastCall(func, args, nargs) (assert(0), NULL)
+#endif
+
+/* PyFunctionFastCall.proto */
+#if CYTHON_FAST_PYCALL
+#define __Pyx_PyFunction_FastCall(func, args, nargs)\
+ __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL)
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs);
+#else
+#define __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs) _PyFunction_FastCallDict(func, args, nargs, kwargs)
+#endif
+#endif
+
+/* PyObjectCallMethO.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg);
+#endif
+
+/* PyObjectCallOneArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+/* DictGetItem.proto */
+#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
+static PyObject *__Pyx_PyDict_GetItem(PyObject *d, PyObject* key);
+#define __Pyx_PyObject_Dict_GetItem(obj, name)\
+ (likely(PyDict_CheckExact(obj)) ?\
+ __Pyx_PyDict_GetItem(obj, name) : PyObject_GetItem(obj, name))
+#else
+#define __Pyx_PyDict_GetItem(d, key) PyObject_GetItem(d, key)
+#define __Pyx_PyObject_Dict_GetItem(obj, name) PyObject_GetItem(obj, name)
+#endif
+
+/* RaiseTooManyValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
+
+/* RaiseNeedMoreValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
+
+/* RaiseNoneIterError.proto */
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
+
+/* SaveResetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSave(type, value, tb) __Pyx__ExceptionSave(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#define __Pyx_ExceptionReset(type, value, tb) __Pyx__ExceptionReset(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+#else
+#define __Pyx_ExceptionSave(type, value, tb) PyErr_GetExcInfo(type, value, tb)
+#define __Pyx_ExceptionReset(type, value, tb) PyErr_SetExcInfo(type, value, tb)
+#endif
+
+/* PyErrExceptionMatches.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err)
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err);
+#else
+#define __Pyx_PyErr_ExceptionMatches(err) PyErr_ExceptionMatches(err)
+#endif
+
+/* GetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_GetException(type, value, tb) __Pyx__GetException(__pyx_tstate, type, value, tb)
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* GetVTable.proto */
+static void* __Pyx_GetVtable(PyObject *dict);
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* ImportFrom.proto */
+static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name);
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line) (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+typedef struct {
+ PyCodeObject* code_object;
+ int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+ int count;
+ int max_count;
+ __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+ int py_line, const char *filename);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
+/* None.proto */
+static CYTHON_INLINE int __Pyx_ErrOccurredWithGIL(void); /* proto */
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* None.proto */
+#include
+
+/* RealImag.proto */
+#if CYTHON_CCOMPLEX
+ #ifdef __cplusplus
+ #define __Pyx_CREAL(z) ((z).real())
+ #define __Pyx_CIMAG(z) ((z).imag())
+ #else
+ #define __Pyx_CREAL(z) (__real__(z))
+ #define __Pyx_CIMAG(z) (__imag__(z))
+ #endif
+#else
+ #define __Pyx_CREAL(z) ((z).real)
+ #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if defined(__cplusplus) && CYTHON_CCOMPLEX\
+ && (defined(_WIN32) || defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )) || __cplusplus >= 201103)
+ #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+ #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+ #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+ #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+ #define __Pyx_c_eq_float(a, b) ((a)==(b))
+ #define __Pyx_c_sum_float(a, b) ((a)+(b))
+ #define __Pyx_c_diff_float(a, b) ((a)-(b))
+ #define __Pyx_c_prod_float(a, b) ((a)*(b))
+ #define __Pyx_c_quot_float(a, b) ((a)/(b))
+ #define __Pyx_c_neg_float(a) (-(a))
+ #ifdef __cplusplus
+ #define __Pyx_c_is_zero_float(z) ((z)==(float)0)
+ #define __Pyx_c_conj_float(z) (::std::conj(z))
+ #if 1
+ #define __Pyx_c_abs_float(z) (::std::abs(z))
+ #define __Pyx_c_pow_float(a, b) (::std::pow(a, b))
+ #endif
+ #else
+ #define __Pyx_c_is_zero_float(z) ((z)==0)
+ #define __Pyx_c_conj_float(z) (conjf(z))
+ #if 1
+ #define __Pyx_c_abs_float(z) (cabsf(z))
+ #define __Pyx_c_pow_float(a, b) (cpowf(a, b))
+ #endif
+ #endif
+#else
+ static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex);
+ static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex);
+ #if 1
+ static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+ #define __Pyx_c_eq_double(a, b) ((a)==(b))
+ #define __Pyx_c_sum_double(a, b) ((a)+(b))
+ #define __Pyx_c_diff_double(a, b) ((a)-(b))
+ #define __Pyx_c_prod_double(a, b) ((a)*(b))
+ #define __Pyx_c_quot_double(a, b) ((a)/(b))
+ #define __Pyx_c_neg_double(a) (-(a))
+ #ifdef __cplusplus
+ #define __Pyx_c_is_zero_double(z) ((z)==(double)0)
+ #define __Pyx_c_conj_double(z) (::std::conj(z))
+ #if 1
+ #define __Pyx_c_abs_double(z) (::std::abs(z))
+ #define __Pyx_c_pow_double(a, b) (::std::pow(a, b))
+ #endif
+ #else
+ #define __Pyx_c_is_zero_double(z) ((z)==0)
+ #define __Pyx_c_conj_double(z) (conj(z))
+ #if 1
+ #define __Pyx_c_abs_double(z) (cabs(z))
+ #define __Pyx_c_pow_double(a, b) (cpow(a, b))
+ #endif
+ #endif
+#else
+ static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex);
+ static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex);
+ #if 1
+ static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ #endif
+#endif
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__NPY_TYPES(enum NPY_TYPES value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+/* CheckBinaryVersion.proto */
+static int __Pyx_check_binary_version(void);
+
+/* PyIdentifierFromString.proto */
+#if !defined(__Pyx_PyIdentifier_FromString)
+#if PY_MAJOR_VERSION < 3
+ #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s)
+#else
+ #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s)
+#endif
+#endif
+
+/* ModuleImport.proto */
+static PyObject *__Pyx_ImportModule(const char *name);
+
+/* TypeImport.proto */
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name, size_t size, int strict);
+
+/* VoidPtrImport.proto */
+static int __Pyx_ImportVoidPtr(PyObject *module, const char *name, void **p, const char *sig);
+
+/* FunctionImport.proto */
+static int __Pyx_ImportFunction(PyObject *module, const char *funcname, void (**f)(void), const char *sig);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+
+/* Module declarations from 'cython' */
+
+/* Module declarations from 'cpython.buffer' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdio' */
+
+/* Module declarations from '__builtin__' */
+
+/* Module declarations from 'cpython.type' */
+static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
+
+/* Module declarations from 'cpython' */
+
+/* Module declarations from 'cpython.object' */
+
+/* Module declarations from 'cpython.ref' */
+
+/* Module declarations from 'cpython.mem' */
+
+/* Module declarations from 'numpy' */
+
+/* Module declarations from 'numpy' */
+static PyTypeObject *__pyx_ptype_5numpy_dtype = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flatiter = 0;
+static PyTypeObject *__pyx_ptype_5numpy_broadcast = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *, char *, char *, int *); /*proto*/
+
+/* Module declarations from 'libcpp.string' */
+
+/* Module declarations from 'libcpp.vector' */
+
+/* Module declarations from 'gensim.models.word2vec_inner' */
+static __pyx_t_6gensim_6models_14word2vec_inner_scopy_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_scopy = 0;
+#define __pyx_v_6gensim_6models_14word2vec_inner_scopy (*__pyx_vp_6gensim_6models_14word2vec_inner_scopy)
+static __pyx_t_6gensim_6models_14word2vec_inner_saxpy_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_saxpy = 0;
+#define __pyx_v_6gensim_6models_14word2vec_inner_saxpy (*__pyx_vp_6gensim_6models_14word2vec_inner_saxpy)
+static __pyx_t_6gensim_6models_14word2vec_inner_sdot_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_sdot = 0;
+#define __pyx_v_6gensim_6models_14word2vec_inner_sdot (*__pyx_vp_6gensim_6models_14word2vec_inner_sdot)
+static __pyx_t_6gensim_6models_14word2vec_inner_dsdot_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_dsdot = 0;
+#define __pyx_v_6gensim_6models_14word2vec_inner_dsdot (*__pyx_vp_6gensim_6models_14word2vec_inner_dsdot)
+static __pyx_t_6gensim_6models_14word2vec_inner_snrm2_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_snrm2 = 0;
+#define __pyx_v_6gensim_6models_14word2vec_inner_snrm2 (*__pyx_vp_6gensim_6models_14word2vec_inner_snrm2)
+static __pyx_t_6gensim_6models_14word2vec_inner_sscal_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_sscal = 0;
+#define __pyx_v_6gensim_6models_14word2vec_inner_sscal (*__pyx_vp_6gensim_6models_14word2vec_inner_sscal)
+static __pyx_t_6gensim_6models_14word2vec_inner_REAL_t (*__pyx_vp_6gensim_6models_14word2vec_inner_EXP_TABLE)[0x3E8] = 0;
+#define __pyx_v_6gensim_6models_14word2vec_inner_EXP_TABLE (*__pyx_vp_6gensim_6models_14word2vec_inner_EXP_TABLE)
+static __pyx_t_6gensim_6models_14word2vec_inner_our_dot_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_our_dot = 0;
+#define __pyx_v_6gensim_6models_14word2vec_inner_our_dot (*__pyx_vp_6gensim_6models_14word2vec_inner_our_dot)
+static __pyx_t_6gensim_6models_14word2vec_inner_our_saxpy_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_our_saxpy = 0;
+#define __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy (*__pyx_vp_6gensim_6models_14word2vec_inner_our_saxpy)
+static unsigned PY_LONG_LONG (*__pyx_f_6gensim_6models_14word2vec_inner_random_int32)(unsigned PY_LONG_LONG *); /*proto*/
+
+/* Module declarations from 'gensim.models.doc2vec_inner' */
+static void (*__pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs)(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, int, int, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *); /*proto*/
+static unsigned PY_LONG_LONG (*__pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_neg)(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5numpy_uint32_t const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, unsigned PY_LONG_LONG, int, int, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *); /*proto*/
+static void (*__pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs)(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, int const , int); /*proto*/
+static unsigned PY_LONG_LONG (*__pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_neg)(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, unsigned PY_LONG_LONG, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, int const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, int const , int); /*proto*/
+static void (*__pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs)(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, int const , int const , int); /*proto*/
+static unsigned PY_LONG_LONG (*__pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_neg)(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, unsigned PY_LONG_LONG, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, int const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t const , __pyx_t_6gensim_6models_14word2vec_inner_REAL_t *, int const , int const , int); /*proto*/
+static PyObject *(*__pyx_f_6gensim_6models_13doc2vec_inner_init_d2v_config)(struct __pyx_t_6gensim_6models_13doc2vec_inner_Doc2VecConfig *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *, struct __pyx_opt_args_6gensim_6models_13doc2vec_inner_init_d2v_config *__pyx_optional_args); /*proto*/
+
+/* Module declarations from 'libcpp.utility' */
+
+/* Module declarations from 'libcpp.unordered_map' */
+
+/* Module declarations from 'libcpp' */
+
+/* Module declarations from 'gensim.models.word2vec_corpusfile' */
+static PyTypeObject *__pyx_ptype_6gensim_6models_19word2vec_corpusfile_CythonLineSentence = 0;
+static PyTypeObject *__pyx_ptype_6gensim_6models_19word2vec_corpusfile_CythonVocab = 0;
+static __pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t (*__pyx_f_6gensim_6models_19word2vec_corpusfile_get_alpha)(__pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t, __pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t, int, int); /*proto*/
+static __pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t (*__pyx_f_6gensim_6models_19word2vec_corpusfile_get_next_alpha)(__pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t, __pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t, int, int, int, int, int, int); /*proto*/
+
+/* Module declarations from 'gensim.models.doc2vec_corpusfile' */
+static int __pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE;
+static __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v_6gensim_6models_18doc2vec_corpusfile_ONEF;
+static void __pyx_f_6gensim_6models_18doc2vec_corpusfile_prepare_c_structures_for_batch(std::vector &, int, int, int, int *, int *, unsigned PY_LONG_LONG *, __pyx_t_6gensim_6models_19word2vec_corpusfile_cvocab_t *, __pyx_t_5numpy_uint32_t *, int *, __pyx_t_5numpy_uint8_t **, __pyx_t_5numpy_uint32_t **, __pyx_t_5numpy_uint32_t *, int *, int, int, int); /*proto*/
+#define __Pyx_MODULE_NAME "gensim.models.doc2vec_corpusfile"
+extern int __pyx_module_is_main_gensim__models__doc2vec_corpusfile;
+int __pyx_module_is_main_gensim__models__doc2vec_corpusfile = 0;
+
+/* Implementation of 'gensim.models.doc2vec_corpusfile' */
+static PyObject *__pyx_builtin_ImportError;
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_ValueError;
+static PyObject *__pyx_builtin_RuntimeError;
+static const char __pyx_k_c[] = "c";
+static const char __pyx_k_i[] = "i";
+static const char __pyx_k_j[] = "j";
+static const char __pyx_k_k[] = "k";
+static const char __pyx_k_m[] = "m";
+static const char __pyx_k_n[] = "n";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k__10[] = "*";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_neu1[] = "neu1";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_work[] = "work";
+static const char __pyx_k_alpha[] = "alpha";
+static const char __pyx_k_count[] = "count";
+static const char __pyx_k_fblas[] = "fblas";
+static const char __pyx_k_model[] = "model";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_range[] = "range";
+static const char __pyx_k_vocab[] = "vocab";
+static const char __pyx_k_epochs[] = "epochs";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_offset[] = "offset";
+static const char __pyx_k_alpha_2[] = "_alpha";
+static const char __pyx_k_doc_tag[] = "_doc_tag";
+static const char __pyx_k_idx_end[] = "idx_end";
+static const char __pyx_k_sent_idx[] = "sent_idx";
+static const char __pyx_k_cur_epoch[] = "_cur_epoch";
+static const char __pyx_k_doc_words[] = "doc_words";
+static const char __pyx_k_end_alpha[] = "end_alpha";
+static const char __pyx_k_idx_start[] = "idx_start";
+static const char __pyx_k_inv_count[] = "inv_count";
+static const char __pyx_k_min_alpha[] = "min_alpha";
+static const char __pyx_k_ValueError[] = "ValueError";
+static const char __pyx_k_num_epochs[] = "num_epochs";
+static const char __pyx_k_pyx_vtable[] = "__pyx_vtable__";
+static const char __pyx_k_word_locks[] = "word_locks";
+static const char __pyx_k_ImportError[] = "ImportError";
+static const char __pyx_k_corpus_file[] = "corpus_file";
+static const char __pyx_k_cur_epoch_2[] = "cur_epoch";
+static const char __pyx_k_learn_words[] = "learn_words";
+static const char __pyx_k_start_alpha[] = "start_alpha";
+static const char __pyx_k_total_words[] = "total_words";
+static const char __pyx_k_train_words[] = "train_words";
+static const char __pyx_k_RuntimeError[] = "RuntimeError";
+static const char __pyx_k_cython_vocab[] = "_cython_vocab";
+static const char __pyx_k_doctag_locks[] = "doctag_locks";
+static const char __pyx_k_document_len[] = "document_len";
+static const char __pyx_k_input_stream[] = "input_stream";
+static const char __pyx_k_learn_hidden[] = "learn_hidden";
+static const char __pyx_k_start_doctag[] = "start_doctag";
+static const char __pyx_k_word_vectors[] = "word_vectors";
+static const char __pyx_k_docvecs_count[] = "docvecs_count";
+static const char __pyx_k_learn_doctags[] = "learn_doctags";
+static const char __pyx_k_doctag_vectors[] = "doctag_vectors";
+static const char __pyx_k_expected_words[] = "_expected_words";
+static const char __pyx_k_effective_words[] = "effective_words";
+static const char __pyx_k_total_documents[] = "total_documents";
+static const char __pyx_k_expected_words_2[] = "expected_words";
+static const char __pyx_k_expected_examples[] = "_expected_examples";
+static const char __pyx_k_scipy_linalg_blas[] = "scipy.linalg.blas";
+static const char __pyx_k_CORPUSFILE_VERSION[] = "CORPUSFILE_VERSION";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static const char __pyx_k_d2v_train_epoch_dm[] = "d2v_train_epoch_dm";
+static const char __pyx_k_expected_examples_2[] = "expected_examples";
+static const char __pyx_k_d2v_train_epoch_dbow[] = "d2v_train_epoch_dbow";
+static const char __pyx_k_total_effective_words[] = "total_effective_words";
+static const char __pyx_k_d2v_train_epoch_dm_concat[] = "d2v_train_epoch_dm_concat";
+static const char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C contiguous";
+static const char __pyx_k_numpy_core_multiarray_failed_to[] = "numpy.core.multiarray failed to import";
+static const char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)";
+static const char __pyx_k_Format_string_allocated_too_shor[] = "Format string allocated too short, see comment in numpy.pxd";
+static const char __pyx_k_Non_native_byte_order_not_suppor[] = "Non-native byte order not supported";
+static const char __pyx_k_Optimized_cython_functions_for_f[] = "Optimized cython functions for file-based training :class:`~gensim.models.doc2vec.Doc2Vec` model.";
+static const char __pyx_k_gensim_models_doc2vec_corpusfile[] = "gensim/models/doc2vec_corpusfile.pyx";
+static const char __pyx_k_ndarray_is_not_Fortran_contiguou[] = "ndarray is not Fortran contiguous";
+static const char __pyx_k_numpy_core_umath_failed_to_impor[] = "numpy.core.umath failed to import";
+static const char __pyx_k_Format_string_allocated_too_shor_2[] = "Format string allocated too short.";
+static const char __pyx_k_gensim_models_doc2vec_corpusfile_2[] = "gensim.models.doc2vec_corpusfile";
+static PyObject *__pyx_n_s_CORPUSFILE_VERSION;
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor;
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2;
+static PyObject *__pyx_n_s_ImportError;
+static PyObject *__pyx_kp_u_Non_native_byte_order_not_suppor;
+static PyObject *__pyx_n_s_RuntimeError;
+static PyObject *__pyx_n_s_ValueError;
+static PyObject *__pyx_n_s__10;
+static PyObject *__pyx_n_s_alpha;
+static PyObject *__pyx_n_s_alpha_2;
+static PyObject *__pyx_n_s_c;
+static PyObject *__pyx_n_s_cline_in_traceback;
+static PyObject *__pyx_n_s_corpus_file;
+static PyObject *__pyx_n_s_count;
+static PyObject *__pyx_n_s_cur_epoch;
+static PyObject *__pyx_n_s_cur_epoch_2;
+static PyObject *__pyx_n_s_cython_vocab;
+static PyObject *__pyx_n_s_d2v_train_epoch_dbow;
+static PyObject *__pyx_n_s_d2v_train_epoch_dm;
+static PyObject *__pyx_n_s_d2v_train_epoch_dm_concat;
+static PyObject *__pyx_n_s_doc_tag;
+static PyObject *__pyx_n_s_doc_words;
+static PyObject *__pyx_n_s_doctag_locks;
+static PyObject *__pyx_n_s_doctag_vectors;
+static PyObject *__pyx_n_s_document_len;
+static PyObject *__pyx_n_s_docvecs_count;
+static PyObject *__pyx_n_s_effective_words;
+static PyObject *__pyx_n_s_end_alpha;
+static PyObject *__pyx_n_s_epochs;
+static PyObject *__pyx_n_s_expected_examples;
+static PyObject *__pyx_n_s_expected_examples_2;
+static PyObject *__pyx_n_s_expected_words;
+static PyObject *__pyx_n_s_expected_words_2;
+static PyObject *__pyx_n_s_fblas;
+static PyObject *__pyx_kp_s_gensim_models_doc2vec_corpusfile;
+static PyObject *__pyx_n_s_gensim_models_doc2vec_corpusfile_2;
+static PyObject *__pyx_n_s_i;
+static PyObject *__pyx_n_s_idx_end;
+static PyObject *__pyx_n_s_idx_start;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_n_s_input_stream;
+static PyObject *__pyx_n_s_inv_count;
+static PyObject *__pyx_n_s_j;
+static PyObject *__pyx_n_s_k;
+static PyObject *__pyx_n_s_learn_doctags;
+static PyObject *__pyx_n_s_learn_hidden;
+static PyObject *__pyx_n_s_learn_words;
+static PyObject *__pyx_n_s_m;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_n_s_min_alpha;
+static PyObject *__pyx_n_s_model;
+static PyObject *__pyx_n_s_n;
+static PyObject *__pyx_kp_u_ndarray_is_not_C_contiguous;
+static PyObject *__pyx_kp_u_ndarray_is_not_Fortran_contiguou;
+static PyObject *__pyx_n_s_neu1;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_num_epochs;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_kp_s_numpy_core_multiarray_failed_to;
+static PyObject *__pyx_kp_s_numpy_core_umath_failed_to_impor;
+static PyObject *__pyx_n_s_offset;
+static PyObject *__pyx_n_s_pyx_vtable;
+static PyObject *__pyx_n_s_range;
+static PyObject *__pyx_n_s_scipy_linalg_blas;
+static PyObject *__pyx_n_s_sent_idx;
+static PyObject *__pyx_n_s_start_alpha;
+static PyObject *__pyx_n_s_start_doctag;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_total_documents;
+static PyObject *__pyx_n_s_total_effective_words;
+static PyObject *__pyx_n_s_total_words;
+static PyObject *__pyx_n_s_train_words;
+static PyObject *__pyx_kp_u_unknown_dtype_code_in_numpy_pxd;
+static PyObject *__pyx_n_s_vocab;
+static PyObject *__pyx_n_s_word_locks;
+static PyObject *__pyx_n_s_word_vectors;
+static PyObject *__pyx_n_s_work;
+static PyObject *__pyx_pf_6gensim_6models_18doc2vec_corpusfile_d2v_train_epoch_dbow(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_corpus_file, PyObject *__pyx_v_offset, PyObject *__pyx_v_start_doctag, PyObject *__pyx_v__cython_vocab, PyObject *__pyx_v__cur_epoch, PyObject *__pyx_v__expected_examples, PyObject *__pyx_v__expected_words, PyObject *__pyx_v_work, PyObject *__pyx_v_neu1, PyObject *__pyx_v_docvecs_count, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_train_words, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks); /* proto */
+static PyObject *__pyx_pf_6gensim_6models_18doc2vec_corpusfile_2d2v_train_epoch_dm(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_corpus_file, PyObject *__pyx_v_offset, PyObject *__pyx_v_start_doctag, PyObject *__pyx_v__cython_vocab, PyObject *__pyx_v__cur_epoch, PyObject *__pyx_v__expected_examples, PyObject *__pyx_v__expected_words, PyObject *__pyx_v_work, PyObject *__pyx_v_neu1, PyObject *__pyx_v_docvecs_count, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks); /* proto */
+static PyObject *__pyx_pf_6gensim_6models_18doc2vec_corpusfile_4d2v_train_epoch_dm_concat(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_corpus_file, PyObject *__pyx_v_offset, PyObject *__pyx_v_start_doctag, PyObject *__pyx_v__cython_vocab, PyObject *__pyx_v__cur_epoch, PyObject *__pyx_v__expected_examples, PyObject *__pyx_v__expected_words, PyObject *__pyx_v_work, PyObject *__pyx_v_neu1, PyObject *__pyx_v_docvecs_count, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks); /* proto */
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info); /* proto */
+static PyObject *__pyx_int_1;
+static PyObject *__pyx_tuple_;
+static PyObject *__pyx_tuple__2;
+static PyObject *__pyx_tuple__3;
+static PyObject *__pyx_tuple__4;
+static PyObject *__pyx_tuple__5;
+static PyObject *__pyx_tuple__6;
+static PyObject *__pyx_tuple__7;
+static PyObject *__pyx_tuple__8;
+static PyObject *__pyx_tuple__9;
+static PyObject *__pyx_tuple__11;
+static PyObject *__pyx_tuple__13;
+static PyObject *__pyx_tuple__15;
+static PyObject *__pyx_codeobj__12;
+static PyObject *__pyx_codeobj__14;
+static PyObject *__pyx_codeobj__16;
+/* Late includes */
+
+/* "gensim/models/doc2vec_corpusfile.pyx":57
+ *
+ *
+ * cdef void prepare_c_structures_for_batch(vector[string] &doc_words, int sample, int hs, int window, int *total_words, # <<<<<<<<<<<<<<
+ * int *effective_words, unsigned long long *next_random, cvocab_t *vocab,
+ * np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points,
+ */
+
+static void __pyx_f_6gensim_6models_18doc2vec_corpusfile_prepare_c_structures_for_batch(std::vector &__pyx_v_doc_words, int __pyx_v_sample, int __pyx_v_hs, int __pyx_v_window, int *__pyx_v_total_words, int *__pyx_v_effective_words, unsigned PY_LONG_LONG *__pyx_v_next_random, __pyx_t_6gensim_6models_19word2vec_corpusfile_cvocab_t *__pyx_v_vocab, __pyx_t_5numpy_uint32_t *__pyx_v_indexes, int *__pyx_v_codelens, __pyx_t_5numpy_uint8_t **__pyx_v_codes, __pyx_t_5numpy_uint32_t **__pyx_v_points, __pyx_t_5numpy_uint32_t *__pyx_v_reduced_windows, int *__pyx_v_document_len, int __pyx_v_train_words, int __pyx_v_docvecs_count, int __pyx_v_doc_tag) {
+ struct __pyx_t_6gensim_6models_19word2vec_corpusfile_VocabItem __pyx_v_predict_word;
+ std::string __pyx_v_token;
+ int __pyx_v_i;
+ long __pyx_t_1;
+ std::vector ::iterator __pyx_t_2;
+ std::string __pyx_t_3;
+ int __pyx_t_4;
+ int __pyx_t_5;
+ __pyx_t_5numpy_uint32_t __pyx_t_6;
+ int __pyx_t_7;
+ __pyx_t_5numpy_uint8_t *__pyx_t_8;
+ __pyx_t_5numpy_uint32_t *__pyx_t_9;
+ int __pyx_t_10;
+ int __pyx_t_11;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":64
+ * cdef VocabItem predict_word
+ * cdef string token
+ * cdef int i = 0 # <<<<<<<<<<<<<<
+ *
+ * total_words[0] += doc_words.size()
+ */
+ __pyx_v_i = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":66
+ * cdef int i = 0
+ *
+ * total_words[0] += doc_words.size() # <<<<<<<<<<<<<<
+ *
+ * for token in doc_words:
+ */
+ __pyx_t_1 = 0;
+ (__pyx_v_total_words[__pyx_t_1]) = ((__pyx_v_total_words[__pyx_t_1]) + __pyx_v_doc_words.size());
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":68
+ * total_words[0] += doc_words.size()
+ *
+ * for token in doc_words: # <<<<<<<<<<<<<<
+ * if vocab[0].find(token) == vocab[0].end(): # shrink document to leave out word
+ * continue # leaving i unchanged
+ */
+ __pyx_t_2 = __pyx_v_doc_words.begin();
+ for (;;) {
+ if (!(__pyx_t_2 != __pyx_v_doc_words.end())) break;
+ __pyx_t_3 = *__pyx_t_2;
+ ++__pyx_t_2;
+ __pyx_v_token = __pyx_t_3;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":69
+ *
+ * for token in doc_words:
+ * if vocab[0].find(token) == vocab[0].end(): # shrink document to leave out word # <<<<<<<<<<<<<<
+ * continue # leaving i unchanged
+ *
+ */
+ __pyx_t_4 = (((__pyx_v_vocab[0]).find(__pyx_v_token) == (__pyx_v_vocab[0]).end()) != 0);
+ if (__pyx_t_4) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":70
+ * for token in doc_words:
+ * if vocab[0].find(token) == vocab[0].end(): # shrink document to leave out word
+ * continue # leaving i unchanged # <<<<<<<<<<<<<<
+ *
+ * predict_word = vocab[0][token]
+ */
+ goto __pyx_L3_continue;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":69
+ *
+ * for token in doc_words:
+ * if vocab[0].find(token) == vocab[0].end(): # shrink document to leave out word # <<<<<<<<<<<<<<
+ * continue # leaving i unchanged
+ *
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":72
+ * continue # leaving i unchanged
+ *
+ * predict_word = vocab[0][token] # <<<<<<<<<<<<<<
+ * if sample and predict_word.sample_int < random_int32(next_random):
+ * continue
+ */
+ __pyx_v_predict_word = ((__pyx_v_vocab[0])[__pyx_v_token]);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":73
+ *
+ * predict_word = vocab[0][token]
+ * if sample and predict_word.sample_int < random_int32(next_random): # <<<<<<<<<<<<<<
+ * continue
+ * indexes[i] = predict_word.index
+ */
+ __pyx_t_5 = (__pyx_v_sample != 0);
+ if (__pyx_t_5) {
+ } else {
+ __pyx_t_4 = __pyx_t_5;
+ goto __pyx_L7_bool_binop_done;
+ }
+ __pyx_t_5 = ((__pyx_v_predict_word.sample_int < __pyx_f_6gensim_6models_14word2vec_inner_random_int32(__pyx_v_next_random)) != 0);
+ __pyx_t_4 = __pyx_t_5;
+ __pyx_L7_bool_binop_done:;
+ if (__pyx_t_4) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":74
+ * predict_word = vocab[0][token]
+ * if sample and predict_word.sample_int < random_int32(next_random):
+ * continue # <<<<<<<<<<<<<<
+ * indexes[i] = predict_word.index
+ * if hs:
+ */
+ goto __pyx_L3_continue;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":73
+ *
+ * predict_word = vocab[0][token]
+ * if sample and predict_word.sample_int < random_int32(next_random): # <<<<<<<<<<<<<<
+ * continue
+ * indexes[i] = predict_word.index
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":75
+ * if sample and predict_word.sample_int < random_int32(next_random):
+ * continue
+ * indexes[i] = predict_word.index # <<<<<<<<<<<<<<
+ * if hs:
+ * codelens[i] = predict_word.code_len
+ */
+ __pyx_t_6 = __pyx_v_predict_word.index;
+ (__pyx_v_indexes[__pyx_v_i]) = __pyx_t_6;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":76
+ * continue
+ * indexes[i] = predict_word.index
+ * if hs: # <<<<<<<<<<<<<<
+ * codelens[i] = predict_word.code_len
+ * codes[i] = predict_word.code
+ */
+ __pyx_t_4 = (__pyx_v_hs != 0);
+ if (__pyx_t_4) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":77
+ * indexes[i] = predict_word.index
+ * if hs:
+ * codelens[i] = predict_word.code_len # <<<<<<<<<<<<<<
+ * codes[i] = predict_word.code
+ * points[i] = predict_word.point
+ */
+ __pyx_t_7 = __pyx_v_predict_word.code_len;
+ (__pyx_v_codelens[__pyx_v_i]) = __pyx_t_7;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":78
+ * if hs:
+ * codelens[i] = predict_word.code_len
+ * codes[i] = predict_word.code # <<<<<<<<<<<<<<
+ * points[i] = predict_word.point
+ *
+ */
+ __pyx_t_8 = __pyx_v_predict_word.code;
+ (__pyx_v_codes[__pyx_v_i]) = __pyx_t_8;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":79
+ * codelens[i] = predict_word.code_len
+ * codes[i] = predict_word.code
+ * points[i] = predict_word.point # <<<<<<<<<<<<<<
+ *
+ * effective_words[0] += 1
+ */
+ __pyx_t_9 = __pyx_v_predict_word.point;
+ (__pyx_v_points[__pyx_v_i]) = __pyx_t_9;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":76
+ * continue
+ * indexes[i] = predict_word.index
+ * if hs: # <<<<<<<<<<<<<<
+ * codelens[i] = predict_word.code_len
+ * codes[i] = predict_word.code
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":81
+ * points[i] = predict_word.point
+ *
+ * effective_words[0] += 1 # <<<<<<<<<<<<<<
+ * i += 1
+ * if i == MAX_DOCUMENT_LEN:
+ */
+ __pyx_t_1 = 0;
+ (__pyx_v_effective_words[__pyx_t_1]) = ((__pyx_v_effective_words[__pyx_t_1]) + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":82
+ *
+ * effective_words[0] += 1
+ * i += 1 # <<<<<<<<<<<<<<
+ * if i == MAX_DOCUMENT_LEN:
+ * break # TODO: log warning, tally overflow?
+ */
+ __pyx_v_i = (__pyx_v_i + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":83
+ * effective_words[0] += 1
+ * i += 1
+ * if i == MAX_DOCUMENT_LEN: # <<<<<<<<<<<<<<
+ * break # TODO: log warning, tally overflow?
+ * document_len[0] = i
+ */
+ __pyx_t_4 = ((__pyx_v_i == 0x2710) != 0);
+ if (__pyx_t_4) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":84
+ * i += 1
+ * if i == MAX_DOCUMENT_LEN:
+ * break # TODO: log warning, tally overflow? # <<<<<<<<<<<<<<
+ * document_len[0] = i
+ *
+ */
+ goto __pyx_L4_break;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":83
+ * effective_words[0] += 1
+ * i += 1
+ * if i == MAX_DOCUMENT_LEN: # <<<<<<<<<<<<<<
+ * break # TODO: log warning, tally overflow?
+ * document_len[0] = i
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":68
+ * total_words[0] += doc_words.size()
+ *
+ * for token in doc_words: # <<<<<<<<<<<<<<
+ * if vocab[0].find(token) == vocab[0].end(): # shrink document to leave out word
+ * continue # leaving i unchanged
+ */
+ __pyx_L3_continue:;
+ }
+ __pyx_L4_break:;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":85
+ * if i == MAX_DOCUMENT_LEN:
+ * break # TODO: log warning, tally overflow?
+ * document_len[0] = i # <<<<<<<<<<<<<<
+ *
+ * if train_words and reduced_windows != NULL:
+ */
+ (__pyx_v_document_len[0]) = __pyx_v_i;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":87
+ * document_len[0] = i
+ *
+ * if train_words and reduced_windows != NULL: # <<<<<<<<<<<<<<
+ * for i in range(document_len[0]):
+ * reduced_windows[i] = random_int32(next_random) % window
+ */
+ __pyx_t_5 = (__pyx_v_train_words != 0);
+ if (__pyx_t_5) {
+ } else {
+ __pyx_t_4 = __pyx_t_5;
+ goto __pyx_L12_bool_binop_done;
+ }
+ __pyx_t_5 = ((__pyx_v_reduced_windows != NULL) != 0);
+ __pyx_t_4 = __pyx_t_5;
+ __pyx_L12_bool_binop_done:;
+ if (__pyx_t_4) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":88
+ *
+ * if train_words and reduced_windows != NULL:
+ * for i in range(document_len[0]): # <<<<<<<<<<<<<<
+ * reduced_windows[i] = random_int32(next_random) % window
+ *
+ */
+ __pyx_t_7 = (__pyx_v_document_len[0]);
+ __pyx_t_10 = __pyx_t_7;
+ for (__pyx_t_11 = 0; __pyx_t_11 < __pyx_t_10; __pyx_t_11+=1) {
+ __pyx_v_i = __pyx_t_11;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":89
+ * if train_words and reduced_windows != NULL:
+ * for i in range(document_len[0]):
+ * reduced_windows[i] = random_int32(next_random) % window # <<<<<<<<<<<<<<
+ *
+ * if doc_tag < docvecs_count:
+ */
+ (__pyx_v_reduced_windows[__pyx_v_i]) = (__pyx_f_6gensim_6models_14word2vec_inner_random_int32(__pyx_v_next_random) % __pyx_v_window);
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":87
+ * document_len[0] = i
+ *
+ * if train_words and reduced_windows != NULL: # <<<<<<<<<<<<<<
+ * for i in range(document_len[0]):
+ * reduced_windows[i] = random_int32(next_random) % window
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":91
+ * reduced_windows[i] = random_int32(next_random) % window
+ *
+ * if doc_tag < docvecs_count: # <<<<<<<<<<<<<<
+ * effective_words[0] += 1
+ *
+ */
+ __pyx_t_4 = ((__pyx_v_doc_tag < __pyx_v_docvecs_count) != 0);
+ if (__pyx_t_4) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":92
+ *
+ * if doc_tag < docvecs_count:
+ * effective_words[0] += 1 # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_t_1 = 0;
+ (__pyx_v_effective_words[__pyx_t_1]) = ((__pyx_v_effective_words[__pyx_t_1]) + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":91
+ * reduced_windows[i] = random_int32(next_random) % window
+ *
+ * if doc_tag < docvecs_count: # <<<<<<<<<<<<<<
+ * effective_words[0] += 1
+ *
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":57
+ *
+ *
+ * cdef void prepare_c_structures_for_batch(vector[string] &doc_words, int sample, int hs, int window, int *total_words, # <<<<<<<<<<<<<<
+ * int *effective_words, unsigned long long *next_random, cvocab_t *vocab,
+ * np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points,
+ */
+
+ /* function exit code */
+}
+
+/* "gensim/models/doc2vec_corpusfile.pyx":95
+ *
+ *
+ * def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, # <<<<<<<<<<<<<<
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_6gensim_6models_18doc2vec_corpusfile_1d2v_train_epoch_dbow(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static char __pyx_doc_6gensim_6models_18doc2vec_corpusfile_d2v_train_epoch_dbow[] = "d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctag_locks=None)\nTrain distributed bag of words model (\"PV-DBOW\") by training on a corpus file.\n\n Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train`.\n\n Parameters\n ----------\n model : :class:`~gensim.models.doc2vec.Doc2Vec`\n The FastText model instance to train.\n corpus_file : str\n Path to corpus file.\n _cur_epoch : int\n Current epoch number. Used for calculating and decaying learning rate.\n work : np.ndarray\n Private working memory for each worker.\n neu1 : np.ndarray\n Private working memory for each worker.\n train_words : bool, optional\n Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** `learn_words`\n and `train_words` are set to True.\n learn_doctags : bool, optional\n Whether the tag vectors should be updated.\n learn_words : bool, optional\n Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**\n `learn_words` and `train_words` are set to True.\n learn_hidden : bool, optional\n Whether or not the weights of the hidden layer will be updated.\n word_vectors : numpy.ndarray, optional\n The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.\n word_locks : numpy.ndarray, optional\n A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates,\n a value of 1 allows to update word-vectors.\n doctag_vectors : numpy.ndarray, optional\n Vector representations of the tags. If None, these will be retrieved from the model.\n doctag_locks"" : numpy.ndarray, optional\n The lock factors for each tag, same as `word_locks`, but for document-vectors.\n\n Returns\n -------\n int\n Number of words in the input document that were actually used for training.\n\n ";
+static PyMethodDef __pyx_mdef_6gensim_6models_18doc2vec_corpusfile_1d2v_train_epoch_dbow = {"d2v_train_epoch_dbow", (PyCFunction)__pyx_pw_6gensim_6models_18doc2vec_corpusfile_1d2v_train_epoch_dbow, METH_VARARGS|METH_KEYWORDS, __pyx_doc_6gensim_6models_18doc2vec_corpusfile_d2v_train_epoch_dbow};
+static PyObject *__pyx_pw_6gensim_6models_18doc2vec_corpusfile_1d2v_train_epoch_dbow(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+ PyObject *__pyx_v_model = 0;
+ PyObject *__pyx_v_corpus_file = 0;
+ PyObject *__pyx_v_offset = 0;
+ PyObject *__pyx_v_start_doctag = 0;
+ PyObject *__pyx_v__cython_vocab = 0;
+ PyObject *__pyx_v__cur_epoch = 0;
+ PyObject *__pyx_v__expected_examples = 0;
+ PyObject *__pyx_v__expected_words = 0;
+ PyObject *__pyx_v_work = 0;
+ PyObject *__pyx_v_neu1 = 0;
+ PyObject *__pyx_v_docvecs_count = 0;
+ PyObject *__pyx_v_word_vectors = 0;
+ PyObject *__pyx_v_word_locks = 0;
+ PyObject *__pyx_v_train_words = 0;
+ PyObject *__pyx_v_learn_doctags = 0;
+ PyObject *__pyx_v_learn_words = 0;
+ PyObject *__pyx_v_learn_hidden = 0;
+ PyObject *__pyx_v_doctag_vectors = 0;
+ PyObject *__pyx_v_doctag_locks = 0;
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("d2v_train_epoch_dbow (wrapper)", 0);
+ {
+ static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_model,&__pyx_n_s_corpus_file,&__pyx_n_s_offset,&__pyx_n_s_start_doctag,&__pyx_n_s_cython_vocab,&__pyx_n_s_cur_epoch,&__pyx_n_s_expected_examples,&__pyx_n_s_expected_words,&__pyx_n_s_work,&__pyx_n_s_neu1,&__pyx_n_s_docvecs_count,&__pyx_n_s_word_vectors,&__pyx_n_s_word_locks,&__pyx_n_s_train_words,&__pyx_n_s_learn_doctags,&__pyx_n_s_learn_words,&__pyx_n_s_learn_hidden,&__pyx_n_s_doctag_vectors,&__pyx_n_s_doctag_locks,0};
+ PyObject* values[19] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":96
+ *
+ * def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None, # <<<<<<<<<<<<<<
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ * doctag_vectors=None, doctag_locks=None):
+ */
+ values[11] = ((PyObject *)Py_None);
+ values[12] = ((PyObject *)Py_None);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":97
+ * def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, # <<<<<<<<<<<<<<
+ * doctag_vectors=None, doctag_locks=None):
+ * """Train distributed bag of words model ("PV-DBOW") by training on a corpus file.
+ */
+ values[13] = ((PyObject *)Py_False);
+ values[14] = ((PyObject *)Py_True);
+ values[15] = ((PyObject *)Py_True);
+ values[16] = ((PyObject *)Py_True);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":98
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ * doctag_vectors=None, doctag_locks=None): # <<<<<<<<<<<<<<
+ * """Train distributed bag of words model ("PV-DBOW") by training on a corpus file.
+ *
+ */
+ values[17] = ((PyObject *)Py_None);
+ values[18] = ((PyObject *)Py_None);
+ if (unlikely(__pyx_kwds)) {
+ Py_ssize_t kw_args;
+ const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+ switch (pos_args) {
+ case 19: values[18] = PyTuple_GET_ITEM(__pyx_args, 18);
+ CYTHON_FALLTHROUGH;
+ case 18: values[17] = PyTuple_GET_ITEM(__pyx_args, 17);
+ CYTHON_FALLTHROUGH;
+ case 17: values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+ CYTHON_FALLTHROUGH;
+ case 16: values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+ CYTHON_FALLTHROUGH;
+ case 15: values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+ CYTHON_FALLTHROUGH;
+ case 14: values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+ CYTHON_FALLTHROUGH;
+ case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+ CYTHON_FALLTHROUGH;
+ case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+ CYTHON_FALLTHROUGH;
+ case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+ CYTHON_FALLTHROUGH;
+ case 10: values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+ CYTHON_FALLTHROUGH;
+ case 9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+ CYTHON_FALLTHROUGH;
+ case 8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+ CYTHON_FALLTHROUGH;
+ case 7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+ CYTHON_FALLTHROUGH;
+ case 6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+ CYTHON_FALLTHROUGH;
+ case 5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ CYTHON_FALLTHROUGH;
+ case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ CYTHON_FALLTHROUGH;
+ case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ CYTHON_FALLTHROUGH;
+ case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ CYTHON_FALLTHROUGH;
+ case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ CYTHON_FALLTHROUGH;
+ case 0: break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ kw_args = PyDict_Size(__pyx_kwds);
+ switch (pos_args) {
+ case 0:
+ if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_model)) != 0)) kw_args--;
+ else goto __pyx_L5_argtuple_error;
+ CYTHON_FALLTHROUGH;
+ case 1:
+ if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_corpus_file)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 1); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 2:
+ if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_offset)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 2); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 3:
+ if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_start_doctag)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 3); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 4:
+ if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_cython_vocab)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 4); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 5:
+ if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_cur_epoch)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 5); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 6:
+ if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_expected_examples)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 6); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 7:
+ if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_expected_words)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 7); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 8:
+ if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_work)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 8); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 9:
+ if (likely((values[9] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_neu1)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 9); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 10:
+ if (likely((values[10] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_docvecs_count)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, 10); __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 11:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_word_vectors);
+ if (value) { values[11] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 12:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_word_locks);
+ if (value) { values[12] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 13:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_train_words);
+ if (value) { values[13] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 14:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_learn_doctags);
+ if (value) { values[14] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 15:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_learn_words);
+ if (value) { values[15] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 16:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_learn_hidden);
+ if (value) { values[16] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 17:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_doctag_vectors);
+ if (value) { values[17] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 18:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_doctag_locks);
+ if (value) { values[18] = value; kw_args--; }
+ }
+ }
+ if (unlikely(kw_args > 0)) {
+ if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "d2v_train_epoch_dbow") < 0)) __PYX_ERR(0, 95, __pyx_L3_error)
+ }
+ } else {
+ switch (PyTuple_GET_SIZE(__pyx_args)) {
+ case 19: values[18] = PyTuple_GET_ITEM(__pyx_args, 18);
+ CYTHON_FALLTHROUGH;
+ case 18: values[17] = PyTuple_GET_ITEM(__pyx_args, 17);
+ CYTHON_FALLTHROUGH;
+ case 17: values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+ CYTHON_FALLTHROUGH;
+ case 16: values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+ CYTHON_FALLTHROUGH;
+ case 15: values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+ CYTHON_FALLTHROUGH;
+ case 14: values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+ CYTHON_FALLTHROUGH;
+ case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+ CYTHON_FALLTHROUGH;
+ case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+ CYTHON_FALLTHROUGH;
+ case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+ values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+ values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+ values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+ values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+ values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+ values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ }
+ __pyx_v_model = values[0];
+ __pyx_v_corpus_file = values[1];
+ __pyx_v_offset = values[2];
+ __pyx_v_start_doctag = values[3];
+ __pyx_v__cython_vocab = values[4];
+ __pyx_v__cur_epoch = values[5];
+ __pyx_v__expected_examples = values[6];
+ __pyx_v__expected_words = values[7];
+ __pyx_v_work = values[8];
+ __pyx_v_neu1 = values[9];
+ __pyx_v_docvecs_count = values[10];
+ __pyx_v_word_vectors = values[11];
+ __pyx_v_word_locks = values[12];
+ __pyx_v_train_words = values[13];
+ __pyx_v_learn_doctags = values[14];
+ __pyx_v_learn_words = values[15];
+ __pyx_v_learn_hidden = values[16];
+ __pyx_v_doctag_vectors = values[17];
+ __pyx_v_doctag_locks = values[18];
+ }
+ goto __pyx_L4_argument_unpacking_done;
+ __pyx_L5_argtuple_error:;
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dbow", 0, 11, 19, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 95, __pyx_L3_error)
+ __pyx_L3_error:;
+ __Pyx_AddTraceback("gensim.models.doc2vec_corpusfile.d2v_train_epoch_dbow", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __Pyx_RefNannyFinishContext();
+ return NULL;
+ __pyx_L4_argument_unpacking_done:;
+ __pyx_r = __pyx_pf_6gensim_6models_18doc2vec_corpusfile_d2v_train_epoch_dbow(__pyx_self, __pyx_v_model, __pyx_v_corpus_file, __pyx_v_offset, __pyx_v_start_doctag, __pyx_v__cython_vocab, __pyx_v__cur_epoch, __pyx_v__expected_examples, __pyx_v__expected_words, __pyx_v_work, __pyx_v_neu1, __pyx_v_docvecs_count, __pyx_v_word_vectors, __pyx_v_word_locks, __pyx_v_train_words, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, __pyx_v_doctag_vectors, __pyx_v_doctag_locks);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":95
+ *
+ *
+ * def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, # <<<<<<<<<<<<<<
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ */
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_pf_6gensim_6models_18doc2vec_corpusfile_d2v_train_epoch_dbow(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_corpus_file, PyObject *__pyx_v_offset, PyObject *__pyx_v_start_doctag, PyObject *__pyx_v__cython_vocab, PyObject *__pyx_v__cur_epoch, PyObject *__pyx_v__expected_examples, PyObject *__pyx_v__expected_words, PyObject *__pyx_v_work, PyObject *__pyx_v_neu1, PyObject *__pyx_v_docvecs_count, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_train_words, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks) {
+ struct __pyx_t_6gensim_6models_13doc2vec_inner_Doc2VecConfig __pyx_v_c;
+ int __pyx_v_cur_epoch;
+ int __pyx_v_num_epochs;
+ int __pyx_v_expected_examples;
+ int __pyx_v_expected_words;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v_start_alpha;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v_end_alpha;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v__alpha;
+ struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *__pyx_v_input_stream = 0;
+ struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonVocab *__pyx_v_vocab = 0;
+ int __pyx_v_i;
+ int __pyx_v_j;
+ int __pyx_v_document_len;
+ int __pyx_v_effective_words;
+ int __pyx_v_total_effective_words;
+ int __pyx_v_total_documents;
+ int __pyx_v_total_words;
+ std::vector __pyx_v_doc_words;
+ int __pyx_v__doc_tag;
+ long __pyx_v_k;
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ PyObject *__pyx_t_2 = NULL;
+ int __pyx_t_3;
+ int __pyx_t_4;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_t_5;
+ __pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t __pyx_t_6;
+ PyObject *__pyx_t_7 = NULL;
+ struct __pyx_opt_args_6gensim_6models_13doc2vec_inner_init_d2v_config __pyx_t_8;
+ int __pyx_t_9;
+ std::vector __pyx_t_10;
+ __pyx_t_6gensim_6models_19word2vec_corpusfile_cvocab_t *__pyx_t_11;
+ int __pyx_t_12;
+ long __pyx_t_13;
+ long __pyx_t_14;
+ int __pyx_t_15;
+ PyObject *__pyx_t_16 = NULL;
+ PyObject *__pyx_t_17 = NULL;
+ __Pyx_RefNannySetupContext("d2v_train_epoch_dbow", 0);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":143
+ * cdef Doc2VecConfig c
+ *
+ * cdef int cur_epoch = _cur_epoch # <<<<<<<<<<<<<<
+ * cdef int num_epochs = model.epochs
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ */
+ __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v__cur_epoch); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 143, __pyx_L1_error)
+ __pyx_v_cur_epoch = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":144
+ *
+ * cdef int cur_epoch = _cur_epoch
+ * cdef int num_epochs = model.epochs # <<<<<<<<<<<<<<
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_epochs); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 144, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_t_2); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 144, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_num_epochs = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":145
+ * cdef int cur_epoch = _cur_epoch
+ * cdef int num_epochs = model.epochs
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples) # <<<<<<<<<<<<<<
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ * cdef REAL_t start_alpha = model.alpha
+ */
+ __pyx_t_3 = (__pyx_v__expected_examples == Py_None);
+ if ((__pyx_t_3 != 0)) {
+ __pyx_t_1 = -1;
+ } else {
+ __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_v__expected_examples); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 145, __pyx_L1_error)
+ __pyx_t_1 = __pyx_t_4;
+ }
+ __pyx_v_expected_examples = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":146
+ * cdef int num_epochs = model.epochs
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words) # <<<<<<<<<<<<<<
+ * cdef REAL_t start_alpha = model.alpha
+ * cdef REAL_t end_alpha = model.min_alpha
+ */
+ __pyx_t_3 = (__pyx_v__expected_words == Py_None);
+ if ((__pyx_t_3 != 0)) {
+ __pyx_t_1 = -1;
+ } else {
+ __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_v__expected_words); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 146, __pyx_L1_error)
+ __pyx_t_1 = __pyx_t_4;
+ }
+ __pyx_v_expected_words = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":147
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ * cdef REAL_t start_alpha = model.alpha # <<<<<<<<<<<<<<
+ * cdef REAL_t end_alpha = model.min_alpha
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_alpha); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 147, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_5 = __pyx_PyFloat_AsFloat(__pyx_t_2); if (unlikely((__pyx_t_5 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 147, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_start_alpha = __pyx_t_5;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":148
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ * cdef REAL_t start_alpha = model.alpha
+ * cdef REAL_t end_alpha = model.min_alpha # <<<<<<<<<<<<<<
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)
+ *
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_min_alpha); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 148, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_5 = __pyx_PyFloat_AsFloat(__pyx_t_2); if (unlikely((__pyx_t_5 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 148, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_end_alpha = __pyx_t_5;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":149
+ * cdef REAL_t start_alpha = model.alpha
+ * cdef REAL_t end_alpha = model.min_alpha
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs) # <<<<<<<<<<<<<<
+ *
+ * cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset)
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_alpha); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_6 = __pyx_PyFloat_AsFloat(__pyx_t_2); if (unlikely((__pyx_t_6 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 149, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v__alpha = __pyx_f_6gensim_6models_19word2vec_corpusfile_get_alpha(__pyx_t_6, __pyx_v_end_alpha, __pyx_v_cur_epoch, __pyx_v_num_epochs);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":151
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)
+ *
+ * cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset) # <<<<<<<<<<<<<<
+ * cdef CythonVocab vocab = _cython_vocab
+ *
+ */
+ __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 151, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_INCREF(__pyx_v_corpus_file);
+ __Pyx_GIVEREF(__pyx_v_corpus_file);
+ PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_corpus_file);
+ __Pyx_INCREF(__pyx_v_offset);
+ __Pyx_GIVEREF(__pyx_v_offset);
+ PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_v_offset);
+ __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_6gensim_6models_19word2vec_corpusfile_CythonLineSentence), __pyx_t_2, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 151, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_input_stream = ((struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_t_7);
+ __pyx_t_7 = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":152
+ *
+ * cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset)
+ * cdef CythonVocab vocab = _cython_vocab # <<<<<<<<<<<<<<
+ *
+ * cdef int i, j, document_len
+ */
+ if (!(likely(((__pyx_v__cython_vocab) == Py_None) || likely(__Pyx_TypeTest(__pyx_v__cython_vocab, __pyx_ptype_6gensim_6models_19word2vec_corpusfile_CythonVocab))))) __PYX_ERR(0, 152, __pyx_L1_error)
+ __pyx_t_7 = __pyx_v__cython_vocab;
+ __Pyx_INCREF(__pyx_t_7);
+ __pyx_v_vocab = ((struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonVocab *)__pyx_t_7);
+ __pyx_t_7 = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":155
+ *
+ * cdef int i, j, document_len
+ * cdef int effective_words = 0 # <<<<<<<<<<<<<<
+ * cdef int total_effective_words = 0, total_documents = 0, total_words = 0
+ * cdef int sent_idx, idx_start, idx_end
+ */
+ __pyx_v_effective_words = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":156
+ * cdef int i, j, document_len
+ * cdef int effective_words = 0
+ * cdef int total_effective_words = 0, total_documents = 0, total_words = 0 # <<<<<<<<<<<<<<
+ * cdef int sent_idx, idx_start, idx_end
+ *
+ */
+ __pyx_v_total_effective_words = 0;
+ __pyx_v_total_documents = 0;
+ __pyx_v_total_words = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":160
+ *
+ * cdef vector[string] doc_words
+ * cdef int _doc_tag = start_doctag # <<<<<<<<<<<<<<
+ *
+ * init_d2v_config(
+ */
+ __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_start_doctag); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 160, __pyx_L1_error)
+ __pyx_v__doc_tag = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":163
+ *
+ * init_d2v_config(
+ * &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=train_words, # <<<<<<<<<<<<<<
+ * work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks,
+ * doctag_vectors=doctag_vectors, doctag_locks=doctag_locks, docvecs_count=docvecs_count)
+ */
+ __pyx_t_7 = PyFloat_FromDouble(__pyx_v__alpha); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 163, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":162
+ * cdef int _doc_tag = start_doctag
+ *
+ * init_d2v_config( # <<<<<<<<<<<<<<
+ * &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=train_words,
+ * work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks,
+ */
+ __pyx_t_8.__pyx_n = 8;
+ __pyx_t_8.train_words = __pyx_v_train_words;
+ __pyx_t_8.work = __pyx_v_work;
+ __pyx_t_8.neu1 = __pyx_v_neu1;
+ __pyx_t_8.word_vectors = __pyx_v_word_vectors;
+ __pyx_t_8.word_locks = __pyx_v_word_locks;
+ __pyx_t_8.doctag_vectors = __pyx_v_doctag_vectors;
+ __pyx_t_8.doctag_locks = __pyx_v_doctag_locks;
+ __pyx_t_8.docvecs_count = __pyx_v_docvecs_count;
+ __pyx_t_2 = __pyx_f_6gensim_6models_13doc2vec_inner_init_d2v_config((&__pyx_v_c), __pyx_v_model, __pyx_t_7, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, &__pyx_t_8); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 162, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":168
+ *
+ * # release GIL & train on the full corpus, document by document
+ * with nogil: # <<<<<<<<<<<<<<
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ */
+ {
+ #ifdef WITH_THREAD
+ PyThreadState *_save;
+ Py_UNBLOCK_THREADS
+ __Pyx_FastGIL_Remember();
+ #endif
+ /*try:*/ {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":169
+ * # release GIL & train on the full corpus, document by document
+ * with nogil:
+ * input_stream.reset() # <<<<<<<<<<<<<<
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ * effective_words = 0
+ */
+ ((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_v_input_stream->__pyx_vtab)->reset(__pyx_v_input_stream, 0);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":170
+ * with nogil:
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers): # <<<<<<<<<<<<<<
+ * effective_words = 0
+ *
+ */
+ while (1) {
+ __pyx_t_9 = (((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_v_input_stream->__pyx_vtab)->is_eof(__pyx_v_input_stream, 0) != 0);
+ if (!__pyx_t_9) {
+ } else {
+ __pyx_t_3 = __pyx_t_9;
+ goto __pyx_L8_bool_binop_done;
+ }
+ __pyx_t_9 = ((__pyx_v_total_words > (__pyx_v_expected_words / __pyx_v_c.workers)) != 0);
+ __pyx_t_3 = __pyx_t_9;
+ __pyx_L8_bool_binop_done:;
+ __pyx_t_9 = ((!__pyx_t_3) != 0);
+ if (!__pyx_t_9) break;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":171
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ * effective_words = 0 # <<<<<<<<<<<<<<
+ *
+ * doc_words = input_stream.read_sentence()
+ */
+ __pyx_v_effective_words = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":173
+ * effective_words = 0
+ *
+ * doc_words = input_stream.read_sentence() # <<<<<<<<<<<<<<
+ *
+ * if doc_words.empty():
+ */
+ __pyx_t_10 = ((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_v_input_stream->__pyx_vtab)->read_sentence(__pyx_v_input_stream, 0); if (unlikely(__Pyx_ErrOccurredWithGIL())) __PYX_ERR(0, 173, __pyx_L4_error)
+ __pyx_v_doc_words = __pyx_t_10;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":175
+ * doc_words = input_stream.read_sentence()
+ *
+ * if doc_words.empty(): # <<<<<<<<<<<<<<
+ * continue
+ *
+ */
+ __pyx_t_9 = (__pyx_v_doc_words.empty() != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":176
+ *
+ * if doc_words.empty():
+ * continue # <<<<<<<<<<<<<<
+ *
+ * prepare_c_structures_for_batch(
+ */
+ goto __pyx_L6_continue;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":175
+ * doc_words = input_stream.read_sentence()
+ *
+ * if doc_words.empty(): # <<<<<<<<<<<<<<
+ * continue
+ *
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":180
+ * prepare_c_structures_for_batch(
+ * doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
+ * &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points, # <<<<<<<<<<<<<<
+ * c.reduced_windows, &document_len, c.train_words, c.docvecs_count, _doc_tag)
+ *
+ */
+ __pyx_t_11 = ((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonVocab *)__pyx_v_vocab->__pyx_vtab)->get_vocab_ptr(__pyx_v_vocab); if (unlikely(__Pyx_ErrOccurredWithGIL())) __PYX_ERR(0, 180, __pyx_L4_error)
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":178
+ * continue
+ *
+ * prepare_c_structures_for_batch( # <<<<<<<<<<<<<<
+ * doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
+ * &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points,
+ */
+ __pyx_f_6gensim_6models_18doc2vec_corpusfile_prepare_c_structures_for_batch(__pyx_v_doc_words, __pyx_v_c.sample, __pyx_v_c.hs, __pyx_v_c.window, (&__pyx_v_total_words), (&__pyx_v_effective_words), (&__pyx_v_c.next_random), __pyx_t_11, __pyx_v_c.indexes, __pyx_v_c.codelens, __pyx_v_c.codes, __pyx_v_c.points, __pyx_v_c.reduced_windows, (&__pyx_v_document_len), __pyx_v_c.train_words, __pyx_v_c.docvecs_count, __pyx_v__doc_tag);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":183
+ * c.reduced_windows, &document_len, c.train_words, c.docvecs_count, _doc_tag)
+ *
+ * for i in range(document_len): # <<<<<<<<<<<<<<
+ * if c.train_words: # simultaneous skip-gram wordvec-training
+ * j = i - c.window + c.reduced_windows[i]
+ */
+ __pyx_t_1 = __pyx_v_document_len;
+ __pyx_t_4 = __pyx_t_1;
+ for (__pyx_t_12 = 0; __pyx_t_12 < __pyx_t_4; __pyx_t_12+=1) {
+ __pyx_v_i = __pyx_t_12;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":184
+ *
+ * for i in range(document_len):
+ * if c.train_words: # simultaneous skip-gram wordvec-training # <<<<<<<<<<<<<<
+ * j = i - c.window + c.reduced_windows[i]
+ * if j < 0:
+ */
+ __pyx_t_9 = (__pyx_v_c.train_words != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":185
+ * for i in range(document_len):
+ * if c.train_words: # simultaneous skip-gram wordvec-training
+ * j = i - c.window + c.reduced_windows[i] # <<<<<<<<<<<<<<
+ * if j < 0:
+ * j = 0
+ */
+ __pyx_v_j = ((__pyx_v_i - __pyx_v_c.window) + (__pyx_v_c.reduced_windows[__pyx_v_i]));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":186
+ * if c.train_words: # simultaneous skip-gram wordvec-training
+ * j = i - c.window + c.reduced_windows[i]
+ * if j < 0: # <<<<<<<<<<<<<<
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ */
+ __pyx_t_9 = ((__pyx_v_j < 0) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":187
+ * j = i - c.window + c.reduced_windows[i]
+ * if j < 0:
+ * j = 0 # <<<<<<<<<<<<<<
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ * if k > document_len:
+ */
+ __pyx_v_j = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":186
+ * if c.train_words: # simultaneous skip-gram wordvec-training
+ * j = i - c.window + c.reduced_windows[i]
+ * if j < 0: # <<<<<<<<<<<<<<
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":188
+ * if j < 0:
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i] # <<<<<<<<<<<<<<
+ * if k > document_len:
+ * k = document_len
+ */
+ __pyx_v_k = (((__pyx_v_i + __pyx_v_c.window) + 1) - (__pyx_v_c.reduced_windows[__pyx_v_i]));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":189
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ * if k > document_len: # <<<<<<<<<<<<<<
+ * k = document_len
+ * for j in range(j, k):
+ */
+ __pyx_t_9 = ((__pyx_v_k > __pyx_v_document_len) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":190
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ * if k > document_len:
+ * k = document_len # <<<<<<<<<<<<<<
+ * for j in range(j, k):
+ * if j == i:
+ */
+ __pyx_v_k = __pyx_v_document_len;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":189
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ * if k > document_len: # <<<<<<<<<<<<<<
+ * k = document_len
+ * for j in range(j, k):
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":191
+ * if k > document_len:
+ * k = document_len
+ * for j in range(j, k): # <<<<<<<<<<<<<<
+ * if j == i:
+ * continue
+ */
+ __pyx_t_13 = __pyx_v_k;
+ __pyx_t_14 = __pyx_t_13;
+ for (__pyx_t_15 = __pyx_v_j; __pyx_t_15 < __pyx_t_14; __pyx_t_15+=1) {
+ __pyx_v_j = __pyx_t_15;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":192
+ * k = document_len
+ * for j in range(j, k):
+ * if j == i: # <<<<<<<<<<<<<<
+ * continue
+ * if c.hs:
+ */
+ __pyx_t_9 = ((__pyx_v_j == __pyx_v_i) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":193
+ * for j in range(j, k):
+ * if j == i:
+ * continue # <<<<<<<<<<<<<<
+ * if c.hs:
+ * # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
+ */
+ goto __pyx_L16_continue;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":192
+ * k = document_len
+ * for j in range(j, k):
+ * if j == i: # <<<<<<<<<<<<<<
+ * continue
+ * if c.hs:
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":194
+ * if j == i:
+ * continue
+ * if c.hs: # <<<<<<<<<<<<<<
+ * # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
+ * fast_document_dbow_hs(
+ */
+ __pyx_t_9 = (__pyx_v_c.hs != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":196
+ * if c.hs:
+ * # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
+ * fast_document_dbow_hs( # <<<<<<<<<<<<<<
+ * c.points[i], c.codes[i], c.codelens[i], c.word_vectors, c.syn1, c.layer1_size,
+ * c.indexes[j], c.alpha, c.work, c.learn_words, c.learn_hidden, c.word_locks)
+ */
+ __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs((__pyx_v_c.points[__pyx_v_i]), (__pyx_v_c.codes[__pyx_v_i]), (__pyx_v_c.codelens[__pyx_v_i]), __pyx_v_c.word_vectors, __pyx_v_c.syn1, __pyx_v_c.layer1_size, (__pyx_v_c.indexes[__pyx_v_j]), __pyx_v_c.alpha, __pyx_v_c.work, __pyx_v_c.learn_words, __pyx_v_c.learn_hidden, __pyx_v_c.word_locks);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":194
+ * if j == i:
+ * continue
+ * if c.hs: # <<<<<<<<<<<<<<
+ * # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
+ * fast_document_dbow_hs(
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":200
+ * c.indexes[j], c.alpha, c.work, c.learn_words, c.learn_hidden, c.word_locks)
+ *
+ * if c.negative: # <<<<<<<<<<<<<<
+ * # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
+ * c.next_random = fast_document_dbow_neg(
+ */
+ __pyx_t_9 = (__pyx_v_c.negative != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":202
+ * if c.negative:
+ * # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
+ * c.next_random = fast_document_dbow_neg( # <<<<<<<<<<<<<<
+ * c.negative, c.cum_table, c.cum_table_len, c.word_vectors, c.syn1neg,
+ * c.layer1_size, c.indexes[i], c.indexes[j], c.alpha, c.work,
+ */
+ __pyx_v_c.next_random = __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_neg(__pyx_v_c.negative, __pyx_v_c.cum_table, __pyx_v_c.cum_table_len, __pyx_v_c.word_vectors, __pyx_v_c.syn1neg, __pyx_v_c.layer1_size, (__pyx_v_c.indexes[__pyx_v_i]), (__pyx_v_c.indexes[__pyx_v_j]), __pyx_v_c.alpha, __pyx_v_c.work, __pyx_v_c.next_random, __pyx_v_c.learn_words, __pyx_v_c.learn_hidden, __pyx_v_c.word_locks);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":200
+ * c.indexes[j], c.alpha, c.work, c.learn_words, c.learn_hidden, c.word_locks)
+ *
+ * if c.negative: # <<<<<<<<<<<<<<
+ * # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
+ * c.next_random = fast_document_dbow_neg(
+ */
+ }
+ __pyx_L16_continue:;
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":184
+ *
+ * for i in range(document_len):
+ * if c.train_words: # simultaneous skip-gram wordvec-training # <<<<<<<<<<<<<<
+ * j = i - c.window + c.reduced_windows[i]
+ * if j < 0:
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":208
+ *
+ * # docvec-training
+ * if _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * if c.hs:
+ * fast_document_dbow_hs(
+ */
+ __pyx_t_9 = ((__pyx_v__doc_tag < __pyx_v_c.docvecs_count) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":209
+ * # docvec-training
+ * if _doc_tag < c.docvecs_count:
+ * if c.hs: # <<<<<<<<<<<<<<
+ * fast_document_dbow_hs(
+ * c.points[i], c.codes[i], c.codelens[i], c.doctag_vectors, c.syn1, c.layer1_size,
+ */
+ __pyx_t_9 = (__pyx_v_c.hs != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":210
+ * if _doc_tag < c.docvecs_count:
+ * if c.hs:
+ * fast_document_dbow_hs( # <<<<<<<<<<<<<<
+ * c.points[i], c.codes[i], c.codelens[i], c.doctag_vectors, c.syn1, c.layer1_size,
+ * _doc_tag, c.alpha, c.work, c.learn_doctags, c.learn_hidden, c.doctag_locks)
+ */
+ __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs((__pyx_v_c.points[__pyx_v_i]), (__pyx_v_c.codes[__pyx_v_i]), (__pyx_v_c.codelens[__pyx_v_i]), __pyx_v_c.doctag_vectors, __pyx_v_c.syn1, __pyx_v_c.layer1_size, __pyx_v__doc_tag, __pyx_v_c.alpha, __pyx_v_c.work, __pyx_v_c.learn_doctags, __pyx_v_c.learn_hidden, __pyx_v_c.doctag_locks);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":209
+ * # docvec-training
+ * if _doc_tag < c.docvecs_count:
+ * if c.hs: # <<<<<<<<<<<<<<
+ * fast_document_dbow_hs(
+ * c.points[i], c.codes[i], c.codelens[i], c.doctag_vectors, c.syn1, c.layer1_size,
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":214
+ * _doc_tag, c.alpha, c.work, c.learn_doctags, c.learn_hidden, c.doctag_locks)
+ *
+ * if c.negative: # <<<<<<<<<<<<<<
+ * c.next_random = fast_document_dbow_neg(
+ * c.negative, c.cum_table, c.cum_table_len, c.doctag_vectors, c.syn1neg,
+ */
+ __pyx_t_9 = (__pyx_v_c.negative != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":215
+ *
+ * if c.negative:
+ * c.next_random = fast_document_dbow_neg( # <<<<<<<<<<<<<<
+ * c.negative, c.cum_table, c.cum_table_len, c.doctag_vectors, c.syn1neg,
+ * c.layer1_size, c.indexes[i], _doc_tag, c.alpha, c.work, c.next_random,
+ */
+ __pyx_v_c.next_random = __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_neg(__pyx_v_c.negative, __pyx_v_c.cum_table, __pyx_v_c.cum_table_len, __pyx_v_c.doctag_vectors, __pyx_v_c.syn1neg, __pyx_v_c.layer1_size, (__pyx_v_c.indexes[__pyx_v_i]), __pyx_v__doc_tag, __pyx_v_c.alpha, __pyx_v_c.work, __pyx_v_c.next_random, __pyx_v_c.learn_doctags, __pyx_v_c.learn_hidden, __pyx_v_c.doctag_locks);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":214
+ * _doc_tag, c.alpha, c.work, c.learn_doctags, c.learn_hidden, c.doctag_locks)
+ *
+ * if c.negative: # <<<<<<<<<<<<<<
+ * c.next_random = fast_document_dbow_neg(
+ * c.negative, c.cum_table, c.cum_table_len, c.doctag_vectors, c.syn1neg,
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":208
+ *
+ * # docvec-training
+ * if _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * if c.hs:
+ * fast_document_dbow_hs(
+ */
+ }
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":220
+ * c.learn_doctags, c.learn_hidden, c.doctag_locks)
+ *
+ * total_documents += 1 # <<<<<<<<<<<<<<
+ * total_effective_words += effective_words
+ * _doc_tag += 1
+ */
+ __pyx_v_total_documents = (__pyx_v_total_documents + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":221
+ *
+ * total_documents += 1
+ * total_effective_words += effective_words # <<<<<<<<<<<<<<
+ * _doc_tag += 1
+ *
+ */
+ __pyx_v_total_effective_words = (__pyx_v_total_effective_words + __pyx_v_effective_words);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":222
+ * total_documents += 1
+ * total_effective_words += effective_words
+ * _doc_tag += 1 # <<<<<<<<<<<<<<
+ *
+ * c.alpha = get_next_alpha(
+ */
+ __pyx_v__doc_tag = (__pyx_v__doc_tag + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":224
+ * _doc_tag += 1
+ *
+ * c.alpha = get_next_alpha( # <<<<<<<<<<<<<<
+ * start_alpha, end_alpha, total_documents, total_words,
+ * expected_examples, expected_words, cur_epoch, num_epochs)
+ */
+ __pyx_v_c.alpha = __pyx_f_6gensim_6models_19word2vec_corpusfile_get_next_alpha(__pyx_v_start_alpha, __pyx_v_end_alpha, __pyx_v_total_documents, __pyx_v_total_words, __pyx_v_expected_examples, __pyx_v_expected_words, __pyx_v_cur_epoch, __pyx_v_num_epochs);
+ __pyx_L6_continue:;
+ }
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":168
+ *
+ * # release GIL & train on the full corpus, document by document
+ * with nogil: # <<<<<<<<<<<<<<
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ */
+ /*finally:*/ {
+ /*normal exit:*/{
+ #ifdef WITH_THREAD
+ __Pyx_FastGIL_Forget();
+ Py_BLOCK_THREADS
+ #endif
+ goto __pyx_L5;
+ }
+ __pyx_L4_error: {
+ #ifdef WITH_THREAD
+ __Pyx_FastGIL_Forget();
+ Py_BLOCK_THREADS
+ #endif
+ goto __pyx_L1_error;
+ }
+ __pyx_L5:;
+ }
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":228
+ * expected_examples, expected_words, cur_epoch, num_epochs)
+ *
+ * return total_documents, total_effective_words, total_words # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_total_documents); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 228, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_7 = __Pyx_PyInt_From_int(__pyx_v_total_effective_words); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 228, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+ __pyx_t_16 = __Pyx_PyInt_From_int(__pyx_v_total_words); if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 228, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_16);
+ __pyx_t_17 = PyTuple_New(3); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 228, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_17);
+ __Pyx_GIVEREF(__pyx_t_2);
+ PyTuple_SET_ITEM(__pyx_t_17, 0, __pyx_t_2);
+ __Pyx_GIVEREF(__pyx_t_7);
+ PyTuple_SET_ITEM(__pyx_t_17, 1, __pyx_t_7);
+ __Pyx_GIVEREF(__pyx_t_16);
+ PyTuple_SET_ITEM(__pyx_t_17, 2, __pyx_t_16);
+ __pyx_t_2 = 0;
+ __pyx_t_7 = 0;
+ __pyx_t_16 = 0;
+ __pyx_r = __pyx_t_17;
+ __pyx_t_17 = 0;
+ goto __pyx_L0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":95
+ *
+ *
+ * def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, # <<<<<<<<<<<<<<
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_2);
+ __Pyx_XDECREF(__pyx_t_7);
+ __Pyx_XDECREF(__pyx_t_16);
+ __Pyx_XDECREF(__pyx_t_17);
+ __Pyx_AddTraceback("gensim.models.doc2vec_corpusfile.d2v_train_epoch_dbow", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __Pyx_XDECREF((PyObject *)__pyx_v_input_stream);
+ __Pyx_XDECREF((PyObject *)__pyx_v_vocab);
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "gensim/models/doc2vec_corpusfile.pyx":231
+ *
+ *
+ * def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, # <<<<<<<<<<<<<<
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctag_locks=None):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_6gensim_6models_18doc2vec_corpusfile_3d2v_train_epoch_dm(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static char __pyx_doc_6gensim_6models_18doc2vec_corpusfile_2d2v_train_epoch_dm[] = "d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None, learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctag_locks=None)\nTrain distributed memory model (\"PV-DM\") by training on a corpus file.\n This method implements the DM model with a projection (input) layer that is either the sum or mean of the context\n vectors, depending on the model's `dm_mean` configuration field.\n\n Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train`.\n\n Parameters\n ----------\n model : :class:`~gensim.models.doc2vec.Doc2Vec`\n The FastText model instance to train.\n corpus_file : str\n Path to corpus file.\n _cur_epoch : int\n Current epoch number. Used for calculating and decaying learning rate.\n work : np.ndarray\n Private working memory for each worker.\n neu1 : np.ndarray\n Private working memory for each worker.\n learn_doctags : bool, optional\n Whether the tag vectors should be updated.\n learn_words : bool, optional\n Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**\n `learn_words` and `train_words` are set to True.\n learn_hidden : bool, optional\n Whether or not the weights of the hidden layer will be updated.\n word_vectors : numpy.ndarray, optional\n The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.\n word_locks : numpy.ndarray, optional\n A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates,\n a value of 1 allows to update word-vectors.\n doctag_vectors : numpy.ndarray, optional\n Vector representations of the tags. If None, these will be retrieved from the model.\n doctag_locks : numpy.ndarray, optional""\n The lock factors for each tag, same as `word_locks`, but for document-vectors.\n\n Returns\n -------\n int\n Number of words in the input document that were actually used for training.\n\n ";
+static PyMethodDef __pyx_mdef_6gensim_6models_18doc2vec_corpusfile_3d2v_train_epoch_dm = {"d2v_train_epoch_dm", (PyCFunction)__pyx_pw_6gensim_6models_18doc2vec_corpusfile_3d2v_train_epoch_dm, METH_VARARGS|METH_KEYWORDS, __pyx_doc_6gensim_6models_18doc2vec_corpusfile_2d2v_train_epoch_dm};
+static PyObject *__pyx_pw_6gensim_6models_18doc2vec_corpusfile_3d2v_train_epoch_dm(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+ PyObject *__pyx_v_model = 0;
+ PyObject *__pyx_v_corpus_file = 0;
+ PyObject *__pyx_v_offset = 0;
+ PyObject *__pyx_v_start_doctag = 0;
+ PyObject *__pyx_v__cython_vocab = 0;
+ PyObject *__pyx_v__cur_epoch = 0;
+ PyObject *__pyx_v__expected_examples = 0;
+ PyObject *__pyx_v__expected_words = 0;
+ PyObject *__pyx_v_work = 0;
+ PyObject *__pyx_v_neu1 = 0;
+ PyObject *__pyx_v_docvecs_count = 0;
+ PyObject *__pyx_v_word_vectors = 0;
+ PyObject *__pyx_v_word_locks = 0;
+ PyObject *__pyx_v_learn_doctags = 0;
+ PyObject *__pyx_v_learn_words = 0;
+ PyObject *__pyx_v_learn_hidden = 0;
+ PyObject *__pyx_v_doctag_vectors = 0;
+ PyObject *__pyx_v_doctag_locks = 0;
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("d2v_train_epoch_dm (wrapper)", 0);
+ {
+ static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_model,&__pyx_n_s_corpus_file,&__pyx_n_s_offset,&__pyx_n_s_start_doctag,&__pyx_n_s_cython_vocab,&__pyx_n_s_cur_epoch,&__pyx_n_s_expected_examples,&__pyx_n_s_expected_words,&__pyx_n_s_work,&__pyx_n_s_neu1,&__pyx_n_s_docvecs_count,&__pyx_n_s_word_vectors,&__pyx_n_s_word_locks,&__pyx_n_s_learn_doctags,&__pyx_n_s_learn_words,&__pyx_n_s_learn_hidden,&__pyx_n_s_doctag_vectors,&__pyx_n_s_doctag_locks,0};
+ PyObject* values[18] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":232
+ *
+ * def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None, # <<<<<<<<<<<<<<
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctag_locks=None):
+ * """Train distributed memory model ("PV-DM") by training on a corpus file.
+ */
+ values[11] = ((PyObject *)Py_None);
+ values[12] = ((PyObject *)Py_None);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":233
+ * def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctag_locks=None): # <<<<<<<<<<<<<<
+ * """Train distributed memory model ("PV-DM") by training on a corpus file.
+ * This method implements the DM model with a projection (input) layer that is either the sum or mean of the context
+ */
+ values[13] = ((PyObject *)Py_True);
+ values[14] = ((PyObject *)Py_True);
+ values[15] = ((PyObject *)Py_True);
+ values[16] = ((PyObject *)Py_None);
+ values[17] = ((PyObject *)Py_None);
+ if (unlikely(__pyx_kwds)) {
+ Py_ssize_t kw_args;
+ const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+ switch (pos_args) {
+ case 18: values[17] = PyTuple_GET_ITEM(__pyx_args, 17);
+ CYTHON_FALLTHROUGH;
+ case 17: values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+ CYTHON_FALLTHROUGH;
+ case 16: values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+ CYTHON_FALLTHROUGH;
+ case 15: values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+ CYTHON_FALLTHROUGH;
+ case 14: values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+ CYTHON_FALLTHROUGH;
+ case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+ CYTHON_FALLTHROUGH;
+ case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+ CYTHON_FALLTHROUGH;
+ case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+ CYTHON_FALLTHROUGH;
+ case 10: values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+ CYTHON_FALLTHROUGH;
+ case 9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+ CYTHON_FALLTHROUGH;
+ case 8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+ CYTHON_FALLTHROUGH;
+ case 7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+ CYTHON_FALLTHROUGH;
+ case 6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+ CYTHON_FALLTHROUGH;
+ case 5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ CYTHON_FALLTHROUGH;
+ case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ CYTHON_FALLTHROUGH;
+ case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ CYTHON_FALLTHROUGH;
+ case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ CYTHON_FALLTHROUGH;
+ case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ CYTHON_FALLTHROUGH;
+ case 0: break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ kw_args = PyDict_Size(__pyx_kwds);
+ switch (pos_args) {
+ case 0:
+ if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_model)) != 0)) kw_args--;
+ else goto __pyx_L5_argtuple_error;
+ CYTHON_FALLTHROUGH;
+ case 1:
+ if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_corpus_file)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 1); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 2:
+ if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_offset)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 2); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 3:
+ if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_start_doctag)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 3); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 4:
+ if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_cython_vocab)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 4); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 5:
+ if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_cur_epoch)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 5); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 6:
+ if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_expected_examples)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 6); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 7:
+ if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_expected_words)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 7); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 8:
+ if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_work)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 8); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 9:
+ if (likely((values[9] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_neu1)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 9); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 10:
+ if (likely((values[10] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_docvecs_count)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, 10); __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 11:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_word_vectors);
+ if (value) { values[11] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 12:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_word_locks);
+ if (value) { values[12] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 13:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_learn_doctags);
+ if (value) { values[13] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 14:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_learn_words);
+ if (value) { values[14] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 15:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_learn_hidden);
+ if (value) { values[15] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 16:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_doctag_vectors);
+ if (value) { values[16] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 17:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_doctag_locks);
+ if (value) { values[17] = value; kw_args--; }
+ }
+ }
+ if (unlikely(kw_args > 0)) {
+ if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "d2v_train_epoch_dm") < 0)) __PYX_ERR(0, 231, __pyx_L3_error)
+ }
+ } else {
+ switch (PyTuple_GET_SIZE(__pyx_args)) {
+ case 18: values[17] = PyTuple_GET_ITEM(__pyx_args, 17);
+ CYTHON_FALLTHROUGH;
+ case 17: values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+ CYTHON_FALLTHROUGH;
+ case 16: values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+ CYTHON_FALLTHROUGH;
+ case 15: values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+ CYTHON_FALLTHROUGH;
+ case 14: values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+ CYTHON_FALLTHROUGH;
+ case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+ CYTHON_FALLTHROUGH;
+ case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+ CYTHON_FALLTHROUGH;
+ case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+ values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+ values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+ values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+ values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+ values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+ values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ }
+ __pyx_v_model = values[0];
+ __pyx_v_corpus_file = values[1];
+ __pyx_v_offset = values[2];
+ __pyx_v_start_doctag = values[3];
+ __pyx_v__cython_vocab = values[4];
+ __pyx_v__cur_epoch = values[5];
+ __pyx_v__expected_examples = values[6];
+ __pyx_v__expected_words = values[7];
+ __pyx_v_work = values[8];
+ __pyx_v_neu1 = values[9];
+ __pyx_v_docvecs_count = values[10];
+ __pyx_v_word_vectors = values[11];
+ __pyx_v_word_locks = values[12];
+ __pyx_v_learn_doctags = values[13];
+ __pyx_v_learn_words = values[14];
+ __pyx_v_learn_hidden = values[15];
+ __pyx_v_doctag_vectors = values[16];
+ __pyx_v_doctag_locks = values[17];
+ }
+ goto __pyx_L4_argument_unpacking_done;
+ __pyx_L5_argtuple_error:;
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm", 0, 11, 18, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 231, __pyx_L3_error)
+ __pyx_L3_error:;
+ __Pyx_AddTraceback("gensim.models.doc2vec_corpusfile.d2v_train_epoch_dm", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __Pyx_RefNannyFinishContext();
+ return NULL;
+ __pyx_L4_argument_unpacking_done:;
+ __pyx_r = __pyx_pf_6gensim_6models_18doc2vec_corpusfile_2d2v_train_epoch_dm(__pyx_self, __pyx_v_model, __pyx_v_corpus_file, __pyx_v_offset, __pyx_v_start_doctag, __pyx_v__cython_vocab, __pyx_v__cur_epoch, __pyx_v__expected_examples, __pyx_v__expected_words, __pyx_v_work, __pyx_v_neu1, __pyx_v_docvecs_count, __pyx_v_word_vectors, __pyx_v_word_locks, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, __pyx_v_doctag_vectors, __pyx_v_doctag_locks);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":231
+ *
+ *
+ * def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, # <<<<<<<<<<<<<<
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctag_locks=None):
+ */
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_pf_6gensim_6models_18doc2vec_corpusfile_2d2v_train_epoch_dm(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_corpus_file, PyObject *__pyx_v_offset, PyObject *__pyx_v_start_doctag, PyObject *__pyx_v__cython_vocab, PyObject *__pyx_v__cur_epoch, PyObject *__pyx_v__expected_examples, PyObject *__pyx_v__expected_words, PyObject *__pyx_v_work, PyObject *__pyx_v_neu1, PyObject *__pyx_v_docvecs_count, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks) {
+ struct __pyx_t_6gensim_6models_13doc2vec_inner_Doc2VecConfig __pyx_v_c;
+ int __pyx_v_cur_epoch;
+ int __pyx_v_num_epochs;
+ int __pyx_v_expected_examples;
+ int __pyx_v_expected_words;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v_start_alpha;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v_end_alpha;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v__alpha;
+ struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *__pyx_v_input_stream = 0;
+ struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonVocab *__pyx_v_vocab = 0;
+ int __pyx_v_i;
+ int __pyx_v_j;
+ int __pyx_v_k;
+ int __pyx_v_m;
+ int __pyx_v_document_len;
+ int __pyx_v_effective_words;
+ int __pyx_v_total_effective_words;
+ int __pyx_v_total_documents;
+ int __pyx_v_total_words;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v_count;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v_inv_count;
+ std::vector __pyx_v_doc_words;
+ int __pyx_v__doc_tag;
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ PyObject *__pyx_t_2 = NULL;
+ int __pyx_t_3;
+ int __pyx_t_4;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_t_5;
+ __pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t __pyx_t_6;
+ PyObject *__pyx_t_7 = NULL;
+ struct __pyx_opt_args_6gensim_6models_13doc2vec_inner_init_d2v_config __pyx_t_8;
+ int __pyx_t_9;
+ std::vector __pyx_t_10;
+ __pyx_t_6gensim_6models_19word2vec_corpusfile_cvocab_t *__pyx_t_11;
+ int __pyx_t_12;
+ int __pyx_t_13;
+ int __pyx_t_14;
+ int __pyx_t_15;
+ PyObject *__pyx_t_16 = NULL;
+ PyObject *__pyx_t_17 = NULL;
+ __Pyx_RefNannySetupContext("d2v_train_epoch_dm", 0);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":277
+ * cdef Doc2VecConfig c
+ *
+ * cdef int cur_epoch = _cur_epoch # <<<<<<<<<<<<<<
+ * cdef int num_epochs = model.epochs
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ */
+ __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v__cur_epoch); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 277, __pyx_L1_error)
+ __pyx_v_cur_epoch = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":278
+ *
+ * cdef int cur_epoch = _cur_epoch
+ * cdef int num_epochs = model.epochs # <<<<<<<<<<<<<<
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_epochs); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 278, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_t_2); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 278, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_num_epochs = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":279
+ * cdef int cur_epoch = _cur_epoch
+ * cdef int num_epochs = model.epochs
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples) # <<<<<<<<<<<<<<
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ * cdef REAL_t start_alpha = model.alpha
+ */
+ __pyx_t_3 = (__pyx_v__expected_examples == Py_None);
+ if ((__pyx_t_3 != 0)) {
+ __pyx_t_1 = -1;
+ } else {
+ __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_v__expected_examples); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 279, __pyx_L1_error)
+ __pyx_t_1 = __pyx_t_4;
+ }
+ __pyx_v_expected_examples = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":280
+ * cdef int num_epochs = model.epochs
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words) # <<<<<<<<<<<<<<
+ * cdef REAL_t start_alpha = model.alpha
+ * cdef REAL_t end_alpha = model.min_alpha
+ */
+ __pyx_t_3 = (__pyx_v__expected_words == Py_None);
+ if ((__pyx_t_3 != 0)) {
+ __pyx_t_1 = -1;
+ } else {
+ __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_v__expected_words); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 280, __pyx_L1_error)
+ __pyx_t_1 = __pyx_t_4;
+ }
+ __pyx_v_expected_words = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":281
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ * cdef REAL_t start_alpha = model.alpha # <<<<<<<<<<<<<<
+ * cdef REAL_t end_alpha = model.min_alpha
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_alpha); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 281, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_5 = __pyx_PyFloat_AsFloat(__pyx_t_2); if (unlikely((__pyx_t_5 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 281, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_start_alpha = __pyx_t_5;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":282
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ * cdef REAL_t start_alpha = model.alpha
+ * cdef REAL_t end_alpha = model.min_alpha # <<<<<<<<<<<<<<
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)
+ *
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_min_alpha); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 282, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_5 = __pyx_PyFloat_AsFloat(__pyx_t_2); if (unlikely((__pyx_t_5 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 282, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_end_alpha = __pyx_t_5;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":283
+ * cdef REAL_t start_alpha = model.alpha
+ * cdef REAL_t end_alpha = model.min_alpha
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs) # <<<<<<<<<<<<<<
+ *
+ * cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset)
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_alpha); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 283, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_6 = __pyx_PyFloat_AsFloat(__pyx_t_2); if (unlikely((__pyx_t_6 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 283, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v__alpha = __pyx_f_6gensim_6models_19word2vec_corpusfile_get_alpha(__pyx_t_6, __pyx_v_end_alpha, __pyx_v_cur_epoch, __pyx_v_num_epochs);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":285
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)
+ *
+ * cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset) # <<<<<<<<<<<<<<
+ * cdef CythonVocab vocab = _cython_vocab
+ *
+ */
+ __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 285, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_INCREF(__pyx_v_corpus_file);
+ __Pyx_GIVEREF(__pyx_v_corpus_file);
+ PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_corpus_file);
+ __Pyx_INCREF(__pyx_v_offset);
+ __Pyx_GIVEREF(__pyx_v_offset);
+ PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_v_offset);
+ __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_6gensim_6models_19word2vec_corpusfile_CythonLineSentence), __pyx_t_2, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 285, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_input_stream = ((struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_t_7);
+ __pyx_t_7 = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":286
+ *
+ * cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset)
+ * cdef CythonVocab vocab = _cython_vocab # <<<<<<<<<<<<<<
+ *
+ * cdef int i, j, k, m, document_len
+ */
+ if (!(likely(((__pyx_v__cython_vocab) == Py_None) || likely(__Pyx_TypeTest(__pyx_v__cython_vocab, __pyx_ptype_6gensim_6models_19word2vec_corpusfile_CythonVocab))))) __PYX_ERR(0, 286, __pyx_L1_error)
+ __pyx_t_7 = __pyx_v__cython_vocab;
+ __Pyx_INCREF(__pyx_t_7);
+ __pyx_v_vocab = ((struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonVocab *)__pyx_t_7);
+ __pyx_t_7 = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":289
+ *
+ * cdef int i, j, k, m, document_len
+ * cdef int effective_words = 0 # <<<<<<<<<<<<<<
+ * cdef int total_effective_words = 0, total_documents = 0, total_words = 0
+ * cdef int sent_idx, idx_start, idx_end
+ */
+ __pyx_v_effective_words = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":290
+ * cdef int i, j, k, m, document_len
+ * cdef int effective_words = 0
+ * cdef int total_effective_words = 0, total_documents = 0, total_words = 0 # <<<<<<<<<<<<<<
+ * cdef int sent_idx, idx_start, idx_end
+ * cdef REAL_t count, inv_count = 1.0
+ */
+ __pyx_v_total_effective_words = 0;
+ __pyx_v_total_documents = 0;
+ __pyx_v_total_words = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":292
+ * cdef int total_effective_words = 0, total_documents = 0, total_words = 0
+ * cdef int sent_idx, idx_start, idx_end
+ * cdef REAL_t count, inv_count = 1.0 # <<<<<<<<<<<<<<
+ *
+ * cdef vector[string] doc_words
+ */
+ __pyx_v_inv_count = 1.0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":295
+ *
+ * cdef vector[string] doc_words
+ * cdef int _doc_tag = start_doctag # <<<<<<<<<<<<<<
+ *
+ * init_d2v_config(
+ */
+ __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_start_doctag); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 295, __pyx_L1_error)
+ __pyx_v__doc_tag = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":298
+ *
+ * init_d2v_config(
+ * &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False, # <<<<<<<<<<<<<<
+ * work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks,
+ * doctag_vectors=doctag_vectors, doctag_locks=doctag_locks, docvecs_count=docvecs_count)
+ */
+ __pyx_t_7 = PyFloat_FromDouble(__pyx_v__alpha); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 298, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":297
+ * cdef int _doc_tag = start_doctag
+ *
+ * init_d2v_config( # <<<<<<<<<<<<<<
+ * &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False,
+ * work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks,
+ */
+ __pyx_t_8.__pyx_n = 8;
+ __pyx_t_8.train_words = Py_False;
+ __pyx_t_8.work = __pyx_v_work;
+ __pyx_t_8.neu1 = __pyx_v_neu1;
+ __pyx_t_8.word_vectors = __pyx_v_word_vectors;
+ __pyx_t_8.word_locks = __pyx_v_word_locks;
+ __pyx_t_8.doctag_vectors = __pyx_v_doctag_vectors;
+ __pyx_t_8.doctag_locks = __pyx_v_doctag_locks;
+ __pyx_t_8.docvecs_count = __pyx_v_docvecs_count;
+ __pyx_t_2 = __pyx_f_6gensim_6models_13doc2vec_inner_init_d2v_config((&__pyx_v_c), __pyx_v_model, __pyx_t_7, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, &__pyx_t_8); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 297, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":303
+ *
+ * # release GIL & train on the full corpus, document by document
+ * with nogil: # <<<<<<<<<<<<<<
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ */
+ {
+ #ifdef WITH_THREAD
+ PyThreadState *_save;
+ Py_UNBLOCK_THREADS
+ __Pyx_FastGIL_Remember();
+ #endif
+ /*try:*/ {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":304
+ * # release GIL & train on the full corpus, document by document
+ * with nogil:
+ * input_stream.reset() # <<<<<<<<<<<<<<
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ * effective_words = 0
+ */
+ ((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_v_input_stream->__pyx_vtab)->reset(__pyx_v_input_stream, 0);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":305
+ * with nogil:
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers): # <<<<<<<<<<<<<<
+ * effective_words = 0
+ *
+ */
+ while (1) {
+ __pyx_t_9 = (((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_v_input_stream->__pyx_vtab)->is_eof(__pyx_v_input_stream, 0) != 0);
+ if (!__pyx_t_9) {
+ } else {
+ __pyx_t_3 = __pyx_t_9;
+ goto __pyx_L8_bool_binop_done;
+ }
+ __pyx_t_9 = ((__pyx_v_total_words > (__pyx_v_expected_words / __pyx_v_c.workers)) != 0);
+ __pyx_t_3 = __pyx_t_9;
+ __pyx_L8_bool_binop_done:;
+ __pyx_t_9 = ((!__pyx_t_3) != 0);
+ if (!__pyx_t_9) break;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":306
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ * effective_words = 0 # <<<<<<<<<<<<<<
+ *
+ * doc_words = input_stream.read_sentence()
+ */
+ __pyx_v_effective_words = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":308
+ * effective_words = 0
+ *
+ * doc_words = input_stream.read_sentence() # <<<<<<<<<<<<<<
+ *
+ * if doc_words.empty():
+ */
+ __pyx_t_10 = ((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_v_input_stream->__pyx_vtab)->read_sentence(__pyx_v_input_stream, 0); if (unlikely(__Pyx_ErrOccurredWithGIL())) __PYX_ERR(0, 308, __pyx_L4_error)
+ __pyx_v_doc_words = __pyx_t_10;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":310
+ * doc_words = input_stream.read_sentence()
+ *
+ * if doc_words.empty(): # <<<<<<<<<<<<<<
+ * continue
+ *
+ */
+ __pyx_t_9 = (__pyx_v_doc_words.empty() != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":311
+ *
+ * if doc_words.empty():
+ * continue # <<<<<<<<<<<<<<
+ *
+ * prepare_c_structures_for_batch(
+ */
+ goto __pyx_L6_continue;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":310
+ * doc_words = input_stream.read_sentence()
+ *
+ * if doc_words.empty(): # <<<<<<<<<<<<<<
+ * continue
+ *
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":315
+ * prepare_c_structures_for_batch(
+ * doc_words, c.sample, c.hs, c.window, &total_words, &effective_words, &c.next_random,
+ * vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points, c.reduced_windows, # <<<<<<<<<<<<<<
+ * &document_len, c.train_words, c.docvecs_count, _doc_tag)
+ *
+ */
+ __pyx_t_11 = ((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonVocab *)__pyx_v_vocab->__pyx_vtab)->get_vocab_ptr(__pyx_v_vocab); if (unlikely(__Pyx_ErrOccurredWithGIL())) __PYX_ERR(0, 315, __pyx_L4_error)
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":313
+ * continue
+ *
+ * prepare_c_structures_for_batch( # <<<<<<<<<<<<<<
+ * doc_words, c.sample, c.hs, c.window, &total_words, &effective_words, &c.next_random,
+ * vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points, c.reduced_windows,
+ */
+ __pyx_f_6gensim_6models_18doc2vec_corpusfile_prepare_c_structures_for_batch(__pyx_v_doc_words, __pyx_v_c.sample, __pyx_v_c.hs, __pyx_v_c.window, (&__pyx_v_total_words), (&__pyx_v_effective_words), (&__pyx_v_c.next_random), __pyx_t_11, __pyx_v_c.indexes, __pyx_v_c.codelens, __pyx_v_c.codes, __pyx_v_c.points, __pyx_v_c.reduced_windows, (&__pyx_v_document_len), __pyx_v_c.train_words, __pyx_v_c.docvecs_count, __pyx_v__doc_tag);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":318
+ * &document_len, c.train_words, c.docvecs_count, _doc_tag)
+ *
+ * for i in range(document_len): # <<<<<<<<<<<<<<
+ * j = i - c.window + c.reduced_windows[i]
+ * if j < 0:
+ */
+ __pyx_t_1 = __pyx_v_document_len;
+ __pyx_t_4 = __pyx_t_1;
+ for (__pyx_t_12 = 0; __pyx_t_12 < __pyx_t_4; __pyx_t_12+=1) {
+ __pyx_v_i = __pyx_t_12;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":319
+ *
+ * for i in range(document_len):
+ * j = i - c.window + c.reduced_windows[i] # <<<<<<<<<<<<<<
+ * if j < 0:
+ * j = 0
+ */
+ __pyx_v_j = ((__pyx_v_i - __pyx_v_c.window) + (__pyx_v_c.reduced_windows[__pyx_v_i]));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":320
+ * for i in range(document_len):
+ * j = i - c.window + c.reduced_windows[i]
+ * if j < 0: # <<<<<<<<<<<<<<
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ */
+ __pyx_t_9 = ((__pyx_v_j < 0) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":321
+ * j = i - c.window + c.reduced_windows[i]
+ * if j < 0:
+ * j = 0 # <<<<<<<<<<<<<<
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ * if k > document_len:
+ */
+ __pyx_v_j = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":320
+ * for i in range(document_len):
+ * j = i - c.window + c.reduced_windows[i]
+ * if j < 0: # <<<<<<<<<<<<<<
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":322
+ * if j < 0:
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i] # <<<<<<<<<<<<<<
+ * if k > document_len:
+ * k = document_len
+ */
+ __pyx_v_k = (((__pyx_v_i + __pyx_v_c.window) + 1) - (__pyx_v_c.reduced_windows[__pyx_v_i]));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":323
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ * if k > document_len: # <<<<<<<<<<<<<<
+ * k = document_len
+ *
+ */
+ __pyx_t_9 = ((__pyx_v_k > __pyx_v_document_len) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":324
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ * if k > document_len:
+ * k = document_len # <<<<<<<<<<<<<<
+ *
+ * # compose l1 (in _neu1) & clear _work
+ */
+ __pyx_v_k = __pyx_v_document_len;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":323
+ * j = 0
+ * k = i + c.window + 1 - c.reduced_windows[i]
+ * if k > document_len: # <<<<<<<<<<<<<<
+ * k = document_len
+ *
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":327
+ *
+ * # compose l1 (in _neu1) & clear _work
+ * memset(c.neu1, 0, c.layer1_size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
+ * count = 0.0
+ * for m in range(j, k):
+ */
+ (void)(memset(__pyx_v_c.neu1, 0, (__pyx_v_c.layer1_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":328
+ * # compose l1 (in _neu1) & clear _work
+ * memset(c.neu1, 0, c.layer1_size * cython.sizeof(REAL_t))
+ * count = 0.0 # <<<<<<<<<<<<<<
+ * for m in range(j, k):
+ * if m == i:
+ */
+ __pyx_v_count = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)0.0);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":329
+ * memset(c.neu1, 0, c.layer1_size * cython.sizeof(REAL_t))
+ * count = 0.0
+ * for m in range(j, k): # <<<<<<<<<<<<<<
+ * if m == i:
+ * continue
+ */
+ __pyx_t_13 = __pyx_v_k;
+ __pyx_t_14 = __pyx_t_13;
+ for (__pyx_t_15 = __pyx_v_j; __pyx_t_15 < __pyx_t_14; __pyx_t_15+=1) {
+ __pyx_v_m = __pyx_t_15;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":330
+ * count = 0.0
+ * for m in range(j, k):
+ * if m == i: # <<<<<<<<<<<<<<
+ * continue
+ * else:
+ */
+ __pyx_t_9 = ((__pyx_v_m == __pyx_v_i) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":331
+ * for m in range(j, k):
+ * if m == i:
+ * continue # <<<<<<<<<<<<<<
+ * else:
+ * count += ONEF
+ */
+ goto __pyx_L15_continue;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":330
+ * count = 0.0
+ * for m in range(j, k):
+ * if m == i: # <<<<<<<<<<<<<<
+ * continue
+ * else:
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":333
+ * continue
+ * else:
+ * count += ONEF # <<<<<<<<<<<<<<
+ * our_saxpy(&c.layer1_size, &ONEF, &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE, c.neu1, &ONE)
+ *
+ */
+ /*else*/ {
+ __pyx_v_count = (__pyx_v_count + __pyx_v_6gensim_6models_18doc2vec_corpusfile_ONEF);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":334
+ * else:
+ * count += ONEF
+ * our_saxpy(&c.layer1_size, &ONEF, &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE, c.neu1, &ONE) # <<<<<<<<<<<<<<
+ *
+ * if _doc_tag < c.docvecs_count:
+ */
+ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_c.layer1_size), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONEF), (&(__pyx_v_c.word_vectors[((__pyx_v_c.indexes[__pyx_v_m]) * __pyx_v_c.layer1_size)])), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE), __pyx_v_c.neu1, (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE));
+ }
+ __pyx_L15_continue:;
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":336
+ * our_saxpy(&c.layer1_size, &ONEF, &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE, c.neu1, &ONE)
+ *
+ * if _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * count += ONEF
+ * our_saxpy(&c.layer1_size, &ONEF, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE, c.neu1, &ONE)
+ */
+ __pyx_t_9 = ((__pyx_v__doc_tag < __pyx_v_c.docvecs_count) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":337
+ *
+ * if _doc_tag < c.docvecs_count:
+ * count += ONEF # <<<<<<<<<<<<<<
+ * our_saxpy(&c.layer1_size, &ONEF, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE, c.neu1, &ONE)
+ * if count > (0.5):
+ */
+ __pyx_v_count = (__pyx_v_count + __pyx_v_6gensim_6models_18doc2vec_corpusfile_ONEF);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":338
+ * if _doc_tag < c.docvecs_count:
+ * count += ONEF
+ * our_saxpy(&c.layer1_size, &ONEF, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE, c.neu1, &ONE) # <<<<<<<<<<<<<<
+ * if count > (0.5):
+ * inv_count = ONEF/count
+ */
+ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_c.layer1_size), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONEF), (&(__pyx_v_c.doctag_vectors[(__pyx_v__doc_tag * __pyx_v_c.layer1_size)])), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE), __pyx_v_c.neu1, (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":336
+ * our_saxpy(&c.layer1_size, &ONEF, &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE, c.neu1, &ONE)
+ *
+ * if _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * count += ONEF
+ * our_saxpy(&c.layer1_size, &ONEF, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE, c.neu1, &ONE)
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":339
+ * count += ONEF
+ * our_saxpy(&c.layer1_size, &ONEF, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE, c.neu1, &ONE)
+ * if count > (0.5): # <<<<<<<<<<<<<<
+ * inv_count = ONEF/count
+ * if c.cbow_mean:
+ */
+ __pyx_t_9 = ((__pyx_v_count > ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)0.5)) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":340
+ * our_saxpy(&c.layer1_size, &ONEF, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE, c.neu1, &ONE)
+ * if count > (0.5):
+ * inv_count = ONEF/count # <<<<<<<<<<<<<<
+ * if c.cbow_mean:
+ * sscal(&c.layer1_size, &inv_count, c.neu1, &ONE) # (does this need BLAS-variants like saxpy?)
+ */
+ __pyx_v_inv_count = (__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONEF / __pyx_v_count);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":339
+ * count += ONEF
+ * our_saxpy(&c.layer1_size, &ONEF, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE, c.neu1, &ONE)
+ * if count > (0.5): # <<<<<<<<<<<<<<
+ * inv_count = ONEF/count
+ * if c.cbow_mean:
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":341
+ * if count > (0.5):
+ * inv_count = ONEF/count
+ * if c.cbow_mean: # <<<<<<<<<<<<<<
+ * sscal(&c.layer1_size, &inv_count, c.neu1, &ONE) # (does this need BLAS-variants like saxpy?)
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error
+ */
+ __pyx_t_9 = (__pyx_v_c.cbow_mean != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":342
+ * inv_count = ONEF/count
+ * if c.cbow_mean:
+ * sscal(&c.layer1_size, &inv_count, c.neu1, &ONE) # (does this need BLAS-variants like saxpy?) # <<<<<<<<<<<<<<
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error
+ * if c.hs:
+ */
+ __pyx_v_6gensim_6models_14word2vec_inner_sscal((&__pyx_v_c.layer1_size), (&__pyx_v_inv_count), __pyx_v_c.neu1, (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":341
+ * if count > (0.5):
+ * inv_count = ONEF/count
+ * if c.cbow_mean: # <<<<<<<<<<<<<<
+ * sscal(&c.layer1_size, &inv_count, c.neu1, &ONE) # (does this need BLAS-variants like saxpy?)
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":343
+ * if c.cbow_mean:
+ * sscal(&c.layer1_size, &inv_count, c.neu1, &ONE) # (does this need BLAS-variants like saxpy?)
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error # <<<<<<<<<<<<<<
+ * if c.hs:
+ * fast_document_dm_hs(
+ */
+ (void)(memset(__pyx_v_c.work, 0, (__pyx_v_c.layer1_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":344
+ * sscal(&c.layer1_size, &inv_count, c.neu1, &ONE) # (does this need BLAS-variants like saxpy?)
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error
+ * if c.hs: # <<<<<<<<<<<<<<
+ * fast_document_dm_hs(
+ * c.points[i], c.codes[i], c.codelens[i], c.neu1,
+ */
+ __pyx_t_9 = (__pyx_v_c.hs != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":345
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error
+ * if c.hs:
+ * fast_document_dm_hs( # <<<<<<<<<<<<<<
+ * c.points[i], c.codes[i], c.codelens[i], c.neu1,
+ * c.syn1, c.alpha, c.work, c.layer1_size, c.learn_hidden)
+ */
+ __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs((__pyx_v_c.points[__pyx_v_i]), (__pyx_v_c.codes[__pyx_v_i]), (__pyx_v_c.codelens[__pyx_v_i]), __pyx_v_c.neu1, __pyx_v_c.syn1, __pyx_v_c.alpha, __pyx_v_c.work, __pyx_v_c.layer1_size, __pyx_v_c.learn_hidden);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":344
+ * sscal(&c.layer1_size, &inv_count, c.neu1, &ONE) # (does this need BLAS-variants like saxpy?)
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error
+ * if c.hs: # <<<<<<<<<<<<<<
+ * fast_document_dm_hs(
+ * c.points[i], c.codes[i], c.codelens[i], c.neu1,
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":349
+ * c.syn1, c.alpha, c.work, c.layer1_size, c.learn_hidden)
+ *
+ * if c.negative: # <<<<<<<<<<<<<<
+ * c.next_random = fast_document_dm_neg(
+ * c.negative, c.cum_table, c.cum_table_len, c.next_random, c.neu1,
+ */
+ __pyx_t_9 = (__pyx_v_c.negative != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":350
+ *
+ * if c.negative:
+ * c.next_random = fast_document_dm_neg( # <<<<<<<<<<<<<<
+ * c.negative, c.cum_table, c.cum_table_len, c.next_random, c.neu1,
+ * c.syn1neg, c.indexes[i], c.alpha, c.work, c.layer1_size, c.learn_hidden)
+ */
+ __pyx_v_c.next_random = __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_neg(__pyx_v_c.negative, __pyx_v_c.cum_table, __pyx_v_c.cum_table_len, __pyx_v_c.next_random, __pyx_v_c.neu1, __pyx_v_c.syn1neg, (__pyx_v_c.indexes[__pyx_v_i]), __pyx_v_c.alpha, __pyx_v_c.work, __pyx_v_c.layer1_size, __pyx_v_c.learn_hidden);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":349
+ * c.syn1, c.alpha, c.work, c.layer1_size, c.learn_hidden)
+ *
+ * if c.negative: # <<<<<<<<<<<<<<
+ * c.next_random = fast_document_dm_neg(
+ * c.negative, c.cum_table, c.cum_table_len, c.next_random, c.neu1,
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":354
+ * c.syn1neg, c.indexes[i], c.alpha, c.work, c.layer1_size, c.learn_hidden)
+ *
+ * if not c.cbow_mean: # <<<<<<<<<<<<<<
+ * sscal(&c.layer1_size, &inv_count, c.work, &ONE) # (does this need BLAS-variants like saxpy?)
+ * # apply accumulated error in work
+ */
+ __pyx_t_9 = ((!(__pyx_v_c.cbow_mean != 0)) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":355
+ *
+ * if not c.cbow_mean:
+ * sscal(&c.layer1_size, &inv_count, c.work, &ONE) # (does this need BLAS-variants like saxpy?) # <<<<<<<<<<<<<<
+ * # apply accumulated error in work
+ * if c.learn_doctags and _doc_tag < c.docvecs_count:
+ */
+ __pyx_v_6gensim_6models_14word2vec_inner_sscal((&__pyx_v_c.layer1_size), (&__pyx_v_inv_count), __pyx_v_c.work, (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":354
+ * c.syn1neg, c.indexes[i], c.alpha, c.work, c.layer1_size, c.learn_hidden)
+ *
+ * if not c.cbow_mean: # <<<<<<<<<<<<<<
+ * sscal(&c.layer1_size, &inv_count, c.work, &ONE) # (does this need BLAS-variants like saxpy?)
+ * # apply accumulated error in work
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":357
+ * sscal(&c.layer1_size, &inv_count, c.work, &ONE) # (does this need BLAS-variants like saxpy?)
+ * # apply accumulated error in work
+ * if c.learn_doctags and _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * our_saxpy(&c.layer1_size, &c.doctag_locks[_doc_tag], c.work,
+ * &ONE, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE)
+ */
+ __pyx_t_3 = (__pyx_v_c.learn_doctags != 0);
+ if (__pyx_t_3) {
+ } else {
+ __pyx_t_9 = __pyx_t_3;
+ goto __pyx_L25_bool_binop_done;
+ }
+ __pyx_t_3 = ((__pyx_v__doc_tag < __pyx_v_c.docvecs_count) != 0);
+ __pyx_t_9 = __pyx_t_3;
+ __pyx_L25_bool_binop_done:;
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":358
+ * # apply accumulated error in work
+ * if c.learn_doctags and _doc_tag < c.docvecs_count:
+ * our_saxpy(&c.layer1_size, &c.doctag_locks[_doc_tag], c.work, # <<<<<<<<<<<<<<
+ * &ONE, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE)
+ * if c.learn_words:
+ */
+ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_c.layer1_size), (&(__pyx_v_c.doctag_locks[__pyx_v__doc_tag])), __pyx_v_c.work, (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE), (&(__pyx_v_c.doctag_vectors[(__pyx_v__doc_tag * __pyx_v_c.layer1_size)])), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":357
+ * sscal(&c.layer1_size, &inv_count, c.work, &ONE) # (does this need BLAS-variants like saxpy?)
+ * # apply accumulated error in work
+ * if c.learn_doctags and _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * our_saxpy(&c.layer1_size, &c.doctag_locks[_doc_tag], c.work,
+ * &ONE, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE)
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":360
+ * our_saxpy(&c.layer1_size, &c.doctag_locks[_doc_tag], c.work,
+ * &ONE, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE)
+ * if c.learn_words: # <<<<<<<<<<<<<<
+ * for m in range(j, k):
+ * if m == i:
+ */
+ __pyx_t_9 = (__pyx_v_c.learn_words != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":361
+ * &ONE, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE)
+ * if c.learn_words:
+ * for m in range(j, k): # <<<<<<<<<<<<<<
+ * if m == i:
+ * continue
+ */
+ __pyx_t_13 = __pyx_v_k;
+ __pyx_t_14 = __pyx_t_13;
+ for (__pyx_t_15 = __pyx_v_j; __pyx_t_15 < __pyx_t_14; __pyx_t_15+=1) {
+ __pyx_v_m = __pyx_t_15;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":362
+ * if c.learn_words:
+ * for m in range(j, k):
+ * if m == i: # <<<<<<<<<<<<<<
+ * continue
+ * else:
+ */
+ __pyx_t_9 = ((__pyx_v_m == __pyx_v_i) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":363
+ * for m in range(j, k):
+ * if m == i:
+ * continue # <<<<<<<<<<<<<<
+ * else:
+ * our_saxpy(&c.layer1_size, &c.word_locks[c.indexes[m]], c.work, &ONE,
+ */
+ goto __pyx_L28_continue;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":362
+ * if c.learn_words:
+ * for m in range(j, k):
+ * if m == i: # <<<<<<<<<<<<<<
+ * continue
+ * else:
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":365
+ * continue
+ * else:
+ * our_saxpy(&c.layer1_size, &c.word_locks[c.indexes[m]], c.work, &ONE, # <<<<<<<<<<<<<<
+ * &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE)
+ *
+ */
+ /*else*/ {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":366
+ * else:
+ * our_saxpy(&c.layer1_size, &c.word_locks[c.indexes[m]], c.work, &ONE,
+ * &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE) # <<<<<<<<<<<<<<
+ *
+ * total_documents += 1
+ */
+ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_c.layer1_size), (&(__pyx_v_c.word_locks[(__pyx_v_c.indexes[__pyx_v_m])])), __pyx_v_c.work, (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE), (&(__pyx_v_c.word_vectors[((__pyx_v_c.indexes[__pyx_v_m]) * __pyx_v_c.layer1_size)])), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE));
+ }
+ __pyx_L28_continue:;
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":360
+ * our_saxpy(&c.layer1_size, &c.doctag_locks[_doc_tag], c.work,
+ * &ONE, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE)
+ * if c.learn_words: # <<<<<<<<<<<<<<
+ * for m in range(j, k):
+ * if m == i:
+ */
+ }
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":368
+ * &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE)
+ *
+ * total_documents += 1 # <<<<<<<<<<<<<<
+ * total_effective_words += effective_words
+ * _doc_tag += 1
+ */
+ __pyx_v_total_documents = (__pyx_v_total_documents + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":369
+ *
+ * total_documents += 1
+ * total_effective_words += effective_words # <<<<<<<<<<<<<<
+ * _doc_tag += 1
+ *
+ */
+ __pyx_v_total_effective_words = (__pyx_v_total_effective_words + __pyx_v_effective_words);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":370
+ * total_documents += 1
+ * total_effective_words += effective_words
+ * _doc_tag += 1 # <<<<<<<<<<<<<<
+ *
+ * c.alpha = get_next_alpha(start_alpha, end_alpha, total_documents, total_words, expected_examples,
+ */
+ __pyx_v__doc_tag = (__pyx_v__doc_tag + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":372
+ * _doc_tag += 1
+ *
+ * c.alpha = get_next_alpha(start_alpha, end_alpha, total_documents, total_words, expected_examples, # <<<<<<<<<<<<<<
+ * expected_words, cur_epoch, num_epochs)
+ *
+ */
+ __pyx_v_c.alpha = __pyx_f_6gensim_6models_19word2vec_corpusfile_get_next_alpha(__pyx_v_start_alpha, __pyx_v_end_alpha, __pyx_v_total_documents, __pyx_v_total_words, __pyx_v_expected_examples, __pyx_v_expected_words, __pyx_v_cur_epoch, __pyx_v_num_epochs);
+ __pyx_L6_continue:;
+ }
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":303
+ *
+ * # release GIL & train on the full corpus, document by document
+ * with nogil: # <<<<<<<<<<<<<<
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ */
+ /*finally:*/ {
+ /*normal exit:*/{
+ #ifdef WITH_THREAD
+ __Pyx_FastGIL_Forget();
+ Py_BLOCK_THREADS
+ #endif
+ goto __pyx_L5;
+ }
+ __pyx_L4_error: {
+ #ifdef WITH_THREAD
+ __Pyx_FastGIL_Forget();
+ Py_BLOCK_THREADS
+ #endif
+ goto __pyx_L1_error;
+ }
+ __pyx_L5:;
+ }
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":375
+ * expected_words, cur_epoch, num_epochs)
+ *
+ * return total_documents, total_effective_words, total_words # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_total_documents); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 375, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_7 = __Pyx_PyInt_From_int(__pyx_v_total_effective_words); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 375, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+ __pyx_t_16 = __Pyx_PyInt_From_int(__pyx_v_total_words); if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 375, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_16);
+ __pyx_t_17 = PyTuple_New(3); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 375, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_17);
+ __Pyx_GIVEREF(__pyx_t_2);
+ PyTuple_SET_ITEM(__pyx_t_17, 0, __pyx_t_2);
+ __Pyx_GIVEREF(__pyx_t_7);
+ PyTuple_SET_ITEM(__pyx_t_17, 1, __pyx_t_7);
+ __Pyx_GIVEREF(__pyx_t_16);
+ PyTuple_SET_ITEM(__pyx_t_17, 2, __pyx_t_16);
+ __pyx_t_2 = 0;
+ __pyx_t_7 = 0;
+ __pyx_t_16 = 0;
+ __pyx_r = __pyx_t_17;
+ __pyx_t_17 = 0;
+ goto __pyx_L0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":231
+ *
+ *
+ * def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, # <<<<<<<<<<<<<<
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctag_locks=None):
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_2);
+ __Pyx_XDECREF(__pyx_t_7);
+ __Pyx_XDECREF(__pyx_t_16);
+ __Pyx_XDECREF(__pyx_t_17);
+ __Pyx_AddTraceback("gensim.models.doc2vec_corpusfile.d2v_train_epoch_dm", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __Pyx_XDECREF((PyObject *)__pyx_v_input_stream);
+ __Pyx_XDECREF((PyObject *)__pyx_v_vocab);
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "gensim/models/doc2vec_corpusfile.pyx":378
+ *
+ *
+ * def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, # <<<<<<<<<<<<<<
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_6gensim_6models_18doc2vec_corpusfile_5d2v_train_epoch_dm_concat(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static char __pyx_doc_6gensim_6models_18doc2vec_corpusfile_4d2v_train_epoch_dm_concat[] = "d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None, learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctag_locks=None)\nTrain distributed memory model (\"PV-DM\") by training on a corpus file, using a concatenation of the context\n window word vectors (rather than a sum or average).\n This might be slower since the input at each batch will be significantly larger.\n\n Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train`.\n\n Parameters\n ----------\n model : :class:`~gensim.models.doc2vec.Doc2Vec`\n The FastText model instance to train.\n corpus_file : str\n Path to corpus file.\n _cur_epoch : int\n Current epoch number. Used for calculating and decaying learning rate.\n work : np.ndarray\n Private working memory for each worker.\n neu1 : np.ndarray\n Private working memory for each worker.\n learn_doctags : bool, optional\n Whether the tag vectors should be updated.\n learn_words : bool, optional\n Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**\n `learn_words` and `train_words` are set to True.\n learn_hidden : bool, optional\n Whether or not the weights of the hidden layer will be updated.\n word_vectors : numpy.ndarray, optional\n The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.\n word_locks : numpy.ndarray, optional\n A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates,\n a value of 1 allows to update word-vectors.\n doctag_vectors : numpy.ndarray, optional\n Vector representations of the tags. If None, these will be retrieved from the model.\n doctag_locks : numpy.ndarray, optional""\n The lock factors for each tag, same as `word_locks`, but for document-vectors.\n\n Returns\n -------\n int\n Number of words in the input document that were actually used for training.\n\n ";
+static PyMethodDef __pyx_mdef_6gensim_6models_18doc2vec_corpusfile_5d2v_train_epoch_dm_concat = {"d2v_train_epoch_dm_concat", (PyCFunction)__pyx_pw_6gensim_6models_18doc2vec_corpusfile_5d2v_train_epoch_dm_concat, METH_VARARGS|METH_KEYWORDS, __pyx_doc_6gensim_6models_18doc2vec_corpusfile_4d2v_train_epoch_dm_concat};
+static PyObject *__pyx_pw_6gensim_6models_18doc2vec_corpusfile_5d2v_train_epoch_dm_concat(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+ PyObject *__pyx_v_model = 0;
+ PyObject *__pyx_v_corpus_file = 0;
+ PyObject *__pyx_v_offset = 0;
+ PyObject *__pyx_v_start_doctag = 0;
+ PyObject *__pyx_v__cython_vocab = 0;
+ PyObject *__pyx_v__cur_epoch = 0;
+ PyObject *__pyx_v__expected_examples = 0;
+ PyObject *__pyx_v__expected_words = 0;
+ PyObject *__pyx_v_work = 0;
+ PyObject *__pyx_v_neu1 = 0;
+ PyObject *__pyx_v_docvecs_count = 0;
+ PyObject *__pyx_v_word_vectors = 0;
+ PyObject *__pyx_v_word_locks = 0;
+ PyObject *__pyx_v_learn_doctags = 0;
+ PyObject *__pyx_v_learn_words = 0;
+ PyObject *__pyx_v_learn_hidden = 0;
+ PyObject *__pyx_v_doctag_vectors = 0;
+ PyObject *__pyx_v_doctag_locks = 0;
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("d2v_train_epoch_dm_concat (wrapper)", 0);
+ {
+ static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_model,&__pyx_n_s_corpus_file,&__pyx_n_s_offset,&__pyx_n_s_start_doctag,&__pyx_n_s_cython_vocab,&__pyx_n_s_cur_epoch,&__pyx_n_s_expected_examples,&__pyx_n_s_expected_words,&__pyx_n_s_work,&__pyx_n_s_neu1,&__pyx_n_s_docvecs_count,&__pyx_n_s_word_vectors,&__pyx_n_s_word_locks,&__pyx_n_s_learn_doctags,&__pyx_n_s_learn_words,&__pyx_n_s_learn_hidden,&__pyx_n_s_doctag_vectors,&__pyx_n_s_doctag_locks,0};
+ PyObject* values[18] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":379
+ *
+ * def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None, # <<<<<<<<<<<<<<
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None,
+ * doctag_locks=None):
+ */
+ values[11] = ((PyObject *)Py_None);
+ values[12] = ((PyObject *)Py_None);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":380
+ * def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, # <<<<<<<<<<<<<<
+ * doctag_locks=None):
+ * """Train distributed memory model ("PV-DM") by training on a corpus file, using a concatenation of the context
+ */
+ values[13] = ((PyObject *)Py_True);
+ values[14] = ((PyObject *)Py_True);
+ values[15] = ((PyObject *)Py_True);
+ values[16] = ((PyObject *)Py_None);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":381
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None,
+ * doctag_locks=None): # <<<<<<<<<<<<<<
+ * """Train distributed memory model ("PV-DM") by training on a corpus file, using a concatenation of the context
+ * window word vectors (rather than a sum or average).
+ */
+ values[17] = ((PyObject *)Py_None);
+ if (unlikely(__pyx_kwds)) {
+ Py_ssize_t kw_args;
+ const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+ switch (pos_args) {
+ case 18: values[17] = PyTuple_GET_ITEM(__pyx_args, 17);
+ CYTHON_FALLTHROUGH;
+ case 17: values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+ CYTHON_FALLTHROUGH;
+ case 16: values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+ CYTHON_FALLTHROUGH;
+ case 15: values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+ CYTHON_FALLTHROUGH;
+ case 14: values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+ CYTHON_FALLTHROUGH;
+ case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+ CYTHON_FALLTHROUGH;
+ case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+ CYTHON_FALLTHROUGH;
+ case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+ CYTHON_FALLTHROUGH;
+ case 10: values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+ CYTHON_FALLTHROUGH;
+ case 9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+ CYTHON_FALLTHROUGH;
+ case 8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+ CYTHON_FALLTHROUGH;
+ case 7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+ CYTHON_FALLTHROUGH;
+ case 6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+ CYTHON_FALLTHROUGH;
+ case 5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ CYTHON_FALLTHROUGH;
+ case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ CYTHON_FALLTHROUGH;
+ case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ CYTHON_FALLTHROUGH;
+ case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ CYTHON_FALLTHROUGH;
+ case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ CYTHON_FALLTHROUGH;
+ case 0: break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ kw_args = PyDict_Size(__pyx_kwds);
+ switch (pos_args) {
+ case 0:
+ if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_model)) != 0)) kw_args--;
+ else goto __pyx_L5_argtuple_error;
+ CYTHON_FALLTHROUGH;
+ case 1:
+ if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_corpus_file)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 1); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 2:
+ if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_offset)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 2); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 3:
+ if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_start_doctag)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 3); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 4:
+ if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_cython_vocab)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 4); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 5:
+ if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_cur_epoch)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 5); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 6:
+ if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_expected_examples)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 6); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 7:
+ if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_expected_words)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 7); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 8:
+ if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_work)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 8); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 9:
+ if (likely((values[9] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_neu1)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 9); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 10:
+ if (likely((values[10] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_docvecs_count)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, 10); __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 11:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_word_vectors);
+ if (value) { values[11] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 12:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_word_locks);
+ if (value) { values[12] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 13:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_learn_doctags);
+ if (value) { values[13] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 14:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_learn_words);
+ if (value) { values[14] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 15:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_learn_hidden);
+ if (value) { values[15] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 16:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_doctag_vectors);
+ if (value) { values[16] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 17:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_doctag_locks);
+ if (value) { values[17] = value; kw_args--; }
+ }
+ }
+ if (unlikely(kw_args > 0)) {
+ if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "d2v_train_epoch_dm_concat") < 0)) __PYX_ERR(0, 378, __pyx_L3_error)
+ }
+ } else {
+ switch (PyTuple_GET_SIZE(__pyx_args)) {
+ case 18: values[17] = PyTuple_GET_ITEM(__pyx_args, 17);
+ CYTHON_FALLTHROUGH;
+ case 17: values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+ CYTHON_FALLTHROUGH;
+ case 16: values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+ CYTHON_FALLTHROUGH;
+ case 15: values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+ CYTHON_FALLTHROUGH;
+ case 14: values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+ CYTHON_FALLTHROUGH;
+ case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+ CYTHON_FALLTHROUGH;
+ case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+ CYTHON_FALLTHROUGH;
+ case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+ values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+ values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+ values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+ values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+ values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+ values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ }
+ __pyx_v_model = values[0];
+ __pyx_v_corpus_file = values[1];
+ __pyx_v_offset = values[2];
+ __pyx_v_start_doctag = values[3];
+ __pyx_v__cython_vocab = values[4];
+ __pyx_v__cur_epoch = values[5];
+ __pyx_v__expected_examples = values[6];
+ __pyx_v__expected_words = values[7];
+ __pyx_v_work = values[8];
+ __pyx_v_neu1 = values[9];
+ __pyx_v_docvecs_count = values[10];
+ __pyx_v_word_vectors = values[11];
+ __pyx_v_word_locks = values[12];
+ __pyx_v_learn_doctags = values[13];
+ __pyx_v_learn_words = values[14];
+ __pyx_v_learn_hidden = values[15];
+ __pyx_v_doctag_vectors = values[16];
+ __pyx_v_doctag_locks = values[17];
+ }
+ goto __pyx_L4_argument_unpacking_done;
+ __pyx_L5_argtuple_error:;
+ __Pyx_RaiseArgtupleInvalid("d2v_train_epoch_dm_concat", 0, 11, 18, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 378, __pyx_L3_error)
+ __pyx_L3_error:;
+ __Pyx_AddTraceback("gensim.models.doc2vec_corpusfile.d2v_train_epoch_dm_concat", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __Pyx_RefNannyFinishContext();
+ return NULL;
+ __pyx_L4_argument_unpacking_done:;
+ __pyx_r = __pyx_pf_6gensim_6models_18doc2vec_corpusfile_4d2v_train_epoch_dm_concat(__pyx_self, __pyx_v_model, __pyx_v_corpus_file, __pyx_v_offset, __pyx_v_start_doctag, __pyx_v__cython_vocab, __pyx_v__cur_epoch, __pyx_v__expected_examples, __pyx_v__expected_words, __pyx_v_work, __pyx_v_neu1, __pyx_v_docvecs_count, __pyx_v_word_vectors, __pyx_v_word_locks, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, __pyx_v_doctag_vectors, __pyx_v_doctag_locks);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":378
+ *
+ *
+ * def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, # <<<<<<<<<<<<<<
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None,
+ */
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_pf_6gensim_6models_18doc2vec_corpusfile_4d2v_train_epoch_dm_concat(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_corpus_file, PyObject *__pyx_v_offset, PyObject *__pyx_v_start_doctag, PyObject *__pyx_v__cython_vocab, PyObject *__pyx_v__cur_epoch, PyObject *__pyx_v__expected_examples, PyObject *__pyx_v__expected_words, PyObject *__pyx_v_work, PyObject *__pyx_v_neu1, PyObject *__pyx_v_docvecs_count, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks) {
+ struct __pyx_t_6gensim_6models_13doc2vec_inner_Doc2VecConfig __pyx_v_c;
+ int __pyx_v_cur_epoch;
+ int __pyx_v_num_epochs;
+ int __pyx_v_expected_examples;
+ int __pyx_v_expected_words;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v_start_alpha;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v_end_alpha;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_v__alpha;
+ struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *__pyx_v_input_stream = 0;
+ struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonVocab *__pyx_v_vocab = 0;
+ int __pyx_v_i;
+ int __pyx_v_j;
+ int __pyx_v_k;
+ int __pyx_v_m;
+ int __pyx_v_n;
+ int __pyx_v_document_len;
+ int __pyx_v_effective_words;
+ int __pyx_v_total_effective_words;
+ int __pyx_v_total_documents;
+ int __pyx_v_total_words;
+ std::vector __pyx_v_doc_words;
+ int __pyx_v__doc_tag;
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ PyObject *__pyx_t_2 = NULL;
+ int __pyx_t_3;
+ int __pyx_t_4;
+ __pyx_t_6gensim_6models_14word2vec_inner_REAL_t __pyx_t_5;
+ __pyx_t_6gensim_6models_19word2vec_corpusfile_REAL_t __pyx_t_6;
+ PyObject *__pyx_t_7 = NULL;
+ struct __pyx_opt_args_6gensim_6models_13doc2vec_inner_init_d2v_config __pyx_t_8;
+ int __pyx_t_9;
+ std::vector __pyx_t_10;
+ __pyx_t_6gensim_6models_19word2vec_corpusfile_cvocab_t *__pyx_t_11;
+ int __pyx_t_12;
+ int __pyx_t_13;
+ int __pyx_t_14;
+ int __pyx_t_15;
+ int __pyx_t_16;
+ long __pyx_t_17;
+ long __pyx_t_18;
+ PyObject *__pyx_t_19 = NULL;
+ PyObject *__pyx_t_20 = NULL;
+ __Pyx_RefNannySetupContext("d2v_train_epoch_dm_concat", 0);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":425
+ * cdef Doc2VecConfig c
+ *
+ * cdef int cur_epoch = _cur_epoch # <<<<<<<<<<<<<<
+ * cdef int num_epochs = model.epochs
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ */
+ __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v__cur_epoch); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 425, __pyx_L1_error)
+ __pyx_v_cur_epoch = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":426
+ *
+ * cdef int cur_epoch = _cur_epoch
+ * cdef int num_epochs = model.epochs # <<<<<<<<<<<<<<
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_epochs); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 426, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_t_2); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 426, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_num_epochs = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":427
+ * cdef int cur_epoch = _cur_epoch
+ * cdef int num_epochs = model.epochs
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples) # <<<<<<<<<<<<<<
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ * cdef REAL_t start_alpha = model.alpha
+ */
+ __pyx_t_3 = (__pyx_v__expected_examples == Py_None);
+ if ((__pyx_t_3 != 0)) {
+ __pyx_t_1 = -1;
+ } else {
+ __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_v__expected_examples); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 427, __pyx_L1_error)
+ __pyx_t_1 = __pyx_t_4;
+ }
+ __pyx_v_expected_examples = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":428
+ * cdef int num_epochs = model.epochs
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words) # <<<<<<<<<<<<<<
+ * cdef REAL_t start_alpha = model.alpha
+ * cdef REAL_t end_alpha = model.min_alpha
+ */
+ __pyx_t_3 = (__pyx_v__expected_words == Py_None);
+ if ((__pyx_t_3 != 0)) {
+ __pyx_t_1 = -1;
+ } else {
+ __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_v__expected_words); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 428, __pyx_L1_error)
+ __pyx_t_1 = __pyx_t_4;
+ }
+ __pyx_v_expected_words = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":429
+ * cdef int expected_examples = (-1 if _expected_examples is None else _expected_examples)
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ * cdef REAL_t start_alpha = model.alpha # <<<<<<<<<<<<<<
+ * cdef REAL_t end_alpha = model.min_alpha
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_alpha); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 429, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_5 = __pyx_PyFloat_AsFloat(__pyx_t_2); if (unlikely((__pyx_t_5 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 429, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_start_alpha = __pyx_t_5;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":430
+ * cdef int expected_words = (-1 if _expected_words is None else _expected_words)
+ * cdef REAL_t start_alpha = model.alpha
+ * cdef REAL_t end_alpha = model.min_alpha # <<<<<<<<<<<<<<
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)
+ *
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_min_alpha); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 430, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_5 = __pyx_PyFloat_AsFloat(__pyx_t_2); if (unlikely((__pyx_t_5 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 430, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_end_alpha = __pyx_t_5;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":431
+ * cdef REAL_t start_alpha = model.alpha
+ * cdef REAL_t end_alpha = model.min_alpha
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs) # <<<<<<<<<<<<<<
+ *
+ * cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset)
+ */
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_alpha); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 431, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_6 = __pyx_PyFloat_AsFloat(__pyx_t_2); if (unlikely((__pyx_t_6 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 431, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v__alpha = __pyx_f_6gensim_6models_19word2vec_corpusfile_get_alpha(__pyx_t_6, __pyx_v_end_alpha, __pyx_v_cur_epoch, __pyx_v_num_epochs);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":433
+ * cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)
+ *
+ * cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset) # <<<<<<<<<<<<<<
+ * cdef CythonVocab vocab = _cython_vocab
+ *
+ */
+ __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 433, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_INCREF(__pyx_v_corpus_file);
+ __Pyx_GIVEREF(__pyx_v_corpus_file);
+ PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_corpus_file);
+ __Pyx_INCREF(__pyx_v_offset);
+ __Pyx_GIVEREF(__pyx_v_offset);
+ PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_v_offset);
+ __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_6gensim_6models_19word2vec_corpusfile_CythonLineSentence), __pyx_t_2, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 433, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __pyx_v_input_stream = ((struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_t_7);
+ __pyx_t_7 = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":434
+ *
+ * cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset)
+ * cdef CythonVocab vocab = _cython_vocab # <<<<<<<<<<<<<<
+ *
+ * cdef int i, j, k, m, n, document_len
+ */
+ if (!(likely(((__pyx_v__cython_vocab) == Py_None) || likely(__Pyx_TypeTest(__pyx_v__cython_vocab, __pyx_ptype_6gensim_6models_19word2vec_corpusfile_CythonVocab))))) __PYX_ERR(0, 434, __pyx_L1_error)
+ __pyx_t_7 = __pyx_v__cython_vocab;
+ __Pyx_INCREF(__pyx_t_7);
+ __pyx_v_vocab = ((struct __pyx_obj_6gensim_6models_19word2vec_corpusfile_CythonVocab *)__pyx_t_7);
+ __pyx_t_7 = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":437
+ *
+ * cdef int i, j, k, m, n, document_len
+ * cdef int effective_words = 0 # <<<<<<<<<<<<<<
+ * cdef int total_effective_words = 0, total_documents = 0, total_words = 0
+ * cdef int sent_idx, idx_start, idx_end
+ */
+ __pyx_v_effective_words = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":438
+ * cdef int i, j, k, m, n, document_len
+ * cdef int effective_words = 0
+ * cdef int total_effective_words = 0, total_documents = 0, total_words = 0 # <<<<<<<<<<<<<<
+ * cdef int sent_idx, idx_start, idx_end
+ *
+ */
+ __pyx_v_total_effective_words = 0;
+ __pyx_v_total_documents = 0;
+ __pyx_v_total_words = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":442
+ *
+ * cdef vector[string] doc_words
+ * cdef int _doc_tag = start_doctag # <<<<<<<<<<<<<<
+ *
+ * init_d2v_config(
+ */
+ __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_start_doctag); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 442, __pyx_L1_error)
+ __pyx_v__doc_tag = __pyx_t_1;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":445
+ *
+ * init_d2v_config(
+ * &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False, # <<<<<<<<<<<<<<
+ * work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks,
+ * doctag_vectors=doctag_vectors, doctag_locks=doctag_locks, docvecs_count=docvecs_count)
+ */
+ __pyx_t_7 = PyFloat_FromDouble(__pyx_v__alpha); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 445, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":444
+ * cdef int _doc_tag = start_doctag
+ *
+ * init_d2v_config( # <<<<<<<<<<<<<<
+ * &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False,
+ * work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks,
+ */
+ __pyx_t_8.__pyx_n = 8;
+ __pyx_t_8.train_words = Py_False;
+ __pyx_t_8.work = __pyx_v_work;
+ __pyx_t_8.neu1 = __pyx_v_neu1;
+ __pyx_t_8.word_vectors = __pyx_v_word_vectors;
+ __pyx_t_8.word_locks = __pyx_v_word_locks;
+ __pyx_t_8.doctag_vectors = __pyx_v_doctag_vectors;
+ __pyx_t_8.doctag_locks = __pyx_v_doctag_locks;
+ __pyx_t_8.docvecs_count = __pyx_v_docvecs_count;
+ __pyx_t_2 = __pyx_f_6gensim_6models_13doc2vec_inner_init_d2v_config((&__pyx_v_c), __pyx_v_model, __pyx_t_7, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, &__pyx_t_8); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 444, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":450
+ *
+ * # release GIL & train on the full corpus, document by document
+ * with nogil: # <<<<<<<<<<<<<<
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ */
+ {
+ #ifdef WITH_THREAD
+ PyThreadState *_save;
+ Py_UNBLOCK_THREADS
+ __Pyx_FastGIL_Remember();
+ #endif
+ /*try:*/ {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":451
+ * # release GIL & train on the full corpus, document by document
+ * with nogil:
+ * input_stream.reset() # <<<<<<<<<<<<<<
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ * effective_words = 0
+ */
+ ((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_v_input_stream->__pyx_vtab)->reset(__pyx_v_input_stream, 0);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":452
+ * with nogil:
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers): # <<<<<<<<<<<<<<
+ * effective_words = 0
+ *
+ */
+ while (1) {
+ __pyx_t_9 = (((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_v_input_stream->__pyx_vtab)->is_eof(__pyx_v_input_stream, 0) != 0);
+ if (!__pyx_t_9) {
+ } else {
+ __pyx_t_3 = __pyx_t_9;
+ goto __pyx_L8_bool_binop_done;
+ }
+ __pyx_t_9 = ((__pyx_v_total_words > (__pyx_v_expected_words / __pyx_v_c.workers)) != 0);
+ __pyx_t_3 = __pyx_t_9;
+ __pyx_L8_bool_binop_done:;
+ __pyx_t_9 = ((!__pyx_t_3) != 0);
+ if (!__pyx_t_9) break;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":453
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ * effective_words = 0 # <<<<<<<<<<<<<<
+ *
+ * doc_words = input_stream.read_sentence()
+ */
+ __pyx_v_effective_words = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":455
+ * effective_words = 0
+ *
+ * doc_words = input_stream.read_sentence() # <<<<<<<<<<<<<<
+ * _doc_tag = total_documents
+ * c.doctag_len = _doc_tag < c.docvecs_count
+ */
+ __pyx_t_10 = ((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonLineSentence *)__pyx_v_input_stream->__pyx_vtab)->read_sentence(__pyx_v_input_stream, 0); if (unlikely(__Pyx_ErrOccurredWithGIL())) __PYX_ERR(0, 455, __pyx_L4_error)
+ __pyx_v_doc_words = __pyx_t_10;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":456
+ *
+ * doc_words = input_stream.read_sentence()
+ * _doc_tag = total_documents # <<<<<<<<<<<<<<
+ * c.doctag_len = _doc_tag < c.docvecs_count
+ *
+ */
+ __pyx_v__doc_tag = __pyx_v_total_documents;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":457
+ * doc_words = input_stream.read_sentence()
+ * _doc_tag = total_documents
+ * c.doctag_len = _doc_tag < c.docvecs_count # <<<<<<<<<<<<<<
+ *
+ * # skip doc either empty or without expected number of tags
+ */
+ __pyx_v_c.doctag_len = (__pyx_v__doc_tag < __pyx_v_c.docvecs_count);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":460
+ *
+ * # skip doc either empty or without expected number of tags
+ * if doc_words.empty() or c.expected_doctag_len != c.doctag_len: # <<<<<<<<<<<<<<
+ * continue
+ *
+ */
+ __pyx_t_3 = (__pyx_v_doc_words.empty() != 0);
+ if (!__pyx_t_3) {
+ } else {
+ __pyx_t_9 = __pyx_t_3;
+ goto __pyx_L11_bool_binop_done;
+ }
+ __pyx_t_3 = ((__pyx_v_c.expected_doctag_len != __pyx_v_c.doctag_len) != 0);
+ __pyx_t_9 = __pyx_t_3;
+ __pyx_L11_bool_binop_done:;
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":461
+ * # skip doc either empty or without expected number of tags
+ * if doc_words.empty() or c.expected_doctag_len != c.doctag_len:
+ * continue # <<<<<<<<<<<<<<
+ *
+ * prepare_c_structures_for_batch(
+ */
+ goto __pyx_L6_continue;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":460
+ *
+ * # skip doc either empty or without expected number of tags
+ * if doc_words.empty() or c.expected_doctag_len != c.doctag_len: # <<<<<<<<<<<<<<
+ * continue
+ *
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":465
+ * prepare_c_structures_for_batch(
+ * doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
+ * &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, # <<<<<<<<<<<<<<
+ * c.points, NULL, &document_len, c.train_words, c.docvecs_count, _doc_tag)
+ *
+ */
+ __pyx_t_11 = ((struct __pyx_vtabstruct_6gensim_6models_19word2vec_corpusfile_CythonVocab *)__pyx_v_vocab->__pyx_vtab)->get_vocab_ptr(__pyx_v_vocab); if (unlikely(__Pyx_ErrOccurredWithGIL())) __PYX_ERR(0, 465, __pyx_L4_error)
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":463
+ * continue
+ *
+ * prepare_c_structures_for_batch( # <<<<<<<<<<<<<<
+ * doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
+ * &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes,
+ */
+ __pyx_f_6gensim_6models_18doc2vec_corpusfile_prepare_c_structures_for_batch(__pyx_v_doc_words, __pyx_v_c.sample, __pyx_v_c.hs, __pyx_v_c.window, (&__pyx_v_total_words), (&__pyx_v_effective_words), (&__pyx_v_c.next_random), __pyx_t_11, __pyx_v_c.indexes, __pyx_v_c.codelens, __pyx_v_c.codes, __pyx_v_c.points, NULL, (&__pyx_v_document_len), __pyx_v_c.train_words, __pyx_v_c.docvecs_count, __pyx_v__doc_tag);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":468
+ * c.points, NULL, &document_len, c.train_words, c.docvecs_count, _doc_tag)
+ *
+ * for i in range(document_len): # <<<<<<<<<<<<<<
+ * j = i - c.window # negative OK: will pad with null word
+ * k = i + c.window + 1 # past document end OK: will pad with null word
+ */
+ __pyx_t_1 = __pyx_v_document_len;
+ __pyx_t_4 = __pyx_t_1;
+ for (__pyx_t_12 = 0; __pyx_t_12 < __pyx_t_4; __pyx_t_12+=1) {
+ __pyx_v_i = __pyx_t_12;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":469
+ *
+ * for i in range(document_len):
+ * j = i - c.window # negative OK: will pad with null word # <<<<<<<<<<<<<<
+ * k = i + c.window + 1 # past document end OK: will pad with null word
+ *
+ */
+ __pyx_v_j = (__pyx_v_i - __pyx_v_c.window);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":470
+ * for i in range(document_len):
+ * j = i - c.window # negative OK: will pad with null word
+ * k = i + c.window + 1 # past document end OK: will pad with null word # <<<<<<<<<<<<<<
+ *
+ * # compose l1 & clear work
+ */
+ __pyx_v_k = ((__pyx_v_i + __pyx_v_c.window) + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":473
+ *
+ * # compose l1 & clear work
+ * if _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * # doc vector(s)
+ * memcpy(&c.neu1[0], &c.doctag_vectors[_doc_tag * c.vector_size],
+ */
+ __pyx_t_9 = ((__pyx_v__doc_tag < __pyx_v_c.docvecs_count) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":475
+ * if _doc_tag < c.docvecs_count:
+ * # doc vector(s)
+ * memcpy(&c.neu1[0], &c.doctag_vectors[_doc_tag * c.vector_size], # <<<<<<<<<<<<<<
+ * c.vector_size * cython.sizeof(REAL_t))
+ * n = 0
+ */
+ (void)(memcpy((&(__pyx_v_c.neu1[0])), (&(__pyx_v_c.doctag_vectors[(__pyx_v__doc_tag * __pyx_v_c.vector_size)])), (__pyx_v_c.vector_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":473
+ *
+ * # compose l1 & clear work
+ * if _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * # doc vector(s)
+ * memcpy(&c.neu1[0], &c.doctag_vectors[_doc_tag * c.vector_size],
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":477
+ * memcpy(&c.neu1[0], &c.doctag_vectors[_doc_tag * c.vector_size],
+ * c.vector_size * cython.sizeof(REAL_t))
+ * n = 0 # <<<<<<<<<<<<<<
+ * for m in range(j, k):
+ * # word vectors in window
+ */
+ __pyx_v_n = 0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":478
+ * c.vector_size * cython.sizeof(REAL_t))
+ * n = 0
+ * for m in range(j, k): # <<<<<<<<<<<<<<
+ * # word vectors in window
+ * if m == i:
+ */
+ __pyx_t_13 = __pyx_v_k;
+ __pyx_t_14 = __pyx_t_13;
+ for (__pyx_t_15 = __pyx_v_j; __pyx_t_15 < __pyx_t_14; __pyx_t_15+=1) {
+ __pyx_v_m = __pyx_t_15;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":480
+ * for m in range(j, k):
+ * # word vectors in window
+ * if m == i: # <<<<<<<<<<<<<<
+ * continue
+ * if m < 0 or m >= document_len:
+ */
+ __pyx_t_9 = ((__pyx_v_m == __pyx_v_i) != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":481
+ * # word vectors in window
+ * if m == i:
+ * continue # <<<<<<<<<<<<<<
+ * if m < 0 or m >= document_len:
+ * c.window_indexes[n] = c.null_word_index
+ */
+ goto __pyx_L16_continue;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":480
+ * for m in range(j, k):
+ * # word vectors in window
+ * if m == i: # <<<<<<<<<<<<<<
+ * continue
+ * if m < 0 or m >= document_len:
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":482
+ * if m == i:
+ * continue
+ * if m < 0 or m >= document_len: # <<<<<<<<<<<<<<
+ * c.window_indexes[n] = c.null_word_index
+ * else:
+ */
+ __pyx_t_3 = ((__pyx_v_m < 0) != 0);
+ if (!__pyx_t_3) {
+ } else {
+ __pyx_t_9 = __pyx_t_3;
+ goto __pyx_L20_bool_binop_done;
+ }
+ __pyx_t_3 = ((__pyx_v_m >= __pyx_v_document_len) != 0);
+ __pyx_t_9 = __pyx_t_3;
+ __pyx_L20_bool_binop_done:;
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":483
+ * continue
+ * if m < 0 or m >= document_len:
+ * c.window_indexes[n] = c.null_word_index # <<<<<<<<<<<<<<
+ * else:
+ * c.window_indexes[n] = c.indexes[m]
+ */
+ __pyx_t_16 = __pyx_v_c.null_word_index;
+ (__pyx_v_c.window_indexes[__pyx_v_n]) = __pyx_t_16;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":482
+ * if m == i:
+ * continue
+ * if m < 0 or m >= document_len: # <<<<<<<<<<<<<<
+ * c.window_indexes[n] = c.null_word_index
+ * else:
+ */
+ goto __pyx_L19;
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":485
+ * c.window_indexes[n] = c.null_word_index
+ * else:
+ * c.window_indexes[n] = c.indexes[m] # <<<<<<<<<<<<<<
+ * n += 1
+ * for m in range(2 * c.window):
+ */
+ /*else*/ {
+ (__pyx_v_c.window_indexes[__pyx_v_n]) = (__pyx_v_c.indexes[__pyx_v_m]);
+ }
+ __pyx_L19:;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":486
+ * else:
+ * c.window_indexes[n] = c.indexes[m]
+ * n += 1 # <<<<<<<<<<<<<<
+ * for m in range(2 * c.window):
+ * memcpy(&c.neu1[(c.doctag_len + m) * c.vector_size], &c.word_vectors[c.window_indexes[m] * c.vector_size],
+ */
+ __pyx_v_n = (__pyx_v_n + 1);
+ __pyx_L16_continue:;
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":487
+ * c.window_indexes[n] = c.indexes[m]
+ * n += 1
+ * for m in range(2 * c.window): # <<<<<<<<<<<<<<
+ * memcpy(&c.neu1[(c.doctag_len + m) * c.vector_size], &c.word_vectors[c.window_indexes[m] * c.vector_size],
+ * c.vector_size * cython.sizeof(REAL_t))
+ */
+ __pyx_t_17 = (2 * __pyx_v_c.window);
+ __pyx_t_18 = __pyx_t_17;
+ for (__pyx_t_13 = 0; __pyx_t_13 < __pyx_t_18; __pyx_t_13+=1) {
+ __pyx_v_m = __pyx_t_13;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":488
+ * n += 1
+ * for m in range(2 * c.window):
+ * memcpy(&c.neu1[(c.doctag_len + m) * c.vector_size], &c.word_vectors[c.window_indexes[m] * c.vector_size], # <<<<<<<<<<<<<<
+ * c.vector_size * cython.sizeof(REAL_t))
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error
+ */
+ (void)(memcpy((&(__pyx_v_c.neu1[((__pyx_v_c.doctag_len + __pyx_v_m) * __pyx_v_c.vector_size)])), (&(__pyx_v_c.word_vectors[((__pyx_v_c.window_indexes[__pyx_v_m]) * __pyx_v_c.vector_size)])), (__pyx_v_c.vector_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))));
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":490
+ * memcpy(&c.neu1[(c.doctag_len + m) * c.vector_size], &c.word_vectors[c.window_indexes[m] * c.vector_size],
+ * c.vector_size * cython.sizeof(REAL_t))
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error # <<<<<<<<<<<<<<
+ *
+ * if c.hs:
+ */
+ (void)(memset(__pyx_v_c.work, 0, (__pyx_v_c.layer1_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":492
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error
+ *
+ * if c.hs: # <<<<<<<<<<<<<<
+ * fast_document_dmc_hs(
+ * c.points[i], c.codes[i], c.codelens[i], c.neu1, c.syn1,
+ */
+ __pyx_t_9 = (__pyx_v_c.hs != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":493
+ *
+ * if c.hs:
+ * fast_document_dmc_hs( # <<<<<<<<<<<<<<
+ * c.points[i], c.codes[i], c.codelens[i], c.neu1, c.syn1,
+ * c.alpha, c.work, c.layer1_size, c.vector_size, c.learn_hidden)
+ */
+ __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs((__pyx_v_c.points[__pyx_v_i]), (__pyx_v_c.codes[__pyx_v_i]), (__pyx_v_c.codelens[__pyx_v_i]), __pyx_v_c.neu1, __pyx_v_c.syn1, __pyx_v_c.alpha, __pyx_v_c.work, __pyx_v_c.layer1_size, __pyx_v_c.vector_size, __pyx_v_c.learn_hidden);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":492
+ * memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error
+ *
+ * if c.hs: # <<<<<<<<<<<<<<
+ * fast_document_dmc_hs(
+ * c.points[i], c.codes[i], c.codelens[i], c.neu1, c.syn1,
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":497
+ * c.alpha, c.work, c.layer1_size, c.vector_size, c.learn_hidden)
+ *
+ * if c.negative: # <<<<<<<<<<<<<<
+ * c.next_random = fast_document_dmc_neg(
+ * c.negative, c.cum_table, c.cum_table_len, c.next_random, c.neu1, c.syn1neg,
+ */
+ __pyx_t_9 = (__pyx_v_c.negative != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":498
+ *
+ * if c.negative:
+ * c.next_random = fast_document_dmc_neg( # <<<<<<<<<<<<<<
+ * c.negative, c.cum_table, c.cum_table_len, c.next_random, c.neu1, c.syn1neg,
+ * c.indexes[i], c.alpha, c.work, c.layer1_size, c.vector_size, c.learn_hidden)
+ */
+ __pyx_v_c.next_random = __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_neg(__pyx_v_c.negative, __pyx_v_c.cum_table, __pyx_v_c.cum_table_len, __pyx_v_c.next_random, __pyx_v_c.neu1, __pyx_v_c.syn1neg, (__pyx_v_c.indexes[__pyx_v_i]), __pyx_v_c.alpha, __pyx_v_c.work, __pyx_v_c.layer1_size, __pyx_v_c.vector_size, __pyx_v_c.learn_hidden);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":497
+ * c.alpha, c.work, c.layer1_size, c.vector_size, c.learn_hidden)
+ *
+ * if c.negative: # <<<<<<<<<<<<<<
+ * c.next_random = fast_document_dmc_neg(
+ * c.negative, c.cum_table, c.cum_table_len, c.next_random, c.neu1, c.syn1neg,
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":502
+ * c.indexes[i], c.alpha, c.work, c.layer1_size, c.vector_size, c.learn_hidden)
+ *
+ * if c.learn_doctags and _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * our_saxpy(&c.vector_size, &c.doctag_locks[_doc_tag], &c.work[m * c.vector_size],
+ * &ONE, &c.doctag_vectors[_doc_tag * c.vector_size], &ONE)
+ */
+ __pyx_t_3 = (__pyx_v_c.learn_doctags != 0);
+ if (__pyx_t_3) {
+ } else {
+ __pyx_t_9 = __pyx_t_3;
+ goto __pyx_L27_bool_binop_done;
+ }
+ __pyx_t_3 = ((__pyx_v__doc_tag < __pyx_v_c.docvecs_count) != 0);
+ __pyx_t_9 = __pyx_t_3;
+ __pyx_L27_bool_binop_done:;
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":503
+ *
+ * if c.learn_doctags and _doc_tag < c.docvecs_count:
+ * our_saxpy(&c.vector_size, &c.doctag_locks[_doc_tag], &c.work[m * c.vector_size], # <<<<<<<<<<<<<<
+ * &ONE, &c.doctag_vectors[_doc_tag * c.vector_size], &ONE)
+ * if c.learn_words:
+ */
+ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_c.vector_size), (&(__pyx_v_c.doctag_locks[__pyx_v__doc_tag])), (&(__pyx_v_c.work[(__pyx_v_m * __pyx_v_c.vector_size)])), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE), (&(__pyx_v_c.doctag_vectors[(__pyx_v__doc_tag * __pyx_v_c.vector_size)])), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE));
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":502
+ * c.indexes[i], c.alpha, c.work, c.layer1_size, c.vector_size, c.learn_hidden)
+ *
+ * if c.learn_doctags and _doc_tag < c.docvecs_count: # <<<<<<<<<<<<<<
+ * our_saxpy(&c.vector_size, &c.doctag_locks[_doc_tag], &c.work[m * c.vector_size],
+ * &ONE, &c.doctag_vectors[_doc_tag * c.vector_size], &ONE)
+ */
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":505
+ * our_saxpy(&c.vector_size, &c.doctag_locks[_doc_tag], &c.work[m * c.vector_size],
+ * &ONE, &c.doctag_vectors[_doc_tag * c.vector_size], &ONE)
+ * if c.learn_words: # <<<<<<<<<<<<<<
+ * for m in range(2 * c.window):
+ * our_saxpy(&c.vector_size, &c.word_locks[c.window_indexes[m]], &c.work[(c.doctag_len + m) * c.vector_size],
+ */
+ __pyx_t_9 = (__pyx_v_c.learn_words != 0);
+ if (__pyx_t_9) {
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":506
+ * &ONE, &c.doctag_vectors[_doc_tag * c.vector_size], &ONE)
+ * if c.learn_words:
+ * for m in range(2 * c.window): # <<<<<<<<<<<<<<
+ * our_saxpy(&c.vector_size, &c.word_locks[c.window_indexes[m]], &c.work[(c.doctag_len + m) * c.vector_size],
+ * &ONE, &c.word_vectors[c.window_indexes[m] * c.vector_size], &ONE)
+ */
+ __pyx_t_17 = (2 * __pyx_v_c.window);
+ __pyx_t_18 = __pyx_t_17;
+ for (__pyx_t_13 = 0; __pyx_t_13 < __pyx_t_18; __pyx_t_13+=1) {
+ __pyx_v_m = __pyx_t_13;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":507
+ * if c.learn_words:
+ * for m in range(2 * c.window):
+ * our_saxpy(&c.vector_size, &c.word_locks[c.window_indexes[m]], &c.work[(c.doctag_len + m) * c.vector_size], # <<<<<<<<<<<<<<
+ * &ONE, &c.word_vectors[c.window_indexes[m] * c.vector_size], &ONE)
+ *
+ */
+ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_c.vector_size), (&(__pyx_v_c.word_locks[(__pyx_v_c.window_indexes[__pyx_v_m])])), (&(__pyx_v_c.work[((__pyx_v_c.doctag_len + __pyx_v_m) * __pyx_v_c.vector_size)])), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE), (&(__pyx_v_c.word_vectors[((__pyx_v_c.window_indexes[__pyx_v_m]) * __pyx_v_c.vector_size)])), (&__pyx_v_6gensim_6models_18doc2vec_corpusfile_ONE));
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":505
+ * our_saxpy(&c.vector_size, &c.doctag_locks[_doc_tag], &c.work[m * c.vector_size],
+ * &ONE, &c.doctag_vectors[_doc_tag * c.vector_size], &ONE)
+ * if c.learn_words: # <<<<<<<<<<<<<<
+ * for m in range(2 * c.window):
+ * our_saxpy(&c.vector_size, &c.word_locks[c.window_indexes[m]], &c.work[(c.doctag_len + m) * c.vector_size],
+ */
+ }
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":510
+ * &ONE, &c.word_vectors[c.window_indexes[m] * c.vector_size], &ONE)
+ *
+ * total_documents += 1 # <<<<<<<<<<<<<<
+ * total_effective_words += effective_words
+ * _doc_tag += 1
+ */
+ __pyx_v_total_documents = (__pyx_v_total_documents + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":511
+ *
+ * total_documents += 1
+ * total_effective_words += effective_words # <<<<<<<<<<<<<<
+ * _doc_tag += 1
+ *
+ */
+ __pyx_v_total_effective_words = (__pyx_v_total_effective_words + __pyx_v_effective_words);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":512
+ * total_documents += 1
+ * total_effective_words += effective_words
+ * _doc_tag += 1 # <<<<<<<<<<<<<<
+ *
+ * c.alpha = get_next_alpha(start_alpha, end_alpha, total_documents, total_words, expected_examples,
+ */
+ __pyx_v__doc_tag = (__pyx_v__doc_tag + 1);
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":514
+ * _doc_tag += 1
+ *
+ * c.alpha = get_next_alpha(start_alpha, end_alpha, total_documents, total_words, expected_examples, # <<<<<<<<<<<<<<
+ * expected_words, cur_epoch, num_epochs)
+ *
+ */
+ __pyx_v_c.alpha = __pyx_f_6gensim_6models_19word2vec_corpusfile_get_next_alpha(__pyx_v_start_alpha, __pyx_v_end_alpha, __pyx_v_total_documents, __pyx_v_total_words, __pyx_v_expected_examples, __pyx_v_expected_words, __pyx_v_cur_epoch, __pyx_v_num_epochs);
+ __pyx_L6_continue:;
+ }
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":450
+ *
+ * # release GIL & train on the full corpus, document by document
+ * with nogil: # <<<<<<<<<<<<<<
+ * input_stream.reset()
+ * while not (input_stream.is_eof() or total_words > expected_words / c.workers):
+ */
+ /*finally:*/ {
+ /*normal exit:*/{
+ #ifdef WITH_THREAD
+ __Pyx_FastGIL_Forget();
+ Py_BLOCK_THREADS
+ #endif
+ goto __pyx_L5;
+ }
+ __pyx_L4_error: {
+ #ifdef WITH_THREAD
+ __Pyx_FastGIL_Forget();
+ Py_BLOCK_THREADS
+ #endif
+ goto __pyx_L1_error;
+ }
+ __pyx_L5:;
+ }
+ }
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":517
+ * expected_words, cur_epoch, num_epochs)
+ *
+ * return total_documents, total_effective_words, total_words # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_total_documents); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 517, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_7 = __Pyx_PyInt_From_int(__pyx_v_total_effective_words); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 517, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+ __pyx_t_19 = __Pyx_PyInt_From_int(__pyx_v_total_words); if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 517, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_19);
+ __pyx_t_20 = PyTuple_New(3); if (unlikely(!__pyx_t_20)) __PYX_ERR(0, 517, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_20);
+ __Pyx_GIVEREF(__pyx_t_2);
+ PyTuple_SET_ITEM(__pyx_t_20, 0, __pyx_t_2);
+ __Pyx_GIVEREF(__pyx_t_7);
+ PyTuple_SET_ITEM(__pyx_t_20, 1, __pyx_t_7);
+ __Pyx_GIVEREF(__pyx_t_19);
+ PyTuple_SET_ITEM(__pyx_t_20, 2, __pyx_t_19);
+ __pyx_t_2 = 0;
+ __pyx_t_7 = 0;
+ __pyx_t_19 = 0;
+ __pyx_r = __pyx_t_20;
+ __pyx_t_20 = 0;
+ goto __pyx_L0;
+
+ /* "gensim/models/doc2vec_corpusfile.pyx":378
+ *
+ *
+ * def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, # <<<<<<<<<<<<<<
+ * _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None,
+ * learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None,
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_2);
+ __Pyx_XDECREF(__pyx_t_7);
+ __Pyx_XDECREF(__pyx_t_19);
+ __Pyx_XDECREF(__pyx_t_20);
+ __Pyx_AddTraceback("gensim.models.doc2vec_corpusfile.d2v_train_epoch_dm_concat", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __Pyx_XDECREF((PyObject *)__pyx_v_input_stream);
+ __Pyx_XDECREF((PyObject *)__pyx_v_vocab);
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ * # experimental exception made for __getbuffer__ and __releasebuffer__
+ * # -- the details of this may change.
+ * def __getbuffer__(ndarray self, Py_buffer* info, int flags): # <<<<<<<<<<<<<<
+ * # This implementation of getbuffer is geared towards Cython
+ * # requirements, and does not yet fulfill the PEP.
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+ __pyx_r = __pyx_pf_5numpy_7ndarray___getbuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+ int __pyx_v_i;
+ int __pyx_v_ndim;
+ int __pyx_v_endian_detector;
+ int __pyx_v_little_endian;
+ int __pyx_v_t;
+ char *__pyx_v_f;
+ PyArray_Descr *__pyx_v_descr = 0;
+ int __pyx_v_offset;
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ int __pyx_t_2;
+ PyObject *__pyx_t_3 = NULL;
+ int __pyx_t_4;
+ int __pyx_t_5;
+ int __pyx_t_6;
+ PyObject *__pyx_t_7 = NULL;
+ char *__pyx_t_8;
+ if (__pyx_v_info == NULL) {
+ PyErr_SetString(PyExc_BufferError, "PyObject_GetBuffer: view==NULL argument is obsolete");
+ return -1;
+ }
+ __Pyx_RefNannySetupContext("__getbuffer__", 0);
+ __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+ __Pyx_GIVEREF(__pyx_v_info->obj);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":222
+ *
+ * cdef int i, ndim
+ * cdef int endian_detector = 1 # <<<<<<<<<<<<<<
+ * cdef bint little_endian = ((&endian_detector)[0] != 0)
+ *
+ */
+ __pyx_v_endian_detector = 1;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":223
+ * cdef int i, ndim
+ * cdef int endian_detector = 1
+ * cdef bint little_endian = ((&endian_detector)[0] != 0) # <<<<<<<<<<<<<<
+ *
+ * ndim = PyArray_NDIM(self)
+ */
+ __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":225
+ * cdef bint little_endian = ((&endian_detector)[0] != 0)
+ *
+ * ndim = PyArray_NDIM(self) # <<<<<<<<<<<<<<
+ *
+ * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ */
+ __pyx_v_ndim = PyArray_NDIM(__pyx_v_self);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ * ndim = PyArray_NDIM(self)
+ *
+ * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) # <<<<<<<<<<<<<<
+ * and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ * raise ValueError(u"ndarray is not C contiguous")
+ */
+ __pyx_t_2 = (((__pyx_v_flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS) != 0);
+ if (__pyx_t_2) {
+ } else {
+ __pyx_t_1 = __pyx_t_2;
+ goto __pyx_L4_bool_binop_done;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":228
+ *
+ * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ * and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)): # <<<<<<<<<<<<<<
+ * raise ValueError(u"ndarray is not C contiguous")
+ *
+ */
+ __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_C_CONTIGUOUS) != 0)) != 0);
+ __pyx_t_1 = __pyx_t_2;
+ __pyx_L4_bool_binop_done:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ * ndim = PyArray_NDIM(self)
+ *
+ * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) # <<<<<<<<<<<<<<
+ * and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ * raise ValueError(u"ndarray is not C contiguous")
+ */
+ if (unlikely(__pyx_t_1)) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":229
+ * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ * and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ * raise ValueError(u"ndarray is not C contiguous") # <<<<<<<<<<<<<<
+ *
+ * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+ __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 229, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(1, 229, __pyx_L1_error)
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ * ndim = PyArray_NDIM(self)
+ *
+ * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) # <<<<<<<<<<<<<<
+ * and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ * raise ValueError(u"ndarray is not C contiguous")
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ * raise ValueError(u"ndarray is not C contiguous")
+ *
+ * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) # <<<<<<<<<<<<<<
+ * and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ * raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+ __pyx_t_2 = (((__pyx_v_flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS) != 0);
+ if (__pyx_t_2) {
+ } else {
+ __pyx_t_1 = __pyx_t_2;
+ goto __pyx_L7_bool_binop_done;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":232
+ *
+ * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ * and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)): # <<<<<<<<<<<<<<
+ * raise ValueError(u"ndarray is not Fortran contiguous")
+ *
+ */
+ __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_F_CONTIGUOUS) != 0)) != 0);
+ __pyx_t_1 = __pyx_t_2;
+ __pyx_L7_bool_binop_done:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ * raise ValueError(u"ndarray is not C contiguous")
+ *
+ * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) # <<<<<<<<<<<<<<
+ * and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ * raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+ if (unlikely(__pyx_t_1)) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233
+ * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ * and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ * raise ValueError(u"ndarray is not Fortran contiguous") # <<<<<<<<<<<<<<
+ *
+ * info.buf = PyArray_DATA(self)
+ */
+ __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 233, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(1, 233, __pyx_L1_error)
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ * raise ValueError(u"ndarray is not C contiguous")
+ *
+ * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) # <<<<<<<<<<<<<<
+ * and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ * raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235
+ * raise ValueError(u"ndarray is not Fortran contiguous")
+ *
+ * info.buf = PyArray_DATA(self) # <<<<<<<<<<<<<<
+ * info.ndim = ndim
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+ __pyx_v_info->buf = PyArray_DATA(__pyx_v_self);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":236
+ *
+ * info.buf = PyArray_DATA(self)
+ * info.ndim = ndim # <<<<<<<<<<<<<<
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ * # Allocate new buffer for strides and shape info.
+ */
+ __pyx_v_info->ndim = __pyx_v_ndim;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237
+ * info.buf = PyArray_DATA(self)
+ * info.ndim = ndim
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t): # <<<<<<<<<<<<<<
+ * # Allocate new buffer for strides and shape info.
+ * # This is allocated as one block, strides first.
+ */
+ __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+ if (__pyx_t_1) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":240
+ * # Allocate new buffer for strides and shape info.
+ * # This is allocated as one block, strides first.
+ * info.strides = PyObject_Malloc(sizeof(Py_ssize_t) * 2 * ndim) # <<<<<<<<<<<<<<
+ * info.shape = info.strides + ndim
+ * for i in range(ndim):
+ */
+ __pyx_v_info->strides = ((Py_ssize_t *)PyObject_Malloc((((sizeof(Py_ssize_t)) * 2) * ((size_t)__pyx_v_ndim))));
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":241
+ * # This is allocated as one block, strides first.
+ * info.strides = PyObject_Malloc(sizeof(Py_ssize_t) * 2 * ndim)
+ * info.shape = info.strides + ndim # <<<<<<<<<<<<<<
+ * for i in range(ndim):
+ * info.strides[i] = PyArray_STRIDES(self)[i]
+ */
+ __pyx_v_info->shape = (__pyx_v_info->strides + __pyx_v_ndim);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":242
+ * info.strides = PyObject_Malloc(sizeof(Py_ssize_t) * 2 * ndim)
+ * info.shape = info.strides + ndim
+ * for i in range(ndim): # <<<<<<<<<<<<<<
+ * info.strides[i] = PyArray_STRIDES(self)[i]
+ * info.shape[i] = PyArray_DIMS(self)[i]
+ */
+ __pyx_t_4 = __pyx_v_ndim;
+ __pyx_t_5 = __pyx_t_4;
+ for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+ __pyx_v_i = __pyx_t_6;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":243
+ * info.shape = info.strides + ndim
+ * for i in range(ndim):
+ * info.strides[i] = PyArray_STRIDES(self)[i] # <<<<<<<<<<<<<<
+ * info.shape[i] = PyArray_DIMS(self)[i]
+ * else:
+ */
+ (__pyx_v_info->strides[__pyx_v_i]) = (PyArray_STRIDES(__pyx_v_self)[__pyx_v_i]);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":244
+ * for i in range(ndim):
+ * info.strides[i] = PyArray_STRIDES(self)[i]
+ * info.shape[i] = PyArray_DIMS(self)[i] # <<<<<<<<<<<<<<
+ * else:
+ * info.strides = PyArray_STRIDES(self)
+ */
+ (__pyx_v_info->shape[__pyx_v_i]) = (PyArray_DIMS(__pyx_v_self)[__pyx_v_i]);
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237
+ * info.buf = PyArray_DATA(self)
+ * info.ndim = ndim
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t): # <<<<<<<<<<<<<<
+ * # Allocate new buffer for strides and shape info.
+ * # This is allocated as one block, strides first.
+ */
+ goto __pyx_L9;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":246
+ * info.shape[i] = PyArray_DIMS(self)[i]
+ * else:
+ * info.strides = PyArray_STRIDES(self) # <<<<<<<<<<<<<<
+ * info.shape = PyArray_DIMS(self)
+ * info.suboffsets = NULL
+ */
+ /*else*/ {
+ __pyx_v_info->strides = ((Py_ssize_t *)PyArray_STRIDES(__pyx_v_self));
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":247
+ * else:
+ * info.strides = PyArray_STRIDES(self)
+ * info.shape = PyArray_DIMS(self) # <<<<<<<<<<<<<<
+ * info.suboffsets = NULL
+ * info.itemsize = PyArray_ITEMSIZE(self)
+ */
+ __pyx_v_info->shape = ((Py_ssize_t *)PyArray_DIMS(__pyx_v_self));
+ }
+ __pyx_L9:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":248
+ * info.strides = PyArray_STRIDES(self)
+ * info.shape = PyArray_DIMS(self)
+ * info.suboffsets = NULL # <<<<<<<<<<<<<<
+ * info.itemsize = PyArray_ITEMSIZE(self)
+ * info.readonly = not PyArray_ISWRITEABLE(self)
+ */
+ __pyx_v_info->suboffsets = NULL;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":249
+ * info.shape = PyArray_DIMS(self)
+ * info.suboffsets = NULL
+ * info.itemsize = PyArray_ITEMSIZE(self) # <<<<<<<<<<<<<<
+ * info.readonly = not PyArray_ISWRITEABLE(self)
+ *
+ */
+ __pyx_v_info->itemsize = PyArray_ITEMSIZE(__pyx_v_self);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":250
+ * info.suboffsets = NULL
+ * info.itemsize = PyArray_ITEMSIZE(self)
+ * info.readonly = not PyArray_ISWRITEABLE(self) # <<<<<<<<<<<<<<
+ *
+ * cdef int t
+ */
+ __pyx_v_info->readonly = (!(PyArray_ISWRITEABLE(__pyx_v_self) != 0));
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":253
+ *
+ * cdef int t
+ * cdef char* f = NULL # <<<<<<<<<<<<<<
+ * cdef dtype descr = self.descr
+ * cdef int offset
+ */
+ __pyx_v_f = NULL;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":254
+ * cdef int t
+ * cdef char* f = NULL
+ * cdef dtype descr = self.descr # <<<<<<<<<<<<<<
+ * cdef int offset
+ *
+ */
+ __pyx_t_3 = ((PyObject *)__pyx_v_self->descr);
+ __Pyx_INCREF(__pyx_t_3);
+ __pyx_v_descr = ((PyArray_Descr *)__pyx_t_3);
+ __pyx_t_3 = 0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ * cdef int offset
+ *
+ * info.obj = self # <<<<<<<<<<<<<<
+ *
+ * if not PyDataType_HASFIELDS(descr):
+ */
+ __Pyx_INCREF(((PyObject *)__pyx_v_self));
+ __Pyx_GIVEREF(((PyObject *)__pyx_v_self));
+ __Pyx_GOTREF(__pyx_v_info->obj);
+ __Pyx_DECREF(__pyx_v_info->obj);
+ __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ * info.obj = self
+ *
+ * if not PyDataType_HASFIELDS(descr): # <<<<<<<<<<<<<<
+ * t = descr.type_num
+ * if ((descr.byteorder == c'>' and little_endian) or
+ */
+ __pyx_t_1 = ((!(PyDataType_HASFIELDS(__pyx_v_descr) != 0)) != 0);
+ if (__pyx_t_1) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":260
+ *
+ * if not PyDataType_HASFIELDS(descr):
+ * t = descr.type_num # <<<<<<<<<<<<<<
+ * if ((descr.byteorder == c'>' and little_endian) or
+ * (descr.byteorder == c'<' and not little_endian)):
+ */
+ __pyx_t_4 = __pyx_v_descr->type_num;
+ __pyx_v_t = __pyx_t_4;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ * if not PyDataType_HASFIELDS(descr):
+ * t = descr.type_num
+ * if ((descr.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<<
+ * (descr.byteorder == c'<' and not little_endian)):
+ * raise ValueError(u"Non-native byte order not supported")
+ */
+ __pyx_t_2 = ((__pyx_v_descr->byteorder == '>') != 0);
+ if (!__pyx_t_2) {
+ goto __pyx_L15_next_or;
+ } else {
+ }
+ __pyx_t_2 = (__pyx_v_little_endian != 0);
+ if (!__pyx_t_2) {
+ } else {
+ __pyx_t_1 = __pyx_t_2;
+ goto __pyx_L14_bool_binop_done;
+ }
+ __pyx_L15_next_or:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":262
+ * t = descr.type_num
+ * if ((descr.byteorder == c'>' and little_endian) or
+ * (descr.byteorder == c'<' and not little_endian)): # <<<<<<<<<<<<<<
+ * raise ValueError(u"Non-native byte order not supported")
+ * if t == NPY_BYTE: f = "b"
+ */
+ __pyx_t_2 = ((__pyx_v_descr->byteorder == '<') != 0);
+ if (__pyx_t_2) {
+ } else {
+ __pyx_t_1 = __pyx_t_2;
+ goto __pyx_L14_bool_binop_done;
+ }
+ __pyx_t_2 = ((!(__pyx_v_little_endian != 0)) != 0);
+ __pyx_t_1 = __pyx_t_2;
+ __pyx_L14_bool_binop_done:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ * if not PyDataType_HASFIELDS(descr):
+ * t = descr.type_num
+ * if ((descr.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<<
+ * (descr.byteorder == c'<' and not little_endian)):
+ * raise ValueError(u"Non-native byte order not supported")
+ */
+ if (unlikely(__pyx_t_1)) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":263
+ * if ((descr.byteorder == c'>' and little_endian) or
+ * (descr.byteorder == c'<' and not little_endian)):
+ * raise ValueError(u"Non-native byte order not supported") # <<<<<<<<<<<<<<
+ * if t == NPY_BYTE: f = "b"
+ * elif t == NPY_UBYTE: f = "B"
+ */
+ __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 263, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(1, 263, __pyx_L1_error)
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ * if not PyDataType_HASFIELDS(descr):
+ * t = descr.type_num
+ * if ((descr.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<<
+ * (descr.byteorder == c'<' and not little_endian)):
+ * raise ValueError(u"Non-native byte order not supported")
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":264
+ * (descr.byteorder == c'<' and not little_endian)):
+ * raise ValueError(u"Non-native byte order not supported")
+ * if t == NPY_BYTE: f = "b" # <<<<<<<<<<<<<<
+ * elif t == NPY_UBYTE: f = "B"
+ * elif t == NPY_SHORT: f = "h"
+ */
+ switch (__pyx_v_t) {
+ case NPY_BYTE:
+ __pyx_v_f = ((char *)"b");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265
+ * raise ValueError(u"Non-native byte order not supported")
+ * if t == NPY_BYTE: f = "b"
+ * elif t == NPY_UBYTE: f = "B" # <<<<<<<<<<<<<<
+ * elif t == NPY_SHORT: f = "h"
+ * elif t == NPY_USHORT: f = "H"
+ */
+ case NPY_UBYTE:
+ __pyx_v_f = ((char *)"B");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":266
+ * if t == NPY_BYTE: f = "b"
+ * elif t == NPY_UBYTE: f = "B"
+ * elif t == NPY_SHORT: f = "h" # <<<<<<<<<<<<<<
+ * elif t == NPY_USHORT: f = "H"
+ * elif t == NPY_INT: f = "i"
+ */
+ case NPY_SHORT:
+ __pyx_v_f = ((char *)"h");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":267
+ * elif t == NPY_UBYTE: f = "B"
+ * elif t == NPY_SHORT: f = "h"
+ * elif t == NPY_USHORT: f = "H" # <<<<<<<<<<<<<<
+ * elif t == NPY_INT: f = "i"
+ * elif t == NPY_UINT: f = "I"
+ */
+ case NPY_USHORT:
+ __pyx_v_f = ((char *)"H");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":268
+ * elif t == NPY_SHORT: f = "h"
+ * elif t == NPY_USHORT: f = "H"
+ * elif t == NPY_INT: f = "i" # <<<<<<<<<<<<<<
+ * elif t == NPY_UINT: f = "I"
+ * elif t == NPY_LONG: f = "l"
+ */
+ case NPY_INT:
+ __pyx_v_f = ((char *)"i");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":269
+ * elif t == NPY_USHORT: f = "H"
+ * elif t == NPY_INT: f = "i"
+ * elif t == NPY_UINT: f = "I" # <<<<<<<<<<<<<<
+ * elif t == NPY_LONG: f = "l"
+ * elif t == NPY_ULONG: f = "L"
+ */
+ case NPY_UINT:
+ __pyx_v_f = ((char *)"I");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":270
+ * elif t == NPY_INT: f = "i"
+ * elif t == NPY_UINT: f = "I"
+ * elif t == NPY_LONG: f = "l" # <<<<<<<<<<<<<<
+ * elif t == NPY_ULONG: f = "L"
+ * elif t == NPY_LONGLONG: f = "q"
+ */
+ case NPY_LONG:
+ __pyx_v_f = ((char *)"l");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":271
+ * elif t == NPY_UINT: f = "I"
+ * elif t == NPY_LONG: f = "l"
+ * elif t == NPY_ULONG: f = "L" # <<<<<<<<<<<<<<
+ * elif t == NPY_LONGLONG: f = "q"
+ * elif t == NPY_ULONGLONG: f = "Q"
+ */
+ case NPY_ULONG:
+ __pyx_v_f = ((char *)"L");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272
+ * elif t == NPY_LONG: f = "l"
+ * elif t == NPY_ULONG: f = "L"
+ * elif t == NPY_LONGLONG: f = "q" # <<<<<<<<<<<<<<
+ * elif t == NPY_ULONGLONG: f = "Q"
+ * elif t == NPY_FLOAT: f = "f"
+ */
+ case NPY_LONGLONG:
+ __pyx_v_f = ((char *)"q");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":273
+ * elif t == NPY_ULONG: f = "L"
+ * elif t == NPY_LONGLONG: f = "q"
+ * elif t == NPY_ULONGLONG: f = "Q" # <<<<<<<<<<<<<<
+ * elif t == NPY_FLOAT: f = "f"
+ * elif t == NPY_DOUBLE: f = "d"
+ */
+ case NPY_ULONGLONG:
+ __pyx_v_f = ((char *)"Q");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274
+ * elif t == NPY_LONGLONG: f = "q"
+ * elif t == NPY_ULONGLONG: f = "Q"
+ * elif t == NPY_FLOAT: f = "f" # <<<<<<<<<<<<<<
+ * elif t == NPY_DOUBLE: f = "d"
+ * elif t == NPY_LONGDOUBLE: f = "g"
+ */
+ case NPY_FLOAT:
+ __pyx_v_f = ((char *)"f");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":275
+ * elif t == NPY_ULONGLONG: f = "Q"
+ * elif t == NPY_FLOAT: f = "f"
+ * elif t == NPY_DOUBLE: f = "d" # <<<<<<<<<<<<<<
+ * elif t == NPY_LONGDOUBLE: f = "g"
+ * elif t == NPY_CFLOAT: f = "Zf"
+ */
+ case NPY_DOUBLE:
+ __pyx_v_f = ((char *)"d");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276
+ * elif t == NPY_FLOAT: f = "f"
+ * elif t == NPY_DOUBLE: f = "d"
+ * elif t == NPY_LONGDOUBLE: f = "g" # <<<<<<<<<<<<<<
+ * elif t == NPY_CFLOAT: f = "Zf"
+ * elif t == NPY_CDOUBLE: f = "Zd"
+ */
+ case NPY_LONGDOUBLE:
+ __pyx_v_f = ((char *)"g");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":277
+ * elif t == NPY_DOUBLE: f = "d"
+ * elif t == NPY_LONGDOUBLE: f = "g"
+ * elif t == NPY_CFLOAT: f = "Zf" # <<<<<<<<<<<<<<
+ * elif t == NPY_CDOUBLE: f = "Zd"
+ * elif t == NPY_CLONGDOUBLE: f = "Zg"
+ */
+ case NPY_CFLOAT:
+ __pyx_v_f = ((char *)"Zf");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":278
+ * elif t == NPY_LONGDOUBLE: f = "g"
+ * elif t == NPY_CFLOAT: f = "Zf"
+ * elif t == NPY_CDOUBLE: f = "Zd" # <<<<<<<<<<<<<<
+ * elif t == NPY_CLONGDOUBLE: f = "Zg"
+ * elif t == NPY_OBJECT: f = "O"
+ */
+ case NPY_CDOUBLE:
+ __pyx_v_f = ((char *)"Zd");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":279
+ * elif t == NPY_CFLOAT: f = "Zf"
+ * elif t == NPY_CDOUBLE: f = "Zd"
+ * elif t == NPY_CLONGDOUBLE: f = "Zg" # <<<<<<<<<<<<<<
+ * elif t == NPY_OBJECT: f = "O"
+ * else:
+ */
+ case NPY_CLONGDOUBLE:
+ __pyx_v_f = ((char *)"Zg");
+ break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":280
+ * elif t == NPY_CDOUBLE: f = "Zd"
+ * elif t == NPY_CLONGDOUBLE: f = "Zg"
+ * elif t == NPY_OBJECT: f = "O" # <<<<<<<<<<<<<<
+ * else:
+ * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+ case NPY_OBJECT:
+ __pyx_v_f = ((char *)"O");
+ break;
+ default:
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":282
+ * elif t == NPY_OBJECT: f = "O"
+ * else:
+ * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) # <<<<<<<<<<<<<<
+ * info.format = f
+ * return
+ */
+ __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_t); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 282, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_7 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_t_3); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 282, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_7);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_7); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 282, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(1, 282, __pyx_L1_error)
+ break;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":283
+ * else:
+ * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ * info.format = f # <<<<<<<<<<<<<<
+ * return
+ * else:
+ */
+ __pyx_v_info->format = __pyx_v_f;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":284
+ * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ * info.format = f
+ * return # <<<<<<<<<<<<<<
+ * else:
+ * info.format = PyObject_Malloc(_buffer_format_string_len)
+ */
+ __pyx_r = 0;
+ goto __pyx_L0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ * info.obj = self
+ *
+ * if not PyDataType_HASFIELDS(descr): # <<<<<<<<<<<<<<
+ * t = descr.type_num
+ * if ((descr.byteorder == c'>' and little_endian) or
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":286
+ * return
+ * else:
+ * info.format = PyObject_Malloc(_buffer_format_string_len) # <<<<<<<<<<<<<<
+ * info.format[0] = c'^' # Native data types, manual alignment
+ * offset = 0
+ */
+ /*else*/ {
+ __pyx_v_info->format = ((char *)PyObject_Malloc(0xFF));
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":287
+ * else:
+ * info.format = PyObject_Malloc(_buffer_format_string_len)
+ * info.format[0] = c'^' # Native data types, manual alignment # <<<<<<<<<<<<<<
+ * offset = 0
+ * f = _util_dtypestring(descr, info.format + 1,
+ */
+ (__pyx_v_info->format[0]) = '^';
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288
+ * info.format = PyObject_Malloc(_buffer_format_string_len)
+ * info.format[0] = c'^' # Native data types, manual alignment
+ * offset = 0 # <<<<<<<<<<<<<<
+ * f = _util_dtypestring(descr, info.format + 1,
+ * info.format + _buffer_format_string_len,
+ */
+ __pyx_v_offset = 0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":289
+ * info.format[0] = c'^' # Native data types, manual alignment
+ * offset = 0
+ * f = _util_dtypestring(descr, info.format + 1, # <<<<<<<<<<<<<<
+ * info.format + _buffer_format_string_len,
+ * &offset)
+ */
+ __pyx_t_8 = __pyx_f_5numpy__util_dtypestring(__pyx_v_descr, (__pyx_v_info->format + 1), (__pyx_v_info->format + 0xFF), (&__pyx_v_offset)); if (unlikely(__pyx_t_8 == ((char *)NULL))) __PYX_ERR(1, 289, __pyx_L1_error)
+ __pyx_v_f = __pyx_t_8;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":292
+ * info.format + _buffer_format_string_len,
+ * &offset)
+ * f[0] = c'\0' # Terminate format string # <<<<<<<<<<<<<<
+ *
+ * def __releasebuffer__(ndarray self, Py_buffer* info):
+ */
+ (__pyx_v_f[0]) = '\x00';
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ * # experimental exception made for __getbuffer__ and __releasebuffer__
+ * # -- the details of this may change.
+ * def __getbuffer__(ndarray self, Py_buffer* info, int flags): # <<<<<<<<<<<<<<
+ * # This implementation of getbuffer is geared towards Cython
+ * # requirements, and does not yet fulfill the PEP.
+ */
+
+ /* function exit code */
+ __pyx_r = 0;
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_3);
+ __Pyx_XDECREF(__pyx_t_7);
+ __Pyx_AddTraceback("numpy.ndarray.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = -1;
+ if (__pyx_v_info->obj != NULL) {
+ __Pyx_GOTREF(__pyx_v_info->obj);
+ __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+ }
+ goto __pyx_L2;
+ __pyx_L0:;
+ if (__pyx_v_info->obj == Py_None) {
+ __Pyx_GOTREF(__pyx_v_info->obj);
+ __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+ }
+ __pyx_L2:;
+ __Pyx_XDECREF((PyObject *)__pyx_v_descr);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":294
+ * f[0] = c'\0' # Terminate format string
+ *
+ * def __releasebuffer__(ndarray self, Py_buffer* info): # <<<<<<<<<<<<<<
+ * if PyArray_HASFIELDS(self):
+ * PyObject_Free(info.format)
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info); /*proto*/
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__releasebuffer__ (wrapper)", 0);
+ __pyx_pf_5numpy_7ndarray_2__releasebuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ __Pyx_RefNannySetupContext("__releasebuffer__", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":295
+ *
+ * def __releasebuffer__(ndarray self, Py_buffer* info):
+ * if PyArray_HASFIELDS(self): # <<<<<<<<<<<<<<
+ * PyObject_Free(info.format)
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+ __pyx_t_1 = (PyArray_HASFIELDS(__pyx_v_self) != 0);
+ if (__pyx_t_1) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":296
+ * def __releasebuffer__(ndarray self, Py_buffer* info):
+ * if PyArray_HASFIELDS(self):
+ * PyObject_Free(info.format) # <<<<<<<<<<<<<<
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ * PyObject_Free(info.strides)
+ */
+ PyObject_Free(__pyx_v_info->format);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":295
+ *
+ * def __releasebuffer__(ndarray self, Py_buffer* info):
+ * if PyArray_HASFIELDS(self): # <<<<<<<<<<<<<<
+ * PyObject_Free(info.format)
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":297
+ * if PyArray_HASFIELDS(self):
+ * PyObject_Free(info.format)
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t): # <<<<<<<<<<<<<<
+ * PyObject_Free(info.strides)
+ * # info.shape was stored after info.strides in the same block
+ */
+ __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+ if (__pyx_t_1) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":298
+ * PyObject_Free(info.format)
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ * PyObject_Free(info.strides) # <<<<<<<<<<<<<<
+ * # info.shape was stored after info.strides in the same block
+ *
+ */
+ PyObject_Free(__pyx_v_info->strides);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":297
+ * if PyArray_HASFIELDS(self):
+ * PyObject_Free(info.format)
+ * if sizeof(npy_intp) != sizeof(Py_ssize_t): # <<<<<<<<<<<<<<
+ * PyObject_Free(info.strides)
+ * # info.shape was stored after info.strides in the same block
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":294
+ * f[0] = c'\0' # Terminate format string
+ *
+ * def __releasebuffer__(ndarray self, Py_buffer* info): # <<<<<<<<<<<<<<
+ * if PyArray_HASFIELDS(self):
+ * PyObject_Free(info.format)
+ */
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":775
+ * ctypedef npy_cdouble complex_t
+ *
+ * cdef inline object PyArray_MultiIterNew1(a): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(1, a)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":776
+ *
+ * cdef inline object PyArray_MultiIterNew1(a):
+ * return PyArray_MultiIterNew(1, a) # <<<<<<<<<<<<<<
+ *
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 776, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":775
+ * ctypedef npy_cdouble complex_t
+ *
+ * cdef inline object PyArray_MultiIterNew1(a): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(1, a)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":778
+ * return PyArray_MultiIterNew(1, a)
+ *
+ * cdef inline object PyArray_MultiIterNew2(a, b): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(2, a, b)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":779
+ *
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ * return PyArray_MultiIterNew(2, a, b) # <<<<<<<<<<<<<<
+ *
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 779, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":778
+ * return PyArray_MultiIterNew(1, a)
+ *
+ * cdef inline object PyArray_MultiIterNew2(a, b): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(2, a, b)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":781
+ * return PyArray_MultiIterNew(2, a, b)
+ *
+ * cdef inline object PyArray_MultiIterNew3(a, b, c): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(3, a, b, c)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":782
+ *
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ * return PyArray_MultiIterNew(3, a, b, c) # <<<<<<<<<<<<<<
+ *
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 782, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":781
+ * return PyArray_MultiIterNew(2, a, b)
+ *
+ * cdef inline object PyArray_MultiIterNew3(a, b, c): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(3, a, b, c)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":784
+ * return PyArray_MultiIterNew(3, a, b, c)
+ *
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(4, a, b, c, d)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":785
+ *
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ * return PyArray_MultiIterNew(4, a, b, c, d) # <<<<<<<<<<<<<<
+ *
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 785, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":784
+ * return PyArray_MultiIterNew(3, a, b, c)
+ *
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(4, a, b, c, d)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":787
+ * return PyArray_MultiIterNew(4, a, b, c, d)
+ *
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(5, a, b, c, d, e)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":788
+ *
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ * return PyArray_MultiIterNew(5, a, b, c, d, e) # <<<<<<<<<<<<<<
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 788, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":787
+ * return PyArray_MultiIterNew(4, a, b, c, d)
+ *
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(5, a, b, c, d, e)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":790
+ * return PyArray_MultiIterNew(5, a, b, c, d, e)
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d): # <<<<<<<<<<<<<<
+ * if PyDataType_HASSUBARRAY(d):
+ * return d.subarray.shape
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__pyx_v_d) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ __Pyx_RefNannySetupContext("PyDataType_SHAPE", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ * if PyDataType_HASSUBARRAY(d): # <<<<<<<<<<<<<<
+ * return d.subarray.shape
+ * else:
+ */
+ __pyx_t_1 = (PyDataType_HASSUBARRAY(__pyx_v_d) != 0);
+ if (__pyx_t_1) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":792
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ * if PyDataType_HASSUBARRAY(d):
+ * return d.subarray.shape # <<<<<<<<<<<<<<
+ * else:
+ * return ()
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __Pyx_INCREF(((PyObject*)__pyx_v_d->subarray->shape));
+ __pyx_r = ((PyObject*)__pyx_v_d->subarray->shape);
+ goto __pyx_L0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ * if PyDataType_HASSUBARRAY(d): # <<<<<<<<<<<<<<
+ * return d.subarray.shape
+ * else:
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794
+ * return d.subarray.shape
+ * else:
+ * return () # <<<<<<<<<<<<<<
+ *
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:
+ */
+ /*else*/ {
+ __Pyx_XDECREF(__pyx_r);
+ __Pyx_INCREF(__pyx_empty_tuple);
+ __pyx_r = __pyx_empty_tuple;
+ goto __pyx_L0;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":790
+ * return PyArray_MultiIterNew(5, a, b, c, d, e)
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d): # <<<<<<<<<<<<<<
+ * if PyDataType_HASSUBARRAY(d):
+ * return d.subarray.shape
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":796
+ * return ()
+ *
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL: # <<<<<<<<<<<<<<
+ * # Recursive utility function used in __getbuffer__ to get format
+ * # string. The new location in the format string is returned.
+ */
+
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx_v_descr, char *__pyx_v_f, char *__pyx_v_end, int *__pyx_v_offset) {
+ PyArray_Descr *__pyx_v_child = 0;
+ int __pyx_v_endian_detector;
+ int __pyx_v_little_endian;
+ PyObject *__pyx_v_fields = 0;
+ PyObject *__pyx_v_childname = NULL;
+ PyObject *__pyx_v_new_offset = NULL;
+ PyObject *__pyx_v_t = NULL;
+ char *__pyx_r;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ Py_ssize_t __pyx_t_2;
+ PyObject *__pyx_t_3 = NULL;
+ PyObject *__pyx_t_4 = NULL;
+ int __pyx_t_5;
+ int __pyx_t_6;
+ int __pyx_t_7;
+ long __pyx_t_8;
+ char *__pyx_t_9;
+ __Pyx_RefNannySetupContext("_util_dtypestring", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801
+ *
+ * cdef dtype child
+ * cdef int endian_detector = 1 # <<<<<<<<<<<<<<
+ * cdef bint little_endian = ((&endian_detector)[0] != 0)
+ * cdef tuple fields
+ */
+ __pyx_v_endian_detector = 1;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":802
+ * cdef dtype child
+ * cdef int endian_detector = 1
+ * cdef bint little_endian = ((&endian_detector)[0] != 0) # <<<<<<<<<<<<<<
+ * cdef tuple fields
+ *
+ */
+ __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":805
+ * cdef tuple fields
+ *
+ * for childname in descr.names: # <<<<<<<<<<<<<<
+ * fields = descr.fields[childname]
+ * child, new_offset = fields
+ */
+ if (unlikely(__pyx_v_descr->names == Py_None)) {
+ PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+ __PYX_ERR(1, 805, __pyx_L1_error)
+ }
+ __pyx_t_1 = __pyx_v_descr->names; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0;
+ for (;;) {
+ if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
+ #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+ __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_3); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(1, 805, __pyx_L1_error)
+ #else
+ __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 805, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ #endif
+ __Pyx_XDECREF_SET(__pyx_v_childname, __pyx_t_3);
+ __pyx_t_3 = 0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":806
+ *
+ * for childname in descr.names:
+ * fields = descr.fields[childname] # <<<<<<<<<<<<<<
+ * child, new_offset = fields
+ *
+ */
+ if (unlikely(__pyx_v_descr->fields == Py_None)) {
+ PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+ __PYX_ERR(1, 806, __pyx_L1_error)
+ }
+ __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_descr->fields, __pyx_v_childname); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 806, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ if (!(likely(PyTuple_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "tuple", Py_TYPE(__pyx_t_3)->tp_name), 0))) __PYX_ERR(1, 806, __pyx_L1_error)
+ __Pyx_XDECREF_SET(__pyx_v_fields, ((PyObject*)__pyx_t_3));
+ __pyx_t_3 = 0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":807
+ * for childname in descr.names:
+ * fields = descr.fields[childname]
+ * child, new_offset = fields # <<<<<<<<<<<<<<
+ *
+ * if (end - f) - (new_offset - offset[0]) < 15:
+ */
+ if (likely(__pyx_v_fields != Py_None)) {
+ PyObject* sequence = __pyx_v_fields;
+ Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
+ if (unlikely(size != 2)) {
+ if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+ else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+ __PYX_ERR(1, 807, __pyx_L1_error)
+ }
+ #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+ __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0);
+ __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1);
+ __Pyx_INCREF(__pyx_t_3);
+ __Pyx_INCREF(__pyx_t_4);
+ #else
+ __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 807, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 807, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ #endif
+ } else {
+ __Pyx_RaiseNoneNotIterableError(); __PYX_ERR(1, 807, __pyx_L1_error)
+ }
+ if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_dtype))))) __PYX_ERR(1, 807, __pyx_L1_error)
+ __Pyx_XDECREF_SET(__pyx_v_child, ((PyArray_Descr *)__pyx_t_3));
+ __pyx_t_3 = 0;
+ __Pyx_XDECREF_SET(__pyx_v_new_offset, __pyx_t_4);
+ __pyx_t_4 = 0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":809
+ * child, new_offset = fields
+ *
+ * if (end - f) - (new_offset - offset[0]) < 15: # <<<<<<<<<<<<<<
+ * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ *
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 809, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyNumber_Subtract(__pyx_v_new_offset, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 809, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 809, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = ((((__pyx_v_end - __pyx_v_f) - ((int)__pyx_t_5)) < 15) != 0);
+ if (unlikely(__pyx_t_6)) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":810
+ *
+ * if (end - f) - (new_offset - offset[0]) < 15:
+ * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd") # <<<<<<<<<<<<<<
+ *
+ * if ((child.byteorder == c'>' and little_endian) or
+ */
+ __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 810, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(1, 810, __pyx_L1_error)
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":809
+ * child, new_offset = fields
+ *
+ * if (end - f) - (new_offset - offset[0]) < 15: # <<<<<<<<<<<<<<
+ * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ *
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":812
+ * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ *
+ * if ((child.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<<
+ * (child.byteorder == c'<' and not little_endian)):
+ * raise ValueError(u"Non-native byte order not supported")
+ */
+ __pyx_t_7 = ((__pyx_v_child->byteorder == '>') != 0);
+ if (!__pyx_t_7) {
+ goto __pyx_L8_next_or;
+ } else {
+ }
+ __pyx_t_7 = (__pyx_v_little_endian != 0);
+ if (!__pyx_t_7) {
+ } else {
+ __pyx_t_6 = __pyx_t_7;
+ goto __pyx_L7_bool_binop_done;
+ }
+ __pyx_L8_next_or:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":813
+ *
+ * if ((child.byteorder == c'>' and little_endian) or
+ * (child.byteorder == c'<' and not little_endian)): # <<<<<<<<<<<<<<
+ * raise ValueError(u"Non-native byte order not supported")
+ * # One could encode it in the format string and have Cython
+ */
+ __pyx_t_7 = ((__pyx_v_child->byteorder == '<') != 0);
+ if (__pyx_t_7) {
+ } else {
+ __pyx_t_6 = __pyx_t_7;
+ goto __pyx_L7_bool_binop_done;
+ }
+ __pyx_t_7 = ((!(__pyx_v_little_endian != 0)) != 0);
+ __pyx_t_6 = __pyx_t_7;
+ __pyx_L7_bool_binop_done:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":812
+ * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ *
+ * if ((child.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<<
+ * (child.byteorder == c'<' and not little_endian)):
+ * raise ValueError(u"Non-native byte order not supported")
+ */
+ if (unlikely(__pyx_t_6)) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":814
+ * if ((child.byteorder == c'>' and little_endian) or
+ * (child.byteorder == c'<' and not little_endian)):
+ * raise ValueError(u"Non-native byte order not supported") # <<<<<<<<<<<<<<
+ * # One could encode it in the format string and have Cython
+ * # complain instead, BUT: < and > in format strings also imply
+ */
+ __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 814, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(1, 814, __pyx_L1_error)
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":812
+ * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ *
+ * if ((child.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<<
+ * (child.byteorder == c'<' and not little_endian)):
+ * raise ValueError(u"Non-native byte order not supported")
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":824
+ *
+ * # Output padding bytes
+ * while offset[0] < new_offset: # <<<<<<<<<<<<<<
+ * f[0] = 120 # "x"; pad byte
+ * f += 1
+ */
+ while (1) {
+ __pyx_t_3 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 824, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PyObject_RichCompare(__pyx_t_3, __pyx_v_new_offset, Py_LT); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 824, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 824, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ if (!__pyx_t_6) break;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":825
+ * # Output padding bytes
+ * while offset[0] < new_offset:
+ * f[0] = 120 # "x"; pad byte # <<<<<<<<<<<<<<
+ * f += 1
+ * offset[0] += 1
+ */
+ (__pyx_v_f[0]) = 0x78;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":826
+ * while offset[0] < new_offset:
+ * f[0] = 120 # "x"; pad byte
+ * f += 1 # <<<<<<<<<<<<<<
+ * offset[0] += 1
+ *
+ */
+ __pyx_v_f = (__pyx_v_f + 1);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827
+ * f[0] = 120 # "x"; pad byte
+ * f += 1
+ * offset[0] += 1 # <<<<<<<<<<<<<<
+ *
+ * offset[0] += child.itemsize
+ */
+ __pyx_t_8 = 0;
+ (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + 1);
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":829
+ * offset[0] += 1
+ *
+ * offset[0] += child.itemsize # <<<<<<<<<<<<<<
+ *
+ * if not PyDataType_HASFIELDS(child):
+ */
+ __pyx_t_8 = 0;
+ (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + __pyx_v_child->elsize);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":831
+ * offset[0] += child.itemsize
+ *
+ * if not PyDataType_HASFIELDS(child): # <<<<<<<<<<<<<<
+ * t = child.type_num
+ * if end - f < 5:
+ */
+ __pyx_t_6 = ((!(PyDataType_HASFIELDS(__pyx_v_child) != 0)) != 0);
+ if (__pyx_t_6) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":832
+ *
+ * if not PyDataType_HASFIELDS(child):
+ * t = child.type_num # <<<<<<<<<<<<<<
+ * if end - f < 5:
+ * raise RuntimeError(u"Format string allocated too short.")
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_child->type_num); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 832, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __Pyx_XDECREF_SET(__pyx_v_t, __pyx_t_4);
+ __pyx_t_4 = 0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":833
+ * if not PyDataType_HASFIELDS(child):
+ * t = child.type_num
+ * if end - f < 5: # <<<<<<<<<<<<<<
+ * raise RuntimeError(u"Format string allocated too short.")
+ *
+ */
+ __pyx_t_6 = (((__pyx_v_end - __pyx_v_f) < 5) != 0);
+ if (unlikely(__pyx_t_6)) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":834
+ * t = child.type_num
+ * if end - f < 5:
+ * raise RuntimeError(u"Format string allocated too short.") # <<<<<<<<<<<<<<
+ *
+ * # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+ __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 834, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __PYX_ERR(1, 834, __pyx_L1_error)
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":833
+ * if not PyDataType_HASFIELDS(child):
+ * t = child.type_num
+ * if end - f < 5: # <<<<<<<<<<<<<<
+ * raise RuntimeError(u"Format string allocated too short.")
+ *
+ */
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":837
+ *
+ * # Until ticket #99 is fixed, use integers to avoid warnings
+ * if t == NPY_BYTE: f[0] = 98 #"b" # <<<<<<<<<<<<<<
+ * elif t == NPY_UBYTE: f[0] = 66 #"B"
+ * elif t == NPY_SHORT: f[0] = 104 #"h"
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_BYTE); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 837, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 837, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 837, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 98;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":838
+ * # Until ticket #99 is fixed, use integers to avoid warnings
+ * if t == NPY_BYTE: f[0] = 98 #"b"
+ * elif t == NPY_UBYTE: f[0] = 66 #"B" # <<<<<<<<<<<<<<
+ * elif t == NPY_SHORT: f[0] = 104 #"h"
+ * elif t == NPY_USHORT: f[0] = 72 #"H"
+ */
+ __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_UBYTE); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 838, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 838, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 838, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 66;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":839
+ * if t == NPY_BYTE: f[0] = 98 #"b"
+ * elif t == NPY_UBYTE: f[0] = 66 #"B"
+ * elif t == NPY_SHORT: f[0] = 104 #"h" # <<<<<<<<<<<<<<
+ * elif t == NPY_USHORT: f[0] = 72 #"H"
+ * elif t == NPY_INT: f[0] = 105 #"i"
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_SHORT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 839, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 839, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 839, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 0x68;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":840
+ * elif t == NPY_UBYTE: f[0] = 66 #"B"
+ * elif t == NPY_SHORT: f[0] = 104 #"h"
+ * elif t == NPY_USHORT: f[0] = 72 #"H" # <<<<<<<<<<<<<<
+ * elif t == NPY_INT: f[0] = 105 #"i"
+ * elif t == NPY_UINT: f[0] = 73 #"I"
+ */
+ __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_USHORT); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 840, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 840, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 840, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 72;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":841
+ * elif t == NPY_SHORT: f[0] = 104 #"h"
+ * elif t == NPY_USHORT: f[0] = 72 #"H"
+ * elif t == NPY_INT: f[0] = 105 #"i" # <<<<<<<<<<<<<<
+ * elif t == NPY_UINT: f[0] = 73 #"I"
+ * elif t == NPY_LONG: f[0] = 108 #"l"
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_INT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 841, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 841, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 841, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 0x69;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":842
+ * elif t == NPY_USHORT: f[0] = 72 #"H"
+ * elif t == NPY_INT: f[0] = 105 #"i"
+ * elif t == NPY_UINT: f[0] = 73 #"I" # <<<<<<<<<<<<<<
+ * elif t == NPY_LONG: f[0] = 108 #"l"
+ * elif t == NPY_ULONG: f[0] = 76 #"L"
+ */
+ __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_UINT); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 842, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 842, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 842, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 73;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":843
+ * elif t == NPY_INT: f[0] = 105 #"i"
+ * elif t == NPY_UINT: f[0] = 73 #"I"
+ * elif t == NPY_LONG: f[0] = 108 #"l" # <<<<<<<<<<<<<<
+ * elif t == NPY_ULONG: f[0] = 76 #"L"
+ * elif t == NPY_LONGLONG: f[0] = 113 #"q"
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONG); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 843, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 843, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 843, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 0x6C;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844
+ * elif t == NPY_UINT: f[0] = 73 #"I"
+ * elif t == NPY_LONG: f[0] = 108 #"l"
+ * elif t == NPY_ULONG: f[0] = 76 #"L" # <<<<<<<<<<<<<<
+ * elif t == NPY_LONGLONG: f[0] = 113 #"q"
+ * elif t == NPY_ULONGLONG: f[0] = 81 #"Q"
+ */
+ __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_ULONG); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 844, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 844, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 844, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 76;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":845
+ * elif t == NPY_LONG: f[0] = 108 #"l"
+ * elif t == NPY_ULONG: f[0] = 76 #"L"
+ * elif t == NPY_LONGLONG: f[0] = 113 #"q" # <<<<<<<<<<<<<<
+ * elif t == NPY_ULONGLONG: f[0] = 81 #"Q"
+ * elif t == NPY_FLOAT: f[0] = 102 #"f"
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONGLONG); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 845, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 845, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 845, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 0x71;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":846
+ * elif t == NPY_ULONG: f[0] = 76 #"L"
+ * elif t == NPY_LONGLONG: f[0] = 113 #"q"
+ * elif t == NPY_ULONGLONG: f[0] = 81 #"Q" # <<<<<<<<<<<<<<
+ * elif t == NPY_FLOAT: f[0] = 102 #"f"
+ * elif t == NPY_DOUBLE: f[0] = 100 #"d"
+ */
+ __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_ULONGLONG); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 846, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 846, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 846, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 81;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":847
+ * elif t == NPY_LONGLONG: f[0] = 113 #"q"
+ * elif t == NPY_ULONGLONG: f[0] = 81 #"Q"
+ * elif t == NPY_FLOAT: f[0] = 102 #"f" # <<<<<<<<<<<<<<
+ * elif t == NPY_DOUBLE: f[0] = 100 #"d"
+ * elif t == NPY_LONGDOUBLE: f[0] = 103 #"g"
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_FLOAT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 847, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 847, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 847, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 0x66;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":848
+ * elif t == NPY_ULONGLONG: f[0] = 81 #"Q"
+ * elif t == NPY_FLOAT: f[0] = 102 #"f"
+ * elif t == NPY_DOUBLE: f[0] = 100 #"d" # <<<<<<<<<<<<<<
+ * elif t == NPY_LONGDOUBLE: f[0] = 103 #"g"
+ * elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf
+ */
+ __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_DOUBLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 848, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 848, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 848, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 0x64;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":849
+ * elif t == NPY_FLOAT: f[0] = 102 #"f"
+ * elif t == NPY_DOUBLE: f[0] = 100 #"d"
+ * elif t == NPY_LONGDOUBLE: f[0] = 103 #"g" # <<<<<<<<<<<<<<
+ * elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf
+ * elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONGDOUBLE); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 849, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 849, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 849, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 0x67;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":850
+ * elif t == NPY_DOUBLE: f[0] = 100 #"d"
+ * elif t == NPY_LONGDOUBLE: f[0] = 103 #"g"
+ * elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf # <<<<<<<<<<<<<<
+ * elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd
+ * elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ */
+ __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CFLOAT); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 850, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 850, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 850, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 90;
+ (__pyx_v_f[1]) = 0x66;
+ __pyx_v_f = (__pyx_v_f + 1);
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":851
+ * elif t == NPY_LONGDOUBLE: f[0] = 103 #"g"
+ * elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf
+ * elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd # <<<<<<<<<<<<<<
+ * elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ * elif t == NPY_OBJECT: f[0] = 79 #"O"
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CDOUBLE); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 851, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 851, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 851, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 90;
+ (__pyx_v_f[1]) = 0x64;
+ __pyx_v_f = (__pyx_v_f + 1);
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":852
+ * elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf
+ * elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd
+ * elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg # <<<<<<<<<<<<<<
+ * elif t == NPY_OBJECT: f[0] = 79 #"O"
+ * else:
+ */
+ __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CLONGDOUBLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 852, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 852, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 852, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ if (__pyx_t_6) {
+ (__pyx_v_f[0]) = 90;
+ (__pyx_v_f[1]) = 0x67;
+ __pyx_v_f = (__pyx_v_f + 1);
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":853
+ * elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd
+ * elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ * elif t == NPY_OBJECT: f[0] = 79 #"O" # <<<<<<<<<<<<<<
+ * else:
+ * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+ __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_OBJECT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 853, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 853, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 853, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ if (likely(__pyx_t_6)) {
+ (__pyx_v_f[0]) = 79;
+ goto __pyx_L15;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":855
+ * elif t == NPY_OBJECT: f[0] = 79 #"O"
+ * else:
+ * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) # <<<<<<<<<<<<<<
+ * f += 1
+ * else:
+ */
+ /*else*/ {
+ __pyx_t_3 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_v_t); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 855, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 855, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+ __PYX_ERR(1, 855, __pyx_L1_error)
+ }
+ __pyx_L15:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":856
+ * else:
+ * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ * f += 1 # <<<<<<<<<<<<<<
+ * else:
+ * # Cython ignores struct boundary information ("T{...}"),
+ */
+ __pyx_v_f = (__pyx_v_f + 1);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":831
+ * offset[0] += child.itemsize
+ *
+ * if not PyDataType_HASFIELDS(child): # <<<<<<<<<<<<<<
+ * t = child.type_num
+ * if end - f < 5:
+ */
+ goto __pyx_L13;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":860
+ * # Cython ignores struct boundary information ("T{...}"),
+ * # so don't output it
+ * f = _util_dtypestring(child, f, end, offset) # <<<<<<<<<<<<<<
+ * return f
+ *
+ */
+ /*else*/ {
+ __pyx_t_9 = __pyx_f_5numpy__util_dtypestring(__pyx_v_child, __pyx_v_f, __pyx_v_end, __pyx_v_offset); if (unlikely(__pyx_t_9 == ((char *)NULL))) __PYX_ERR(1, 860, __pyx_L1_error)
+ __pyx_v_f = __pyx_t_9;
+ }
+ __pyx_L13:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":805
+ * cdef tuple fields
+ *
+ * for childname in descr.names: # <<<<<<<<<<<<<<
+ * fields = descr.fields[childname]
+ * child, new_offset = fields
+ */
+ }
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":861
+ * # so don't output it
+ * f = _util_dtypestring(child, f, end, offset)
+ * return f # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_r = __pyx_v_f;
+ goto __pyx_L0;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":796
+ * return ()
+ *
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL: # <<<<<<<<<<<<<<
+ * # Recursive utility function used in __getbuffer__ to get format
+ * # string. The new location in the format string is returned.
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_XDECREF(__pyx_t_3);
+ __Pyx_XDECREF(__pyx_t_4);
+ __Pyx_AddTraceback("numpy._util_dtypestring", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __Pyx_XDECREF((PyObject *)__pyx_v_child);
+ __Pyx_XDECREF(__pyx_v_fields);
+ __Pyx_XDECREF(__pyx_v_childname);
+ __Pyx_XDECREF(__pyx_v_new_offset);
+ __Pyx_XDECREF(__pyx_v_t);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ *
+ *
+ * cdef inline void set_array_base(ndarray arr, object base): # <<<<<<<<<<<<<<
+ * cdef PyObject* baseptr
+ * if base is None:
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+ PyObject *__pyx_v_baseptr;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ int __pyx_t_2;
+ __Pyx_RefNannySetupContext("set_array_base", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":979
+ * cdef inline void set_array_base(ndarray arr, object base):
+ * cdef PyObject* baseptr
+ * if base is None: # <<<<<<<<<<<<<<
+ * baseptr = NULL
+ * else:
+ */
+ __pyx_t_1 = (__pyx_v_base == Py_None);
+ __pyx_t_2 = (__pyx_t_1 != 0);
+ if (__pyx_t_2) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":980
+ * cdef PyObject* baseptr
+ * if base is None:
+ * baseptr = NULL # <<<<<<<<<<<<<<
+ * else:
+ * Py_INCREF(base) # important to do this before decref below!
+ */
+ __pyx_v_baseptr = NULL;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":979
+ * cdef inline void set_array_base(ndarray arr, object base):
+ * cdef PyObject* baseptr
+ * if base is None: # <<<<<<<<<<<<<<
+ * baseptr = NULL
+ * else:
+ */
+ goto __pyx_L3;
+ }
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":982
+ * baseptr = NULL
+ * else:
+ * Py_INCREF(base) # important to do this before decref below! # <<<<<<<<<<<<<<
+ * baseptr = base
+ * Py_XDECREF(arr.base)
+ */
+ /*else*/ {
+ Py_INCREF(__pyx_v_base);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":983
+ * else:
+ * Py_INCREF(base) # important to do this before decref below!
+ * baseptr = base # <<<<<<<<<<<<<<
+ * Py_XDECREF(arr.base)
+ * arr.base = baseptr
+ */
+ __pyx_v_baseptr = ((PyObject *)__pyx_v_base);
+ }
+ __pyx_L3:;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":984
+ * Py_INCREF(base) # important to do this before decref below!
+ * baseptr = base
+ * Py_XDECREF(arr.base) # <<<<<<<<<<<<<<
+ * arr.base = baseptr
+ *
+ */
+ Py_XDECREF(__pyx_v_arr->base);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":985
+ * baseptr = base
+ * Py_XDECREF(arr.base)
+ * arr.base = baseptr # <<<<<<<<<<<<<<
+ *
+ * cdef inline object get_array_base(ndarray arr):
+ */
+ __pyx_v_arr->base = __pyx_v_baseptr;
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ *
+ *
+ * cdef inline void set_array_base(ndarray arr, object base): # <<<<<<<<<<<<<<
+ * cdef PyObject* baseptr
+ * if base is None:
+ */
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+}
+
+/* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":987
+ * arr.base = baseptr
+ *
+ * cdef inline object get_array_base(ndarray arr): # <<<<<<<<<<<<<<
+ * if arr.base is NULL:
+ * return None
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ __Pyx_RefNannySetupContext("get_array_base", 0);
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":988
+ *
+ * cdef inline object get_array_base(ndarray arr):
+ * if arr.base is NULL: # <<<<<<<<<<<<<<
+ * return None
+ * else:
+ */
+ __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0);
+ if (__pyx_t_1) {
+
+ /* "../../.virtualenvs/math/local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":989
+ * cdef inline object get_array_base(ndarray arr):
+ * if arr.base is NULL:
+ * return None # <<<<<<<<<<<<<<
+ * else:
+ * return