Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/RaRe-Technologies/gensim
Browse files Browse the repository at this point in the history
…into fix_load_wv
  • Loading branch information
pushpankar committed Feb 5, 2018
2 parents c157d79 + 17c47ee commit 164cf63
Show file tree
Hide file tree
Showing 63 changed files with 11,932 additions and 4,317 deletions.
148 changes: 148 additions & 0 deletions CHANGELOG.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions docs/notebooks/doc2vec-lee.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -610,11 +610,11 @@
"source": [
"# Pick a random document from the test corpus and infer a vector from the model\n",
"doc_id = random.randint(0, len(test_corpus) - 1)\n",
"inferred_vector = model.infer_vector(test_corpus[doc_id].words)\n",
"inferred_vector = model.infer_vector(test_corpus[doc_id])\n",
"sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))\n",
"\n",
"# Compare and print the most/median/least similar documents from the train corpus\n",
"print('Test Document ({}): «{}»\\n'.format(doc_id, ' '.join(test_corpus[doc_id].words)))\n",
"print('Test Document ({}): «{}»\\n'.format(doc_id, ' '.join(test_corpus[doc_id])))\n",
"print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\\n' % model)\n",
"for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:\n",
" print(u'%s %s: «%s»\\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))"
Expand Down
6 changes: 3 additions & 3 deletions docs/src/_static/css/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ a.googlegroups{

#middle1{
float:left;
width:540px;
width:440px;
padding-top: 53px;
}

Expand All @@ -613,13 +613,13 @@ a.googlegroups{

#right1 {
float:right;
width: 351px;
width: 451px;
padding-top: 53px;
}

.consulting-banner {
background-color: #ff2010;
background-color: rgba(255, 32, 16, 0.2);
background-color: rgba(255, 32, 16, 0.4);
padding: 1px 10px;
-moz-border-radius: 15px;
border-radius: 15px;
Expand Down
5 changes: 5 additions & 0 deletions docs/src/apiref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ Modules:
models/wrappers/wordrank
models/wrappers/varembed
models/wrappers/fasttext
models/deprecated/doc2vec
models/deprecated/fasttext
models/deprecated/word2vec
models/deprecated/keyedvectors
models/deprecated/fasttext_wrapper
similarities/docsim
similarities/index
sklearn_api/atmodel
Expand Down
4 changes: 2 additions & 2 deletions docs/src/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
# built documents.
#
# The short X.Y version.
version = '3.2'
version = '3.3'
# The full version, including alpha/beta/rc tags.
release = '3.2.0'
release = '3.3.0'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
4 changes: 2 additions & 2 deletions docs/src/dist_lsi.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,14 @@ Distributed LSA on Wikipedia
First, download and prepare the Wikipedia corpus as per :doc:`wiki`, then load
the corpus iterator with::

>>> import logging, gensim, bz2
>>> import logging, gensim
>>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

>>> # load id->word mapping (the dictionary)
>>> id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')
>>> # load corpus iterator
>>> mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')
>>> # mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output
>>> # mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output

>>> print(mm)
MmCorpus(3199665 documents, 100000 features, 495547400 non-zero entries)
Expand Down
10 changes: 5 additions & 5 deletions docs/src/gensim_theme/layout.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<!-- Google Tag Manager - JD-20170831 -->
<!-- Google Tag Manager - JD-20170831 -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
Expand Down Expand Up @@ -79,10 +79,10 @@ <h1 class="h1gensim">
</div>
<div id="right1">
<div class="consulting-banner">
<h3><a href="http://rare-technologies.com/">Get Expert Help</a></h3>
<p>machine learning, NLP, data mining</p>
<p>custom SW design, development, optimizations</p>
<p>• corporate trainings &amp; IT consulting</p>
<h3>Get Expert Help From The Gensim Authors</h3>
<p><a href="https://rare-technologies.com/">Consulting</a> in Machine Learning &amp; NLP</p>
<p>Commercial document similarity engine: <a href="https://scaletext.com/">ScaleText.ai</a></p>
<p><a href="https://rare-technologies.com/corporate-training/">Corporate trainings</a> in Python Data Science and Deep Learning</p>
</div>
</div>
</div>
Expand Down
9 changes: 9 additions & 0 deletions docs/src/models/deprecated/doc2vec.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
:mod:`models.deprecated.doc2vec` -- Deep learning with paragraph2vec
====================================================================

.. automodule:: gensim.models.deprecated.doc2vec
:synopsis: Deep learning with doc2vec
:members:
:inherited-members:
:undoc-members:
:show-inheritance:
10 changes: 10 additions & 0 deletions docs/src/models/deprecated/fasttext.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
:mod:`models.deprecated.fasttext` -- FastText model
===================================================

.. automodule:: gensim.models.deprecated.fasttext
:synopsis: FastText model
:members:
:inherited-members:
:special-members: __getitem__
:undoc-members:
:show-inheritance:
10 changes: 10 additions & 0 deletions docs/src/models/deprecated/fasttext_wrapper.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
:mod:`models.deprecated.fasttext_wrapper` -- Wrapper for Facebook implementation of FastText model
==================================================================================================

.. automodule:: gensim.models.deprecated.fasttext_wrapper
:synopsis: FastText model
:members:
:inherited-members:
:special-members: __getitem__
:undoc-members:
:show-inheritance:
9 changes: 9 additions & 0 deletions docs/src/models/deprecated/keyedvectors.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
:mod:`models.deprecated.keyedvectors` -- Store and query word vectors
=====================================================================

.. automodule:: gensim.models.deprecated.keyedvectors
:synopsis: Store and query word vectors
:members:
:inherited-members:
:undoc-members:
:show-inheritance:
9 changes: 9 additions & 0 deletions docs/src/models/deprecated/word2vec.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
:mod:`models.deprecated.word2vec` -- Deep learning with word2vec
================================================================

.. automodule:: gensim.models.deprecated.word2vec
:synopsis: Deep learning with word2vec
:members:
:inherited-members:
:undoc-members:
:show-inheritance:
6 changes: 3 additions & 3 deletions docs/src/models/wrappers/fasttext.rst
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
:mod:`models.wrappers.fasttext` -- FastText Word Embeddings
===========================================================
:mod:`models.wrappers.fasttext` -- Wrapper for FastText implementation from Facebook
====================================================================================

.. automodule:: gensim.models.wrappers.fasttext
:synopsis: FastText Embeddings
:synopsis: FastText
:members:
:inherited-members:
:undoc-members:
Expand Down
8 changes: 4 additions & 4 deletions docs/src/wiki.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ Latent Semantic Analysis

First let's load the corpus iterator and dictionary, created in the second step above::

>>> import logging, gensim, bz2
>>> import logging, gensim
>>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

>>> # load id->word mapping (the dictionary), one of the results of step 2 above
>>> id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')
>>> # load corpus iterator
>>> mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')
>>> # mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output (recommended)
>>> # mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output (recommended)

>>> print(mm)
MmCorpus(3931787 documents, 100000 features, 756379027 non-zero entries)
Expand Down Expand Up @@ -93,14 +93,14 @@ Latent Dirichlet Allocation

As with Latent Semantic Analysis above, first load the corpus iterator and dictionary::

>>> import logging, gensim, bz2
>>> import logging, gensim
>>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

>>> # load id->word mapping (the dictionary), one of the results of step 2 above
>>> id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')
>>> # load corpus iterator
>>> mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')
>>> # mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output
>>> # mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output

>>> print(mm)
MmCorpus(3931787 documents, 100000 features, 756379027 non-zero entries)
Expand Down
2 changes: 1 addition & 1 deletion gensim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization, utils # noqa:F401
import logging

__version__ = '3.2.0'
__version__ = '3.3.0'


class NullHandler(logging.Handler):
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/ucicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False)
id2word = utils.dict_from_corpus(corpus)
num_terms = len(id2word)
else:
num_terms = 1 + max([-1] + id2word.keys())
num_terms = 1 + max([-1] + list(id2word))

# write out vocabulary
fname_vocab = utils.smart_extension(fname, '.vocab')
Expand Down
62 changes: 51 additions & 11 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
"""


import bz2
import logging
import multiprocessing
Expand All @@ -45,7 +44,6 @@
TOKEN_MIN_LEN = 2
TOKEN_MAX_LEN = 15


RE_P0 = re.compile(r'<!--.*?-->', re.DOTALL | re.UNICODE)
"""Comments."""
RE_P1 = re.compile(r'<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)
Expand Down Expand Up @@ -78,6 +76,8 @@
"""Categories."""
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
"""Remove File and Image templates."""
RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
"""Capture interlinks text and article linked"""

IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
Expand All @@ -93,34 +93,70 @@
"""


def filter_wiki(raw):
def find_interlinks(raw):
"""Find all interlinks to other articles in the dump.
Parameters
----------
raw : str
Unicode or utf-8 encoded string.
Returns
-------
dict
Mapping from the linked article to the actual text found.
"""
filtered = filter_wiki(raw, promote_remaining=False, simplify_links=False)
interlinks_raw = re.findall(RE_P16, filtered)

interlinks = {}
for parts in [i.split('|') for i in interlinks_raw]:
actual_title = parts[0]
try:
interlink_text = parts[1]
interlinks[actual_title] = interlink_text
except IndexError:
interlinks[actual_title] = actual_title

legit_interlinks = {i: j for i, j in interlinks.items() if '[' not in i and ']' not in i}
return legit_interlinks


def filter_wiki(raw, promote_remaining=True, simplify_links=True):
"""Filter out wiki markup from `raw`, leaving only text.
Parameters
----------
raw : str
Unicode or utf-8 encoded string.
promote_remaining : bool
Whether uncaught markup should be promoted to plain text.
simplify_links : bool
Whether links should be simplified keeping only their description text.
Returns
-------
str
`raw` without markup.
"""
# parsing of the wiki markup is not perfect, but sufficient for our purposes
# contributions to improving this code are welcome :)
text = utils.to_unicode(raw, 'utf8', errors='ignore')
text = utils.decode_htmlentities(text) # '&amp;nbsp;' --> '\xa0'
return remove_markup(text)
return remove_markup(text, promote_remaining, simplify_links)


def remove_markup(text):
def remove_markup(text, promote_remaining=True, simplify_links=True):
"""Filter out wiki markup from `text`, leaving only text.
Parameters
----------
text : str
String containing markup.
promote_remaining : bool
Whether uncaught markup should be promoted to plain text.
simplify_links : bool
Whether links should be simplified keeping only their description text.
Returns
-------
Expand All @@ -145,8 +181,11 @@ def remove_markup(text):
text = re.sub(RE_P11, '', text) # remove all remaining tags
text = re.sub(RE_P14, '', text) # remove categories
text = re.sub(RE_P5, '\\3', text) # remove urls, keep description
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only

if simplify_links:
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only
# remove table markup

text = text.replace('||', '\n|') # each table cell on a separate line
text = re.sub(RE_P12, '\n', text) # remove formatting lines
text = re.sub(RE_P13, '\n\\3', text) # leave only cell content
Expand All @@ -156,9 +195,9 @@ def remove_markup(text):
if old == text or iters > 2:
break

# the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
# TODO is this really desirable?
text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text
if promote_remaining:
text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text

return text


Expand Down Expand Up @@ -333,7 +372,7 @@ def extract_pages(f, filter_namespaces=False):
text = None

pageid = elem.find(pageid_path).text
yield title, text or "", pageid # empty page will yield None
yield title, text or "", pageid # empty page will yield None

# Prune the element tree, as per
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
Expand Down Expand Up @@ -461,6 +500,7 @@ class WikiCorpus(TextCorpus):
>>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format and mapping
"""

def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
Expand Down
5 changes: 2 additions & 3 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,10 +897,9 @@ def hellinger(vec1, vec2):
if isbow(vec1) and isbow(vec2):
# if it is a BoW format, instead of converting to dense we use dictionaries to calculate appropriate distance
vec1, vec2 = dict(vec1), dict(vec2)
if len(vec2) < len(vec1):
vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector
indices = set(list(vec1.keys()) + list(vec2.keys()))
sim = np.sqrt(
0.5 * sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))
0.5 * sum((np.sqrt(vec1.get(index, 0.0)) - np.sqrt(vec2.get(index, 0.0)))**2 for index in indices)
)
return sim
else:
Expand Down
1 change: 1 addition & 0 deletions gensim/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401

from . import wrappers # noqa:F401
from . import deprecated # noqa:F401

from gensim import interfaces, utils

Expand Down
Loading

0 comments on commit 164cf63

Please sign in to comment.