Merge branch 'develop' of https://github.com/RaRe-Technologies/gensim …

…into fix_load_wv
piskvorky · Feb 5, 2018 · 164cf63 · 164cf63
2 parents c157d79 + 17c47ee
commit 164cf63
Show file tree

Hide file tree

Showing 63 changed files with 11,932 additions and 4,317 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/docs/notebooks/doc2vec-lee.ipynb b/docs/notebooks/doc2vec-lee.ipynb
@@ -610,11 +610,11 @@
    "source": [
     "# Pick a random document from the test corpus and infer a vector from the model\n",
     "doc_id = random.randint(0, len(test_corpus) - 1)\n",
-    "inferred_vector = model.infer_vector(test_corpus[doc_id].words)\n",
+    "inferred_vector = model.infer_vector(test_corpus[doc_id])\n",
     "sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))\n",
     "\n",
     "# Compare and print the most/median/least similar documents from the train corpus\n",
-    "print('Test Document ({}): «{}»\\n'.format(doc_id, ' '.join(test_corpus[doc_id].words)))\n",
+    "print('Test Document ({}): «{}»\\n'.format(doc_id, ' '.join(test_corpus[doc_id])))\n",
     "print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\\n' % model)\n",
     "for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:\n",
     "    print(u'%s %s: «%s»\\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))"

diff --git a/docs/src/_static/css/style.css b/docs/src/_static/css/style.css
@@ -596,7 +596,7 @@ a.googlegroups{
 
 #middle1{
     float:left;
-    width:540px;
+    width:440px;
     padding-top: 53px;
 }
 
@@ -613,13 +613,13 @@ a.googlegroups{
 
 #right1 {
     float:right;
-    width: 351px;
+    width: 451px;
     padding-top: 53px;
 }
 
 .consulting-banner {
     background-color: #ff2010;
-    background-color: rgba(255, 32, 16, 0.2);
+    background-color: rgba(255, 32, 16, 0.4);
     padding: 1px 10px;
     -moz-border-radius: 15px;
     border-radius: 15px;

diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
@@ -55,6 +55,11 @@ Modules:
     models/wrappers/wordrank
     models/wrappers/varembed
     models/wrappers/fasttext
+    models/deprecated/doc2vec
+    models/deprecated/fasttext
+    models/deprecated/word2vec
+    models/deprecated/keyedvectors
+    models/deprecated/fasttext_wrapper
     similarities/docsim
     similarities/index
     sklearn_api/atmodel

diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -53,9 +53,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '3.2'
+version = '3.3'
 # The full version, including alpha/beta/rc tags.
-release = '3.2.0'
+release = '3.3.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/src/dist_lsi.rst b/docs/src/dist_lsi.rst
@@ -120,14 +120,14 @@ Distributed LSA on Wikipedia
 First, download and prepare the Wikipedia corpus as per :doc:`wiki`, then load
 the corpus iterator with::
 
-    >>> import logging, gensim, bz2
+    >>> import logging, gensim
     >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
     >>> # load id->word mapping (the dictionary)
     >>> id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')
     >>> # load corpus iterator
     >>> mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')
-    >>> # mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output
+    >>> # mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output
 
     >>> print(mm)
     MmCorpus(3199665 documents, 100000 features, 495547400 non-zero entries)

diff --git a/docs/src/gensim_theme/layout.html b/docs/src/gensim_theme/layout.html
@@ -33,7 +33,7 @@
 
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
   <head>
-    <!-- Google Tag Manager - JD-20170831 --> 
+    <!-- Google Tag Manager - JD-20170831 -->
     <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
     new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
     j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
@@ -79,10 +79,10 @@ <h1 class="h1gensim">
           </div>
           <div id="right1">
             <div class="consulting-banner">
-              <h3><a href="http://rare-technologies.com/">Get Expert Help</a></h3>
-              <p>• machine learning, NLP, data mining</p>
-              <p>• custom SW design, development, optimizations</p>
-              <p>• corporate trainings &amp; IT consulting</p>
+              <h3>Get Expert Help From The Gensim Authors</h3>
+              <p>• <a href="https://rare-technologies.com/">Consulting</a> in Machine Learning &amp; NLP</p>
+              <p>• Commercial document similarity engine: <a href="https://scaletext.com/">ScaleText.ai</a></p>
+              <p>• <a href="https://rare-technologies.com/corporate-training/">Corporate trainings</a> in Python Data Science and Deep Learning</p>
             </div>
           </div>
         </div>

diff --git a/docs/src/models/deprecated/doc2vec.rst b/docs/src/models/deprecated/doc2vec.rst
@@ -0,0 +1,9 @@
+:mod:`models.deprecated.doc2vec` -- Deep learning with paragraph2vec
+====================================================================
+
+.. automodule:: gensim.models.deprecated.doc2vec
+    :synopsis: Deep learning with doc2vec
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/src/models/deprecated/fasttext.rst b/docs/src/models/deprecated/fasttext.rst
@@ -0,0 +1,10 @@
+:mod:`models.deprecated.fasttext` -- FastText model
+===================================================
+
+.. automodule:: gensim.models.deprecated.fasttext
+    :synopsis: FastText model
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/src/models/deprecated/fasttext_wrapper.rst b/docs/src/models/deprecated/fasttext_wrapper.rst
@@ -0,0 +1,10 @@
+:mod:`models.deprecated.fasttext_wrapper` -- Wrapper for Facebook implementation of FastText model
+==================================================================================================
+
+.. automodule:: gensim.models.deprecated.fasttext_wrapper
+    :synopsis: FastText model
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/src/models/deprecated/keyedvectors.rst b/docs/src/models/deprecated/keyedvectors.rst
@@ -0,0 +1,9 @@
+:mod:`models.deprecated.keyedvectors` -- Store and query word vectors
+=====================================================================
+
+.. automodule:: gensim.models.deprecated.keyedvectors
+    :synopsis: Store and query word vectors
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/src/models/deprecated/word2vec.rst b/docs/src/models/deprecated/word2vec.rst
@@ -0,0 +1,9 @@
+:mod:`models.deprecated.word2vec` -- Deep learning with word2vec
+================================================================
+
+.. automodule:: gensim.models.deprecated.word2vec
+    :synopsis: Deep learning with word2vec
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/src/models/wrappers/fasttext.rst b/docs/src/models/wrappers/fasttext.rst
@@ -1,8 +1,8 @@
-:mod:`models.wrappers.fasttext` -- FastText Word Embeddings
-===========================================================
+:mod:`models.wrappers.fasttext` -- Wrapper for FastText implementation from Facebook
+====================================================================================
 
 .. automodule:: gensim.models.wrappers.fasttext
-    :synopsis: FastText Embeddings
+    :synopsis: FastText
     :members:
     :inherited-members:
     :undoc-members:

diff --git a/docs/src/wiki.rst b/docs/src/wiki.rst
@@ -38,14 +38,14 @@ Latent Semantic Analysis
 
 First let's load the corpus iterator and dictionary, created in the second step above::
 
-    >>> import logging, gensim, bz2
+    >>> import logging, gensim
     >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
     >>> # load id->word mapping (the dictionary), one of the results of step 2 above
     >>> id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')
     >>> # load corpus iterator
     >>> mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')
-    >>> # mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output (recommended)
+    >>> # mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output (recommended)
 
     >>> print(mm)
     MmCorpus(3931787 documents, 100000 features, 756379027 non-zero entries)
@@ -93,14 +93,14 @@ Latent Dirichlet Allocation
 
 As with Latent Semantic Analysis above, first load the corpus iterator and dictionary::
 
-    >>> import logging, gensim, bz2
+    >>> import logging, gensim
     >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
     >>> # load id->word mapping (the dictionary), one of the results of step 2 above
     >>> id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')
     >>> # load corpus iterator
     >>> mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')
-    >>> # mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output
+    >>> # mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm.bz2') # use this if you compressed the TFIDF output
 
     >>> print(mm)
     MmCorpus(3931787 documents, 100000 features, 756379027 non-zero entries)

diff --git a/gensim/__init__.py b/gensim/__init__.py
@@ -6,7 +6,7 @@
 from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization, utils  # noqa:F401
 import logging
 
-__version__ = '3.2.0'
+__version__ = '3.3.0'
 
 
 class NullHandler(logging.Handler):

diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
@@ -207,7 +207,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False)
             id2word = utils.dict_from_corpus(corpus)
             num_terms = len(id2word)
         else:
-            num_terms = 1 + max([-1] + id2word.keys())
+            num_terms = 1 + max([-1] + list(id2word))
 
         # write out vocabulary
         fname_vocab = utils.smart_extension(fname, '.vocab')

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -22,7 +22,6 @@
 
 """
 
-
 import bz2
 import logging
 import multiprocessing
@@ -45,7 +44,6 @@
 TOKEN_MIN_LEN = 2
 TOKEN_MAX_LEN = 15
 
-
 RE_P0 = re.compile(r'<!--.*?-->', re.DOTALL | re.UNICODE)
 """Comments."""
 RE_P1 = re.compile(r'<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)
@@ -78,6 +76,8 @@
 """Categories."""
 RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
 """Remove File and Image templates."""
+RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
+"""Capture interlinks text and article linked"""
 
 IGNORED_NAMESPACES = [
     'Wikipedia', 'Category', 'File', 'Portal', 'Template',
@@ -93,34 +93,70 @@
 """
 
 
-def filter_wiki(raw):
+def find_interlinks(raw):
+    """Find all interlinks to other articles in the dump.
+
+    Parameters
+    ----------
+    raw : str
+        Unicode or utf-8 encoded string.
+
+    Returns
+    -------
+    dict
+        Mapping from the linked article to the actual text found.
+    """
+    filtered = filter_wiki(raw, promote_remaining=False, simplify_links=False)
+    interlinks_raw = re.findall(RE_P16, filtered)
+
+    interlinks = {}
+    for parts in [i.split('|') for i in interlinks_raw]:
+        actual_title = parts[0]
+        try:
+            interlink_text = parts[1]
+            interlinks[actual_title] = interlink_text
+        except IndexError:
+            interlinks[actual_title] = actual_title
+
+    legit_interlinks = {i: j for i, j in interlinks.items() if '[' not in i and ']' not in i}
+    return legit_interlinks
+
+
+def filter_wiki(raw, promote_remaining=True, simplify_links=True):
     """Filter out wiki markup from `raw`, leaving only text.
 
     Parameters
     ----------
     raw : str
         Unicode or utf-8 encoded string.
+    promote_remaining : bool
+        Whether uncaught markup should be promoted to plain text.
+    simplify_links : bool
+        Whether links should be simplified keeping only their description text.
 
     Returns
     -------
     str
         `raw` without markup.
-
     """
     # parsing of the wiki markup is not perfect, but sufficient for our purposes
     # contributions to improving this code are welcome :)
     text = utils.to_unicode(raw, 'utf8', errors='ignore')
     text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
-    return remove_markup(text)
+    return remove_markup(text, promote_remaining, simplify_links)
 
 
-def remove_markup(text):
+def remove_markup(text, promote_remaining=True, simplify_links=True):
     """Filter out wiki markup from `text`, leaving only text.
 
     Parameters
     ----------
     text : str
         String containing markup.
+    promote_remaining : bool
+        Whether uncaught markup should be promoted to plain text.
+    simplify_links : bool
+        Whether links should be simplified keeping only their description text.
 
     Returns
     -------
@@ -145,8 +181,11 @@ def remove_markup(text):
         text = re.sub(RE_P11, '', text)  # remove all remaining tags
         text = re.sub(RE_P14, '', text)  # remove categories
         text = re.sub(RE_P5, '\\3', text)  # remove urls, keep description
-        text = re.sub(RE_P6, '\\2', text)  # simplify links, keep description only
+
+        if simplify_links:
+            text = re.sub(RE_P6, '\\2', text)  # simplify links, keep description only
         # remove table markup
+
         text = text.replace('||', '\n|')  # each table cell on a separate line
         text = re.sub(RE_P12, '\n', text)  # remove formatting lines
         text = re.sub(RE_P13, '\n\\3', text)  # leave only cell content
@@ -156,9 +195,9 @@ def remove_markup(text):
         if old == text or iters > 2:
             break
 
-    # the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
-    # TODO is this really desirable?
-    text = text.replace('[', '').replace(']', '')  # promote all remaining markup to plain text
+    if promote_remaining:
+        text = text.replace('[', '').replace(']', '')  # promote all remaining markup to plain text
+
     return text
 
 
@@ -333,7 +372,7 @@ def extract_pages(f, filter_namespaces=False):
                     text = None
 
             pageid = elem.find(pageid_path).text
-            yield title, text or "", pageid     # empty page will yield None
+            yield title, text or "", pageid  # empty page will yield None
 
             # Prune the element tree, as per
             # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
@@ -461,6 +500,7 @@ class WikiCorpus(TextCorpus):
     >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format and mapping
 
     """
+
     def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                  filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
                  token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -897,10 +897,9 @@ def hellinger(vec1, vec2):
     if isbow(vec1) and isbow(vec2):
         # if it is a BoW format, instead of converting to dense we use dictionaries to calculate appropriate distance
         vec1, vec2 = dict(vec1), dict(vec2)
-        if len(vec2) < len(vec1):
-            vec1, vec2 = vec2, vec1  # swap references so that we iterate over the shorter vector
+        indices = set(list(vec1.keys()) + list(vec2.keys()))
         sim = np.sqrt(
-            0.5 * sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))
+            0.5 * sum((np.sqrt(vec1.get(index, 0.0)) - np.sqrt(vec2.get(index, 0.0)))**2 for index in indices)
         )
         return sim
     else:

diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
@@ -23,6 +23,7 @@
 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix  # noqa:F401
 
 from . import wrappers  # noqa:F401
+from . import deprecated  # noqa:F401
 
 from gensim import interfaces, utils