Skip to content

Commit

Permalink
Annotate bleicorpus.py
Browse files Browse the repository at this point in the history
  • Loading branch information
anotherbugmaster committed Nov 23, 2017
1 parent cde582e commit 34cccfe
Showing 1 changed file with 28 additions and 7 deletions.
35 changes: 28 additions & 7 deletions gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""Blei's LDA-C format."""
"""
Blei's LDA-C format.
"""

from __future__ import with_statement

Expand Down Expand Up @@ -41,7 +43,9 @@ def __init__(self, fname, fname_vocab=None):
Args:
fname (str): serialized corpus's filename
fname_vocab (str): vocabulary file; takes precedence over fname.vocab
fname_vocab (:obj:`str`, optional): vocabulary file; takes precedence over fname.vocab
Raises:
IOError: If vocabulary file doesn't exist
"""
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s", fname)
Expand Down Expand Up @@ -76,6 +80,15 @@ def __iter__(self):
self.length = lineno + 1

def line2doc(self, line):
"""
Args:
line (str): document's string representation
Returns:
:obj:`list` of (:obj:`int`, :obj:`float`):
document's list representation
Raises:
ValueError: If format is invalid
"""
parts = utils.to_unicode(line).split()
if int(parts[0]) != len(parts) - 1:
raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
Expand All @@ -91,11 +104,14 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False):
There are actually two files saved: `fname` and `fname.vocab`, where
`fname.vocab` is the vocabulary file.
This function is automatically called by `BleiCorpus.serialize`; don't
call it directly, call `serialize` instead.
Args:
fname (str): filename
corpus : yields documents
id2word (:obj:`dict` of (:obj:`str`, :obj:`str`), optional):
transforms id to word
metadata (bool): any additional info
Returns:
:obj:`list` of :obj:`int`: fields' offsets
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")
Expand Down Expand Up @@ -124,7 +140,12 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False):

def docbyoffset(self, offset):
"""
Return the document stored at file position `offset`.
Return document corresponding to `offset`.
Args:
offset (int): position of the document in the file
Returns:
:obj:`list` of (:obj:`int`, :obj:`float`): document's list representation
"""
with utils.smart_open(self.fname) as f:
f.seek(offset)
Expand Down

0 comments on commit 34cccfe

Please sign in to comment.