Skip to content

Commit

Permalink
Document bleicorpus in Numpy style
Browse files Browse the repository at this point in the history
  • Loading branch information
anotherbugmaster committed Nov 23, 2017
1 parent 3b55037 commit 04c2e61
Showing 1 changed file with 52 additions and 29 deletions.
81 changes: 52 additions & 29 deletions gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
Blei's LDA-C format.
"""
"""Blei's LDA-C format."""

from __future__ import with_statement

Expand Down Expand Up @@ -41,11 +39,18 @@ def __init__(self, fname, fname_vocab=None):
"""
Initialize the corpus from a file.
Args:
fname (str): serialized corpus's filename
fname_vocab (:obj:`str`, optional): vocabulary file; takes precedence over fname.vocab
Raises:
IOError: If vocabulary file doesn't exist
Parameters
----------
fname : str
Serialized corpus's filename
fname_vocab : str or None, optional
Vocabulary file; takes precedence over
Raises
------
IOError
If vocabulary file doesn't exist
"""
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s", fname)
Expand All @@ -70,9 +75,7 @@ def __init__(self, fname, fname_vocab=None):
self.id2word = dict(enumerate(words))

def __iter__(self):
"""
Iterate over the corpus, returning one sparse vector at a time.
"""
"""Iterate over the corpus, returning one sparse vector at a time."""
lineno = -1
with utils.smart_open(self.fname) as fin:
for lineno, line in enumerate(fin):
Expand All @@ -81,12 +84,20 @@ def __iter__(self):

def line2doc(self, line):
"""
Args:
line (str): document's string representation
Returns:
:obj:`list` of (:obj:`int`, :obj:`float`):
document's list representation
Raises:
Convert line to document.
Parameters
----------
line : str
Document's string representation
Returns
-------
list of (int, float)
document's list representation
Raises
------
ValueError: If format is invalid
"""
parts = utils.to_unicode(line).split()
Expand All @@ -104,14 +115,21 @@ def _save_corpus(fname, corpus, id2word=None, metadata=False):
There are actually two files saved: `fname` and `fname.vocab`, where
`fname.vocab` is the vocabulary file.
Args:
fname (str): filename
corpus : yields documents
id2word (:obj:`dict` of (:obj:`str`, :obj:`str`), optional):
transforms id to word
metadata (bool): any additional info
Returns:
:obj:`list` of :obj:`int`: fields' offsets
Parameters
----------
fname : str
Filename
corpus : iterable
Iterable of documents
id2word : dict of (str, str), optional
Transforms id to word
metadata : bool
Any additional info
Returns
-------
list of int
Fields' offsets
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")
Expand Down Expand Up @@ -142,10 +160,15 @@ def docbyoffset(self, offset):
"""
Return document corresponding to `offset`.
Args:
offset (int): position of the document in the file
Returns:
:obj:`list` of (:obj:`int`, :obj:`float`): document's list representation
Parameters
----------
offset : int
Position of the document in the file
Returns
-------
list of (int, float)
Document's list representation
"""
with utils.smart_open(self.fname) as f:
f.seek(offset)
Expand Down

0 comments on commit 04c2e61

Please sign in to comment.