Skip to content

Commit

Permalink
Make save_corpus private
Browse files Browse the repository at this point in the history
  • Loading branch information
anotherbugmaster committed Nov 23, 2017
1 parent 3cc34ff commit cde582e
Show file tree
Hide file tree
Showing 10 changed files with 23 additions and 17 deletions.
14 changes: 8 additions & 6 deletions gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
Blei's LDA-C format.
"""
"""Blei's LDA-C format."""

from __future__ import with_statement

Expand Down Expand Up @@ -41,8 +39,9 @@ def __init__(self, fname, fname_vocab=None):
"""
Initialize the corpus from a file.
`fname_vocab` is the file with vocabulary; if not specified, it defaults to
`fname.vocab`.
Args:
fname (str): serialized corpus's filename
fname_vocab (str): vocabulary file; takes precedence over fname.vocab
"""
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s", fname)
Expand Down Expand Up @@ -85,7 +84,7 @@ def line2doc(self, line):
return doc

@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
def __save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save a corpus in the LDA-C format.
Expand All @@ -94,6 +93,9 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
This function is automatically called by `BleiCorpus.serialize`; don't
call it directly, call `serialize` instead.
Args:
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def __iter__(self):
yield self.line2doc(line)

@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
def __save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save a corpus in the List-of-words format.
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def line2doc(self, line):
return doc

@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
def __save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save a corpus in the Mallet format.
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/mmcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __iter__(self):
yield doc # get rid of doc id, return the sparse vector only

@staticmethod
def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
"""
Save a corpus in the Matrix Market format to disk.
Expand Down
4 changes: 2 additions & 2 deletions gensim/corpora/sharded_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,7 +773,7 @@ def load(cls, fname, mmap=None):
return super(ShardedCorpus, cls).load(fname, mmap)

@staticmethod
def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs):
def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs):
"""
Implement a serialization interface. Do not call directly;
use the `serialize` method instead.
Expand Down Expand Up @@ -809,4 +809,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
Ignore the parameters id2word, index_fname, progress_cnt, labels
and metadata. They currently do nothing and are here only to
provide a compatible method signature with superclass."""
serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs)
serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs)
2 changes: 1 addition & 1 deletion gensim/corpora/svmlightcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def __iter__(self):
self.length = lineno + 1

@staticmethod
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
"""
Save a corpus in the SVMlight format.
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/ucicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def create_dictionary(self):
return dictionary

@staticmethod
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
"""
Save a corpus in the UCI Bag-of-Words format.
Expand Down
8 changes: 6 additions & 2 deletions gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,14 @@ def __len__(self):
# return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus

@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
def __save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save an existing `corpus` to disk.
Some formats also support saving the dictionary (`feature_id->word` mapping),
which can in this case be provided by the optional `id2word` parameter.
>>> MmCorpus.save_corpus('file.mm', corpus)
>>> MmCorpus.__save_corpus('file.mm', corpus)
Some corpora also support an index of where each document begins, so
that the documents on disk can be accessed in O(1) time (see the
Expand All @@ -103,6 +103,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
fmt = str(doc) # format the document appropriately...
fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk

def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None,
metadata=False):
pass


class TransformedCorpus(CorpusABC):
def __init__(self, obj, corpus, chunksize=None, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/wrappers/dtmmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def convert_input(self, corpus, time_slices):
"""
logger.info("serializing temporary corpus to %s", self.fcorpustxt())
# write out the corpus in a file format that DTM understands:
corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)
corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus)

with utils.smart_open(self.ftimeslices(), 'wb') as fout:
fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_miislita.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_textcorpus(self):

# make sure serializing works
ftmp = get_tmpfile('test_textcorpus.mm')
corpora.MmCorpus.save_corpus(ftmp, miislita)
corpora.MmCorpus.__save_corpus(ftmp, miislita)
self.assertTrue(os.path.exists(ftmp))

# make sure deserializing gives the same result
Expand Down

0 comments on commit cde582e

Please sign in to comment.