From cde582ec204604c96e5dad60392f5d88d7bc8cbe Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Mon, 2 Oct 2017 14:42:31 +0500 Subject: [PATCH] Make `save_corpus` private --- gensim/corpora/bleicorpus.py | 14 ++++++++------ gensim/corpora/lowcorpus.py | 2 +- gensim/corpora/malletcorpus.py | 2 +- gensim/corpora/mmcorpus.py | 2 +- gensim/corpora/sharded_corpus.py | 4 ++-- gensim/corpora/svmlightcorpus.py | 2 +- gensim/corpora/ucicorpus.py | 2 +- gensim/interfaces.py | 8 ++++++-- gensim/models/wrappers/dtmmodel.py | 2 +- gensim/test/test_miislita.py | 2 +- 10 files changed, 23 insertions(+), 17 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 6bd96da716..273759aca6 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Blei's LDA-C format. -""" +"""Blei's LDA-C format.""" from __future__ import with_statement @@ -41,8 +39,9 @@ def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. - `fname_vocab` is the file with vocabulary; if not specified, it defaults to - `fname.vocab`. + Args: + fname (str): serialized corpus's filename + fname_vocab (str): vocabulary file; takes precedence over fname.vocab """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -85,7 +84,7 @@ def line2doc(self, line): return doc @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. @@ -94,6 +93,9 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. + + Args: + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index e293c998a1..49de7fb9cf 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -141,7 +141,7 @@ def __iter__(self): yield self.line2doc(line) @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index cacf0074bd..b6dc482dcc 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -67,7 +67,7 @@ def line2doc(self, line): return doc @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 2158f0a526..1eaadfb332 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -38,7 +38,7 @@ def __iter__(self): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ Save a corpus in the Matrix Market format to disk. diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 049e22f226..c0fdbfa409 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -773,7 +773,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -809,4 +809,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index c19aa321e2..0b43792ece 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -79,7 +79,7 @@ def __iter__(self): self.length = lineno + 1 @staticmethod - def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): + def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index a8911ee07f..995ce3e6ad 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -192,7 +192,7 @@ def create_dictionary(self): return dictionary @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 6cc7e8d872..8c831fd40f 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -74,14 +74,14 @@ def __len__(self): # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. - >>> MmCorpus.save_corpus('file.mm', corpus) + >>> MmCorpus.__save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the @@ -103,6 +103,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk + def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, + metadata=False): + pass + class TransformedCorpus(CorpusABC): def __init__(self, obj, corpus, chunksize=None, **kwargs): diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 8bbadfc663..3eea2ab651 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -176,7 +176,7 @@ def convert_input(self, corpus, time_slices): """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) + corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index 344da1adb3..5863fc9f65 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -56,7 +56,7 @@ def test_textcorpus(self): # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') - corpora.MmCorpus.save_corpus(ftmp, miislita) + corpora.MmCorpus.__save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result