Make save_corpus private

VaiyeBe · Nov 23, 2017 · cde582e · cde582e
1 parent 3cc34ff
commit cde582e
Show file tree

Hide file tree

Showing 10 changed files with 23 additions and 17 deletions.
diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -5,9 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Blei's LDA-C format.
-"""
+"""Blei's LDA-C format."""
 
 from __future__ import with_statement
 
@@ -41,8 +39,9 @@ def __init__(self, fname, fname_vocab=None):
         """
         Initialize the corpus from a file.
 
-        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
-        `fname.vocab`.
+        Args:
+            fname (str): serialized corpus's filename
+            fname_vocab (str): vocabulary file; takes precedence over fname.vocab
         """
         IndexedCorpus.__init__(self, fname)
         logger.info("loading corpus from %s", fname)
@@ -85,7 +84,7 @@ def line2doc(self, line):
         return doc
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, metadata=False):
         """
         Save a corpus in the LDA-C format.
 
@@ -94,6 +93,9 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
 
         This function is automatically called by `BleiCorpus.serialize`; don't
         call it directly, call `serialize` instead.
+
+        Args:
+
         """
         if id2word is None:
             logger.info("no word id mapping provided; initializing from corpus")

diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -141,7 +141,7 @@ def __iter__(self):
                     yield self.line2doc(line)
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, metadata=False):
         """
         Save a corpus in the List-of-words format.
 

diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
@@ -67,7 +67,7 @@ def line2doc(self, line):
             return doc
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, metadata=False):
         """
         Save a corpus in the Mallet format.
 

diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py
@@ -38,7 +38,7 @@ def __iter__(self):
             yield doc  # get rid of doc id, return the sparse vector only
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
         """
         Save a corpus in the Matrix Market format to disk.
 

diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py
@@ -773,7 +773,7 @@ def load(cls, fname, mmap=None):
         return super(ShardedCorpus, cls).load(fname, mmap)
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs):
+    def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs):
         """
         Implement a serialization interface. Do not call directly;
         use the `serialize` method instead.
@@ -809,4 +809,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
         Ignore the parameters id2word, index_fname, progress_cnt, labels
         and metadata. They currently do nothing and are here only to
         provide a compatible method signature with superclass."""
-        serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs)
+        serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs)
diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py
@@ -79,7 +79,7 @@ def __iter__(self):
         self.length = lineno + 1
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
         """
         Save a corpus in the SVMlight format.
 

diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
@@ -192,7 +192,7 @@ def create_dictionary(self):
         return dictionary
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
         """
         Save a corpus in the UCI Bag-of-Words format.
 

diff --git a/gensim/interfaces.py b/gensim/interfaces.py
@@ -74,14 +74,14 @@ def __len__(self):
 #        return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, metadata=False):
         """
         Save an existing `corpus` to disk.
 
         Some formats also support saving the dictionary (`feature_id->word` mapping),
         which can in this case be provided by the optional `id2word` parameter.
 
-        >>> MmCorpus.save_corpus('file.mm', corpus)
+        >>> MmCorpus.__save_corpus('file.mm', corpus)
 
         Some corpora also support an index of where each document begins, so
         that the documents on disk can be accessed in O(1) time (see the
@@ -103,6 +103,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
                 fmt = str(doc)  # format the document appropriately...
                 fout.write(utils.to_utf8("%s\n" % fmt))  # serialize the formatted document to disk
 
+    def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None,
+                  metadata=False):
+        pass
+
 
 class TransformedCorpus(CorpusABC):
     def __init__(self, obj, corpus, chunksize=None, **kwargs):

diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py
@@ -176,7 +176,7 @@ def convert_input(self, corpus, time_slices):
         """
         logger.info("serializing temporary corpus to %s", self.fcorpustxt())
         # write out the corpus in a file format that DTM understands:
-        corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)
+        corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus)
 
         with utils.smart_open(self.ftimeslices(), 'wb') as fout:
             fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))

diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py
@@ -56,7 +56,7 @@ def test_textcorpus(self):
 
         # make sure serializing works
         ftmp = get_tmpfile('test_textcorpus.mm')
-        corpora.MmCorpus.save_corpus(ftmp, miislita)
+        corpora.MmCorpus.__save_corpus(ftmp, miislita)
         self.assertTrue(os.path.exists(ftmp))
 
         # make sure deserializing gives the same result