From 3cc34fffb821af60f09d80b39dff0ae1a9a9cc4e Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Sat, 30 Sep 2017 15:39:56 +0500 Subject: [PATCH 01/14] Fix typo --- gensim/corpora/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py index 0d51a9b903..aa122d1833 100644 --- a/gensim/corpora/__init__.py +++ b/gensim/corpora/__init__.py @@ -1,5 +1,5 @@ """ -This package contains implementations of various streaming corpus I/O format. +This package contains implementations of various streaming corpus I/O formats. """ # bring corpus classes directly into package namespace, to save some typing From cde582ec204604c96e5dad60392f5d88d7bc8cbe Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Mon, 2 Oct 2017 14:42:31 +0500 Subject: [PATCH 02/14] Make `save_corpus` private --- gensim/corpora/bleicorpus.py | 14 ++++++++------ gensim/corpora/lowcorpus.py | 2 +- gensim/corpora/malletcorpus.py | 2 +- gensim/corpora/mmcorpus.py | 2 +- gensim/corpora/sharded_corpus.py | 4 ++-- gensim/corpora/svmlightcorpus.py | 2 +- gensim/corpora/ucicorpus.py | 2 +- gensim/interfaces.py | 8 ++++++-- gensim/models/wrappers/dtmmodel.py | 2 +- gensim/test/test_miislita.py | 2 +- 10 files changed, 23 insertions(+), 17 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 6bd96da716..273759aca6 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Blei's LDA-C format. -""" +"""Blei's LDA-C format.""" from __future__ import with_statement @@ -41,8 +39,9 @@ def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. - `fname_vocab` is the file with vocabulary; if not specified, it defaults to - `fname.vocab`. + Args: + fname (str): serialized corpus's filename + fname_vocab (str): vocabulary file; takes precedence over fname.vocab """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -85,7 +84,7 @@ def line2doc(self, line): return doc @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. @@ -94,6 +93,9 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. + + Args: + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index e293c998a1..49de7fb9cf 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -141,7 +141,7 @@ def __iter__(self): yield self.line2doc(line) @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index cacf0074bd..b6dc482dcc 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -67,7 +67,7 @@ def line2doc(self, line): return doc @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 2158f0a526..1eaadfb332 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -38,7 +38,7 @@ def __iter__(self): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ Save a corpus in the Matrix Market format to disk. diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 049e22f226..c0fdbfa409 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -773,7 +773,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -809,4 +809,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index c19aa321e2..0b43792ece 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -79,7 +79,7 @@ def __iter__(self): self.length = lineno + 1 @staticmethod - def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): + def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index a8911ee07f..995ce3e6ad 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -192,7 +192,7 @@ def create_dictionary(self): return dictionary @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 6cc7e8d872..8c831fd40f 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -74,14 +74,14 @@ def __len__(self): # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. - >>> MmCorpus.save_corpus('file.mm', corpus) + >>> MmCorpus.__save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the @@ -103,6 +103,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk + def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, + metadata=False): + pass + class TransformedCorpus(CorpusABC): def __init__(self, obj, corpus, chunksize=None, **kwargs): diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 8bbadfc663..3eea2ab651 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -176,7 +176,7 @@ def convert_input(self, corpus, time_slices): """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) + corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index 344da1adb3..5863fc9f65 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -56,7 +56,7 @@ def test_textcorpus(self): # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') - corpora.MmCorpus.save_corpus(ftmp, miislita) + corpora.MmCorpus.__save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result From 34cccfe724419e88fa4bd652e99d3c839788b28e Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Mon, 2 Oct 2017 22:13:22 +0500 Subject: [PATCH 03/14] Annotate `bleicorpus.py` --- gensim/corpora/bleicorpus.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 273759aca6..ef966b77cb 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,7 +5,9 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Blei's LDA-C format.""" +""" +Blei's LDA-C format. +""" from __future__ import with_statement @@ -41,7 +43,9 @@ def __init__(self, fname, fname_vocab=None): Args: fname (str): serialized corpus's filename - fname_vocab (str): vocabulary file; takes precedence over fname.vocab + fname_vocab (:obj:`str`, optional): vocabulary file; takes precedence over fname.vocab + Raises: + IOError: If vocabulary file doesn't exist """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -76,6 +80,15 @@ def __iter__(self): self.length = lineno + 1 def line2doc(self, line): + """ + Args: + line (str): document's string representation + Returns: + :obj:`list` of (:obj:`int`, :obj:`float`): + document's list representation + Raises: + ValueError: If format is invalid + """ parts = utils.to_unicode(line).split() if int(parts[0]) != len(parts) - 1: raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) @@ -91,11 +104,14 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False): There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. - This function is automatically called by `BleiCorpus.serialize`; don't - call it directly, call `serialize` instead. - Args: - + fname (str): filename + corpus : yields documents + id2word (:obj:`dict` of (:obj:`str`, :obj:`str`), optional): + transforms id to word + metadata (bool): any additional info + Returns: + :obj:`list` of :obj:`int`: fields' offsets """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -124,7 +140,12 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False): def docbyoffset(self, offset): """ - Return the document stored at file position `offset`. + Return document corresponding to `offset`. + + Args: + offset (int): position of the document in the file + Returns: + :obj:`list` of (:obj:`int`, :obj:`float`): document's list representation """ with utils.smart_open(self.fname) as f: f.seek(offset) From 34bd9efd2d2de60a7f1c078231be10646092e531 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Mon, 2 Oct 2017 22:57:03 +0500 Subject: [PATCH 04/14] Make __save_corpus weakly private --- gensim/corpora/bleicorpus.py | 2 +- gensim/corpora/lowcorpus.py | 2 +- gensim/corpora/malletcorpus.py | 2 +- gensim/corpora/mmcorpus.py | 2 +- gensim/corpora/sharded_corpus.py | 4 ++-- gensim/corpora/svmlightcorpus.py | 2 +- gensim/corpora/ucicorpus.py | 2 +- gensim/interfaces.py | 4 ++-- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index ef966b77cb..d5eb0da8be 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -97,7 +97,7 @@ def line2doc(self, line): return doc @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def _save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 49de7fb9cf..5d2e19bbfd 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -141,7 +141,7 @@ def __iter__(self): yield self.line2doc(line) @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def _save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index b6dc482dcc..90de7a3c76 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -67,7 +67,7 @@ def line2doc(self, line): return doc @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def _save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 1eaadfb332..0380e09066 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -38,7 +38,7 @@ def __iter__(self): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + def _save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ Save a corpus in the Matrix Market format to disk. diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index c0fdbfa409..907c5d798b 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -773,7 +773,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): + def _save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -809,4 +809,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + serializer._save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 0b43792ece..419da57127 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -79,7 +79,7 @@ def __iter__(self): self.length = lineno + 1 @staticmethod - def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): + def _save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 995ce3e6ad..84efcb406a 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -192,7 +192,7 @@ def create_dictionary(self): return dictionary @staticmethod - def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def _save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 8c831fd40f..623d8625eb 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -74,14 +74,14 @@ def __len__(self): # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def _save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. - >>> MmCorpus.__save_corpus('file.mm', corpus) + >>> MmCorpus._save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the From 3b550370fe4f4f732de02c6ac67cadc27df43420 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 3 Oct 2017 12:07:02 +0500 Subject: [PATCH 05/14] Fix _save_corpus[2] --- gensim/models/wrappers/dtmmodel.py | 2 +- gensim/test/test_miislita.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 3eea2ab651..d006cde4b2 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -176,7 +176,7 @@ def convert_input(self, corpus, time_slices): """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus) + corpora.BleiCorpus._save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index 5863fc9f65..fbcd77f9ac 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -56,7 +56,7 @@ def test_textcorpus(self): # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') - corpora.MmCorpus.__save_corpus(ftmp, miislita) + corpora.MmCorpus._save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result From 04c2e61bd9ee39dc4d7f55639c9ea0bd294bf484 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 24 Oct 2017 15:12:36 +0500 Subject: [PATCH 06/14] Document bleicorpus in Numpy style --- gensim/corpora/bleicorpus.py | 81 +++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index d5eb0da8be..f5f98c21b2 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Blei's LDA-C format. -""" +"""Blei's LDA-C format.""" from __future__ import with_statement @@ -41,11 +39,18 @@ def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. - Args: - fname (str): serialized corpus's filename - fname_vocab (:obj:`str`, optional): vocabulary file; takes precedence over fname.vocab - Raises: - IOError: If vocabulary file doesn't exist + Parameters + ---------- + fname : str + Serialized corpus's filename + fname_vocab : str or None, optional + Vocabulary file; takes precedence over + + Raises + ------ + IOError + If vocabulary file doesn't exist + """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -70,9 +75,7 @@ def __init__(self, fname, fname_vocab=None): self.id2word = dict(enumerate(words)) def __iter__(self): - """ - Iterate over the corpus, returning one sparse vector at a time. - """ + """Iterate over the corpus, returning one sparse vector at a time.""" lineno = -1 with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): @@ -81,12 +84,20 @@ def __iter__(self): def line2doc(self, line): """ - Args: - line (str): document's string representation - Returns: - :obj:`list` of (:obj:`int`, :obj:`float`): - document's list representation - Raises: + Convert line to document. + + Parameters + ---------- + line : str + Document's string representation + + Returns + ------- + list of (int, float) + document's list representation + + Raises + ------ ValueError: If format is invalid """ parts = utils.to_unicode(line).split() @@ -104,14 +115,21 @@ def _save_corpus(fname, corpus, id2word=None, metadata=False): There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. - Args: - fname (str): filename - corpus : yields documents - id2word (:obj:`dict` of (:obj:`str`, :obj:`str`), optional): - transforms id to word - metadata (bool): any additional info - Returns: - :obj:`list` of :obj:`int`: fields' offsets + Parameters + ---------- + fname : str + Filename + corpus : iterable + Iterable of documents + id2word : dict of (str, str), optional + Transforms id to word + metadata : bool + Any additional info + + Returns + ------- + list of int + Fields' offsets """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -142,10 +160,15 @@ def docbyoffset(self, offset): """ Return document corresponding to `offset`. - Args: - offset (int): position of the document in the file - Returns: - :obj:`list` of (:obj:`int`, :obj:`float`): document's list representation + Parameters + ---------- + offset : int + Position of the document in the file + + Returns + ------- + list of (int, float) + Document's list representation """ with utils.smart_open(self.fname) as f: f.seek(offset) From 6c6d118a2d9b4d5cff931328752df082cd46ae88 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 23:56:50 +0500 Subject: [PATCH 07/14] Fix tests on Arch Linux --- gensim/downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 8cf8f3590e..8b9af3c7f2 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -310,7 +310,7 @@ def _download(name): with open(part_path, "rb") as rfp: shutil.copyfileobj(rfp, wfp) os.remove(part_path) - os.rename(tmp_dir, data_folder_dir) + shutil.move(tmp_dir, data_folder_dir) else: url_data = "{base}/{fname}/{fname}.gz".format(base=DOWNLOAD_BASE_URL, fname=name) fname = "{fname}.gz".format(fname=name) @@ -323,7 +323,7 @@ def _download(name): else: shutil.rmtree(tmp_dir) raise Exception("Checksum comparison failed, try again") - os.rename(tmp_dir, data_folder_dir) + shutil.move(tmp_dir, data_folder_dir) def _get_filename(name): From 9ef0206a25b1a66c6a355db78fb6d239a475eb90 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Thu, 23 Nov 2017 05:44:02 +0500 Subject: [PATCH 08/14] Revert "Document bleicorpus in Numpy style" This reverts commit 04c2e61bd9ee39dc4d7f55639c9ea0bd294bf484. --- gensim/corpora/bleicorpus.py | 81 +++++++++++++----------------------- 1 file changed, 29 insertions(+), 52 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index f5f98c21b2..d5eb0da8be 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,7 +5,9 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Blei's LDA-C format.""" +""" +Blei's LDA-C format. +""" from __future__ import with_statement @@ -39,18 +41,11 @@ def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. - Parameters - ---------- - fname : str - Serialized corpus's filename - fname_vocab : str or None, optional - Vocabulary file; takes precedence over - - Raises - ------ - IOError - If vocabulary file doesn't exist - + Args: + fname (str): serialized corpus's filename + fname_vocab (:obj:`str`, optional): vocabulary file; takes precedence over fname.vocab + Raises: + IOError: If vocabulary file doesn't exist """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -75,7 +70,9 @@ def __init__(self, fname, fname_vocab=None): self.id2word = dict(enumerate(words)) def __iter__(self): - """Iterate over the corpus, returning one sparse vector at a time.""" + """ + Iterate over the corpus, returning one sparse vector at a time. + """ lineno = -1 with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): @@ -84,20 +81,12 @@ def __iter__(self): def line2doc(self, line): """ - Convert line to document. - - Parameters - ---------- - line : str - Document's string representation - - Returns - ------- - list of (int, float) - document's list representation - - Raises - ------ + Args: + line (str): document's string representation + Returns: + :obj:`list` of (:obj:`int`, :obj:`float`): + document's list representation + Raises: ValueError: If format is invalid """ parts = utils.to_unicode(line).split() @@ -115,21 +104,14 @@ def _save_corpus(fname, corpus, id2word=None, metadata=False): There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. - Parameters - ---------- - fname : str - Filename - corpus : iterable - Iterable of documents - id2word : dict of (str, str), optional - Transforms id to word - metadata : bool - Any additional info - - Returns - ------- - list of int - Fields' offsets + Args: + fname (str): filename + corpus : yields documents + id2word (:obj:`dict` of (:obj:`str`, :obj:`str`), optional): + transforms id to word + metadata (bool): any additional info + Returns: + :obj:`list` of :obj:`int`: fields' offsets """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -160,15 +142,10 @@ def docbyoffset(self, offset): """ Return document corresponding to `offset`. - Parameters - ---------- - offset : int - Position of the document in the file - - Returns - ------- - list of (int, float) - Document's list representation + Args: + offset (int): position of the document in the file + Returns: + :obj:`list` of (:obj:`int`, :obj:`float`): document's list representation """ with utils.smart_open(self.fname) as f: f.seek(offset) From 7f103f9b8326f59735300258c7583d8507637fd5 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Thu, 23 Nov 2017 05:44:09 +0500 Subject: [PATCH 09/14] Revert "Fix _save_corpus[2]" This reverts commit 3b550370fe4f4f732de02c6ac67cadc27df43420. --- gensim/models/wrappers/dtmmodel.py | 2 +- gensim/test/test_miislita.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index d006cde4b2..3eea2ab651 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -176,7 +176,7 @@ def convert_input(self, corpus, time_slices): """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus._save_corpus(self.fcorpustxt(), corpus) + corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index fbcd77f9ac..5863fc9f65 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -56,7 +56,7 @@ def test_textcorpus(self): # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') - corpora.MmCorpus._save_corpus(ftmp, miislita) + corpora.MmCorpus.__save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result From 566430a262af94135c03cec82551c16ac58e0f2d Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Thu, 23 Nov 2017 05:44:11 +0500 Subject: [PATCH 10/14] Revert "Make __save_corpus weakly private" This reverts commit 34bd9efd2d2de60a7f1c078231be10646092e531. --- gensim/corpora/bleicorpus.py | 2 +- gensim/corpora/lowcorpus.py | 2 +- gensim/corpora/malletcorpus.py | 2 +- gensim/corpora/mmcorpus.py | 2 +- gensim/corpora/sharded_corpus.py | 4 ++-- gensim/corpora/svmlightcorpus.py | 2 +- gensim/corpora/ucicorpus.py | 2 +- gensim/interfaces.py | 4 ++-- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index d5eb0da8be..ef966b77cb 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -97,7 +97,7 @@ def line2doc(self, line): return doc @staticmethod - def _save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 5d2e19bbfd..49de7fb9cf 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -141,7 +141,7 @@ def __iter__(self): yield self.line2doc(line) @staticmethod - def _save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index 90de7a3c76..b6dc482dcc 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -67,7 +67,7 @@ def line2doc(self, line): return doc @staticmethod - def _save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 0380e09066..1eaadfb332 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -38,7 +38,7 @@ def __iter__(self): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def _save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ Save a corpus in the Matrix Market format to disk. diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 907c5d798b..c0fdbfa409 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -773,7 +773,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def _save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -809,4 +809,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer._save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 419da57127..0b43792ece 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -79,7 +79,7 @@ def __iter__(self): self.length = lineno + 1 @staticmethod - def _save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): + def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 84efcb406a..995ce3e6ad 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -192,7 +192,7 @@ def create_dictionary(self): return dictionary @staticmethod - def _save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 623d8625eb..8c831fd40f 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -74,14 +74,14 @@ def __len__(self): # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod - def _save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. - >>> MmCorpus._save_corpus('file.mm', corpus) + >>> MmCorpus.__save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the From a583f4089e3ad63fdd9d5cb514d8be226f24cd2b Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Thu, 23 Nov 2017 05:44:12 +0500 Subject: [PATCH 11/14] Revert "Annotate `bleicorpus.py`" This reverts commit 34cccfe724419e88fa4bd652e99d3c839788b28e. --- gensim/corpora/bleicorpus.py | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index ef966b77cb..273759aca6 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Blei's LDA-C format. -""" +"""Blei's LDA-C format.""" from __future__ import with_statement @@ -43,9 +41,7 @@ def __init__(self, fname, fname_vocab=None): Args: fname (str): serialized corpus's filename - fname_vocab (:obj:`str`, optional): vocabulary file; takes precedence over fname.vocab - Raises: - IOError: If vocabulary file doesn't exist + fname_vocab (str): vocabulary file; takes precedence over fname.vocab """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -80,15 +76,6 @@ def __iter__(self): self.length = lineno + 1 def line2doc(self, line): - """ - Args: - line (str): document's string representation - Returns: - :obj:`list` of (:obj:`int`, :obj:`float`): - document's list representation - Raises: - ValueError: If format is invalid - """ parts = utils.to_unicode(line).split() if int(parts[0]) != len(parts) - 1: raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) @@ -104,14 +91,11 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False): There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. + This function is automatically called by `BleiCorpus.serialize`; don't + call it directly, call `serialize` instead. + Args: - fname (str): filename - corpus : yields documents - id2word (:obj:`dict` of (:obj:`str`, :obj:`str`), optional): - transforms id to word - metadata (bool): any additional info - Returns: - :obj:`list` of :obj:`int`: fields' offsets + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -140,12 +124,7 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False): def docbyoffset(self, offset): """ - Return document corresponding to `offset`. - - Args: - offset (int): position of the document in the file - Returns: - :obj:`list` of (:obj:`int`, :obj:`float`): document's list representation + Return the document stored at file position `offset`. """ with utils.smart_open(self.fname) as f: f.seek(offset) From 026b1f77841d1d03e9c20c325f46120eceabc424 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Thu, 23 Nov 2017 05:44:13 +0500 Subject: [PATCH 12/14] Revert "Make `save_corpus` private" This reverts commit cde582ec204604c96e5dad60392f5d88d7bc8cbe. --- gensim/corpora/bleicorpus.py | 14 ++++++-------- gensim/corpora/lowcorpus.py | 2 +- gensim/corpora/malletcorpus.py | 2 +- gensim/corpora/mmcorpus.py | 2 +- gensim/corpora/sharded_corpus.py | 4 ++-- gensim/corpora/svmlightcorpus.py | 2 +- gensim/corpora/ucicorpus.py | 2 +- gensim/interfaces.py | 8 ++------ gensim/models/wrappers/dtmmodel.py | 2 +- gensim/test/test_miislita.py | 2 +- 10 files changed, 17 insertions(+), 23 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 273759aca6..6bd96da716 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,7 +5,9 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Blei's LDA-C format.""" +""" +Blei's LDA-C format. +""" from __future__ import with_statement @@ -39,9 +41,8 @@ def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. - Args: - fname (str): serialized corpus's filename - fname_vocab (str): vocabulary file; takes precedence over fname.vocab + `fname_vocab` is the file with vocabulary; if not specified, it defaults to + `fname.vocab`. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -84,7 +85,7 @@ def line2doc(self, line): return doc @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. @@ -93,9 +94,6 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False): This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. - - Args: - """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 49de7fb9cf..e293c998a1 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -141,7 +141,7 @@ def __iter__(self): yield self.line2doc(line) @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index b6dc482dcc..cacf0074bd 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -67,7 +67,7 @@ def line2doc(self, line): return doc @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 1eaadfb332..2158f0a526 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -38,7 +38,7 @@ def __iter__(self): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ Save a corpus in the Matrix Market format to disk. diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index c0fdbfa409..049e22f226 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -773,7 +773,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): + def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -809,4 +809,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 0b43792ece..c19aa321e2 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -79,7 +79,7 @@ def __iter__(self): self.length = lineno + 1 @staticmethod - def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): + def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 995ce3e6ad..a8911ee07f 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -192,7 +192,7 @@ def create_dictionary(self): return dictionary @staticmethod - def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 8c831fd40f..6cc7e8d872 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -74,14 +74,14 @@ def __len__(self): # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. - >>> MmCorpus.__save_corpus('file.mm', corpus) + >>> MmCorpus.save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the @@ -103,10 +103,6 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False): fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk - def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, - metadata=False): - pass - class TransformedCorpus(CorpusABC): def __init__(self, obj, corpus, chunksize=None, **kwargs): diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 3eea2ab651..8bbadfc663 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -176,7 +176,7 @@ def convert_input(self, corpus, time_slices): """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus) + corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index 5863fc9f65..344da1adb3 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -56,7 +56,7 @@ def test_textcorpus(self): # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') - corpora.MmCorpus.__save_corpus(ftmp, miislita) + corpora.MmCorpus.save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result From df9903a43ed65721f754a6cd0000e4238080be2e Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Thu, 23 Nov 2017 05:45:03 +0500 Subject: [PATCH 13/14] Revert "Fix typo" This reverts commit 3cc34fffb821af60f09d80b39dff0ae1a9a9cc4e. --- gensim/corpora/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py index aa122d1833..0d51a9b903 100644 --- a/gensim/corpora/__init__.py +++ b/gensim/corpora/__init__.py @@ -1,5 +1,5 @@ """ -This package contains implementations of various streaming corpus I/O formats. +This package contains implementations of various streaming corpus I/O format. """ # bring corpus classes directly into package namespace, to save some typing From eaf8bd247643b1579397a8baa05b73e5270b74cd Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Thu, 23 Nov 2017 06:55:47 +0500 Subject: [PATCH 14/14] Handle the multiple partitions case --- gensim/downloader.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 8b9af3c7f2..8a787207f2 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -281,6 +281,7 @@ def _download(name): """ url_load_file = "{base}/{fname}/__init__.py".format(base=DOWNLOAD_BASE_URL, fname=name) data_folder_dir = os.path.join(base_dir, name) + data_folder_dir_tmp = data_folder_dir + '_tmp' tmp_dir = tempfile.mkdtemp() init_path = os.path.join(tmp_dir, "__init__.py") urllib.urlretrieve(url_load_file, init_path) @@ -310,7 +311,6 @@ def _download(name): with open(part_path, "rb") as rfp: shutil.copyfileobj(rfp, wfp) os.remove(part_path) - shutil.move(tmp_dir, data_folder_dir) else: url_data = "{base}/{fname}/{fname}.gz".format(base=DOWNLOAD_BASE_URL, fname=name) fname = "{fname}.gz".format(fname=name) @@ -323,7 +323,12 @@ def _download(name): else: shutil.rmtree(tmp_dir) raise Exception("Checksum comparison failed, try again") - shutil.move(tmp_dir, data_folder_dir) + + if os.path.exists(data_folder_dir_tmp): + os.remove(data_folder_dir_tmp) + + shutil.move(tmp_dir, data_folder_dir_tmp) + os.rename(data_folder_dir_tmp, data_folder_dir) def _get_filename(name):