diff --git a/gensim/.spyderproject b/gensim/.spyderproject deleted file mode 100644 index 956fe24eeb..0000000000 Binary files a/gensim/.spyderproject and /dev/null differ diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index af79a2fd5f..0dd0e30d2a 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -83,19 +83,17 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres if index_fname is None: index_fname = utils.smart_extension(fname, '.index') + kwargs = {'metadata': metadata} if progress_cnt is not None: - if labels is not None: - offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) - else: - offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) - else: - if labels is not None: - offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) - else: - offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata) + kwargs['progress_cnt'] = progress_cnt + + if labels is not None: + kwargs['labels'] = labels + + offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs) if offsets is None: - raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__) + raise NotImplementedError("Called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return @@ -119,8 +117,7 @@ def __len__(self): def __getitem__(self, docno): if self.index is None: - raise RuntimeError("cannot call corpus[docid] without an index") - + raise RuntimeError("Cannot call corpus[docid] without an index") if isinstance(docno, (slice, list, numpy.ndarray)): return utils.SlicedCorpus(self, docno) elif isinstance(docno, six.integer_types + (numpy.integer,)): diff --git a/gensim/utils.py b/gensim/utils.py index f8c6219878..bca29e73cc 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -495,8 +495,7 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore= _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) logger.info("saved %s object", self.__class__.__name__) except TypeError: # `fname_or_handle` does not have write attribute - self._smart_save(fname_or_handle, separately, sep_limit, ignore, - pickle_protocol=pickle_protocol) + self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol) def identity(p): @@ -730,17 +729,18 @@ def __init__(self, corpus, slice_): def __iter__(self): if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0: - return (self.corpus.docbyoffset(i) for i in - self.corpus.index[self.slice_]) - else: - return itertools.islice(self.corpus, self.slice_.start, - self.slice_.stop, self.slice_.step) + return (self.corpus.docbyoffset(i) for i in self.corpus.index[self.slice_]) + return itertools.islice(self.corpus, self.slice_.start, self.slice_.stop, self.slice_.step) def __len__(self): # check cached length, calculate if needed if self.length is None: if isinstance(self.slice_, (list, np.ndarray)): self.length = len(self.slice_) + elif isinstance(self.slice_, slice): + (start, end, step) = self.slice_.indices(len(self.corpus.index)) + diff = end - start + self.length = diff // step + (diff % step > 0) else: self.length = sum(1 for x in self) @@ -1038,7 +1038,7 @@ def has_pattern(): def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, - stopwords=frozenset(), min_length=2, max_length=15): + stopwords=frozenset(), min_length=2, max_length=15): """ This function is only available when the optional 'pattern' package is installed.