Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

* optimize __len__ of SlicedCorpus #1679

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed gensim/.spyderproject
Binary file not shown.
21 changes: 9 additions & 12 deletions gensim/corpora/indexedcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,19 +83,17 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
if index_fname is None:
index_fname = utils.smart_extension(fname, '.index')

kwargs = {'metadata': metadata}
if progress_cnt is not None:
if labels is not None:
offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata)
else:
offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata)
else:
if labels is not None:
offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata)
else:
offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata)
kwargs['progress_cnt'] = progress_cnt

if labels is not None:
kwargs['labels'] = labels

offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs)

if offsets is None:
raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__)
raise NotImplementedError("Called serialize on class %s which doesn't support indexing!" % serializer.__name__)

# store offsets persistently, using pickle
# we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
Expand All @@ -119,8 +117,7 @@ def __len__(self):

def __getitem__(self, docno):
if self.index is None:
raise RuntimeError("cannot call corpus[docid] without an index")

raise RuntimeError("Cannot call corpus[docid] without an index")
if isinstance(docno, (slice, list, numpy.ndarray)):
return utils.SlicedCorpus(self, docno)
elif isinstance(docno, six.integer_types + (numpy.integer,)):
Expand Down
16 changes: 8 additions & 8 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,8 +495,7 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=
_pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
logger.info("saved %s object", self.__class__.__name__)
except TypeError: # `fname_or_handle` does not have write attribute
self._smart_save(fname_or_handle, separately, sep_limit, ignore,
pickle_protocol=pickle_protocol)
self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol)


def identity(p):
Expand Down Expand Up @@ -730,17 +729,18 @@ def __init__(self, corpus, slice_):

def __iter__(self):
if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0:
return (self.corpus.docbyoffset(i) for i in
self.corpus.index[self.slice_])
else:
return itertools.islice(self.corpus, self.slice_.start,
self.slice_.stop, self.slice_.step)
return (self.corpus.docbyoffset(i) for i in self.corpus.index[self.slice_])
return itertools.islice(self.corpus, self.slice_.start, self.slice_.stop, self.slice_.step)

def __len__(self):
# check cached length, calculate if needed
if self.length is None:
if isinstance(self.slice_, (list, np.ndarray)):
self.length = len(self.slice_)
elif isinstance(self.slice_, slice):
(start, end, step) = self.slice_.indices(len(self.corpus.index))
diff = end - start
self.length = diff // step + (diff % step > 0)
else:
self.length = sum(1 for x in self)

Expand Down Expand Up @@ -1038,7 +1038,7 @@ def has_pattern():


def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
stopwords=frozenset(), min_length=2, max_length=15):
stopwords=frozenset(), min_length=2, max_length=15):
"""
This function is only available when the optional 'pattern' package is installed.

Expand Down