From 6b4c3a8a155dce5f0128ac7b34218e26e84ff972 Mon Sep 17 00:00:00 2001 From: vlejd Date: Fri, 16 Jun 2017 23:43:22 +0200 Subject: [PATCH 1/4] Create local random generator for sample_text & add lenght --- gensim/corpora/textcorpus.py | 46 ++++++++++++++++++++++++++-------- gensim/test/test_textcorpus.py | 46 ++++++++++++++++++++++------------ 2 files changed, 66 insertions(+), 26 deletions(-) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index e52b60f32b..fa840968e1 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -98,28 +98,54 @@ def get_texts(self): else: yield utils.tokenize(line, lowercase=True) - def sample_texts(self, n): + def sample_texts(self, n, seed=None, length=None): """ - Yield n random texts from the corpus without replacement. + Yields n random documents from the corpus without replacement. - Given the the number of remaingin elements in stream is remaining and we need - to choose n elements, the probability for current element to be chosen is n/remaining. + Given the number of remaining documents in corpus, we need to choose n elements. + The probability for current element to be chosen is n/remaining. If we choose it, we just decreese the n and move to the next element. + Computing corpus length may be a costly operation so you can use optional paramter + length instead. + + Args: + n (int): number of documents we want to sample. + seed (int|None): if specified, use it as a seed for local random generator. + length (int|None): if specified, use it as guess of corpus length. + + Yeilds: + list[str]: document represented as list of tokens. See get_texts method. + + Raises: + ValueError: then n is invalid or length was set incorrectly. """ - length = len(self) - if not n <= length: - raise ValueError("sample larger than population") + random_generator = None + if seed is None: + random_generator = random + else: + random_generator = random.Random(seed) + + if length is None: + length = len(self) + if not n <= length: + raise ValueError("n is larger than length of corpus.") if not 0 <= n: - raise ValueError("negative sample size") + raise ValueError("Negative sample size.") for i, sample in enumerate(self.get_texts()): - remaining_in_stream = length - i - chance = random.randint(1, remaining_in_stream) + if i == length: + break + remaining_in_corpus = length - i + chance = random_generator.randint(1, remaining_in_corpus) if chance <= n: n -= 1 yield sample + if n != 0: + # This means that length was set to be smaller than nuber of items in stream. + raise ValueError("length smaller than number of documents in stream") + def __len__(self): if not hasattr(self, 'length'): # cache the corpus length diff --git a/gensim/test/test_textcorpus.py b/gensim/test/test_textcorpus.py index abf646eb97..e4700c1b7f 100644 --- a/gensim/test/test_textcorpus.py +++ b/gensim/test/test_textcorpus.py @@ -21,34 +21,48 @@ class TestTextCorpus(unittest.TestCase): # TODO add tests for other methods - def test_sample_text(self): - class TestTextCorpus(TextCorpus): - def __init__(self): - self.data = [["document1"], ["document2"]] + class DumyTextCorpus(TextCorpus): + def __init__(self): + self.size = 10 + self.data = [["document%s" % i] for i in range(self.size)] - def get_texts(self): - for document in self.data: - yield document + def get_texts(self): + for document in self.data: + yield document - corpus = TestTextCorpus() + def test_sample_text(self): + corpus = self.DumyTextCorpus() sample1 = list(corpus.sample_texts(1)) self.assertEqual(len(sample1), 1) - document1 = sample1[0] == ["document1"] - document2 = sample1[0] == ["document2"] - self.assertTrue(document1 or document2) + self.assertIn(sample1[0], corpus.data) - sample2 = list(corpus.sample_texts(2)) - self.assertEqual(len(sample2), 2) - self.assertEqual(sample2[0], ["document1"]) - self.assertEqual(sample2[1], ["document2"]) + sample2 = list(corpus.sample_texts(corpus.size)) + self.assertEqual(len(sample2), corpus.size) + for i in range(corpus.size): + self.assertEqual(sample2[i], ["document%s" % i]) with self.assertRaises(ValueError): - list(corpus.sample_texts(3)) + list(corpus.sample_texts(corpus.size + 1)) with self.assertRaises(ValueError): list(corpus.sample_texts(-1)) + def test_sample_text_length(self): + corpus = self.DumyTextCorpus() + sample1 = list(corpus.sample_texts(1, length=1)) + self.assertEqual(sample1[0], ["document0"]) + + sample2 = list(corpus.sample_texts(2, length=2)) + self.assertEqual(sample2[0], ["document0"]) + self.assertEqual(sample2[1], ["document1"]) + + def test_sample_text_seed(self): + corpus = self.DumyTextCorpus() + sample1 = list(corpus.sample_texts(5, seed=42)) + sample2 = list(corpus.sample_texts(5, seed=42)) + self.assertEqual(sample1, sample2) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From ab3f517b38b47b9f6de94645cdfd12600e9c9bc5 Mon Sep 17 00:00:00 2001 From: vlejd Date: Sat, 17 Jun 2017 11:30:39 +0200 Subject: [PATCH 2/4] Fix typos in textcorpus.py --- gensim/corpora/textcorpus.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index fa840968e1..9b65598ba4 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -100,24 +100,25 @@ def get_texts(self): def sample_texts(self, n, seed=None, length=None): """ - Yields n random documents from the corpus without replacement. + Yield n random documents from the corpus without replacement. - Given the number of remaining documents in corpus, we need to choose n elements. - The probability for current element to be chosen is n/remaining. + Given the number of remaining documents in a corpus, we need to choose n elements. + The probability for the current element to be chosen is n/remaining. If we choose it, we just decreese the n and move to the next element. - Computing corpus length may be a costly operation so you can use optional paramter - length instead. + Computing the corpus length may be a costly operation so you can use the optional + parameter `length` instead. Args: n (int): number of documents we want to sample. seed (int|None): if specified, use it as a seed for local random generator. - length (int|None): if specified, use it as guess of corpus length. + length (int|None): if specified, use it as a guess of corpus length. + It must be positive and not greater than actual corpus length. Yeilds: - list[str]: document represented as list of tokens. See get_texts method. + list[str]: document represented as a list of tokens. See get_texts method. Raises: - ValueError: then n is invalid or length was set incorrectly. + ValueError: when n is invalid or length was set incorrectly. """ random_generator = None if seed is None: From c0e375b400c35259a3fbdb755f06a519a7e24027 Mon Sep 17 00:00:00 2001 From: vlejd Date: Sun, 18 Jun 2017 12:46:01 +0200 Subject: [PATCH 3/4] Fix typo in yields (yeilds) --- gensim/corpora/textcorpus.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 9b65598ba4..4e5e0b0db7 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -104,7 +104,7 @@ def sample_texts(self, n, seed=None, length=None): Given the number of remaining documents in a corpus, we need to choose n elements. The probability for the current element to be chosen is n/remaining. - If we choose it, we just decreese the n and move to the next element. + If we choose it, we just decrease the n and move to the next element. Computing the corpus length may be a costly operation so you can use the optional parameter `length` instead. @@ -114,7 +114,7 @@ def sample_texts(self, n, seed=None, length=None): length (int|None): if specified, use it as a guess of corpus length. It must be positive and not greater than actual corpus length. - Yeilds: + Yields: list[str]: document represented as a list of tokens. See get_texts method. Raises: @@ -144,7 +144,7 @@ def sample_texts(self, n, seed=None, length=None): yield sample if n != 0: - # This means that length was set to be smaller than nuber of items in stream. + # This means that length was set to be smaller than number of items in stream. raise ValueError("length smaller than number of documents in stream") def __len__(self): From 0015898f955719607a05b56d9e9a0cec4f7ce489 Mon Sep 17 00:00:00 2001 From: vlejd Date: Sun, 18 Jun 2017 14:44:13 +0200 Subject: [PATCH 4/4] Fix typos and clarify exeception in sample_texts --- gensim/corpora/textcorpus.py | 5 +++-- gensim/test/test_textcorpus.py | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 4e5e0b0db7..00e69a6717 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -144,8 +144,9 @@ def sample_texts(self, n, seed=None, length=None): yield sample if n != 0: - # This means that length was set to be smaller than number of items in stream. - raise ValueError("length smaller than number of documents in stream") + # This means that length was set to be greater than number of items in corpus + # and we were not able to sample enough documents before the stream ended. + raise ValueError("length greater than number of documents in corpus") def __len__(self): if not hasattr(self, 'length'): diff --git a/gensim/test/test_textcorpus.py b/gensim/test/test_textcorpus.py index e4700c1b7f..82e3d80960 100644 --- a/gensim/test/test_textcorpus.py +++ b/gensim/test/test_textcorpus.py @@ -21,7 +21,7 @@ class TestTextCorpus(unittest.TestCase): # TODO add tests for other methods - class DumyTextCorpus(TextCorpus): + class DummyTextCorpus(TextCorpus): def __init__(self): self.size = 10 self.data = [["document%s" % i] for i in range(self.size)] @@ -31,7 +31,7 @@ def get_texts(self): yield document def test_sample_text(self): - corpus = self.DumyTextCorpus() + corpus = self.DummyTextCorpus() sample1 = list(corpus.sample_texts(1)) self.assertEqual(len(sample1), 1) @@ -49,7 +49,7 @@ def test_sample_text(self): list(corpus.sample_texts(-1)) def test_sample_text_length(self): - corpus = self.DumyTextCorpus() + corpus = self.DummyTextCorpus() sample1 = list(corpus.sample_texts(1, length=1)) self.assertEqual(sample1[0], ["document0"]) @@ -58,7 +58,7 @@ def test_sample_text_length(self): self.assertEqual(sample2[1], ["document1"]) def test_sample_text_seed(self): - corpus = self.DumyTextCorpus() + corpus = self.DummyTextCorpus() sample1 = list(corpus.sample_texts(5, seed=42)) sample2 = list(corpus.sample_texts(5, seed=42)) self.assertEqual(sample1, sample2)