From 6b4c3a8a155dce5f0128ac7b34218e26e84ff972 Mon Sep 17 00:00:00 2001
From: vlejd <vladimir.v.macko@gmail.com>
Date: Fri, 16 Jun 2017 23:43:22 +0200
Subject: [PATCH 1/4] Create local random generator for sample_text & add
 lenght

---
 gensim/corpora/textcorpus.py   | 46 ++++++++++++++++++++++++++--------
 gensim/test/test_textcorpus.py | 46 ++++++++++++++++++++++------------
 2 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
index e52b60f32b..fa840968e1 100644
--- a/gensim/corpora/textcorpus.py
+++ b/gensim/corpora/textcorpus.py
@@ -98,28 +98,54 @@ def get_texts(self):
                 else:
                     yield utils.tokenize(line, lowercase=True)
 
-    def sample_texts(self, n):
+    def sample_texts(self, n, seed=None, length=None):
         """
-        Yield n random texts from the corpus without replacement.
+        Yields n random documents from the corpus without replacement.
 
-        Given the the number of remaingin elements in stream is remaining and we need
-        to choose n elements, the probability for current element to be chosen is n/remaining.
+        Given the number of remaining documents in corpus, we need to choose n elements.
+        The probability for current element to be chosen is n/remaining.
         If we choose it, we just decreese the n and move to the next element.
+        Computing corpus length may be a costly operation so you can use optional paramter
+        length instead.
+
+        Args:
+            n (int): number of documents we want to sample.
+            seed (int|None): if specified, use it as a seed for local random generator.
+            length (int|None): if specified, use it as guess of corpus length.
+
+        Yeilds:
+            list[str]: document represented as list of tokens. See get_texts method.
+
+        Raises:
+            ValueError: then n is invalid or length was set incorrectly.
         """
-        length = len(self)
-        if not n <= length:
-            raise ValueError("sample larger than population")
+        random_generator = None
+        if seed is None:
+            random_generator = random
+        else:
+            random_generator = random.Random(seed)
+
+        if length is None:
+            length = len(self)
 
+        if not n <= length:
+            raise ValueError("n is larger than length of corpus.")
         if not 0 <= n:
-            raise ValueError("negative sample size")
+            raise ValueError("Negative sample size.")
 
         for i, sample in enumerate(self.get_texts()):
-            remaining_in_stream = length - i
-            chance = random.randint(1, remaining_in_stream)
+            if i == length:
+                break
+            remaining_in_corpus = length - i
+            chance = random_generator.randint(1, remaining_in_corpus)
             if chance <= n:
                 n -= 1
                 yield sample
 
+        if n != 0:
+            # This means that length was set to be smaller than nuber of items in stream.
+            raise ValueError("length smaller than number of documents in stream")
+
     def __len__(self):
         if not hasattr(self, 'length'):
             # cache the corpus length
diff --git a/gensim/test/test_textcorpus.py b/gensim/test/test_textcorpus.py
index abf646eb97..e4700c1b7f 100644
--- a/gensim/test/test_textcorpus.py
+++ b/gensim/test/test_textcorpus.py
@@ -21,34 +21,48 @@
 class TestTextCorpus(unittest.TestCase):
     # TODO add tests for other methods
 
-    def test_sample_text(self):
-        class TestTextCorpus(TextCorpus):
-            def __init__(self):
-                self.data = [["document1"], ["document2"]]
+    class DumyTextCorpus(TextCorpus):
+        def __init__(self):
+            self.size = 10
+            self.data = [["document%s" % i] for i in range(self.size)]
 
-            def get_texts(self):
-                for document in self.data:
-                    yield document
+        def get_texts(self):
+            for document in self.data:
+                yield document
 
-        corpus = TestTextCorpus()
+    def test_sample_text(self):
+        corpus = self.DumyTextCorpus()
 
         sample1 = list(corpus.sample_texts(1))
         self.assertEqual(len(sample1), 1)
-        document1 = sample1[0] == ["document1"]
-        document2 = sample1[0] == ["document2"]
-        self.assertTrue(document1 or document2)
+        self.assertIn(sample1[0], corpus.data)
 
-        sample2 = list(corpus.sample_texts(2))
-        self.assertEqual(len(sample2), 2)
-        self.assertEqual(sample2[0], ["document1"])
-        self.assertEqual(sample2[1], ["document2"])
+        sample2 = list(corpus.sample_texts(corpus.size))
+        self.assertEqual(len(sample2), corpus.size)
+        for i in range(corpus.size):
+            self.assertEqual(sample2[i], ["document%s" % i])
 
         with self.assertRaises(ValueError):
-            list(corpus.sample_texts(3))
+            list(corpus.sample_texts(corpus.size + 1))
 
         with self.assertRaises(ValueError):
             list(corpus.sample_texts(-1))
 
+    def test_sample_text_length(self):
+        corpus = self.DumyTextCorpus()
+        sample1 = list(corpus.sample_texts(1, length=1))
+        self.assertEqual(sample1[0], ["document0"])
+
+        sample2 = list(corpus.sample_texts(2, length=2))
+        self.assertEqual(sample2[0], ["document0"])
+        self.assertEqual(sample2[1], ["document1"])
+
+    def test_sample_text_seed(self):
+        corpus = self.DumyTextCorpus()
+        sample1 = list(corpus.sample_texts(5, seed=42))
+        sample2 = list(corpus.sample_texts(5, seed=42))
+        self.assertEqual(sample1, sample2)
+
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

From ab3f517b38b47b9f6de94645cdfd12600e9c9bc5 Mon Sep 17 00:00:00 2001
From: vlejd <vladimir.v.macko@gmail.com>
Date: Sat, 17 Jun 2017 11:30:39 +0200
Subject: [PATCH 2/4] Fix typos in textcorpus.py

---
 gensim/corpora/textcorpus.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
index fa840968e1..9b65598ba4 100644
--- a/gensim/corpora/textcorpus.py
+++ b/gensim/corpora/textcorpus.py
@@ -100,24 +100,25 @@ def get_texts(self):
 
     def sample_texts(self, n, seed=None, length=None):
         """
-        Yields n random documents from the corpus without replacement.
+        Yield n random documents from the corpus without replacement.
 
-        Given the number of remaining documents in corpus, we need to choose n elements.
-        The probability for current element to be chosen is n/remaining.
+        Given the number of remaining documents in a corpus, we need to choose n elements.
+        The probability for the current element to be chosen is n/remaining.
         If we choose it, we just decreese the n and move to the next element.
-        Computing corpus length may be a costly operation so you can use optional paramter
-        length instead.
+        Computing the corpus length may be a costly operation so you can use the optional
+        parameter `length` instead.
 
         Args:
             n (int): number of documents we want to sample.
             seed (int|None): if specified, use it as a seed for local random generator.
-            length (int|None): if specified, use it as guess of corpus length.
+            length (int|None): if specified, use it as a guess of corpus length.
+                It must be positive and not greater than actual corpus length.
 
         Yeilds:
-            list[str]: document represented as list of tokens. See get_texts method.
+            list[str]: document represented as a list of tokens. See get_texts method.
 
         Raises:
-            ValueError: then n is invalid or length was set incorrectly.
+            ValueError: when n is invalid or length was set incorrectly.
         """
         random_generator = None
         if seed is None:

From c0e375b400c35259a3fbdb755f06a519a7e24027 Mon Sep 17 00:00:00 2001
From: vlejd <vladimir.v.macko@gmail.com>
Date: Sun, 18 Jun 2017 12:46:01 +0200
Subject: [PATCH 3/4] Fix typo in yields (yeilds)

---
 gensim/corpora/textcorpus.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
index 9b65598ba4..4e5e0b0db7 100644
--- a/gensim/corpora/textcorpus.py
+++ b/gensim/corpora/textcorpus.py
@@ -104,7 +104,7 @@ def sample_texts(self, n, seed=None, length=None):
 
         Given the number of remaining documents in a corpus, we need to choose n elements.
         The probability for the current element to be chosen is n/remaining.
-        If we choose it, we just decreese the n and move to the next element.
+        If we choose it, we just decrease the n and move to the next element.
         Computing the corpus length may be a costly operation so you can use the optional
         parameter `length` instead.
 
@@ -114,7 +114,7 @@ def sample_texts(self, n, seed=None, length=None):
             length (int|None): if specified, use it as a guess of corpus length.
                 It must be positive and not greater than actual corpus length.
 
-        Yeilds:
+        Yields:
             list[str]: document represented as a list of tokens. See get_texts method.
 
         Raises:
@@ -144,7 +144,7 @@ def sample_texts(self, n, seed=None, length=None):
                 yield sample
 
         if n != 0:
-            # This means that length was set to be smaller than nuber of items in stream.
+            # This means that length was set to be smaller than number of items in stream.
             raise ValueError("length smaller than number of documents in stream")
 
     def __len__(self):

From 0015898f955719607a05b56d9e9a0cec4f7ce489 Mon Sep 17 00:00:00 2001
From: vlejd <vladimir.v.macko@gmail.com>
Date: Sun, 18 Jun 2017 14:44:13 +0200
Subject: [PATCH 4/4] Fix typos and clarify exeception in sample_texts

---
 gensim/corpora/textcorpus.py   | 5 +++--
 gensim/test/test_textcorpus.py | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
index 4e5e0b0db7..00e69a6717 100644
--- a/gensim/corpora/textcorpus.py
+++ b/gensim/corpora/textcorpus.py
@@ -144,8 +144,9 @@ def sample_texts(self, n, seed=None, length=None):
                 yield sample
 
         if n != 0:
-            # This means that length was set to be smaller than number of items in stream.
-            raise ValueError("length smaller than number of documents in stream")
+            # This means that length was set to be greater than number of items in corpus
+            # and we were not able to sample enough documents before the stream ended.
+            raise ValueError("length greater than number of documents in corpus")
 
     def __len__(self):
         if not hasattr(self, 'length'):
diff --git a/gensim/test/test_textcorpus.py b/gensim/test/test_textcorpus.py
index e4700c1b7f..82e3d80960 100644
--- a/gensim/test/test_textcorpus.py
+++ b/gensim/test/test_textcorpus.py
@@ -21,7 +21,7 @@
 class TestTextCorpus(unittest.TestCase):
     # TODO add tests for other methods
 
-    class DumyTextCorpus(TextCorpus):
+    class DummyTextCorpus(TextCorpus):
         def __init__(self):
             self.size = 10
             self.data = [["document%s" % i] for i in range(self.size)]
@@ -31,7 +31,7 @@ def get_texts(self):
                 yield document
 
     def test_sample_text(self):
-        corpus = self.DumyTextCorpus()
+        corpus = self.DummyTextCorpus()
 
         sample1 = list(corpus.sample_texts(1))
         self.assertEqual(len(sample1), 1)
@@ -49,7 +49,7 @@ def test_sample_text(self):
             list(corpus.sample_texts(-1))
 
     def test_sample_text_length(self):
-        corpus = self.DumyTextCorpus()
+        corpus = self.DummyTextCorpus()
         sample1 = list(corpus.sample_texts(1, length=1))
         self.assertEqual(sample1[0], ["document0"])
 
@@ -58,7 +58,7 @@ def test_sample_text_length(self):
         self.assertEqual(sample2[1], ["document1"])
 
     def test_sample_text_seed(self):
-        corpus = self.DumyTextCorpus()
+        corpus = self.DummyTextCorpus()
         sample1 = list(corpus.sample_texts(5, seed=42))
         sample2 = list(corpus.sample_texts(5, seed=42))
         self.assertEqual(sample1, sample2)