Merge pull request #3978 from jreback/parser_iterator

BUG (GH3967) csv parsers would loop infinitely if iterator=True but no chunksize specified
pandas-dev · Jun 21, 2013 · 78a71b1 · 78a71b1
2 parents 40064ec + 79e81a0
commit 78a71b1
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 36 deletions.
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -258,6 +258,8 @@ pandas 0.11.1
   - Fixed ``__truediv__`` in Python 2.7 with ``numexpr`` installed to actually do true division when dividing
     two integer arrays with at least 10000 cells total (:issue:`3764`)
   - Indexing with a string with seconds resolution not selecting from a time index (:issue:`3925`)
+  - csv parsers would loop infinitely if ``iterator=True`` but no ``chunksize`` was 
+    specified (:issue:`3967`), python parser failing with ``chunksize=1``
 
 .. _Gh3616: https://github.com/pydata/pandas/issues/3616
 

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -186,7 +186,7 @@ def _read(filepath_or_buffer, kwds):
             kwds['parse_dates'] = True
 
     # Extract some of the arguments (pass chunksize on).
-    iterator = kwds.pop('iterator', False)
+    iterator = kwds.get('iterator', False)
     nrows = kwds.pop('nrows', None)
     chunksize = kwds.get('chunksize', None)
 
@@ -569,8 +569,11 @@ def _clean_options(self, options, engine):
 
     def __iter__(self):
         try:
-            while True:
-                yield self.read(self.chunksize)
+            if self.chunksize:
+                while True:
+                    yield self.read(self.chunksize)
+            else:
+                yield self.read()
         except StopIteration:
             pass
 
@@ -1594,47 +1597,58 @@ def _rows_to_cols(self, content):
     def _get_lines(self, rows=None):
         source = self.data
         lines = self.buf
+        new_rows = None
 
         # already fetched some number
         if rows is not None:
-            rows -= len(self.buf)
 
-        if isinstance(source, list):
-            if self.pos > len(source):
-                raise StopIteration
-            if rows is None:
-                lines.extend(source[self.pos:])
-                self.pos = len(source)
+            # we already have the lines in the buffer
+            if len(self.buf) >= rows:
+                new_rows, self.buf = self.buf[:rows], self.buf[rows:]
+
+            # need some lines
             else:
-                lines.extend(source[self.pos:self.pos + rows])
-                self.pos += rows
-        else:
-            new_rows = []
-            try:
-                if rows is not None:
-                    for _ in xrange(rows):
-                        new_rows.append(next(source))
-                    lines.extend(new_rows)
+                rows -= len(self.buf)
+
+        if new_rows is None:
+            if isinstance(source, list):
+                if self.pos > len(source):
+                    raise StopIteration
+                if rows is None:
+                    lines.extend(source[self.pos:])
+                    self.pos = len(source)
                 else:
-                    rows = 0
-                    while True:
-                        try:
+                    lines.extend(source[self.pos:self.pos + rows])
+                    self.pos += rows
+            else:
+                new_rows = []
+                try:
+                    if rows is not None:
+                        for _ in xrange(rows):
                             new_rows.append(next(source))
-                            rows += 1
-                        except csv.Error, inst:
-                            if 'newline inside string' in str(inst):
-                                row_num = str(self.pos + rows)
-                                msg = ('EOF inside string starting with line '
-                                       + row_num)
-                                raise Exception(msg)
-                            raise
-            except StopIteration:
-                lines.extend(new_rows)
-                if len(lines) == 0:
-                    raise
-            self.pos += len(new_rows)
+                        lines.extend(new_rows)
+                    else:
+                        rows = 0
+                        while True:
+                            try:
+                                new_rows.append(next(source))
+                                rows += 1
+                            except csv.Error, inst:
+                                if 'newline inside string' in str(inst):
+                                    row_num = str(self.pos + rows)
+                                    msg = ('EOF inside string starting with line '
+                                           + row_num)
+                                    raise Exception(msg)
+                                raise
+                except StopIteration:
+                    lines.extend(new_rows)
+                    if len(lines) == 0:
+                        raise
+                self.pos += len(new_rows)
 
-        self.buf = []
+            self.buf = []
+        else:
+            lines = new_rows
 
         if self.skip_footer:
             lines = lines[:-self.skip_footer]

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1037,6 +1037,24 @@ def test_iterator(self):
                                   iterator=True)
         self.assert_(isinstance(treader, TextFileReader))
 
+        # stopping iteration when on chunksize is specified, GH 3967
+        data = """A,B,C
+foo,1,2,3
+bar,4,5,6
+baz,7,8,9
+"""
+        reader = self.read_csv(StringIO(data), iterator=True)
+        result = list(reader)
+        expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
+        tm.assert_frame_equal(result[0], expected)
+
+        # chunksize = 1
+        reader = self.read_csv(StringIO(data), chunksize=1)
+        result = list(reader)
+        expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
+        self.assert_(len(result) == 3)
+        tm.assert_frame_equal(pd.concat(result), expected)
+
     def test_header_not_first_line(self):
         data = """got,to,ignore,this,line
 got,to,ignore,this,line