From 79e81a03b402a7c11eccbf36578adc7b657a1321 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Jun 2013 20:58:42 -0400 Subject: [PATCH] BUG (GH3967) csv parsers would loop infinitely if ``iterator=True`` but no ``chunksize`` was specified (:issue:`3967`), python parser failing with ``chunksize=1`` --- doc/source/release.rst | 2 + pandas/io/parsers.py | 86 +++++++++++++++++++-------------- pandas/io/tests/test_parsers.py | 18 +++++++ 3 files changed, 70 insertions(+), 36 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 882826765d057..f16036692c8d3 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -258,6 +258,8 @@ pandas 0.11.1 - Fixed ``__truediv__`` in Python 2.7 with ``numexpr`` installed to actually do true division when dividing two integer arrays with at least 10000 cells total (:issue:`3764`) - Indexing with a string with seconds resolution not selecting from a time index (:issue:`3925`) + - csv parsers would loop infinitely if ``iterator=True`` but no ``chunksize`` was + specified (:issue:`3967`), python parser failing with ``chunksize=1`` .. _Gh3616: https://github.com/pydata/pandas/issues/3616 diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 054363d8cda06..658532e80682d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -186,7 +186,7 @@ def _read(filepath_or_buffer, kwds): kwds['parse_dates'] = True # Extract some of the arguments (pass chunksize on). - iterator = kwds.pop('iterator', False) + iterator = kwds.get('iterator', False) nrows = kwds.pop('nrows', None) chunksize = kwds.get('chunksize', None) @@ -569,8 +569,11 @@ def _clean_options(self, options, engine): def __iter__(self): try: - while True: - yield self.read(self.chunksize) + if self.chunksize: + while True: + yield self.read(self.chunksize) + else: + yield self.read() except StopIteration: pass @@ -1594,47 +1597,58 @@ def _rows_to_cols(self, content): def _get_lines(self, rows=None): source = self.data lines = self.buf + new_rows = None # already fetched some number if rows is not None: - rows -= len(self.buf) - if isinstance(source, list): - if self.pos > len(source): - raise StopIteration - if rows is None: - lines.extend(source[self.pos:]) - self.pos = len(source) + # we already have the lines in the buffer + if len(self.buf) >= rows: + new_rows, self.buf = self.buf[:rows], self.buf[rows:] + + # need some lines else: - lines.extend(source[self.pos:self.pos + rows]) - self.pos += rows - else: - new_rows = [] - try: - if rows is not None: - for _ in xrange(rows): - new_rows.append(next(source)) - lines.extend(new_rows) + rows -= len(self.buf) + + if new_rows is None: + if isinstance(source, list): + if self.pos > len(source): + raise StopIteration + if rows is None: + lines.extend(source[self.pos:]) + self.pos = len(source) else: - rows = 0 - while True: - try: + lines.extend(source[self.pos:self.pos + rows]) + self.pos += rows + else: + new_rows = [] + try: + if rows is not None: + for _ in xrange(rows): new_rows.append(next(source)) - rows += 1 - except csv.Error, inst: - if 'newline inside string' in str(inst): - row_num = str(self.pos + rows) - msg = ('EOF inside string starting with line ' - + row_num) - raise Exception(msg) - raise - except StopIteration: - lines.extend(new_rows) - if len(lines) == 0: - raise - self.pos += len(new_rows) + lines.extend(new_rows) + else: + rows = 0 + while True: + try: + new_rows.append(next(source)) + rows += 1 + except csv.Error, inst: + if 'newline inside string' in str(inst): + row_num = str(self.pos + rows) + msg = ('EOF inside string starting with line ' + + row_num) + raise Exception(msg) + raise + except StopIteration: + lines.extend(new_rows) + if len(lines) == 0: + raise + self.pos += len(new_rows) - self.buf = [] + self.buf = [] + else: + lines = new_rows if self.skip_footer: lines = lines[:-self.skip_footer] diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index cc2dddd829302..f9e956f60dde6 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1037,6 +1037,24 @@ def test_iterator(self): iterator=True) self.assert_(isinstance(treader, TextFileReader)) + # stopping iteration when on chunksize is specified, GH 3967 + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + reader = self.read_csv(StringIO(data), iterator=True) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + tm.assert_frame_equal(result[0], expected) + + # chunksize = 1 + reader = self.read_csv(StringIO(data), chunksize=1) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + self.assert_(len(result) == 3) + tm.assert_frame_equal(pd.concat(result), expected) + def test_header_not_first_line(self): data = """got,to,ignore,this,line got,to,ignore,this,line