From 95410629fc382dc8e26743934fd6828f14350d76 Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Sat, 9 Mar 2019 19:44:56 -0600 Subject: [PATCH 1/5] DOC: file obj for to_csv must be newline='' --- doc/source/user_guide/io.rst | 4 ++-- pandas/core/generic.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b23a0f10e9e2b..1b5d96fa9c146 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1689,7 +1689,7 @@ The ``Series`` and ``DataFrame`` objects have an instance method ``to_csv`` whic allows storing the contents of the object as a comma-separated-values file. The function takes a number of arguments. Only the first is required. -* ``path_or_buf``: A string path to the file to write or a StringIO +* ``path_or_buf``: A string path to the file to write or a file object. If a file object it must be opened with `newline=''` * ``sep`` : Field delimiter for the output file (default ",") * ``na_rep``: A string representation of a missing value (default '') * ``float_format``: Format string for floating point numbers @@ -1702,7 +1702,7 @@ function takes a number of arguments. Only the first is required. * ``mode`` : Python write mode, default 'w' * ``encoding``: a string representing the encoding to use if the contents are non-ASCII, for Python versions prior to 3 -* ``line_terminator``: Character sequence denoting line end (default '\\n') +* ``line_terminator``: Character sequence denoting line end (default `os.linesep`) * ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL). Note that if you have set a `float_format` then floats are converted to strings and csv.QUOTE_NONNUMERIC will treat them as non-numeric * ``quotechar``: Character used to quote fields (default '"') * ``doublequote``: Control quoting of ``quotechar`` in fields (default True) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0b81576404e2f..a37b745d3082a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2920,7 +2920,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, ---------- path_or_buf : str or file handle, default None File path or object, if None is provided the result is returned as - a string. + a string. If a file object is passed it should be opened with + `newline=''`, disabling universal newlines. .. versionchanged:: 0.24.0 From 2485251c9133fcc82f69f6d2ce9bc97cf9bec6fa Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Sat, 9 Mar 2019 20:21:14 -0600 Subject: [PATCH 2/5] no universal newline for wrapped compression file obj --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/io/common.py | 2 +- pandas/tests/frame/test_to_csv.py | 11 +++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 2c6d1e01ed89b..0f603515c61cc 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -32,6 +32,7 @@ Fixed Regressions - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) +- Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`) .. _whatsnew_0242.enhancements: diff --git a/pandas/io/common.py b/pandas/io/common.py index ad054d77b3bc8..c1cacf39c5b08 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -434,7 +434,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, if (compat.PY3 and is_text and (compression or isinstance(f, need_text_wrapping))): from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding) + f = TextIOWrapper(f, encoding=encoding, newline='') handles.append(f) if memory_map and hasattr(f, 'fileno'): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 54a8712a9c645..f5be2664021b6 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -4,6 +4,7 @@ import csv import os +import gzip import numpy as np import pytest @@ -1221,3 +1222,13 @@ def test_multi_index_header(self): '1,5,6,7,8'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected + + def test_gz_lineend(self): + df = pd.DataFrame({'a': [1, 2]}) + expected_rows = ['a', '1', '2'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + with ensure_clean('__test_gz_lineend.csv.gz') as path: + df.to_csv(path, index=False) + result = gzip.open(path, mode='rt', newline='').read() + + assert result == expected From 61d07cc56aad83f6d3babda74d5f153669a26c0c Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Sat, 9 Mar 2019 20:22:46 -0600 Subject: [PATCH 3/5] fixup --- pandas/tests/frame/test_to_csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index f5be2664021b6..a5765d82f276a 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -1224,6 +1224,7 @@ def test_multi_index_header(self): assert result == expected def test_gz_lineend(self): + # GH 25311 df = pd.DataFrame({'a': [1, 2]}) expected_rows = ['a', '1', '2'] expected = tm.convert_rows_list_to_csv_str(expected_rows) From 0d732666f4b78e457fcbf982e07e69ff8758377c Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Sun, 10 Mar 2019 20:37:39 -0500 Subject: [PATCH 4/5] CLN: using compression helper --- pandas/tests/frame/test_to_csv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index a5765d82f276a..5a5d54591ec2b 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -1230,6 +1230,7 @@ def test_gz_lineend(self): expected = tm.convert_rows_list_to_csv_str(expected_rows) with ensure_clean('__test_gz_lineend.csv.gz') as path: df.to_csv(path, index=False) - result = gzip.open(path, mode='rt', newline='').read() + with tm.decompress_file(path, compression='gzip') as f: + result = f.read().decode('utf-8') assert result == expected From 76e6f70037f22fc404bd7f5580c2203bc740e95b Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Sun, 10 Mar 2019 21:20:41 -0500 Subject: [PATCH 5/5] unused import --- pandas/tests/frame/test_to_csv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 5a5d54591ec2b..59bf3d00f979c 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -4,7 +4,6 @@ import csv import os -import gzip import numpy as np import pytest