Skip to content

Commit

Permalink
API: Default to_* methods to compression='infer' (pandas-dev#22011)
Browse files Browse the repository at this point in the history
  • Loading branch information
dhimmel authored and victor committed Sep 30, 2018
1 parent 96f7cda commit bc40588
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 125 deletions.
2 changes: 1 addition & 1 deletion doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``
Set to ``None`` for no decompression.

.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.

.. versionchanged:: 0.24.0 'infer' option added and set to default.
thousands : str, default ``None``
Thousands separator.
decimal : str, default ``'.'``
Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,8 @@ Other Enhancements
- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
- :func:`~DataFrame.to_csv` and :func:`~DataFrame.to_json` now support ``compression='infer'`` to infer compression based on filename (:issue:`15008`)
- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`).
The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)

Expand Down
8 changes: 6 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1715,7 +1715,7 @@ def to_panel(self):

def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
columns=None, header=True, index=True, index_label=None,
mode='w', encoding=None, compression=None, quoting=None,
mode='w', encoding=None, compression='infer', quoting=None,
quotechar='"', line_terminator='\n', chunksize=None,
tupleize_cols=None, date_format=None, doublequote=True,
escapechar=None, decimal='.'):
Expand Down Expand Up @@ -1750,10 +1750,14 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
encoding : string, optional
A string representing the encoding to use in the output file,
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None},
default 'infer'
If 'infer' and `path_or_buf` is path-like, then detect compression
from the following extensions: '.gz', '.bz2', '.zip' or '.xz'
(otherwise no compression).
.. versionchanged:: 0.24.0
'infer' option added and set to default
line_terminator : string, default ``'\n'``
The newline character or character sequence to use in the output
file
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1933,7 +1933,7 @@ def _repr_latex_(self):

def to_json(self, path_or_buf=None, orient=None, date_format=None,
double_precision=10, force_ascii=True, date_unit='ms',
default_handler=None, lines=False, compression=None,
default_handler=None, lines=False, compression='infer',
index=True):
"""
Convert the object to a JSON string.
Expand Down Expand Up @@ -1999,13 +1999,14 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
like.
.. versionadded:: 0.19.0
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None},
default 'infer'
A string representing the compression to use in the output file,
only used when the first argument is a filename.
.. versionadded:: 0.21.0
.. versionchanged:: 0.24.0
'infer' option added and set to default
index : boolean, default True
Whether to include the index values in the JSON string. Not
including the index (``index=False``) is only supported when
Expand Down
11 changes: 7 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3767,7 +3767,7 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,

def to_csv(self, path=None, index=True, sep=",", na_rep='',
float_format=None, header=False, index_label=None,
mode='w', encoding=None, compression=None, date_format=None,
mode='w', encoding=None, compression='infer', date_format=None,
decimal='.'):
"""
Write Series to a comma-separated values (csv) file
Expand Down Expand Up @@ -3795,10 +3795,13 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
encoding : string, optional
a string representing the encoding to use if the contents are
non-ascii, for python versions prior to 3
compression : string, optional
compression : None or string, default 'infer'
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
Allowed values are None, 'gzip', 'bz2', 'zip', 'xz', and 'infer'.
This input is only used when the first argument is a filename.
.. versionchanged:: 0.24.0
'infer' option added and set to default
date_format: string, default None
Format string for datetime objects.
decimal: string, default '.'
Expand Down
41 changes: 21 additions & 20 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,21 @@
from pandas.core.dtypes.generic import (
ABCMultiIndex, ABCPeriodIndex, ABCDatetimeIndex, ABCIndexClass)

from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user,
_stringify_path)
from pandas.io.common import (
_expand_user,
_get_handle,
_infer_compression,
_stringify_path,
UnicodeWriter,
)


class CSVFormatter(object):

def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
float_format=None, cols=None, header=True, index=True,
index_label=None, mode='w', nanRep=None, encoding=None,
compression=None, quoting=None, line_terminator='\n',
compression='infer', quoting=None, line_terminator='\n',
chunksize=None, tupleize_cols=False, quotechar='"',
date_format=None, doublequote=True, escapechar=None,
decimal='.'):
Expand All @@ -50,8 +55,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
self.index = index
self.index_label = index_label
self.mode = mode
if encoding is None:
encoding = 'ascii' if compat.PY2 else 'utf-8'
self.encoding = encoding
self.compression = compression
self.compression = _infer_compression(self.path_or_buf, compression)

if quoting is None:
quoting = csvlib.QUOTE_MINIMAL
Expand Down Expand Up @@ -124,16 +131,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
self.nlevels = 0

def save(self):
# create the writer & save
if self.encoding is None:
if compat.PY2:
encoding = 'ascii'
else:
encoding = 'utf-8'
else:
encoding = self.encoding

# GH 21227 internal compression is not used when file-like passed.
"""
Create the writer & save
"""
# GH21227 internal compression is not used when file-like passed.
if self.compression and hasattr(self.path_or_buf, 'write'):
msg = ("compression has no effect when passing file-like "
"object as input.")
Expand All @@ -147,15 +148,15 @@ def save(self):
if is_zip:
# zipfile doesn't support writing string to archive. uses string
# buffer to receive csv writing and dump into zip compression
# file handle. GH 21241, 21118
# file handle. GH21241, GH21118
f = StringIO()
close = False
elif hasattr(self.path_or_buf, 'write'):
f = self.path_or_buf
close = False
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
encoding=self.encoding,
compression=self.compression)
close = True

Expand All @@ -165,23 +166,23 @@ def save(self):
doublequote=self.doublequote,
escapechar=self.escapechar,
quotechar=self.quotechar)
if encoding == 'ascii':
if self.encoding == 'ascii':
self.writer = csvlib.writer(f, **writer_kwargs)
else:
writer_kwargs['encoding'] = encoding
writer_kwargs['encoding'] = self.encoding
self.writer = UnicodeWriter(f, **writer_kwargs)

self._save()

finally:
if is_zip:
# GH 17778 handles zip compression separately.
# GH17778 handles zip compression separately.
buf = f.getvalue()
if hasattr(self.path_or_buf, 'write'):
self.path_or_buf.write(buf)
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
encoding=self.encoding,
compression=self.compression)
f.write(buf)
close = True
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
# interface to/from
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
double_precision=10, force_ascii=True, date_unit='ms',
default_handler=None, lines=False, compression=None,
default_handler=None, lines=False, compression='infer',
index=True):

if not index and orient not in ['split', 'table']:
Expand Down
61 changes: 31 additions & 30 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
"""
Tests for the pandas.io.common functionalities
Tests for the pandas.io.common functionalities
"""
import mmap
import pytest
import os
from os.path import isabs

import pytest

import pandas as pd
import pandas.util.testing as tm
import pandas.io.common as icom
import pandas.util._test_decorators as td

from pandas.io import common
from pandas.compat import is_platform_windows, StringIO, FileNotFoundError

from pandas import read_csv, concat
import pandas.util.testing as tm
from pandas.compat import (
is_platform_windows,
StringIO,
FileNotFoundError,
)


class CustomFSPath(object):
Expand Down Expand Up @@ -55,36 +56,36 @@ class TestCommonIOCapabilities(object):

def test_expand_user(self):
filename = '~/sometest'
expanded_name = common._expand_user(filename)
expanded_name = icom._expand_user(filename)

assert expanded_name != filename
assert isabs(expanded_name)
assert os.path.isabs(expanded_name)
assert os.path.expanduser(filename) == expanded_name

def test_expand_user_normal_path(self):
filename = '/somefolder/sometest'
expanded_name = common._expand_user(filename)
expanded_name = icom._expand_user(filename)

assert expanded_name == filename
assert os.path.expanduser(filename) == expanded_name

@td.skip_if_no('pathlib')
def test_stringify_path_pathlib(self):
rel_path = common._stringify_path(Path('.'))
rel_path = icom._stringify_path(Path('.'))
assert rel_path == '.'
redundant_path = common._stringify_path(Path('foo//bar'))
redundant_path = icom._stringify_path(Path('foo//bar'))
assert redundant_path == os.path.join('foo', 'bar')

@td.skip_if_no('py.path')
def test_stringify_path_localpath(self):
path = os.path.join('foo', 'bar')
abs_path = os.path.abspath(path)
lpath = LocalPath(path)
assert common._stringify_path(lpath) == abs_path
assert icom._stringify_path(lpath) == abs_path

def test_stringify_path_fspath(self):
p = CustomFSPath('foo/bar.csv')
result = common._stringify_path(p)
result = icom._stringify_path(p)
assert result == 'foo/bar.csv'

@pytest.mark.parametrize('extension,expected', [
Expand All @@ -97,36 +98,36 @@ def test_stringify_path_fspath(self):
@pytest.mark.parametrize('path_type', path_types)
def test_infer_compression_from_path(self, extension, expected, path_type):
path = path_type('foo/bar.csv' + extension)
compression = common._infer_compression(path, compression='infer')
compression = icom._infer_compression(path, compression='infer')
assert compression == expected

def test_get_filepath_or_buffer_with_path(self):
filename = '~/sometest'
filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
filename)
assert filepath_or_buffer != filename
assert isabs(filepath_or_buffer)
assert os.path.isabs(filepath_or_buffer)
assert os.path.expanduser(filename) == filepath_or_buffer
assert not should_close

def test_get_filepath_or_buffer_with_buffer(self):
input_buffer = StringIO()
filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
input_buffer)
assert filepath_or_buffer == input_buffer
assert not should_close

def test_iterator(self):
reader = read_csv(StringIO(self.data1), chunksize=1)
result = concat(reader, ignore_index=True)
expected = read_csv(StringIO(self.data1))
reader = pd.read_csv(StringIO(self.data1), chunksize=1)
result = pd.concat(reader, ignore_index=True)
expected = pd.read_csv(StringIO(self.data1))
tm.assert_frame_equal(result, expected)

# GH12153
it = read_csv(StringIO(self.data1), chunksize=1)
it = pd.read_csv(StringIO(self.data1), chunksize=1)
first = next(it)
tm.assert_frame_equal(first, expected.iloc[[0]])
tm.assert_frame_equal(concat(it), expected.iloc[1:])
tm.assert_frame_equal(pd.concat(it), expected.iloc[1:])

@pytest.mark.parametrize('reader, module, error_class, fn_ext', [
(pd.read_csv, 'os', FileNotFoundError, 'csv'),
Expand Down Expand Up @@ -246,18 +247,18 @@ def test_constructor_bad_file(self, mmap_file):
msg = "[Errno 22]"
err = mmap.error

tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file)
tm.assert_raises_regex(err, msg, icom.MMapWrapper, non_file)

target = open(mmap_file, 'r')
target.close()

msg = "I/O operation on closed file"
tm.assert_raises_regex(
ValueError, msg, common.MMapWrapper, target)
ValueError, msg, icom.MMapWrapper, target)

def test_get_attr(self, mmap_file):
with open(mmap_file, 'r') as target:
wrapper = common.MMapWrapper(target)
wrapper = icom.MMapWrapper(target)

attrs = dir(wrapper.mmap)
attrs = [attr for attr in attrs
Expand All @@ -271,7 +272,7 @@ def test_get_attr(self, mmap_file):

def test_next(self, mmap_file):
with open(mmap_file, 'r') as target:
wrapper = common.MMapWrapper(target)
wrapper = icom.MMapWrapper(target)
lines = target.readlines()

for line in lines:
Expand All @@ -285,4 +286,4 @@ def test_unknown_engine(self):
df = tm.makeDataFrame()
df.to_csv(path)
with tm.assert_raises_regex(ValueError, 'Unknown engine'):
read_csv(path, engine='pyt')
pd.read_csv(path, engine='pyt')
Loading

0 comments on commit bc40588

Please sign in to comment.