Skip to content

Commit

Permalink
BUG: Don't warn if default conflicts with dialect (#23775)
Browse files Browse the repository at this point in the history
xref gh-23761.
  • Loading branch information
gfyoung authored and jreback committed Nov 19, 2018
1 parent 3d6d873 commit 2946745
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 30 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1385,6 +1385,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
- Bug in :func:`read_csv()` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`)
- Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`)
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
- Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`)
- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
Expand Down
47 changes: 37 additions & 10 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,24 @@ def parser_f(filepath_or_buffer,
if sep is False:
sep = default_sep

# gh-23761
#
# When a dialect is passed, it overrides any of the overlapping
# parameters passed in directly. We don't want to warn if the
# default parameters were passed in (since it probably means
# that the user didn't pass them in explicitly in the first place).
#
# "delimiter" is the annoying corner case because we alias it to
# "sep" before doing comparison to the dialect values later on.
# Thus, we need a flag to indicate that we need to "override"
# the comparison to dialect values by checking if default values
# for BOTH "delimiter" and "sep" were provided.
if dialect is not None:
sep_override = delimiter is None and sep == default_sep
kwds = dict(sep_override=sep_override)
else:
kwds = dict()

# Alias sep -> delimiter.
if delimiter is None:
delimiter = sep
Expand All @@ -647,7 +665,7 @@ def parser_f(filepath_or_buffer,
engine = 'c'
engine_specified = False

kwds = dict(delimiter=delimiter,
kwds.update(delimiter=delimiter,
engine=engine,
dialect=dialect,
compression=compression,
Expand Down Expand Up @@ -769,18 +787,27 @@ def __init__(self, f, engine=None, **kwds):
except AttributeError:
raise ValueError("Invalid dialect '{dialect}' provided"
.format(dialect=kwds['dialect']))
provided = kwds.get(param, _parser_defaults[param])
parser_default = _parser_defaults[param]
provided = kwds.get(param, parser_default)

# Messages for conflicting values between the dialect instance
# and the actual parameters provided.
# Messages for conflicting values between the dialect
# instance and the actual parameters provided.
conflict_msgs = []

if dialect_val != provided:
conflict_msgs.append((
"Conflicting values for '{param}': '{val}' was "
"provided, but the dialect specifies '{diaval}'. "
"Using the dialect-specified value.".format(
param=param, val=provided, diaval=dialect_val)))
# Don't warn if the default parameter was passed in,
# even if it conflicts with the dialect (gh-23761).
if provided != parser_default and provided != dialect_val:
msg = ("Conflicting values for '{param}': '{val}' was "
"provided, but the dialect specifies '{diaval}'. "
"Using the dialect-specified value.".format(
param=param, val=provided, diaval=dialect_val))

# Annoying corner case for not warning about
# conflicts between dialect and delimiter parameter.
# Refer to the outer "_read_" function for more info.
if not (param == "delimiter" and
kwds.pop("sep_override", False)):
conflict_msgs.append(msg)

if conflict_msgs:
warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
Expand Down
84 changes: 64 additions & 20 deletions pandas/tests/io/parser/test_dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
import pandas.util.testing as tm


@pytest.fixture
def custom_dialect():
dialect_name = "weird"
dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":",
skipinitialspace=False, quotechar="~", quoting=3)
return dialect_name, dialect_kwargs


def test_dialect(all_parsers):
parser = all_parsers
data = """\
Expand All @@ -26,10 +34,7 @@ def test_dialect(all_parsers):

dia = csv.excel()
dia.quoting = csv.QUOTE_NONE

# Conflicting dialect quoting.
with tm.assert_produces_warning(ParserWarning):
df = parser.read_csv(StringIO(data), dialect=dia)
df = parser.read_csv(StringIO(data), dialect=dia)

data = """\
label1,label2,label3
Expand All @@ -53,14 +58,10 @@ def test_dialect_str(all_parsers):
"fruit": ["apple", "pear"],
"vegetable": ["broccoli", "tomato"]
})
csv.register_dialect(dialect_name, delimiter=":")

# Conflicting dialect delimiter.
with tm.assert_produces_warning(ParserWarning):
with tm.with_csv_dialect(dialect_name, delimiter=":"):
df = parser.read_csv(StringIO(data), dialect=dialect_name)

tm.assert_frame_equal(df, exp)
csv.unregister_dialect(dialect_name)
tm.assert_frame_equal(df, exp)


def test_invalid_dialect(all_parsers):
Expand All @@ -75,17 +76,60 @@ class InvalidDialect(object):
parser.read_csv(StringIO(data), dialect=InvalidDialect)


@pytest.mark.parametrize("delimiter", [",", "."])
def test_dialect_conflict(all_parsers, delimiter):
data = "a,b\n1,2"
dialect = "excel"
@pytest.mark.parametrize("arg", [None, "doublequote", "escapechar",
"skipinitialspace", "quotechar", "quoting"])
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect,
arg, value):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers

expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"

warning_klass = None
kwds = dict()

# arg=None tests when we pass in the dialect without any other arguments.
if arg is not None:
if "value" == "dialect": # No conflict --> no warning.
kwds[arg] = dialect_kwargs[arg]
elif "value" == "default": # Default --> no warning.
from pandas.io.parsers import _parser_defaults
kwds[arg] = _parser_defaults[arg]
else: # Non-default + conflict with dialect --> warning.
warning_klass = ParserWarning
kwds[arg] = "blah"

with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
with tm.assert_produces_warning(warning_klass):
result = parser.read_csv(StringIO(data),
dialect=dialect_name, **kwds)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("kwargs,warning_klass", [
(dict(sep=","), None), # sep is default --> sep_override=True
(dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False
(dict(delimiter=":"), None), # No conflict
(dict(delimiter=None), None), # Default arguments --> sep_override=True
(dict(delimiter=","), ParserWarning), # Conflict
(dict(delimiter="."), ParserWarning), # Conflict
], ids=["sep-override-true", "sep-override-false",
"delimiter-no-conflict", "delimiter-default-arg",
"delimiter-conflict", "delimiter-conflict2"])
def test_dialect_conflict_delimiter(all_parsers, custom_dialect,
kwargs, warning_klass):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers

expected = DataFrame({"a": [1], "b": [2]})
warning_klass = None if delimiter == "," else ParserWarning
data = "a:b\n1:2"

with tm.assert_produces_warning(warning_klass):
result = parser.read_csv(StringIO(data),
delimiter=delimiter,
dialect=dialect)
tm.assert_frame_equal(result, expected)
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
with tm.assert_produces_warning(warning_klass):
result = parser.read_csv(StringIO(data),
dialect=dialect_name, **kwargs)
tm.assert_frame_equal(result, expected)
31 changes: 31 additions & 0 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2835,6 +2835,37 @@ def __exit__(self, exc_type, exc_value, traceback):
np.random.set_state(self.start_state)


@contextmanager
def with_csv_dialect(name, **kwargs):
"""
Context manager to temporarily register a CSV dialect for parsing CSV.
Parameters
----------
name : str
The name of the dialect.
kwargs : mapping
The parameters for the dialect.
Raises
------
ValueError : the name of the dialect conflicts with a builtin one.
See Also
--------
csv : Python's CSV library.
"""
import csv
_BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"}

if name in _BUILTIN_DIALECTS:
raise ValueError("Cannot override builtin dialect.")

csv.register_dialect(name, **kwargs)
yield
csv.unregister_dialect(name)


@contextmanager
def use_numexpr(use, min_elements=None):
from pandas.core.computation import expressions as expr
Expand Down

0 comments on commit 2946745

Please sign in to comment.