diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b9abf9293079f..c00503e080864 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1385,6 +1385,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :func:`read_csv()` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`) - Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`) - Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) +- Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`) - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) - Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) - Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f76e0d874fdf3..a1f02165f8d3d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -632,6 +632,24 @@ def parser_f(filepath_or_buffer, if sep is False: sep = default_sep + # gh-23761 + # + # When a dialect is passed, it overrides any of the overlapping + # parameters passed in directly. We don't want to warn if the + # default parameters were passed in (since it probably means + # that the user didn't pass them in explicitly in the first place). + # + # "delimiter" is the annoying corner case because we alias it to + # "sep" before doing comparison to the dialect values later on. + # Thus, we need a flag to indicate that we need to "override" + # the comparison to dialect values by checking if default values + # for BOTH "delimiter" and "sep" were provided. + if dialect is not None: + sep_override = delimiter is None and sep == default_sep + kwds = dict(sep_override=sep_override) + else: + kwds = dict() + # Alias sep -> delimiter. if delimiter is None: delimiter = sep @@ -647,7 +665,7 @@ def parser_f(filepath_or_buffer, engine = 'c' engine_specified = False - kwds = dict(delimiter=delimiter, + kwds.update(delimiter=delimiter, engine=engine, dialect=dialect, compression=compression, @@ -769,18 +787,27 @@ def __init__(self, f, engine=None, **kwds): except AttributeError: raise ValueError("Invalid dialect '{dialect}' provided" .format(dialect=kwds['dialect'])) - provided = kwds.get(param, _parser_defaults[param]) + parser_default = _parser_defaults[param] + provided = kwds.get(param, parser_default) - # Messages for conflicting values between the dialect instance - # and the actual parameters provided. + # Messages for conflicting values between the dialect + # instance and the actual parameters provided. conflict_msgs = [] - if dialect_val != provided: - conflict_msgs.append(( - "Conflicting values for '{param}': '{val}' was " - "provided, but the dialect specifies '{diaval}'. " - "Using the dialect-specified value.".format( - param=param, val=provided, diaval=dialect_val))) + # Don't warn if the default parameter was passed in, + # even if it conflicts with the dialect (gh-23761). + if provided != parser_default and provided != dialect_val: + msg = ("Conflicting values for '{param}': '{val}' was " + "provided, but the dialect specifies '{diaval}'. " + "Using the dialect-specified value.".format( + param=param, val=provided, diaval=dialect_val)) + + # Annoying corner case for not warning about + # conflicts between dialect and delimiter parameter. + # Refer to the outer "_read_" function for more info. + if not (param == "delimiter" and + kwds.pop("sep_override", False)): + conflict_msgs.append(msg) if conflict_msgs: warnings.warn('\n\n'.join(conflict_msgs), ParserWarning, diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index b005bf4f2d212..5392f793b361c 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -16,6 +16,14 @@ import pandas.util.testing as tm +@pytest.fixture +def custom_dialect(): + dialect_name = "weird" + dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":", + skipinitialspace=False, quotechar="~", quoting=3) + return dialect_name, dialect_kwargs + + def test_dialect(all_parsers): parser = all_parsers data = """\ @@ -26,10 +34,7 @@ def test_dialect(all_parsers): dia = csv.excel() dia.quoting = csv.QUOTE_NONE - - # Conflicting dialect quoting. - with tm.assert_produces_warning(ParserWarning): - df = parser.read_csv(StringIO(data), dialect=dia) + df = parser.read_csv(StringIO(data), dialect=dia) data = """\ label1,label2,label3 @@ -53,14 +58,10 @@ def test_dialect_str(all_parsers): "fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"] }) - csv.register_dialect(dialect_name, delimiter=":") - # Conflicting dialect delimiter. - with tm.assert_produces_warning(ParserWarning): + with tm.with_csv_dialect(dialect_name, delimiter=":"): df = parser.read_csv(StringIO(data), dialect=dialect_name) - - tm.assert_frame_equal(df, exp) - csv.unregister_dialect(dialect_name) + tm.assert_frame_equal(df, exp) def test_invalid_dialect(all_parsers): @@ -75,17 +76,60 @@ class InvalidDialect(object): parser.read_csv(StringIO(data), dialect=InvalidDialect) -@pytest.mark.parametrize("delimiter", [",", "."]) -def test_dialect_conflict(all_parsers, delimiter): - data = "a,b\n1,2" - dialect = "excel" +@pytest.mark.parametrize("arg", [None, "doublequote", "escapechar", + "skipinitialspace", "quotechar", "quoting"]) +@pytest.mark.parametrize("value", ["dialect", "default", "other"]) +def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, + arg, value): + # see gh-23761. + dialect_name, dialect_kwargs = custom_dialect + parser = all_parsers + + expected = DataFrame({"a": [1], "b": [2]}) + data = "a:b\n1:2" + + warning_klass = None + kwds = dict() + + # arg=None tests when we pass in the dialect without any other arguments. + if arg is not None: + if "value" == "dialect": # No conflict --> no warning. + kwds[arg] = dialect_kwargs[arg] + elif "value" == "default": # Default --> no warning. + from pandas.io.parsers import _parser_defaults + kwds[arg] = _parser_defaults[arg] + else: # Non-default + conflict with dialect --> warning. + warning_klass = ParserWarning + kwds[arg] = "blah" + + with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + with tm.assert_produces_warning(warning_klass): + result = parser.read_csv(StringIO(data), + dialect=dialect_name, **kwds) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs,warning_klass", [ + (dict(sep=","), None), # sep is default --> sep_override=True + (dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False + (dict(delimiter=":"), None), # No conflict + (dict(delimiter=None), None), # Default arguments --> sep_override=True + (dict(delimiter=","), ParserWarning), # Conflict + (dict(delimiter="."), ParserWarning), # Conflict +], ids=["sep-override-true", "sep-override-false", + "delimiter-no-conflict", "delimiter-default-arg", + "delimiter-conflict", "delimiter-conflict2"]) +def test_dialect_conflict_delimiter(all_parsers, custom_dialect, + kwargs, warning_klass): + # see gh-23761. + dialect_name, dialect_kwargs = custom_dialect parser = all_parsers expected = DataFrame({"a": [1], "b": [2]}) - warning_klass = None if delimiter == "," else ParserWarning + data = "a:b\n1:2" - with tm.assert_produces_warning(warning_klass): - result = parser.read_csv(StringIO(data), - delimiter=delimiter, - dialect=dialect) - tm.assert_frame_equal(result, expected) + with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + with tm.assert_produces_warning(warning_klass): + result = parser.read_csv(StringIO(data), + dialect=dialect_name, **kwargs) + tm.assert_frame_equal(result, expected) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 1dc461e010e37..112b4170d211a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2835,6 +2835,37 @@ def __exit__(self, exc_type, exc_value, traceback): np.random.set_state(self.start_state) +@contextmanager +def with_csv_dialect(name, **kwargs): + """ + Context manager to temporarily register a CSV dialect for parsing CSV. + + Parameters + ---------- + name : str + The name of the dialect. + kwargs : mapping + The parameters for the dialect. + + Raises + ------ + ValueError : the name of the dialect conflicts with a builtin one. + + See Also + -------- + csv : Python's CSV library. + """ + import csv + _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} + + if name in _BUILTIN_DIALECTS: + raise ValueError("Cannot override builtin dialect.") + + csv.register_dialect(name, **kwargs) + yield + csv.unregister_dialect(name) + + @contextmanager def use_numexpr(use, min_elements=None): from pandas.core.computation import expressions as expr