diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 947bf15a49c7a6..dd993a866d96d1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -332,6 +332,10 @@ encoding : str, default ``None`` Encoding to use for UTF when reading/writing (e.g. ``'utf-8'``). `List of Python standard encodings `_. +encoding_errors : str, default 'strict' + Behavior when the input string can’t be converted according to + the encoding’s rules (strict, ignore, replace, etc.) + See: https://docs.python.org/3/library/codecs.html#codec-base-classes dialect : str or :class:`python:csv.Dialect` instance, default ``None`` If provided, this parameter will override values (default or not) for the following parameters: `delimiter`, `doublequote`, `escapechar`, diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index aeed3668fe774a..1f13703c7d2c84 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -21,7 +21,7 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -- +- :meth:`Dataframe.to_csv` Add `encoding_errors` option (:issue:`27750`). - .. _whatsnew_1000.enhancements.other: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 71d5068e2e0fc4..2219e6c170875a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3069,6 +3069,7 @@ def to_csv( doublequote=True, escapechar=None, decimal=".", + encoding_errors="strict", ): r""" Write object to a comma-separated values (csv) file. @@ -3151,6 +3152,11 @@ def to_csv( decimal : str, default '.' Character recognized as decimal separator. E.g. use ',' for European data. + encoding_errors : str, default 'strict' + Behavior when the input string can’t be converted according to + the encoding’s rules (strict, ignore, replace, etc.) + See: https://docs.python.org/3/library/codecs.html#codec-base-classes + .. versionadded:: 1.0.0 Returns ------- @@ -3197,6 +3203,7 @@ def to_csv( doublequote=doublequote, escapechar=escapechar, decimal=decimal, + encoding_errors=encoding_errors, ) formatter.save() diff --git a/pandas/io/common.py b/pandas/io/common.py index e01e473047b88e..852babe61cddc9 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -310,7 +310,13 @@ def _infer_compression( def _get_handle( - path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True + path_or_buf, + mode, + encoding=None, + compression=None, + memory_map=False, + is_text=True, + encoding_errors="strict", ): """ Get file handle for given path/buffer and mode. @@ -331,6 +337,11 @@ def _get_handle( is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) + encoding_errors : str, default 'strict' + Behavior when the input string can’t be converted according to + the encoding’s rules (strict, ignore, replace, etc.) + See: https://docs.python.org/3/library/codecs.html#codec-base-classes + .. versionadded:: 1.0.0 Returns ------- @@ -407,10 +418,12 @@ def _get_handle( elif is_path: if encoding: # Encoding - f = open(path_or_buf, mode, encoding=encoding, newline="") + f = open( + path_or_buf, mode, errors=encoding_errors, encoding=encoding, newline="" + ) elif is_text: # No explicit encoding - f = open(path_or_buf, mode, errors="replace", newline="") + f = open(path_or_buf, mode, errors=encoding_errors, newline="") else: # Binary mode f = open(path_or_buf, mode) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 60daf311397e80..aa95c99cfc3a9c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -51,6 +51,7 @@ def __init__( doublequote=True, escapechar=None, decimal=".", + encoding_errors="strict", ): self.obj = obj @@ -93,6 +94,8 @@ def __init__( self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) + self.encoding_errors = encoding_errors + # validate mi options if self.has_mi_columns: if cols is not None: @@ -179,6 +182,7 @@ def save(self): self.mode, encoding=self.encoding, compression=self.compression, + encoding_errors=self.encoding_errors, ) close = True diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index ee236a8253b01a..bec30c5e05da16 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -514,3 +514,467 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) + + def test_to_csv_path_with_sjis(self): + # https://github.com/pandas-dev/pandas/issues/27750 + data = {"int": [1, 2, 3], "str_sjis": ["abc", "\u070a", "def"]} + df = pd.DataFrame(data) + # case 1: encoding_errors=strict + with tm.ensure_clean("sjis_test.csv") as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="strict", + index=False, + ) + + # case 2: encoding_errors=replace + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,? +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="replace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 3: encoding_errors=ignore + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2, +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="ignore", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 4: encoding_errors=xmlcharrefreplace + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,܊ +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="xmlcharrefreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 5: encoding_errors=backslashreplace + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,\\u070a +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="backslashreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 6: encoding_errors=namereplace + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,\\N{SYRIAC CONTRACTION} +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="namereplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 7: encoding_errors=surrogatepass + with tm.ensure_clean("sjis_test.csv") as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="surrogatepass", + index=False, + ) + + def test_to_csv_path_with_cp932(self): + # https://github.com/pandas-dev/pandas/issues/27750 + data = {"int": [1, 2, 3], "str_cp932": ["abc", "\u070a", "def"]} + df = pd.DataFrame(data) + # case 1: encoding_errors=strict + with tm.ensure_clean("cp932_test.csv") as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="strict", + index=False, + ) + + # case 2: encoding_errors=replace + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,? +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="replace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 3: encoding_errors=ignore + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2, +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="ignore", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 4: encoding_errors=xmlcharrefreplace + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,܊ +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="xmlcharrefreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 5: encoding_errors=backslashreplace + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,\\u070a +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="backslashreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 6: encoding_errors=namereplace + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,\\N{SYRIAC CONTRACTION} +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="namereplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 7: encoding_errors=surrogatepass + with tm.ensure_clean("cp932_test.csv") as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="surrogatepass", + index=False, + ) + + def test_to_csv_file_object_with_sjis(self): + # https://github.com/pandas-dev/pandas/issues/27750 + data = {"int": [1, 2, 3], "str_sjis": ["abc", "\u070a", "def"]} + df = pd.DataFrame(data) + # case 1: encoding_errors=strict + with tm.ensure_clean() as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="strict", + index=False, + ) + + # case 2: encoding_errors=replace + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,? +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="replace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 3: encoding_errors=ignore + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2, +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="ignore", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 4: encoding_errors=xmlcharrefreplace + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,܊ +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="xmlcharrefreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 5: encoding_errors=backslashreplace + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,\\u070a +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="backslashreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 6: encoding_errors=namereplace + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,\\N{SYRIAC CONTRACTION} +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="namereplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 7: encoding_errors=surrogatepass + with tm.ensure_clean() as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="surrogatepass", + index=False, + ) + + def test_to_csv_file_object_with_cp932(self): + # https://github.com/pandas-dev/pandas/issues/27750 + data = {"int": [1, 2, 3], "str_cp932": ["abc", "\u070a", "def"]} + df = pd.DataFrame(data) + # case 1: encoding_errors=strict + with tm.ensure_clean() as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="strict", + index=False, + ) + + # case 2: encoding_errors=replace + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,? +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="replace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 3: encoding_errors=ignore + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2, +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="ignore", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 4: encoding_errors=xmlcharrefreplace + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,܊ +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="xmlcharrefreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 5: encoding_errors=backslashreplace + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,\\u070a +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="backslashreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 6: encoding_errors=namereplace + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,\\N{SYRIAC CONTRACTION} +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="namereplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 7: encoding_errors=surrogatepass + with tm.ensure_clean() as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="surrogatepass", + index=False, + )