From e81e147be71251979085640b089dd74111141795 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 2 Aug 2018 22:32:42 +0200 Subject: [PATCH] API: Deprecate old Series.to_csv signature closes #19715 --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/frame.py | 101 ------------------------- pandas/core/generic.py | 109 +++++++++++++++++++++++++++ pandas/core/series.py | 110 ++++++++++++++-------------- pandas/tests/frame/test_to_csv.py | 17 +++-- pandas/tests/io/test_compression.py | 37 ++++++---- pandas/tests/series/test_io.py | 36 ++++++--- 7 files changed, 229 insertions(+), 182 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5c15c7b6a742f9..730a4895055c67 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -477,6 +477,7 @@ Deprecations - :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) - :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) - :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) +- The signature of :meth:`Series.to_csv` has been uniformed to that of doc:meth:`DataFrame.to_csv`: the name of the first argument is now 'path_or_buf', the order of subsequent arguments has changed, the 'header' argument now defaults to True. (:issue:`19715`) - :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) - :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cb251d46489255..f2766f45bee2bf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1714,107 +1714,6 @@ def to_panel(self): return self._constructor_expanddim(new_mgr) - def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, - columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, compression='infer', quoting=None, - quotechar='"', line_terminator='\n', chunksize=None, - tupleize_cols=None, date_format=None, doublequote=True, - escapechar=None, decimal='.'): - r"""Write DataFrame to a comma-separated values (csv) file - - Parameters - ---------- - path_or_buf : string or file handle, default None - File path or object, if None is provided the result is returned as - a string. - sep : character, default ',' - Field delimiter for the output file. - na_rep : string, default '' - Missing data representation - float_format : string, default None - Format string for floating point numbers - columns : sequence, optional - Columns to write - header : boolean or list of string, default True - Write out the column names. If a list of strings is given it is - assumed to be aliases for the column names - index : boolean, default True - Write row names (index) - index_label : string or sequence, or False, default None - Column label for index column(s) if desired. If None is given, and - `header` and `index` are True, then the index names are used. A - sequence should be given if the DataFrame uses MultiIndex. If - False do not print fields for index names. Use index_label=False - for easier importing in R - mode : str - Python write mode, default 'w' - encoding : string, optional - A string representing the encoding to use in the output file, - defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, - default 'infer' - If 'infer' and `path_or_buf` is path-like, then detect compression - from the following extensions: '.gz', '.bz2', '.zip' or '.xz' - (otherwise no compression). - - .. versionchanged:: 0.24.0 - 'infer' option added and set to default - line_terminator : string, default ``'\n'`` - The newline character or character sequence to use in the output - file - quoting : optional constant from csv module - defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` - then floats are converted to strings and thus csv.QUOTE_NONNUMERIC - will treat them as non-numeric - quotechar : string (length 1), default '\"' - character used to quote fields - doublequote : boolean, default True - Control quoting of `quotechar` inside a field - escapechar : string (length 1), default None - character used to escape `sep` and `quotechar` when appropriate - chunksize : int or None - rows to write at a time - tupleize_cols : boolean, default False - .. deprecated:: 0.21.0 - This argument will be removed and will always write each row - of the multi-index as a separate row in the CSV file. - - Write MultiIndex columns as a list of tuples (if True) or in - the new, expanded format, where each MultiIndex column is a row - in the CSV (if False). - date_format : string, default None - Format string for datetime objects - decimal: string, default '.' - Character recognized as decimal separator. E.g. use ',' for - European data - - """ - - if tupleize_cols is not None: - warnings.warn("The 'tupleize_cols' parameter is deprecated and " - "will be removed in a future version", - FutureWarning, stacklevel=2) - else: - tupleize_cols = False - - from pandas.io.formats.csvs import CSVFormatter - formatter = CSVFormatter(self, path_or_buf, - line_terminator=line_terminator, sep=sep, - encoding=encoding, - compression=compression, quoting=quoting, - na_rep=na_rep, float_format=float_format, - cols=columns, header=header, index=index, - index_label=index_label, mode=mode, - chunksize=chunksize, quotechar=quotechar, - tupleize_cols=tupleize_cols, - date_format=date_format, - doublequote=doublequote, - escapechar=escapechar, decimal=decimal) - formatter.save() - - if path_or_buf is None: - return formatter.path_or_buf.getvalue() - @Appender(_shared_docs['to_excel'] % _shared_doc_kwargs) def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, columns=None, header=True, index=True, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f62605c3427025..52b3f79abf5e88 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9270,6 +9270,115 @@ def first_valid_index(self): def last_valid_index(self): return self._find_valid_index('last') + def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, + columns=None, header=True, index=True, index_label=None, + mode='w', encoding=None, compression='infer', quoting=None, + quotechar='"', line_terminator='\n', chunksize=None, + tupleize_cols=None, date_format=None, doublequote=True, + escapechar=None, decimal='.'): + r"""Write object to a comma-separated values (csv) file + + Parameters + ---------- + path_or_buf : string or file handle, default None + File path or object, if None is provided the result is returned as + a string. + .. versionchanged:: 0.24.0 + Was previously named "path" for Series. + sep : character, default ',' + Field delimiter for the output file. + na_rep : string, default '' + Missing data representation + float_format : string, default None + Format string for floating point numbers + columns : sequence, optional + Columns to write + header : boolean or list of string, default True + Write out the column names. If a list of strings is given it is + assumed to be aliases for the column names + .. versionchanged:: 0.24.0 + Previously defaulted to False for Series. + index : boolean, default True + Write row names (index) + index_label : string or sequence, or False, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the object uses MultiIndex. If + False do not print fields for index names. Use index_label=False + for easier importing in R + mode : str + Python write mode, default 'w' + encoding : string, optional + A string representing the encoding to use in the output file, + defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, + default 'infer' + If 'infer' and `path_or_buf` is path-like, then detect compression + from the following extensions: '.gz', '.bz2', '.zip' or '.xz' + (otherwise no compression). + + .. versionchanged:: 0.24.0 + 'infer' option added and set to default + line_terminator : string, default ``'\n'`` + The newline character or character sequence to use in the output + file + quoting : optional constant from csv module + defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` + then floats are converted to strings and thus csv.QUOTE_NONNUMERIC + will treat them as non-numeric + quotechar : string (length 1), default '\"' + character used to quote fields + doublequote : boolean, default True + Control quoting of `quotechar` inside a field + escapechar : string (length 1), default None + character used to escape `sep` and `quotechar` when appropriate + chunksize : int or None + rows to write at a time + tupleize_cols : boolean, default False + .. deprecated:: 0.21.0 + This argument will be removed and will always write each row + of the multi-index as a separate row in the CSV file. + + Write MultiIndex columns as a list of tuples (if True) or in + the new, expanded format, where each MultiIndex column is a row + in the CSV (if False). + date_format : string, default None + Format string for datetime objects + decimal: string, default '.' + Character recognized as decimal separator. E.g. use ',' for + European data + + .. versionchanged:: 0.24.0 + The order of arguments for Series was changed. + """ + + df = self if isinstance(self, ABCDataFrame) else self.to_frame() + + if tupleize_cols is not None: + warnings.warn("The 'tupleize_cols' parameter is deprecated and " + "will be removed in a future version", + FutureWarning, stacklevel=2) + else: + tupleize_cols = False + + from pandas.io.formats.csvs import CSVFormatter + formatter = CSVFormatter(df, path_or_buf, + line_terminator=line_terminator, sep=sep, + encoding=encoding, + compression=compression, quoting=quoting, + na_rep=na_rep, float_format=float_format, + cols=columns, header=header, index=index, + index_label=index_label, mode=mode, + chunksize=chunksize, quotechar=quotechar, + tupleize_cols=tupleize_cols, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, decimal=decimal) + formatter.save() + + if path_or_buf is None: + return formatter.path_or_buf.getvalue() + def _doc_parms(cls): """Return a tuple of the doc parms.""" diff --git a/pandas/core/series.py b/pandas/core/series.py index 21dea15772cc07..bfba6367616e2a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -17,6 +17,7 @@ from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.common import ( is_categorical_dtype, + is_string_like, is_bool, is_integer, is_integer_dtype, is_float_dtype, @@ -3765,59 +3766,62 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, return result - def to_csv(self, path=None, index=True, sep=",", na_rep='', - float_format=None, header=False, index_label=None, - mode='w', encoding=None, compression='infer', date_format=None, - decimal='.'): - """ - Write Series to a comma-separated values (csv) file - - Parameters - ---------- - path : string or file handle, default None - File path or object, if None is provided the result is returned as - a string. - na_rep : string, default '' - Missing data representation - float_format : string, default None - Format string for floating point numbers - header : boolean, default False - Write out series name - index : boolean, default True - Write row names (index) - index_label : string or sequence, default None - Column label for index column(s) if desired. If None is given, and - `header` and `index` are True, then the index names are used. A - sequence should be given if the DataFrame uses MultiIndex. - mode : Python write mode, default 'w' - sep : character, default "," - Field delimiter for the output file. - encoding : string, optional - a string representing the encoding to use if the contents are - non-ascii, for python versions prior to 3 - compression : None or string, default 'infer' - A string representing the compression to use in the output file. - Allowed values are None, 'gzip', 'bz2', 'zip', 'xz', and 'infer'. - This input is only used when the first argument is a filename. - - .. versionchanged:: 0.24.0 - 'infer' option added and set to default - date_format: string, default None - Format string for datetime objects. - decimal: string, default '.' - Character recognized as decimal separator. E.g. use ',' for - European data - """ - from pandas.core.frame import DataFrame - df = DataFrame(self) - # result is only a string if no path provided, otherwise None - result = df.to_csv(path, index=index, sep=sep, na_rep=na_rep, - float_format=float_format, header=header, - index_label=index_label, mode=mode, - encoding=encoding, compression=compression, - date_format=date_format, decimal=decimal) - if path is None: - return result + @Appender(generic.NDFrame.to_csv.__doc__) + def to_csv(self, *args, **kwargs): + + names = ["path_or_buf", "sep", "na_rep", "float_format", "columns", + "header", "index", "index_label", "mode", "encoding", + "compression", "quoting", "quotechar", "line_terminator", + "chunksize", "tupleize_cols", "date_format", "doublequote", + "escapechar", "decimal"] + + old_names = ["path_or_buf", "index", "sep", "na_rep", "float_format", + "header", "index_label", "mode", "encoding", + "compression", "date_format", "decimal"] + + if "path" in kwargs: + warnings.warn("The signature of `Series.to_csv` was aligned " + "to that of `DataFrame.to_csv`, and argument " + "'path' will be renamed to 'path_or_buf'.", + FutureWarning, stacklevel=2) + kwargs["path_or_buf"] = kwargs.pop("path") + + if len(args) > 1: + # Either "index" (old signature) or "sep" (new signature) is being + # passed as second argument (while the first is the same) + maybe_sep = args[1] + + if not (is_string_like(maybe_sep) and len(maybe_sep) == 1): + # old signature + warnings.warn("The signature of `Series.to_csv` was aligned " + "to that of `DataFrame.to_csv`. Note that the " + "order of arguments changed, and the new one " + "has 'sep' in first place, for which \"{}\" is " + "not a valid value. The old order will cease to " + "be supported in a future version. Please refer " + "to the documentation for `DataFrame.to_csv` " + "when updating your function " + "calls.".format(maybe_sep), + FutureWarning, stacklevel=2) + names = old_names + + pos_args = dict(zip(names[:len(args)], args)) + + for key in pos_args: + if key in kwargs: + raise ValueError("Argument given by name ('{}') and position " + "({})".format(key, names.index(key))) + kwargs[key] = pos_args[key] + + if kwargs.get("header", None) is None: + warnings.warn("The signature of `Series.to_csv` was aligned " + "to that of `DataFrame.to_csv`, and argument " + "'header' will change its default value from False " + "to True: please pass an explicit value to suppress " + "this warning.", FutureWarning, + stacklevel=2) + kwargs["header"] = False # Backwards compatibility. + return self.to_frame().to_csv(**kwargs) @Appender(generic._shared_docs['to_excel'] % _shared_doc_kwargs) def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 9e3b606f319738..e1c3c29ef2846d 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -893,22 +893,27 @@ def test_to_csv_line_terminators(self): def test_to_csv_from_csv_categorical(self): - # CSV with categoricals should result in the same output as when one - # would add a "normal" Series/DataFrame. - s = Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) - s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) + # CSV with categoricals should result in the same output + # as when one would add a "normal" Series/DataFrame. + s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) + s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) res = StringIO() - s.to_csv(res) + + s.to_csv(res, header=False) exp = StringIO() - s2.to_csv(exp) + + s2.to_csv(exp, header=False) assert res.getvalue() == exp.getvalue() df = DataFrame({"s": s}) df2 = DataFrame({"s": s2}) + res = StringIO() df.to_csv(res) + exp = StringIO() df2.to_csv(exp) + assert res.getvalue() == exp.getvalue() def test_to_csv_path_is_none(self): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 76788ced44e846..4db7e0e869798a 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,4 +1,5 @@ import os +import warnings import pytest @@ -6,6 +7,13 @@ import pandas.io.common as icom import pandas.util.testing as tm +def catch_to_csv_depr(): + # Catching warnings because Series.to_csv has + # been deprecated. Remove this context when + # Series.to_csv has been aligned. + + return warnings.catch_warnings(record=True) + @pytest.mark.parametrize('obj', [ pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], @@ -15,11 +23,12 @@ @pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) def test_compression_size(obj, method, compression_only): with tm.ensure_clean() as path: - getattr(obj, method)(path, compression=compression_only) - compressed_size = os.path.getsize(path) - getattr(obj, method)(path, compression=None) - uncompressed_size = os.path.getsize(path) - assert uncompressed_size > compressed_size + with catch_to_csv_depr(): + getattr(obj, method)(path, compression=compression_only) + compressed_size = os.path.getsize(path) + getattr(obj, method)(path, compression=None) + uncompressed_size = os.path.getsize(path) + assert uncompressed_size > compressed_size @pytest.mark.parametrize('obj', [ @@ -31,16 +40,18 @@ def test_compression_size(obj, method, compression_only): def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: f, handles = icom._get_handle(path, 'w', compression=compression_only) - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed - compressed_size = os.path.getsize(path) + with catch_to_csv_depr(): + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed + compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: f, handles = icom._get_handle(path, 'w', compression=None) - with f: - getattr(obj, method)(f) - assert not f.closed + with catch_to_csv_depr(): + with f: + getattr(obj, method)(f) + assert not f.closed assert f.closed uncompressed_size = os.path.getsize(path) assert uncompressed_size > compressed_size diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 814d794d45c184..cbf9bff06ad34c 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -37,7 +37,7 @@ def read_csv(self, path, **kwargs): def test_from_csv_deprecation(self): # see gh-17812 with ensure_clean() as path: - self.ts.to_csv(path) + self.ts.to_csv(path, header=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -45,10 +45,28 @@ def test_from_csv_deprecation(self): depr_ts = Series.from_csv(path) assert_series_equal(depr_ts, ts) + @pytest.mark.parametrize("arg", ["path", "header", "both"]) + def test_to_csv_deprecation(self, arg): + # see gh-19715 + with ensure_clean() as path: + if arg == "path": + kwargs = dict(path=path, header=False) + elif arg == "header": + kwargs = dict(path_or_buf=path) + else: # Both discrepancies match. + kwargs = dict(path=path) + + with tm.assert_produces_warning(FutureWarning): + self.ts.to_csv(**kwargs) + + # Make sure roundtrip still works. + ts = self.read_csv(path) + assert_series_equal(self.ts, ts, check_names=False) + def test_from_csv(self): with ensure_clean() as path: - self.ts.to_csv(path) + self.ts.to_csv(path, header=False) ts = self.read_csv(path) assert_series_equal(self.ts, ts, check_names=False) @@ -65,7 +83,7 @@ def test_from_csv(self): ts_h = self.read_csv(path, header=0) assert ts_h.name == "ts" - self.series.to_csv(path) + self.series.to_csv(path, header=False) series = self.read_csv(path) assert_series_equal(self.series, series, check_names=False) @@ -92,13 +110,13 @@ def test_to_csv(self): import io with ensure_clean() as path: - self.ts.to_csv(path) + self.ts.to_csv(path, header=False) with io.open(path, newline=None) as f: lines = f.readlines() assert (lines[1] != '\n') - self.ts.to_csv(path, index=False) + self.ts.to_csv(path, index=False, header=False) arr = np.loadtxt(path) assert_almost_equal(arr, self.ts.values) @@ -106,7 +124,7 @@ def test_to_csv_unicode_index(self): buf = StringIO() s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")]) - s.to_csv(buf, encoding="UTF-8") + s.to_csv(buf, encoding="UTF-8", header=False) buf.seek(0) s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") @@ -116,7 +134,7 @@ def test_to_csv_float_format(self): with ensure_clean() as filename: ser = Series([0.123456, 0.234567, 0.567567]) - ser.to_csv(filename, float_format="%.2f") + ser.to_csv(filename, float_format="%.2f", header=False) rs = self.read_csv(filename) xp = Series([0.12, 0.23, 0.57]) @@ -128,14 +146,14 @@ def test_to_csv_list_entries(self): split = s.str.split(r'\s+and\s+') buf = StringIO() - split.to_csv(buf) + split.to_csv(buf, header=False) def test_to_csv_path_is_none(self): # GH 8215 # Series.to_csv() was returning None, inconsistent with # DataFrame.to_csv() which returned string s = Series([1, 2, 3]) - csv_str = s.to_csv(path=None) + csv_str = s.to_csv(path_or_buf=None, header=False) assert isinstance(csv_str, str) @pytest.mark.parametrize('s,encoding', [