Preserve whitespace by default in read_fwf() by adding options:

* 'keep_whitespace' (default=True) * 'whitespace_chars' (default=[space] and [tab] chars) See: pandas-dev#49832 (comment) https://stackoverflow.com/questions/72235501/python-pandas-read-fwf-strips-white-space https://stackoverflow.com/questions/57012437/pandas-read-fwf-removes-white-space * changes in pandas/io/parsers/readers.py: _fwf_defaults() read_fwf() * pandas/io/parsers/python_parsers.py FixedWidthReader __init__ __next__ FixedWidthFieldParser __init__ _make_reader Signed-off-by: Ronald Barnes <ron@ronaldbarnes.ca>
RonaldBarnes · Jan 27, 2023 · 997e2e9 · 997e2e9
1 parent 4ce3757
commit 997e2e9
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 4 deletions.
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -1181,10 +1181,13 @@ def __init__(
         comment: str | None,
         skiprows: set[int] | None = None,
         infer_nrows: int = 100,
+        keep_whitespace: bool | None = True,
+        whitespace_chars: str | None = " \t",
     ) -> None:
         self.f = f
         self.buffer: Iterator | None = None
-        self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
+        self.keep_whitespace = keep_whitespace
+        self.whitespace_chars = whitespace_chars
         self.comment = comment
         if colspecs == "infer":
             self.colspecs = self.detect_colspecs(
@@ -1211,6 +1214,36 @@ def __init__(
                     "2 element tuple or list of integers"
                 )
 
+        if not isinstance(self.keep_whitespace, bool):
+            raise TypeError(
+                "keep_whitespace must be type bool (True or False), "
+                f"input was type {type(self.keep_whitespace).__name__}: "
+                f'"{self.keep_whitespace}"'
+            )
+        if delimiter:
+            ## Delimiters in fixed-width files removed:
+            ## use colspecs, widths, or read_table()
+            import warnings
+
+            ## See link regarding fixing anti-patterns & unexpected default behaviour:
+            ## https://github.com/pandas-dev/pandas/pull/49832#discussion_r1030615937
+            ##
+            ## Deprecation warnings ignored by default, show them:
+            warnings.simplefilter("always")
+            warnings.formatwarning = (
+                lambda msg, cat, file, line, args1: f"NOTICE:\n{msg}\n\n"
+                f'{cat}\nFile "{file}", line {line} '
+                "in FixedWidthReader.__init__\n"
+            )
+            warnings.warn(
+                (
+                    "Delimiters are deprecated in fixed-width files "
+                    + "- use colspecs or widths\n"
+                    + "See keep_whitespace in read_fwf(), also see read_table()."
+                ),
+                DeprecationWarning,
+            )
+
     def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
         """
         Read rows from self.f, skipping as specified.
@@ -1283,7 +1316,14 @@ def __next__(self) -> list[str]:
         else:
             line = next(self.f)  # type: ignore[arg-type]
         # Note: 'colspecs' is a sequence of half-open intervals.
-        return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
+        line = line.rstrip("\r\n")
+        if self.keep_whitespace:
+            return [line[from_:to] for (from_, to) in self.colspecs]
+        else:
+            return [
+                line[from_:to].strip(self.whitespace_chars)
+                for (from_, to) in self.colspecs
+            ]
 
 
 class FixedWidthFieldParser(PythonParser):
@@ -1296,6 +1336,8 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
         # Support iterators, convert to a list.
         self.colspecs = kwds.pop("colspecs")
         self.infer_nrows = kwds.pop("infer_nrows")
+        self.keep_whitespace = kwds.pop("keep_whitespace", True)
+        self.whitespace_chars = kwds.pop("whitespace_chars", " \t")
         PythonParser.__init__(self, f, **kwds)
 
     def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
@@ -1306,6 +1348,8 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
             self.comment,
             self.skiprows,
             self.infer_nrows,
+            self.keep_whitespace,
+            self.whitespace_chars,
         )
 
     def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -438,7 +438,13 @@
     "float_precision": None,
 }
 
-_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
+_fwf_defaults = {
+    "colspecs": "infer",
+    "infer_nrows": 100,
+    "widths": None,
+    "keep_whitespace": True,
+    "whitespace_chars": " \t",
+}
 
 _c_unsupported = {"skipfooter"}
 _python_unsupported = {"low_memory", "float_precision"}
@@ -1235,6 +1241,8 @@ def read_fwf(
     widths: Sequence[int] | None = None,
     infer_nrows: int = 100,
     use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
+    keep_whitespace: bool | None = True,
+    whitespace_chars: str | None = " \t",
     **kwds,
 ) -> DataFrame | TextFileReader:
     r"""
@@ -1273,6 +1281,14 @@ def read_fwf(
 
         .. versionadded:: 2.0
 
+    keep_whitespace : bool, default True
+        Preserve or strip whitespace from fields.
+    whitespace_chars : str, default [space] & [tab]
+        If stripping whitespace, allows user to specify which
+        characters to strip (can be any characters).
+
+        .. versionadded:: 2.0
+
     **kwds : optional
         Optional keyword arguments can be passed to ``TextFileReader``.
 
@@ -1284,12 +1300,13 @@ def read_fwf(
 
     See Also
     --------
+    read_table : Read a table of fixed-width columns into a DataFrame.
     DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
     read_csv : Read a comma-separated values (csv) file into DataFrame.
 
     Examples
     --------
-    >>> pd.read_fwf('data.csv')  # doctest: +SKIP
+    >>> pd.read_fwf('data.dat')  # doctest: +SKIP
     """
     # Check input arguments.
     if colspecs is None and widths is None:
@@ -1336,6 +1353,8 @@ def read_fwf(
     kwds["infer_nrows"] = infer_nrows
     kwds["engine"] = "python-fwf"
     kwds["use_nullable_dtypes"] = use_nullable_dtypes
+    kwds["keep_whitespace"] = keep_whitespace
+    kwds["whitespace_chars"] = whitespace_chars
     return _read(filepath_or_buffer, kwds)