Skip to content

Commit

Permalink
Preserve whitespace by default in read_fwf() by adding options:
Browse files Browse the repository at this point in the history
	* 'keep_whitespace' (default=True)
	* 'whitespace_chars' (default=[space] and [tab] chars)

See:
	pandas-dev#49832 (comment)
	https://stackoverflow.com/questions/72235501/python-pandas-read-fwf-strips-white-space
	https://stackoverflow.com/questions/57012437/pandas-read-fwf-removes-white-space

* changes in pandas/io/parsers/readers.py:
		_fwf_defaults()
		read_fwf()
* pandas/io/parsers/python_parsers.py
		FixedWidthReader
			__init__
			__next__
		FixedWidthFieldParser
			__init__
			_make_reader

Signed-off-by: Ronald Barnes <ron@ronaldbarnes.ca>
  • Loading branch information
RonaldBarnes committed Jan 27, 2023
1 parent 4ce3757 commit 997e2e9
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 4 deletions.
48 changes: 46 additions & 2 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1181,10 +1181,13 @@ def __init__(
comment: str | None,
skiprows: set[int] | None = None,
infer_nrows: int = 100,
keep_whitespace: bool | None = True,
whitespace_chars: str | None = " \t",
) -> None:
self.f = f
self.buffer: Iterator | None = None
self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
self.keep_whitespace = keep_whitespace
self.whitespace_chars = whitespace_chars
self.comment = comment
if colspecs == "infer":
self.colspecs = self.detect_colspecs(
Expand All @@ -1211,6 +1214,36 @@ def __init__(
"2 element tuple or list of integers"
)

if not isinstance(self.keep_whitespace, bool):
raise TypeError(
"keep_whitespace must be type bool (True or False), "
f"input was type {type(self.keep_whitespace).__name__}: "
f'"{self.keep_whitespace}"'
)
if delimiter:
## Delimiters in fixed-width files removed:
## use colspecs, widths, or read_table()
import warnings

## See link regarding fixing anti-patterns & unexpected default behaviour:
## https://github.com/pandas-dev/pandas/pull/49832#discussion_r1030615937
##
## Deprecation warnings ignored by default, show them:
warnings.simplefilter("always")
warnings.formatwarning = (
lambda msg, cat, file, line, args1: f"NOTICE:\n{msg}\n\n"
f'{cat}\nFile "{file}", line {line} '
"in FixedWidthReader.__init__\n"
)
warnings.warn(
(
"Delimiters are deprecated in fixed-width files "
+ "- use colspecs or widths\n"
+ "See keep_whitespace in read_fwf(), also see read_table()."
),
DeprecationWarning,
)

def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
"""
Read rows from self.f, skipping as specified.
Expand Down Expand Up @@ -1283,7 +1316,14 @@ def __next__(self) -> list[str]:
else:
line = next(self.f) # type: ignore[arg-type]
# Note: 'colspecs' is a sequence of half-open intervals.
return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
line = line.rstrip("\r\n")
if self.keep_whitespace:
return [line[from_:to] for (from_, to) in self.colspecs]
else:
return [
line[from_:to].strip(self.whitespace_chars)
for (from_, to) in self.colspecs
]


class FixedWidthFieldParser(PythonParser):
Expand All @@ -1296,6 +1336,8 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
# Support iterators, convert to a list.
self.colspecs = kwds.pop("colspecs")
self.infer_nrows = kwds.pop("infer_nrows")
self.keep_whitespace = kwds.pop("keep_whitespace", True)
self.whitespace_chars = kwds.pop("whitespace_chars", " \t")
PythonParser.__init__(self, f, **kwds)

def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
Expand All @@ -1306,6 +1348,8 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
self.comment,
self.skiprows,
self.infer_nrows,
self.keep_whitespace,
self.whitespace_chars,
)

def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
Expand Down
23 changes: 21 additions & 2 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,13 @@
"float_precision": None,
}

_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
_fwf_defaults = {
"colspecs": "infer",
"infer_nrows": 100,
"widths": None,
"keep_whitespace": True,
"whitespace_chars": " \t",
}

_c_unsupported = {"skipfooter"}
_python_unsupported = {"low_memory", "float_precision"}
Expand Down Expand Up @@ -1235,6 +1241,8 @@ def read_fwf(
widths: Sequence[int] | None = None,
infer_nrows: int = 100,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
keep_whitespace: bool | None = True,
whitespace_chars: str | None = " \t",
**kwds,
) -> DataFrame | TextFileReader:
r"""
Expand Down Expand Up @@ -1273,6 +1281,14 @@ def read_fwf(
.. versionadded:: 2.0
keep_whitespace : bool, default True
Preserve or strip whitespace from fields.
whitespace_chars : str, default [space] & [tab]
If stripping whitespace, allows user to specify which
characters to strip (can be any characters).
.. versionadded:: 2.0
**kwds : optional
Optional keyword arguments can be passed to ``TextFileReader``.
Expand All @@ -1284,12 +1300,13 @@ def read_fwf(
See Also
--------
read_table : Read a table of fixed-width columns into a DataFrame.
DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
read_csv : Read a comma-separated values (csv) file into DataFrame.
Examples
--------
>>> pd.read_fwf('data.csv') # doctest: +SKIP
>>> pd.read_fwf('data.dat') # doctest: +SKIP
"""
# Check input arguments.
if colspecs is None and widths is None:
Expand Down Expand Up @@ -1336,6 +1353,8 @@ def read_fwf(
kwds["infer_nrows"] = infer_nrows
kwds["engine"] = "python-fwf"
kwds["use_nullable_dtypes"] = use_nullable_dtypes
kwds["keep_whitespace"] = keep_whitespace
kwds["whitespace_chars"] = whitespace_chars
return _read(filepath_or_buffer, kwds)


Expand Down

0 comments on commit 997e2e9

Please sign in to comment.