Skip to content

Commit

Permalink
ENH: Added keep_whitespace and whitespace_chars to read_fwf, al…
Browse files Browse the repository at this point in the history
…lowing

more control over handling of whitespace in fields and removing the
requirement to specify a `delimiter` in order to preserve whitespace. (pandas-dev#51569)

Signed-off-by: Ronald Barnes <ron@ronaldbarnes.ca>
  • Loading branch information
RonaldBarnes committed Feb 23, 2023
1 parent c4caed6 commit 25bf583
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 3 deletions.
50 changes: 49 additions & 1 deletion pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
self.decimal = kwds["decimal"]

self.comment = kwds["comment"]
## GH51569
self.keep_whitespace = kwds.get("keep_whitespace")
self.whitespace_chars = kwds.get("whitespace_chars")

# Set self.data to something that can read lines.
if isinstance(f, list):
Expand Down Expand Up @@ -1180,11 +1183,20 @@ def __init__(
comment: str | None,
skiprows: set[int] | None = None,
infer_nrows: int = 100,
## GH51569
keep_whitespace: bool | tuple[bool, bool] = (False, False),
whitespace_chars: str = " \t",
) -> None:
self.f = f
self.buffer: Iterator | None = None
self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
self.comment = comment
self.keep_whitespace = keep_whitespace
## Backwards compatibility means supporting delimiter:
if delimiter:
whitespace_chars = whitespace_chars + delimiter
self.whitespace_chars = whitespace_chars

if colspecs == "infer":
self.colspecs = self.detect_colspecs(
infer_nrows=infer_nrows, skiprows=skiprows
Expand All @@ -1210,6 +1222,33 @@ def __init__(
"2 element tuple or list of integers"
)

## GH51569
## Accept boolean, but convert to tuple(bool,bool) for (left,right) of fields:
if isinstance(self.keep_whitespace, bool):
self.keep_whitespace = (keep_whitespace, keep_whitespace)
## Ensure tuple is (bool,bool):
if (
isinstance(self.keep_whitespace, tuple)
and len(self.keep_whitespace) == 2
and isinstance(self.keep_whitespace[0], bool)
and isinstance(self.keep_whitespace[1], bool)
):
# Define custom lstrip & rstrip *once*, at __init__:
if self.keep_whitespace[0] is True:
self.ltrim = lambda x: x
else:
self.ltrim = lambda x: x.lstrip(self.whitespace_chars)
if self.keep_whitespace[1] is True:
self.rtrim = lambda x: x
else:
self.rtrim = lambda x: x.rstrip(self.whitespace_chars)
else:
raise ValueError(
"'keep_whitespace' must be a bool or tuple(bool,bool)."
f"\nReceived '{type(self.keep_whitespace).__name__}': "
f"'{self.keep_whitespace}'."
)

def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
"""
Read rows from self.f, skipping as specified.
Expand Down Expand Up @@ -1281,8 +1320,14 @@ def __next__(self) -> list[str]:
line = next(self.f) # type: ignore[arg-type]
else:
line = next(self.f) # type: ignore[arg-type]

line = line.rstrip("\r\n")

# Note: 'colspecs' is a sequence of half-open intervals.
return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
return [self.ltrim(self.rtrim(line[from_:to])) for (from_, to) in self.colspecs]


# return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]


class FixedWidthFieldParser(PythonParser):
Expand All @@ -1305,6 +1350,9 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
self.comment,
self.skiprows,
self.infer_nrows,
## GH51569
self.keep_whitespace,
self.whitespace_chars,
)

def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
Expand Down
27 changes: 25 additions & 2 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,13 @@
"float_precision": None,
}

_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
_fwf_defaults = {
"colspecs": "infer",
"infer_nrows": 100,
"widths": None,
"keep_whitespace": (False, False),
"whitespace_chars": " \t",
}

_c_unsupported = {"skipfooter"}
_python_unsupported = {"low_memory", "float_precision"}
Expand Down Expand Up @@ -1235,10 +1241,13 @@ def read_fwf(
widths: Sequence[int] | None = None,
infer_nrows: int = 100,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
## GH51569
keep_whitespace: bool | tuple[bool, bool] = (False, False),
whitespace_chars: str = " \t",
**kwds,
) -> DataFrame | TextFileReader:
r"""
Read a table of fixed-width formatted lines into DataFrame.
Read a file of fixed-width lines into DataFrame.
Also supports optionally iterating or breaking of the file
into chunks.
Expand Down Expand Up @@ -1266,6 +1275,8 @@ def read_fwf(
infer_nrows : int, default 100
The number of rows to consider when letting the parser determine the
`colspecs`.
delimiter : str, default ``' '`` and ``'\t'`` characters
When inferring colspecs, sets the column / field separator.
use_nullable_dtypes : bool = False
Whether or not to use nullable dtypes as default when reading data. If
set to True, nullable dtypes are used for all dtypes that have a nullable
Expand All @@ -1283,6 +1294,14 @@ def read_fwf(
.. versionadded:: 2.0
keep_whitespace : bool, or tuple (bool,bool), default (False,False)
How to handle whitespace at start,end of each field / column.
whitespace_chars : str, default = ``' '`` and ``'\t'`` characters
If ``keep_whitespace`` is to remove whitespace, these characters are
stripped from each field / column.
.. versionadded:: 2.0
**kwds : optional
Optional keyword arguments can be passed to ``TextFileReader``.
Expand All @@ -1294,6 +1313,7 @@ def read_fwf(
See Also
--------
read_table : Read data from table (i.e. columns with delimiting spaces).
DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
read_csv : Read a comma-separated values (csv) file into DataFrame.
Expand Down Expand Up @@ -1346,6 +1366,9 @@ def read_fwf(
kwds["infer_nrows"] = infer_nrows
kwds["engine"] = "python-fwf"
kwds["use_nullable_dtypes"] = use_nullable_dtypes
## GH51569
kwds["keep_whitespace"] = keep_whitespace
kwds["whitespace_chars"] = whitespace_chars
return _read(filepath_or_buffer, kwds)


Expand Down

0 comments on commit 25bf583

Please sign in to comment.