Skip to content

Commit

Permalink
Add support for string 'nan', 'inf' & '-inf' values while type-cast…
Browse files Browse the repository at this point in the history
…ing to `float` (#9613)

Fixes: #7488 

This PR add's support for strings that are `nan`, `inf` & `-inf` and their case-sensitive variations to be supported while type-casting from string column to `float` dtype.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/brandon-b-miller

URL: #9613
  • Loading branch information
galipremsagar authored Nov 8, 2021
1 parent a7a8250 commit 9a60296
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 5 deletions.
81 changes: 77 additions & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,69 @@ def str_to_boolean(column: StringColumn):
cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
}

_NAN_INF_VARIATIONS = [
"nan",
"NAN",
"Nan",
"naN",
"nAN",
"NAn",
"nAn",
"-inf",
"-INF",
"-InF",
"-inF",
"-iNF",
"-INf",
"-iNf",
"+inf",
"+INF",
"+InF",
"+inF",
"+iNF",
"+INf",
"+Inf",
"+iNf",
"inf",
"INF",
"InF",
"inF",
"iNF",
"INf",
"iNf",
]
_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
]


def _is_supported_regex_flags(flags):
return flags == 0 or (
Expand Down Expand Up @@ -5243,21 +5306,31 @@ def as_numerical_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.NumericalColumn":
out_dtype = cudf.dtype(dtype)

string_col = self
if out_dtype.kind in {"i", "u"}:
if not libstrings.is_integer(self).all():
if not libstrings.is_integer(string_col).all():
raise ValueError(
"Could not convert strings to integer "
"type due to presence of non-integer values."
)
elif out_dtype.kind == "f":
if not libstrings.is_float(self).all():
# TODO: Replace this `replace` call with a
# case-insensitive method once following
# issue is fixed: https://github.com/rapidsai/cudf/issues/5217
old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
new_values = cudf.core.column.as_column(
_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
)
string_col = libcudf.replace.replace(
string_col, old_values, new_values
)
if not libstrings.is_float(string_col).all():
raise ValueError(
"Could not convert strings to float "
"type due to presence of non-floating values."
)

result_col = _str_to_numeric_typecast_functions[out_dtype](self)
result_col = _str_to_numeric_typecast_functions[out_dtype](string_col)
return result_col

def _as_datetime_or_timedelta_column(self, dtype, format):
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ def _proc_inf_strings(col):
"""Convert "inf/infinity" strings into "Inf", the native string
representing infinity in libcudf
"""
# TODO: This can be handled by libcudf in
# future see StringColumn.as_numerical_column
col = libstrings.replace_multi(
col, as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]),
)
Expand Down
14 changes: 13 additions & 1 deletion python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,19 @@ def test_string_astype(dtype):
):
data = ["1", "2", "3", "4", "5"]
elif dtype.startswith("float"):
data = ["1.0", "2.0", "3.0", "4.0", "5.0"]
data = [
"1.0",
"2.0",
"3.0",
"4.0",
None,
"5.0",
"nan",
"-INF",
"NaN",
"inF",
"NAn",
]
elif dtype.startswith("bool"):
data = ["True", "False", "True", "False", "False"]
elif dtype.startswith("datetime64"):
Expand Down

0 comments on commit 9a60296

Please sign in to comment.