Add support for string 'nan', 'inf' & '-inf' values while type-cast…

…ing to `float` (#9613) Fixes: #7488 This PR add's support for strings that are `nan`, `inf` & `-inf` and their case-sensitive variations to be supported while type-casting from string column to `float` dtype. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - David Wendt (https://github.com/davidwendt) - https://github.com/brandon-b-miller URL: #9613
rapidsai · Nov 8, 2021 · 9a60296 · 9a60296
1 parent a7a8250
commit 9a60296
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 5 deletions.
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -97,6 +97,69 @@ def str_to_boolean(column: StringColumn):
     cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
 }
 
+_NAN_INF_VARIATIONS = [
+    "nan",
+    "NAN",
+    "Nan",
+    "naN",
+    "nAN",
+    "NAn",
+    "nAn",
+    "-inf",
+    "-INF",
+    "-InF",
+    "-inF",
+    "-iNF",
+    "-INf",
+    "-iNf",
+    "+inf",
+    "+INF",
+    "+InF",
+    "+inF",
+    "+iNF",
+    "+INf",
+    "+Inf",
+    "+iNf",
+    "inf",
+    "INF",
+    "InF",
+    "inF",
+    "iNF",
+    "INf",
+    "iNf",
+]
+_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
+    "NaN",
+    "NaN",
+    "NaN",
+    "NaN",
+    "NaN",
+    "NaN",
+    "NaN",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+]
+
 
 def _is_supported_regex_flags(flags):
     return flags == 0 or (
@@ -5243,21 +5306,31 @@ def as_numerical_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
         out_dtype = cudf.dtype(dtype)
-
+        string_col = self
         if out_dtype.kind in {"i", "u"}:
-            if not libstrings.is_integer(self).all():
+            if not libstrings.is_integer(string_col).all():
                 raise ValueError(
                     "Could not convert strings to integer "
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            if not libstrings.is_float(self).all():
+            # TODO: Replace this `replace` call with a
+            # case-insensitive method once following
+            # issue is fixed: https://github.com/rapidsai/cudf/issues/5217
+            old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
+            new_values = cudf.core.column.as_column(
+                _LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
+            )
+            string_col = libcudf.replace.replace(
+                string_col, old_values, new_values
+            )
+            if not libstrings.is_float(string_col).all():
                 raise ValueError(
                     "Could not convert strings to float "
                     "type due to presence of non-floating values."
                 )
 
-        result_col = _str_to_numeric_typecast_functions[out_dtype](self)
+        result_col = _str_to_numeric_typecast_functions[out_dtype](string_col)
         return result_col
 
     def _as_datetime_or_timedelta_column(self, dtype, format):

diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
@@ -244,6 +244,8 @@ def _proc_inf_strings(col):
     """Convert "inf/infinity" strings into "Inf", the native string
     representing infinity in libcudf
     """
+    # TODO: This can be handled by libcudf in
+    # future see StringColumn.as_numerical_column
     col = libstrings.replace_multi(
         col, as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]),
     )

diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
@@ -183,7 +183,19 @@ def test_string_astype(dtype):
     ):
         data = ["1", "2", "3", "4", "5"]
     elif dtype.startswith("float"):
-        data = ["1.0", "2.0", "3.0", "4.0", "5.0"]
+        data = [
+            "1.0",
+            "2.0",
+            "3.0",
+            "4.0",
+            None,
+            "5.0",
+            "nan",
+            "-INF",
+            "NaN",
+            "inF",
+            "NAn",
+        ]
     elif dtype.startswith("bool"):
         data = ["True", "False", "True", "False", "False"]
     elif dtype.startswith("datetime64"):