Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update check for inf/nan strings in libcudf float conversion to ignore case #9694

Merged
merged 14 commits into from
Nov 30, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 49 additions & 12 deletions cpp/include/cudf/strings/string.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,40 @@ inline __device__ bool is_integer(string_view const& d_str)
thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; });
}

/**
* @brief Returns true if input contains the not-a-number string.
*
* The following are valid for this function: "NAN" and "NaN"
* @param d_str input string
* @return true if input is as valid NaN string.
*/
inline __device__ bool is_nan_str(string_view const& d_str)
{
auto const ptr = d_str.data();
return (d_str.size_bytes() == 3) && (ptr[0] == 'N' || ptr[0] == 'n') &&
(ptr[1] == 'A' || ptr[1] == 'a') && (ptr[2] == 'N' || ptr[2] == 'n');
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
}

/**
* @brief Returns true if input contains the infinity string.
*
* The following are valid for this function: "INF", "INFINITY", and "Inf"
* @param d_str input string
* @return true if input is as valid Inf string.
*/
inline __device__ bool is_inf_str(string_view const& d_str)
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
{
auto const ptr = d_str.data();
auto const size = d_str.size_bytes();
if ((size < 3) || (size > 8) || (ptr[0] != 'I' && ptr[0] != 'i') ||
(ptr[1] != 'N' && ptr[1] != 'n') || (ptr[2] != 'F' && ptr[2] != 'f')) {
return false;
}
return (size == 3) || (size == 8 && (ptr[3] == 'I' || ptr[3] == 'i') &&
(ptr[4] == 'N' || ptr[4] == 'n') && (ptr[5] == 'I' || ptr[5] == 'i') &&
(ptr[6] == 'T' || ptr[6] == 't') && (ptr[7] == 'Y' || ptr[7] == 'y'));
}

/**
* @brief Returns `true` if all characters in the string
* are valid for conversion to a float type.
Expand All @@ -65,38 +99,41 @@ inline __device__ bool is_integer(string_view const& d_str)
* An empty string returns `false`.
* No bounds checking is performed to verify if the value would fit
* within a specific float type.
* The following strings are also allowed "NaN", "Inf" and, "-Inf"
* and will return true.
* The following strings are also allowed and will return true:
* "NaN", "NAN", "Inf", "INF", "INFINITY"
*
* @param d_str String to check.
* @return true if string has valid float characters
*/
inline __device__ bool is_float(string_view const& d_str)
{
if (d_str.empty()) return false;
// strings allowed by the converter
if (d_str.compare("NaN", 3) == 0) return true;
if (d_str.compare("Inf", 3) == 0) return true;
if (d_str.compare("-Inf", 4) == 0) return true;
bool decimal_found = false;
bool exponent_found = false;
size_type bytes = d_str.size_bytes();
const char* data = d_str.data();
// sign character allowed at the beginning of the string
size_type chidx = (*data == '-' || *data == '+') ? 1 : 0;
bool result = chidx < bytes;
size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0;

// check for nan and infinity strings
if (data[ch_idx] > '9') {
auto const inf_nan = string_view(data + ch_idx, bytes - ch_idx);
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
if (is_nan_str(inf_nan) || is_inf_str(inf_nan)) return true;
}

bool result = ch_idx < bytes;
// check for float chars [0-9] and a single decimal '.'
// and scientific notation [eE][+-][0-9]
for (; chidx < bytes; ++chidx) {
auto chr = data[chidx];
for (; ch_idx < bytes; ++ch_idx) {
auto chr = data[ch_idx];
if (chr >= '0' && chr <= '9') continue;
if (!decimal_found && chr == '.') {
decimal_found = true; // no more decimals
continue;
}
if (!exponent_found && (chr == 'e' || chr == 'E')) {
if (chidx + 1 < bytes) chr = data[chidx + 1];
if (chr == '-' || chr == '+') ++chidx;
if (ch_idx + 1 < bytes) chr = data[ch_idx + 1];
if (chr == '-' || chr == '+') ++ch_idx;
decimal_found = true; // no decimal allowed in exponent
exponent_found = true; // no more exponents
continue;
Expand Down
13 changes: 8 additions & 5 deletions cpp/src/strings/convert/convert_floats.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ namespace {
* @brief This function converts the given string into a
* floating point double value.
*
* This will also map strings containing "NaN", "Inf" and "-Inf"
* This will also map strings containing "NaN", "Inf", etc.
* to the appropriate float values.
*
* This function will also handle scientific notation format.
Expand All @@ -55,16 +55,19 @@ __device__ inline double stod(string_view const& d_str)
const char* in_ptr = d_str.data();
const char* end = in_ptr + d_str.size_bytes();
if (end == in_ptr) return 0.0;
// special strings
if (d_str.compare("NaN", 3) == 0) return std::numeric_limits<double>::quiet_NaN();
if (d_str.compare("Inf", 3) == 0) return std::numeric_limits<double>::infinity();
if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits<double>::infinity();
double sign{1.0};
if (*in_ptr == '-' || *in_ptr == '+') {
sign = (*in_ptr == '-' ? -1 : 1);
++in_ptr;
}

// special strings: NaN, Inf
if (*in_ptr > '9') {
auto const inf_nan = string_view(in_ptr, static_cast<size_type>(thrust::distance(in_ptr, end)));
if (string::is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
}

// Parse and store the mantissa as much as we can,
// until we are about to exceed the limit of uint64_t
constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
Expand Down
51 changes: 13 additions & 38 deletions cpp/tests/strings/floats_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,32 +58,20 @@ TEST_F(StringsConvertTest, IsFloat)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);

cudf::test::strings_column_wrapper strings2(
{"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
{"-34", "9.8", "1234567890", "-917.2e5", "INF", "NAN", "-Inf", "INFINITY"});
results = cudf::strings::is_float(cudf::strings_column_view(strings2));
cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
}

TEST_F(StringsConvertTest, ToFloats32)
{
std::vector<const char*> h_strings{"1234",
nullptr,
"-876",
"543.2",
"-0.12",
".25",
"-.002",
"",
"-0.0",
"1.2e4",
"NaN",
"abc123",
"123abc",
"456e",
"-1.78e+5",
"-122.33644782123456789",
"12e+309",
"3.4028236E38"};
std::vector<const char*> h_strings{
"1234", nullptr, "-876", "543.2",
"-0.12", ".25", "-.002", "",
"-0.0", "1.2e4", "NAN", "abc123",
"123abc", "456e", "-1.78e+5", "-122.33644782123456789",
"12e+309", "3.4028236E38", "INF", "Infinity"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
Expand Down Expand Up @@ -135,24 +123,11 @@ TEST_F(StringsConvertTest, FromFloats32)

TEST_F(StringsConvertTest, ToFloats64)
{
std::vector<const char*> h_strings{"1234",
nullptr,
"-876",
"543.2",
"-0.12",
".25",
"-.002",
"",
"-0.0",
"1.28e256",
"NaN",
"abc123",
"123abc",
"456e",
"-1.78e+5",
"-122.33644782",
"12e+309",
"1.7976931348623159E308"};
std::vector<const char*> h_strings{
"1234", nullptr, "-876", "543.2", "-0.12", ".25",
"-.002", "", "-0.0", "1.28e256", "NaN", "abc123",
"123abc", "456e", "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
"-Inf", "-INFINITY"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
Expand Down
73 changes: 0 additions & 73 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,69 +97,6 @@ def str_to_boolean(column: StringColumn):
cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
}

_NAN_INF_VARIATIONS = [
"nan",
"NAN",
"Nan",
"naN",
"nAN",
"NAn",
"nAn",
"-inf",
"-INF",
"-InF",
"-inF",
"-iNF",
"-INf",
"-iNf",
"+inf",
"+INF",
"+InF",
"+inF",
"+iNF",
"+INf",
"+Inf",
"+iNf",
"inf",
"INF",
"InF",
"inF",
"iNF",
"INf",
"iNf",
]
_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
]


def _is_supported_regex_flags(flags):
return flags == 0 or (
Expand Down Expand Up @@ -5309,16 +5246,6 @@ def as_numerical_column(
"type due to presence of non-integer values."
)
elif out_dtype.kind == "f":
# TODO: Replace this `replace` call with a
# case-insensitive method once following
# issue is fixed: https://github.com/rapidsai/cudf/issues/5217
old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
new_values = cudf.core.column.as_column(
_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
)
string_col = libcudf.replace.replace(
string_col, old_values, new_values
)
if not libstrings.is_float(string_col).all():
raise ValueError(
"Could not convert strings to float "
Expand Down