Skip to content

Commit

Permalink
Change IPv4 convert APIs to support UINT32 instead of INT64 (#16489)
Browse files Browse the repository at this point in the history
Changes the integer type for `cudf::strings::ipv4_to_integers` and `cudf::strings::integers_to_ipv4` to use UINT32 types instead of INT64. The INT64 type was originally chosen because libcudf did not support unsigned types at the time.
This is a breaking change since the basic input/output type is changed.

Closes #16324

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - https://github.com/brandon-b-miller
  - Karthikeyan (https://github.com/karthikeyann)

URL: #16489
  • Loading branch information
davidwendt authored Aug 8, 2024
1 parent a94512a commit cc75b05
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 23 deletions.
11 changes: 3 additions & 8 deletions cpp/include/cudf/strings/convert/convert_ipv4.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,12 @@ namespace strings {
* No checking is done on the format. If a string is not in IPv4 format, the resulting
* integer is undefined.
*
* The resulting 32-bit integer is placed in an int64_t to avoid setting the sign-bit
* in an int32_t type. This could be changed if cudf supported a UINT32 type in the future.
*
* Any null entries will result in corresponding null entries in the output column.
*
* @param input Strings instance for this operation
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New INT64 column converted from strings
* @return New UINT32 column converted from strings
*/
std::unique_ptr<column> ipv4_to_integers(
strings_column_view const& input,
Expand All @@ -68,13 +65,11 @@ std::unique_ptr<column> ipv4_to_integers(
* Each input integer is dissected into four integers by dividing the input into 8-bit sections.
* These sub-integers are then converted into [0-9] characters and placed between '.' characters.
*
* No checking is done on the input integer value. Only the lower 32-bits are used.
*
* Any null entries will result in corresponding null entries in the output column.
*
* @throw cudf::logic_error if the input column is not INT64 type.
* @throw cudf::logic_error if the input column is not UINT32 type.
*
* @param integers Integer (INT64) column to convert
* @param integers Integer (UINT32) column to convert
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column
Expand Down
14 changes: 7 additions & 7 deletions cpp/src/strings/convert/convert_ipv4.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace {
struct ipv4_to_integers_fn {
column_device_view const d_strings;

__device__ int64_t operator()(size_type idx)
__device__ uint32_t operator()(size_type idx)
{
if (d_strings.is_null(idx)) return 0;
string_view d_str = d_strings.element<string_view>(idx);
Expand All @@ -66,7 +66,7 @@ struct ipv4_to_integers_fn {
}
}
uint32_t result = (ipvals[0] << 24) + (ipvals[1] << 16) + (ipvals[2] << 8) + ipvals[3];
return static_cast<int64_t>(result);
return result;
}
};

Expand All @@ -79,18 +79,18 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
{
size_type strings_count = input.size();
if (strings_count == 0) {
return make_numeric_column(data_type{type_id::INT64}, 0, mask_state::UNALLOCATED, stream);
return make_numeric_column(data_type{type_id::UINT32}, 0, mask_state::UNALLOCATED, stream);
}

auto strings_column = column_device_view::create(input.parent(), stream);
// create output column copying the strings' null-mask
auto results = make_numeric_column(data_type{type_id::INT64},
auto results = make_numeric_column(data_type{type_id::UINT32},
strings_count,
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);
auto d_results = results->mutable_view().data<int64_t>();
auto d_results = results->mutable_view().data<uint32_t>();
// fill output column with ipv4 integers
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
Expand Down Expand Up @@ -135,7 +135,7 @@ struct integers_to_ipv4_fn {
return;
}

auto const ip_number = d_column.element<int64_t>(idx);
auto const ip_number = d_column.element<uint32_t>(idx);

char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
int shift_bits = 24;
Expand Down Expand Up @@ -165,7 +165,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
{
if (integers.is_empty()) return make_empty_column(type_id::STRING);

CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
CUDF_EXPECTS(integers.type().id() == type_id::UINT32, "Input column must be UINT32 type");

auto d_column = column_device_view::create(integers, stream);
auto [offsets_column, chars] =
Expand Down
8 changes: 4 additions & 4 deletions cpp/tests/strings/ipv4_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ TEST_F(StringsConvertTest, IPv4ToIntegers)
auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::ipv4_to_integers(strings_view);

std::vector<int64_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
cudf::test::fixed_width_column_wrapper<int64_t> expected(
std::vector<uint32_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
cudf::test::fixed_width_column_wrapper<uint32_t> expected(
h_expected.cbegin(),
h_expected.cend(),
thrust::make_transform_iterator(h_strings.begin(),
Expand All @@ -59,8 +59,8 @@ TEST_F(StringsConvertTest, IntegersToIPv4)
thrust::make_transform_iterator(h_strings.begin(),
[](auto const str) { return str != nullptr; }));

std::vector<int64_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
cudf::test::fixed_width_column_wrapper<int64_t> column(
std::vector<uint32_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
cudf::test::fixed_width_column_wrapper<uint32_t> column(
h_column.cbegin(),
h_column.cend(),
thrust::make_transform_iterator(h_strings.begin(),
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,8 @@ def normalize_binop_value(
return NotImplemented

def int2ip(self) -> "cudf.core.column.StringColumn":
if self.dtype != cudf.dtype("int64"):
raise TypeError("Only int64 type can be converted to ip")
if self.dtype != cudf.dtype("uint32"):
raise TypeError("Only uint32 type can be converted to ip")

return libcudf.string_casting.int2ip(self)

Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2672,7 +2672,9 @@ def test_string_ip4_to_int():


def test_string_int_to_ipv4():
gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449])
gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype(
"uint32"
)
expected = cudf.Series(
["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"]
)
Expand Down Expand Up @@ -2718,7 +2720,7 @@ def test_string_isipv4():


@pytest.mark.parametrize(
"dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int64", "uint64"}))
"dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"uint32"}))
)
def test_string_int_to_ipv4_dtype_fail(dtype):
gsr = cudf.Series([1, 2, 3, 4, 5]).astype(dtype)
Expand Down

0 comments on commit cc75b05

Please sign in to comment.