diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 51e3783bac5..f662d8a9e91 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -302,6 +302,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) * @param[in] dtypes The data type of the column * @param[out] columns The output column data * @param[out] valids The bitmaps indicating whether column fields are valid + * @param[out] valid_counts The number of valid fields in each column */ __global__ void __launch_bounds__(csvparse_block_dim) convert_csv_to_cudf(cudf::io::parse_options_view options, @@ -310,7 +311,8 @@ __global__ void __launch_bounds__(csvparse_block_dim) device_span row_offsets, device_span dtypes, device_span columns, - device_span valids) + device_span valids, + device_span valid_counts) { auto const raw_csv = data.data(); // thread IDs range per block, so also need the block id. @@ -318,8 +320,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) long const rec_id = threadIdx.x + (blockDim.x * blockIdx.x); long const rec_id_next = rec_id + 1; - // we can have more threads than data, make sure we are not past the end of - // the data + // we can have more threads than data, make sure we are not past the end of the data if (rec_id_next >= row_offsets.size()) return; auto field_start = raw_csv + row_offsets[rec_id]; @@ -370,6 +371,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) column_flags[col] & column_parse::as_hexadecimal)) { // set the valid bitmap - all bits were set to 0 to start set_bit(valids[actual_col], rec_id); + atomicAdd(&valid_counts[actual_col], 1); } } } else if (dtypes[actual_col].id() == cudf::type_id::STRING) { @@ -803,14 +805,15 @@ std::vector detect_column_types( return detail::make_std_vector_sync(d_stats, stream); } -void __host__ decode_row_column_data(cudf::io::parse_options_view const& options, - device_span data, - device_span column_flags, - device_span row_offsets, - device_span dtypes, - device_span columns, - device_span valids, - rmm::cuda_stream_view stream) +void decode_row_column_data(cudf::io::parse_options_view const& options, + device_span data, + device_span column_flags, + device_span row_offsets, + device_span dtypes, + device_span columns, + device_span valids, + device_span valid_counts, + rmm::cuda_stream_view stream) { // Calculate actual block count to use based on records count auto const block_size = csvparse_block_dim; @@ -818,7 +821,7 @@ void __host__ decode_row_column_data(cudf::io::parse_options_view const& options auto const grid_size = (num_rows + block_size - 1) / block_size; convert_csv_to_cudf<<>>( - options, data, column_flags, row_offsets, dtypes, columns, valids); + options, data, column_flags, row_offsets, dtypes, columns, valids, valid_counts); } uint32_t __host__ gather_row_offsets(const parse_options_view& options, diff --git a/cpp/src/io/csv/csv_gpu.hpp b/cpp/src/io/csv/csv_gpu.hpp index 4b6c0b10cc3..cbaa5d87e9a 100644 --- a/cpp/src/io/csv/csv_gpu.hpp +++ b/cpp/src/io/csv/csv_gpu.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -217,6 +217,7 @@ std::vector detect_column_types( * @param[in] dtypes List of dtype corresponding to each column * @param[out] columns Device memory output of column data * @param[out] valids Device memory output of column valids bitmap data + * @param[out] valid_counts Device memory output of the number of valid fields in each column * @param[in] stream CUDA stream to use, default 0 */ void decode_row_column_data(cudf::io::parse_options_view const& options, @@ -226,6 +227,7 @@ void decode_row_column_data(cudf::io::parse_options_view const& options, device_span dtypes, device_span columns, device_span valids, + device_span valid_counts, rmm::cuda_stream_view stream); } // namespace gpu diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 2da5b8f09db..9e3c1f62a07 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include @@ -580,8 +580,7 @@ std::vector decode_data(parse_options const& parse_opts, if (column_flags[col] & column_parse::enabled) { auto out_buffer = column_buffer(column_types[active_col], num_records, true, stream, mr); - out_buffer.name = column_names[col]; - out_buffer.null_count() = UNKNOWN_NULL_COUNT; + out_buffer.name = column_names[col]; out_buffers.emplace_back(std::move(out_buffer)); active_col++; } @@ -595,6 +594,9 @@ std::vector decode_data(parse_options const& parse_opts, h_valid[i] = out_buffers[i].null_mask(); } + auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async( + num_active_columns, stream, rmm::mr::get_current_device_resource()); + cudf::io::csv::gpu::decode_row_column_data( parse_opts.view(), data, @@ -603,8 +605,14 @@ std::vector decode_data(parse_options const& parse_opts, make_device_uvector_async(column_types, stream, rmm::mr::get_current_device_resource()), make_device_uvector_async(h_data, stream, rmm::mr::get_current_device_resource()), make_device_uvector_async(h_valid, stream, rmm::mr::get_current_device_resource()), + d_valid_counts, stream); + auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); + for (int i = 0; i < num_active_columns; ++i) { + out_buffers[i].null_count() = num_records - h_valid_counts[i]; + } + return out_buffers; } @@ -859,7 +867,7 @@ table_with_metadata read_csv(cudf::io::datasource* source, const std::string dblquotechar(2, parse_opts.quotechar); std::unique_ptr col = cudf::make_strings_column(*out_buffers[i]._strings, stream); out_columns.emplace_back( - cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr)); + cudf::strings::detail::replace(col->view(), dblquotechar, quotechar, -1, stream, mr)); } else { out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream)); } diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 7c357e777a3..607fe4bd8c6 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -169,7 +169,8 @@ void check_float_column(cudf::column_view const& col_lhs, CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUIVALENT(col_lhs, (wrapper{data.begin(), data.end(), validity})); - CUDF_EXPECTS(col_lhs.null_count() == 0, "All elements should be valid"); + CUDF_EXPECTS(col_lhs.null_count() == 0 and col_rhs.null_count() == 0, + "All elements should be valid"); EXPECT_THAT(cudf::test::to_host(col_lhs).first, ::testing::Pointwise(FloatNearPointwise(tol), data)); } @@ -2464,4 +2465,19 @@ TEST_F(CsvReaderTest, BlankLineAfterFirstRow) } } +TEST_F(CsvReaderTest, NullCount) +{ + std::string buffer = "0,,\n1,1.,\n2,,\n3,,\n4,4.,\n5,5.,\n6,6.,\n7,7.,\n"; + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .header(-1); + const auto result = cudf::io::read_csv(in_opts); + const auto result_view = result.tbl->view(); + + EXPECT_EQ(result_view.num_rows(), 8); + EXPECT_EQ(result_view.column(0).null_count(), 0); + EXPECT_EQ(result_view.column(1).null_count(), 3); + EXPECT_EQ(result_view.column(2).null_count(), 8); +} + CUDF_TEST_PROGRAM_MAIN()