From c7976892cfa47875a6a9d0ae911f8ece12aaa1cb Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 3 May 2024 23:22:56 +0000 Subject: [PATCH 1/9] bug fix --- cpp/src/io/json/read_json.cu | 39 ++++++++++---------- cpp/tests/io/json_chunked_reader.cpp | 53 +++++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 22 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 89c301ec055..4cda239a38d 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -72,7 +72,6 @@ device_span ingest_raw_input(device_span buffer, // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line // delimiter. auto constexpr num_delimiter_chars = 1; - auto const num_extra_delimiters = num_delimiter_chars * (sources.size() - 1); if (compression == compression_type::NONE) { std::vector delimiter_map{}; @@ -89,36 +88,36 @@ device_span ingest_raw_input(device_span buffer, std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset); size_t start_source = std::distance(prefsum_source_sizes.begin(), upper); - auto remaining_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset); + auto total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset); range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0; - for (size_t i = start_source; i < sources.size() && remaining_bytes_to_read; i++) { + for (size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) { if (sources[i]->is_empty()) continue; - auto data_size = std::min(sources[i]->size() - range_offset, remaining_bytes_to_read); - auto destination = reinterpret_cast(buffer.data()) + bytes_read; + auto data_size = + std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read); + auto destination = reinterpret_cast(buffer.data()) + bytes_read + + (num_delimiter_chars * delimiter_map.size()); if (sources[i]->is_device_read_preferred(data_size)) { bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); } else { h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); auto const& h_buffer = h_buffers.back(); CUDF_CUDA_TRY(cudaMemcpyAsync( - destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value())); + destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value())); bytes_read += h_buffer->size(); } range_offset = 0; - remaining_bytes_to_read -= bytes_read; - delimiter_map.push_back(bytes_read); - bytes_read += num_delimiter_chars; + delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size())); } - // In the case where all sources are empty, bytes_read is zero - if (bytes_read) bytes_read -= num_delimiter_chars; + // In the case where all sources are empty, we needn't insert a delimiter + if (!delimiter_map.empty()) delimiter_map.pop_back(); // If this is a multi-file source, we scatter the JSON line delimiters between files if (sources.size() > 1) { - static_assert(num_delimiter_chars == 1, - "Currently only single-character delimiters are supported"); + CUDF_EXPECTS(num_delimiter_chars == 1, + "Currently only single-character delimiters are supported"); auto const delimiter_source = thrust::make_constant_iterator('\n'); auto const d_delimiter_map = cudf::detail::make_device_uvector_async( - host_span{delimiter_map.data(), delimiter_map.size() - 1}, + host_span{delimiter_map.data(), delimiter_map.size()}, stream, rmm::mr::get_current_device_resource()); thrust::scatter(rmm::exec_policy_nosync(stream), @@ -128,7 +127,7 @@ device_span ingest_raw_input(device_span buffer, buffer.data()); } stream.synchronize(); - return buffer.first(bytes_read); + return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars)); } // TODO: allow byte range reading from multiple compressed files. auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset); @@ -151,9 +150,7 @@ size_type find_first_delimiter_in_chunk(host_span buffer(total_source_size, stream); ingest_raw_input(buffer, sources, @@ -195,8 +192,7 @@ datasource::owning_buffer> get_record_range_raw_input( CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset, "Invalid offsetting"); auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset; - chunk_size = - should_load_all_sources ? total_source_size - chunk_offset + num_extra_delimiters : chunk_size; + chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size; // Some magic numbers constexpr int num_subchunks = 10; // per chunk_size @@ -217,7 +213,8 @@ datasource::owning_buffer> get_record_range_raw_input( size_t const buffer_size = reader_compression != compression_type::NONE ? total_source_size * estimated_compression_ratio + header_size - : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk); + : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) + + num_extra_delimiters; rmm::device_uvector buffer(buffer_size, stream); device_span bufspan(buffer); diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp index ef69ee5239d..1a292a47587 100644 --- a/cpp/tests/io/json_chunked_reader.cpp +++ b/cpp/tests/io/json_chunked_reader.cpp @@ -24,11 +24,19 @@ #include +#include +#include +#include + /** * @brief Base test fixture for JSON reader tests */ struct JsonReaderTest : public cudf::test::BaseFixture {}; +cudf::test::TempDirTestEnvironment* const temp_env = + static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + // function to extract first delimiter in the string in each chunk, // collate together and form byte_range for each chunk, // parse separately. @@ -87,7 +95,7 @@ std::vector skeleton_for_parellel_chunk_reader( return tables; } -TEST_F(JsonReaderTest, ByteRange) +TEST_F(JsonReaderTest, ByteRange_SingleFile) { std::string const json_string = R"( { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } @@ -126,3 +134,46 @@ TEST_F(JsonReaderTest, ByteRange) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view()); } } + +TEST_F(JsonReaderTest, ReadCompleteFiles) +{ + std::string const json_string = R"( + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } + { "a": { "y" : 6}, "b" : [6 ], "c": 13 } + { "a": { "y" : 6}, "b" : [7 ], "c": 14 })"; + auto filename = temp_env->get_temp_dir() + "ParseInRangeIntegers.json"; + { + std::ofstream outfile(filename, std::ofstream::out); + outfile << json_string; + } + + constexpr int num_sources = 5; + std::vector filepaths(num_sources, filename); + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + std::vector part_tables; + for (auto filepath : filepaths) { + cudf::io::json_reader_options part_in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); + + part_tables.push_back(cudf::io::read_json(part_in_options)); + } + + auto part_table_views = std::vector(part_tables.size()); + std::transform(part_tables.begin(), part_tables.end(), part_table_views.begin(), [](auto& table) { + return table.tbl->view(); + }); + + auto expected_result = cudf::concatenate(part_table_views); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result.tbl->view(), expected_result->view()); +} From 468ff9f7536a9d48a2afe495024f825dd1033b0c Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 6 May 2024 18:45:00 +0000 Subject: [PATCH 2/9] tests fix --- cpp/src/io/json/read_json.cu | 4 +++- cpp/tests/io/json_chunked_reader.cpp | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 4cda239a38d..fb8779e4967 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -150,7 +150,9 @@ size_type find_first_delimiter_in_chunk(host_span buffer(total_source_size, stream); ingest_raw_input(buffer, sources, diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp index 1a292a47587..8e1622cc0d3 100644 --- a/cpp/tests/io/json_chunked_reader.cpp +++ b/cpp/tests/io/json_chunked_reader.cpp @@ -85,7 +85,7 @@ std::vector skeleton_for_parellel_chunk_reader( std::vector tables; // Process each chunk in parallel. for (auto const& [chunk_start, chunk_end] : record_ranges) { - if (chunk_start == -1 or chunk_end == -1) continue; + if (chunk_start == -1 or chunk_end == -1 or (size_t) chunk_start >= total_source_size) continue; reader_opts_chunk.set_byte_range_offset(chunk_start); reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start); tables.push_back(read_json(sources, reader_opts_chunk, stream, mr)); @@ -95,7 +95,7 @@ std::vector skeleton_for_parellel_chunk_reader( return tables; } -TEST_F(JsonReaderTest, ByteRange_SingleFile) +TEST_F(JsonReaderTest, ByteRange_SingleSource) { std::string const json_string = R"( { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } From 3fc31bb3bb63286b0a51f9790f8a000cadb77d2d Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 6 May 2024 20:38:53 +0000 Subject: [PATCH 3/9] multisource byte range reading test --- cpp/src/io/json/read_json.cu | 28 +++++++-------- cpp/tests/io/json_chunked_reader.cpp | 53 +++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index fb8779e4967..a27e48b5086 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -100,10 +100,12 @@ device_span ingest_raw_input(device_span buffer, bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); } else { h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); - auto const& h_buffer = h_buffers.back(); - CUDF_CUDA_TRY(cudaMemcpyAsync( - destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value())); - bytes_read += h_buffer->size(); + CUDF_CUDA_TRY(cudaMemcpyAsync(destination, + h_buffers[h_buffers.size() - 1]->data(), + h_buffers[h_buffers.size() - 1]->size(), + cudaMemcpyHostToDevice, + stream.value())); + bytes_read += h_buffers[h_buffers.size() - 1]->size(); } range_offset = 0; delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size())); @@ -150,17 +152,15 @@ size_type find_first_delimiter_in_chunk(host_span buffer(total_source_size, stream); - ingest_raw_input(buffer, - sources, - reader_opts.get_compression(), - reader_opts.get_byte_range_offset(), - reader_opts.get_byte_range_size(), - stream); - return find_first_delimiter(buffer, delimiter, stream); + auto readbufspan = ingest_raw_input(buffer, + sources, + reader_opts.get_compression(), + reader_opts.get_byte_range_offset(), + reader_opts.get_byte_range_size(), + stream); + return find_first_delimiter(readbufspan, '\n', stream); } /** diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp index 8e1622cc0d3..79b9c3234ab 100644 --- a/cpp/tests/io/json_chunked_reader.cpp +++ b/cpp/tests/io/json_chunked_reader.cpp @@ -49,7 +49,6 @@ std::vector skeleton_for_parellel_chunk_reader( { using namespace cudf::io::json::detail; using cudf::size_type; - // assuming single source. size_t total_source_size = 0; for (auto const& source : sources) { total_source_size += source->size(); @@ -177,3 +176,55 @@ TEST_F(JsonReaderTest, ReadCompleteFiles) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result.tbl->view(), expected_result->view()); } + +TEST_F(JsonReaderTest, ByteRange_MultiSource) +{ + std::string const json_string = R"( + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } + { "a": { "y" : 6}, "b" : [6 ], "c": 13 } + { "a": { "y" : 6}, "b" : [7 ], "c": 14 })"; + auto filename = temp_env->get_temp_dir() + "ParseInRangeIntegers.json"; + { + std::ofstream outfile(filename, std::ofstream::out); + outfile << json_string; + } + + constexpr int num_sources = 5; + std::vector filepaths(num_sources, filename); + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths}) + .lines(true) + .compression(cudf::io::compression_type::NONE) + .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); + + // Read full test data via existing, nested JSON lines reader + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + + auto file_paths = json_lines_options.get_source().filepaths(); + std::vector> datasources; + for (auto& fp : file_paths) { + datasources.emplace_back(cudf::io::datasource::create(fp)); + } + + // Test for different chunk sizes + for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) { + auto const tables = skeleton_for_parellel_chunk_reader(datasources, + json_lines_options, + chunk_size, + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + + auto table_views = std::vector(tables.size()); + std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) { + return table.tbl->view(); + }); + auto result = cudf::concatenate(table_views); + + // Verify that the data read via chunked reader matches the data read via nested JSON reader + // cannot use EQUAL due to concatenate removing null mask + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view()); + } +} From ddc05a00edceff11d1c6f97868cce37711e9218b Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 7 May 2024 18:40:32 +0000 Subject: [PATCH 4/9] pr reviews --- cpp/src/io/json/read_json.cu | 2 +- cpp/tests/io/json_chunked_reader.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index a27e48b5086..2344b4cd8d9 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -115,7 +115,7 @@ device_span ingest_raw_input(device_span buffer, // If this is a multi-file source, we scatter the JSON line delimiters between files if (sources.size() > 1) { - CUDF_EXPECTS(num_delimiter_chars == 1, + static_assert(num_delimiter_chars == 1, "Currently only single-character delimiters are supported"); auto const delimiter_source = thrust::make_constant_iterator('\n'); auto const d_delimiter_map = cudf::detail::make_device_uvector_async( diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp index 79b9c3234ab..7c416ad8751 100644 --- a/cpp/tests/io/json_chunked_reader.cpp +++ b/cpp/tests/io/json_chunked_reader.cpp @@ -84,7 +84,7 @@ std::vector skeleton_for_parellel_chunk_reader( std::vector tables; // Process each chunk in parallel. for (auto const& [chunk_start, chunk_end] : record_ranges) { - if (chunk_start == -1 or chunk_end == -1 or (size_t) chunk_start >= total_source_size) continue; + if (chunk_start == -1 or chunk_end == -1 or static_cast(chunk_start) >= total_source_size) continue; reader_opts_chunk.set_byte_range_offset(chunk_start); reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start); tables.push_back(read_json(sources, reader_opts_chunk, stream, mr)); From 423c33537ff97e2cf354235515c7957c5dc28b19 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 7 May 2024 18:45:45 +0000 Subject: [PATCH 5/9] formatting --- cpp/src/io/json/read_json.cu | 2 +- cpp/tests/io/json_chunked_reader.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 2344b4cd8d9..46a0b0111bb 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -116,7 +116,7 @@ device_span ingest_raw_input(device_span buffer, // If this is a multi-file source, we scatter the JSON line delimiters between files if (sources.size() > 1) { static_assert(num_delimiter_chars == 1, - "Currently only single-character delimiters are supported"); + "Currently only single-character delimiters are supported"); auto const delimiter_source = thrust::make_constant_iterator('\n'); auto const d_delimiter_map = cudf::detail::make_device_uvector_async( host_span{delimiter_map.data(), delimiter_map.size()}, diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp index 7c416ad8751..7482cb1b70d 100644 --- a/cpp/tests/io/json_chunked_reader.cpp +++ b/cpp/tests/io/json_chunked_reader.cpp @@ -84,7 +84,9 @@ std::vector skeleton_for_parellel_chunk_reader( std::vector tables; // Process each chunk in parallel. for (auto const& [chunk_start, chunk_end] : record_ranges) { - if (chunk_start == -1 or chunk_end == -1 or static_cast(chunk_start) >= total_source_size) continue; + if (chunk_start == -1 or chunk_end == -1 or + static_cast(chunk_start) >= total_source_size) + continue; reader_opts_chunk.set_byte_range_offset(chunk_start); reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start); tables.push_back(read_json(sources, reader_opts_chunk, stream, mr)); From e7432c99dd8c7d9670cbaab19d8081d16c37f9e6 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 9 May 2024 22:42:07 +0000 Subject: [PATCH 6/9] addressing PR reviews; reducing cpu memory footprint --- cpp/src/io/json/read_json.cu | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 46a0b0111bb..448d86511e5 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -76,7 +76,6 @@ device_span ingest_raw_input(device_span buffer, if (compression == compression_type::NONE) { std::vector delimiter_map{}; std::vector prefsum_source_sizes(sources.size()); - std::vector> h_buffers; delimiter_map.reserve(sources.size()); size_t bytes_read = 0; std::transform_inclusive_scan(sources.begin(), @@ -99,13 +98,10 @@ device_span ingest_raw_input(device_span buffer, if (sources[i]->is_device_read_preferred(data_size)) { bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); } else { - h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); - CUDF_CUDA_TRY(cudaMemcpyAsync(destination, - h_buffers[h_buffers.size() - 1]->data(), - h_buffers[h_buffers.size() - 1]->size(), - cudaMemcpyHostToDevice, - stream.value())); - bytes_read += h_buffers[h_buffers.size() - 1]->size(); + auto h_buffer = sources[i]->host_read(range_offset, data_size); + CUDF_CUDA_TRY(cudaMemcpyAsync( + destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value())); + bytes_read += h_buffer->size(); } range_offset = 0; delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size())); From ec133d56719e8bd2b177ff6a8bac8b9e639050a7 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 9 May 2024 23:36:22 +0000 Subject: [PATCH 7/9] more comment fixing --- cpp/src/io/json/read_json.cu | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 448d86511e5..f248668d532 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -50,7 +50,10 @@ size_t sources_size(host_span> const sources, } /** - * @brief Read from array of data sources into RMM buffer + * @brief Read from array of data sources into RMM buffer. The size of the returned device span + can be larger than the number of bytes requested from the list of sources when + the range to be read spans across multiple sources. This is due to the delimiter + characters inserted after the end of each accessed source. * * @param buffer Device span buffer to which data is read * @param sources Array of data sources @@ -106,7 +109,7 @@ device_span ingest_raw_input(device_span buffer, range_offset = 0; delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size())); } - // In the case where all sources are empty, we needn't insert a delimiter + // Removing delimiter inserted after last non-empty source is read if (!delimiter_map.empty()) delimiter_map.pop_back(); // If this is a multi-file source, we scatter the JSON line delimiters between files From 36eda824a01987d3bc1fc410506e4bc3941df5d7 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 10 May 2024 18:44:24 +0000 Subject: [PATCH 8/9] addressing PR reviews --- cpp/src/io/json/read_json.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 28daaead9ae..4cccea54278 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -79,6 +79,7 @@ device_span ingest_raw_input(device_span buffer, if (compression == compression_type::NONE) { std::vector delimiter_map{}; std::vector prefsum_source_sizes(sources.size()); + std::vector> h_buffers; delimiter_map.reserve(sources.size()); size_t bytes_read = 0; std::transform_inclusive_scan(sources.begin(), @@ -101,7 +102,8 @@ device_span ingest_raw_input(device_span buffer, if (sources[i]->is_device_read_preferred(data_size)) { bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); } else { - auto h_buffer = sources[i]->host_read(range_offset, data_size); + h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); + auto const& h_buffer = h_buffers.back(); CUDF_CUDA_TRY(cudaMemcpyAsync( destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value())); bytes_read += h_buffer->size(); From dec523765c1b2e2ab8208e89e73b39fc93570975 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 10 May 2024 20:55:02 +0000 Subject: [PATCH 9/9] pr reviews --- cpp/src/io/json/read_json.cu | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 4cccea54278..ea52dce020e 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -91,7 +91,8 @@ device_span ingest_raw_input(device_span buffer, std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset); size_t start_source = std::distance(prefsum_source_sizes.begin(), upper); - auto total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset); + auto const total_bytes_to_read = + std::min(range_size, prefsum_source_sizes.back() - range_offset); range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0; for (size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) { if (sources[i]->is_empty()) continue; @@ -112,7 +113,7 @@ device_span ingest_raw_input(device_span buffer, delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size())); } // Removing delimiter inserted after last non-empty source is read - if (!delimiter_map.empty()) delimiter_map.pop_back(); + if (!delimiter_map.empty()) { delimiter_map.pop_back(); } // If this is a multi-file source, we scatter the JSON line delimiters between files if (sources.size() > 1) { @@ -120,9 +121,7 @@ device_span ingest_raw_input(device_span buffer, "Currently only single-character delimiters are supported"); auto const delimiter_source = thrust::make_constant_iterator('\n'); auto const d_delimiter_map = cudf::detail::make_device_uvector_async( - host_span{delimiter_map.data(), delimiter_map.size()}, - stream, - rmm::mr::get_current_device_resource()); + delimiter_map, stream, rmm::mr::get_current_device_resource()); thrust::scatter(rmm::exec_policy_nosync(stream), delimiter_source, delimiter_source + d_delimiter_map.size(),