From 632f43223825b4fe4f5694cf8a66492a47830a71 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 8 Oct 2024 09:59:06 -0400 Subject: [PATCH 1/6] Use std::optional for host types --- cpp/src/io/parquet/parquet.hpp | 26 +++++++++++------------ cpp/src/io/parquet/predicate_pushdown.cpp | 2 +- cpp/src/io/parquet/writer_impl.cu | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 7c985643887..ac72ff8fd8d 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -258,17 +258,17 @@ struct SchemaElement { */ struct Statistics { // deprecated max value in signed comparison order - cuda::std::optional> max; + std::optional> max; // deprecated min value in signed comparison order - cuda::std::optional> min; + std::optional> min; // count of null values in the column cuda::std::optional null_count; // count of distinct values occurring cuda::std::optional distinct_count; // max value for column determined by ColumnOrder - cuda::std::optional> max_value; + std::optional> max_value; // min value for column determined by ColumnOrder - cuda::std::optional> min_value; + std::optional> min_value; // If true, max_value is the actual maximum value for a column cuda::std::optional is_max_value_exact; // If true, min_value is the actual minimum value for a column @@ -290,14 +290,14 @@ struct SizeStatistics { * * This value should not be written if max_repetition_level is 0. */ - cuda::std::optional> repetition_level_histogram; + std::optional> repetition_level_histogram; /** * Same as repetition_level_histogram except for definition levels. * * This value should not be written if max_definition_level is 0 or 1. */ - cuda::std::optional> definition_level_histogram; + std::optional> definition_level_histogram; }; /** @@ -318,7 +318,7 @@ struct OffsetIndex { std::vector page_locations; // per-page size info. see description of the same field in SizeStatistics. only present for // columns with a BYTE_ARRAY physical type. - cuda::std::optional> unencoded_byte_array_data_bytes; + std::optional> unencoded_byte_array_data_bytes; }; /** @@ -330,10 +330,10 @@ struct ColumnIndex { std::vector> max_values; // upper bound for values in each page BoundaryOrder boundary_order = BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered - cuda::std::optional> null_counts; // Optional count of null values per page + std::optional> null_counts; // Optional count of null values per page // Repetition/definition level histograms for the column chunk - cuda::std::optional> repetition_level_histogram; - cuda::std::optional> definition_level_histogram; + std::optional> repetition_level_histogram; + std::optional> definition_level_histogram; }; /** @@ -383,7 +383,7 @@ struct ColumnChunkMetaData { Statistics statistics; // Set of all encodings used for pages in this column chunk. This information can be used to // determine if all data pages are dictionary encoded for example. - cuda::std::optional> encoding_stats; + std::optional> encoding_stats; // Optional statistics to help estimate total memory when converted to in-memory representations. // The histograms contained in these statistics can also be useful in some cases for more // fine-grained nullability/list length filter pushdown. @@ -429,7 +429,7 @@ struct RowGroup { int64_t num_rows = 0; // If set, specifies a sort ordering of the rows in this RowGroup. // The sorting columns can be a subset of all the columns. - cuda::std::optional> sorting_columns; + std::optional> sorting_columns; // Byte offset from beginning of file to first page (data or dictionary) in this row group cuda::std::optional file_offset; // Total byte size of all compressed (and potentially encrypted) column data in this row group @@ -460,7 +460,7 @@ struct FileMetaData { std::vector row_groups; std::vector key_value_metadata; std::string created_by = ""; - cuda::std::optional> column_orders; + std::optional> column_orders; }; /** diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index b90ca36c8c7..35a1392cea0 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -152,7 +152,7 @@ struct stats_caster { } void set_index(size_type index, - cuda::std::optional> const& binary_value, + std::optional> const& binary_value, Type const type) { if (binary_value.has_value()) { diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ec05f35d405..9859b36c7cc 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -186,7 +186,7 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - cuda::std::optional> column_orders = cuda::std::nullopt; + std::optional> column_orders = std::nullopt; }; namespace { From 5a81d95c313fc979b42672cf869b3027735233c0 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 8 Oct 2024 10:27:13 -0400 Subject: [PATCH 2/6] Use std::optional for host types --- cpp/src/io/parquet/compact_protocol_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index b978799b8bc..f66996e8e07 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -440,7 +440,7 @@ class parquet_field_optional : public parquet_field { cuda::std::optional& val; public: - parquet_field_optional(int f, cuda::std::optional& v) : parquet_field(f), val(v) {} + parquet_field_optional(int f, std::optional& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { From 48f74823560eb281ed222aa0ef14b850b8e3113c Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 8 Oct 2024 10:45:15 -0400 Subject: [PATCH 3/6] Use std::optional for host types --- .../io/parquet/compact_protocol_reader.cpp | 6 ++-- cpp/src/io/parquet/parquet.hpp | 36 +++++++++---------- cpp/src/io/parquet/parquet_gpu.hpp | 4 +-- cpp/src/io/parquet/predicate_pushdown.cpp | 4 +-- cpp/src/io/parquet/reader_impl.cpp | 2 +- cpp/src/io/parquet/reader_impl_chunking.cu | 6 ++-- cpp/src/io/parquet/reader_impl_helpers.cpp | 6 ++-- cpp/src/io/parquet/writer_impl.cu | 6 ++-- 8 files changed, 34 insertions(+), 36 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index f66996e8e07..bf95349f12f 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -308,10 +308,10 @@ class parquet_field_struct : public parquet_field { template class parquet_field_union_struct : public parquet_field { E& enum_val; - cuda::std::optional& val; // union structs are always wrapped in std::optional + std::optional& val; // union structs are always wrapped in std::optional public: - parquet_field_union_struct(int f, E& ev, cuda::std::optional& v) + parquet_field_union_struct(int f, E& ev, std::optional& v) : parquet_field(f), enum_val(ev), val(v) { } @@ -437,7 +437,7 @@ class parquet_field_struct_blob : public parquet_field { */ template class parquet_field_optional : public parquet_field { - cuda::std::optional& val; + std::optional& val; public: parquet_field_optional(int f, std::optional& v) : parquet_field(f), val(v) {} diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index ac72ff8fd8d..753a51112a7 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -20,8 +20,6 @@ #include -#include - #include #include #include @@ -94,10 +92,10 @@ struct LogicalType { BSON }; Type type; - cuda::std::optional decimal_type; - cuda::std::optional time_type; - cuda::std::optional timestamp_type; - cuda::std::optional int_type; + std::optional decimal_type; + std::optional time_type; + std::optional timestamp_type; + std::optional int_type; LogicalType(Type tp = UNDEFINED) : type(tp) {} LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {} @@ -178,21 +176,21 @@ struct SchemaElement { // 5: nested fields int32_t num_children = 0; // 6: DEPRECATED: record the original type before conversion to parquet type - cuda::std::optional converted_type; + std::optional converted_type; // 7: DEPRECATED: record the scale for DECIMAL converted type int32_t decimal_scale = 0; // 8: DEPRECATED: record the precision for DECIMAL converted type int32_t decimal_precision = 0; // 9: save field_id from original schema - cuda::std::optional field_id; + std::optional field_id; // 10: replaces converted type - cuda::std::optional logical_type; + std::optional logical_type; // extra cudf specific fields bool output_as_byte_array = false; // cudf type determined from arrow:schema - cuda::std::optional arrow_type; + std::optional arrow_type; // The following fields are filled in later during schema initialization int max_definition_level = 0; @@ -262,17 +260,17 @@ struct Statistics { // deprecated min value in signed comparison order std::optional> min; // count of null values in the column - cuda::std::optional null_count; + std::optional null_count; // count of distinct values occurring - cuda::std::optional distinct_count; + std::optional distinct_count; // max value for column determined by ColumnOrder std::optional> max_value; // min value for column determined by ColumnOrder std::optional> min_value; // If true, max_value is the actual maximum value for a column - cuda::std::optional is_max_value_exact; + std::optional is_max_value_exact; // If true, min_value is the actual minimum value for a column - cuda::std::optional is_min_value_exact; + std::optional is_min_value_exact; }; /** @@ -281,7 +279,7 @@ struct Statistics { struct SizeStatistics { // Number of variable-width bytes stored for the page/chunk. Should not be set for anything // but the BYTE_ARRAY physical type. - cuda::std::optional unencoded_byte_array_data_bytes; + std::optional unencoded_byte_array_data_bytes; /** * When present, there is expected to be one element corresponding to each * repetition (i.e. size=max repetition_level+1) where each element @@ -387,7 +385,7 @@ struct ColumnChunkMetaData { // Optional statistics to help estimate total memory when converted to in-memory representations. // The histograms contained in these statistics can also be useful in some cases for more // fine-grained nullability/list length filter pushdown. - cuda::std::optional size_statistics; + std::optional size_statistics; }; /** @@ -431,11 +429,11 @@ struct RowGroup { // The sorting columns can be a subset of all the columns. std::optional> sorting_columns; // Byte offset from beginning of file to first page (data or dictionary) in this row group - cuda::std::optional file_offset; + std::optional file_offset; // Total byte size of all compressed (and potentially encrypted) column data in this row group - cuda::std::optional total_compressed_size; + std::optional total_compressed_size; // Row group ordinal in the file - cuda::std::optional ordinal; + std::optional ordinal; }; /** diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index a8ba3a969ce..3d31c9087ab 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -395,7 +395,7 @@ struct ColumnChunkDesc { uint8_t def_level_bits_, uint8_t rep_level_bits_, Compression codec_, - cuda::std::optional logical_type_, + std::optional logical_type_, int32_t ts_clock_rate_, int32_t src_col_index_, int32_t src_col_schema_, @@ -446,7 +446,7 @@ struct ColumnChunkDesc { void** column_data_base{}; // base pointers of column data void** column_string_base{}; // base pointers of column string data Compression codec{}; // compressed codec enum - cuda::std::optional logical_type{}; // logical type + std::optional logical_type{}; // logical type int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) int32_t src_col_index{}; // my input column index diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 35a1392cea0..f0a0bc0b51b 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -234,8 +234,8 @@ struct stats_caster { max.set_index(stats_idx, max_value, colchunk.meta_data.type); } else { // Marking it null, if column present in row group - min.set_index(stats_idx, cuda::std::nullopt, {}); - max.set_index(stats_idx, cuda::std::nullopt, {}); + min.set_index(stats_idx, std::nullopt, {}); + max.set_index(stats_idx, std::nullopt, {}); } stats_idx++; } diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 1b69ccb7742..f0865c715bc 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -38,7 +38,7 @@ namespace { // be treated as a string. Currently the only logical type that has special handling is DECIMAL. // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which // for now would also be treated as a string). -inline bool is_treat_fixed_length_as_string(cuda::std::optional const& logical_type) +inline bool is_treat_fixed_length_as_string(std::optional const& logical_type) { if (!logical_type.has_value()) { return true; } return logical_type->type != LogicalType::DECIMAL; diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index c588fedb85c..772be2909df 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -371,11 +371,11 @@ int64_t find_next_split(int64_t cur_pos, * * @return A tuple of Parquet clock rate and Parquet decimal type. */ -[[nodiscard]] std::tuple> conversion_info( +[[nodiscard]] std::tuple> conversion_info( type_id column_type_id, type_id timestamp_type_id, Type physical, - cuda::std::optional logical_type) + std::optional logical_type) { int32_t const clock_rate = is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0; @@ -386,7 +386,7 @@ int64_t find_next_split(int64_t cur_pos, // if decimal but not outputting as float or decimal, then convert to no logical type if (column_type_id != type_id::FLOAT64 and not cudf::is_fixed_point(data_type{column_type_id})) { - return std::make_tuple(clock_rate, cuda::std::nullopt); + return std::make_tuple(clock_rate, std::nullopt); } } diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 6d566b5815e..a6562d33de2 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf; namespace { -cuda::std::optional converted_to_logical_type(SchemaElement const& schema) +std::optional converted_to_logical_type(SchemaElement const& schema) { if (schema.converted_type.has_value()) { switch (schema.converted_type.value()) { @@ -66,7 +66,7 @@ cuda::std::optional converted_to_logical_type(SchemaElement const& default: return LogicalType{LogicalType::UNDEFINED}; } } - return cuda::std::nullopt; + return std::nullopt; } } // namespace @@ -246,7 +246,7 @@ void metadata::sanitize_schema() struct_elem.repetition_type = REQUIRED; struct_elem.num_children = schema_elem.num_children; struct_elem.type = UNDEFINED_TYPE; - struct_elem.converted_type = cuda::std::nullopt; + struct_elem.converted_type = std::nullopt; // swap children struct_elem.children_idx = std::move(schema_elem.children_idx); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 9859b36c7cc..190f13eb688 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -472,7 +472,7 @@ struct leaf_schema_fn { std::enable_if_t, void> operator()() { col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = cuda::std::nullopt; + col_schema.converted_type = std::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; if (timestamp_is_int96) { col_schema.ts_scale = -1000; // negative value indicates division by absolute value @@ -750,7 +750,7 @@ std::vector construct_parquet_schema_tree( col_schema.type = Type::BYTE_ARRAY; } - col_schema.converted_type = cuda::std::nullopt; + col_schema.converted_type = std::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_byte_array; col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -2795,7 +2795,7 @@ std::unique_ptr> writer::merge_row_group_metadata( // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615 for (auto& se : md.schema) { if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) { - se.logical_type = cuda::std::nullopt; + se.logical_type = std::nullopt; } } From fcc51e4bb7381af4e31e1b2d24067676336ecf4e Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 8 Oct 2024 11:11:30 -0400 Subject: [PATCH 4/6] Correct style issues found by CI --- cpp/src/io/parquet/parquet.hpp | 2 +- cpp/src/io/parquet/parquet_gpu.hpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 753a51112a7..2851ef67a65 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -327,7 +327,7 @@ struct ColumnIndex { std::vector> min_values; // lower bound for values in each page std::vector> max_values; // upper bound for values in each page BoundaryOrder boundary_order = - BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered + BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered std::optional> null_counts; // Optional count of null values per page // Repetition/definition level histograms for the column chunk std::optional> repetition_level_histogram; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 3d31c9087ab..4f6d41a97da 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -441,11 +441,11 @@ struct ColumnChunkDesc { int32_t num_data_pages{}; // number of data pages int32_t num_dict_pages{}; // number of dictionary pages PageInfo const* dict_page{}; - string_index_pair* str_dict_index{}; // index for string dictionary - bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column - void** column_data_base{}; // base pointers of column data - void** column_string_base{}; // base pointers of column string data - Compression codec{}; // compressed codec enum + string_index_pair* str_dict_index{}; // index for string dictionary + bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column + void** column_data_base{}; // base pointers of column data + void** column_string_base{}; // base pointers of column string data + Compression codec{}; // compressed codec enum std::optional logical_type{}; // logical type int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) From a7faf5bba408dc67115681296a38b9a9a2e99776 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 8 Oct 2024 14:51:12 -0400 Subject: [PATCH 5/6] Refactor as requested in review --- cpp/src/io/parquet/reader_impl_chunking.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 772be2909df..27312a4da89 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -386,11 +386,11 @@ int64_t find_next_split(int64_t cur_pos, // if decimal but not outputting as float or decimal, then convert to no logical type if (column_type_id != type_id::FLOAT64 and not cudf::is_fixed_point(data_type{column_type_id})) { - return std::make_tuple(clock_rate, std::nullopt); + return {clock_rate, std::nullopt}; } } - return std::make_tuple(clock_rate, std::move(logical_type)); + return {clock_rate, std::move(logical_type)}; } /** From f603431ae0a814d16e5b2648cf8bf1ed7787df94 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 9 Oct 2024 11:20:52 -0400 Subject: [PATCH 6/6] Fix compile issues in tests --- cpp/tests/io/parquet_common.cpp | 2 +- cpp/tests/io/parquet_common.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index 6141a40bc95..a1b8677eac8 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2) int32_t compare_binary(std::vector const& v1, std::vector const& v2, cudf::io::parquet::detail::Type ptype, - cuda::std::optional const& ctype) + std::optional const& ctype) { auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN); switch (ptype) { diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp index bd1579eaa1b..c90b81ed27a 100644 --- a/cpp/tests/io/parquet_common.hpp +++ b/cpp/tests/io/parquet_common.hpp @@ -172,7 +172,7 @@ std::pair create_parquet_typed_with_stats(std::string int32_t compare_binary(std::vector const& v1, std::vector const& v2, cudf::io::parquet::detail::Type ptype, - cuda::std::optional const& ctype); + std::optional const& ctype); void expect_compression_stats_empty(std::shared_ptr stats);