Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use std::optional for host types #17015

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cpp/src/io/parquet/compact_protocol_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,10 +308,10 @@ class parquet_field_struct : public parquet_field {
template <typename E, typename T>
class parquet_field_union_struct : public parquet_field {
E& enum_val;
cuda::std::optional<T>& val; // union structs are always wrapped in std::optional
std::optional<T>& val; // union structs are always wrapped in std::optional

public:
parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
parquet_field_union_struct(int f, E& ev, std::optional<T>& v)
: parquet_field(f), enum_val(ev), val(v)
{
}
Expand Down Expand Up @@ -437,10 +437,10 @@ class parquet_field_struct_blob : public parquet_field {
*/
template <typename T, typename FieldFunctor>
class parquet_field_optional : public parquet_field {
cuda::std::optional<T>& val;
std::optional<T>& val;

public:
parquet_field_optional(int f, cuda::std::optional<T>& v) : parquet_field(f), val(v) {}
parquet_field_optional(int f, std::optional<T>& v) : parquet_field(f), val(v) {}

inline void operator()(CompactProtocolReader* cpr, int field_type)
{
Expand Down
64 changes: 31 additions & 33 deletions cpp/src/io/parquet/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@

#include <cudf/types.hpp>

#include <cuda/std/optional>

#include <cstdint>
#include <optional>
#include <string>
Expand Down Expand Up @@ -94,10 +92,10 @@ struct LogicalType {
BSON
};
Type type;
cuda::std::optional<DecimalType> decimal_type;
cuda::std::optional<TimeType> time_type;
cuda::std::optional<TimestampType> timestamp_type;
cuda::std::optional<IntType> int_type;
std::optional<DecimalType> decimal_type;
std::optional<TimeType> time_type;
std::optional<TimestampType> timestamp_type;
std::optional<IntType> int_type;

LogicalType(Type tp = UNDEFINED) : type(tp) {}
LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
Expand Down Expand Up @@ -178,21 +176,21 @@ struct SchemaElement {
// 5: nested fields
int32_t num_children = 0;
// 6: DEPRECATED: record the original type before conversion to parquet type
cuda::std::optional<ConvertedType> converted_type;
std::optional<ConvertedType> converted_type;
// 7: DEPRECATED: record the scale for DECIMAL converted type
int32_t decimal_scale = 0;
// 8: DEPRECATED: record the precision for DECIMAL converted type
int32_t decimal_precision = 0;
// 9: save field_id from original schema
cuda::std::optional<int32_t> field_id;
std::optional<int32_t> field_id;
// 10: replaces converted type
cuda::std::optional<LogicalType> logical_type;
std::optional<LogicalType> logical_type;

// extra cudf specific fields
bool output_as_byte_array = false;

// cudf type determined from arrow:schema
cuda::std::optional<type_id> arrow_type;
std::optional<type_id> arrow_type;

// The following fields are filled in later during schema initialization
int max_definition_level = 0;
Expand Down Expand Up @@ -258,21 +256,21 @@ struct SchemaElement {
*/
struct Statistics {
// deprecated max value in signed comparison order
cuda::std::optional<std::vector<uint8_t>> max;
std::optional<std::vector<uint8_t>> max;
// deprecated min value in signed comparison order
cuda::std::optional<std::vector<uint8_t>> min;
std::optional<std::vector<uint8_t>> min;
// count of null values in the column
cuda::std::optional<int64_t> null_count;
std::optional<int64_t> null_count;
// count of distinct values occurring
cuda::std::optional<int64_t> distinct_count;
std::optional<int64_t> distinct_count;
// max value for column determined by ColumnOrder
cuda::std::optional<std::vector<uint8_t>> max_value;
std::optional<std::vector<uint8_t>> max_value;
// min value for column determined by ColumnOrder
cuda::std::optional<std::vector<uint8_t>> min_value;
std::optional<std::vector<uint8_t>> min_value;
// If true, max_value is the actual maximum value for a column
cuda::std::optional<bool> is_max_value_exact;
std::optional<bool> is_max_value_exact;
// If true, min_value is the actual minimum value for a column
cuda::std::optional<bool> is_min_value_exact;
std::optional<bool> is_min_value_exact;
};

/**
Expand All @@ -281,7 +279,7 @@ struct Statistics {
struct SizeStatistics {
// Number of variable-width bytes stored for the page/chunk. Should not be set for anything
// but the BYTE_ARRAY physical type.
cuda::std::optional<int64_t> unencoded_byte_array_data_bytes;
std::optional<int64_t> unencoded_byte_array_data_bytes;
/**
* When present, there is expected to be one element corresponding to each
* repetition (i.e. size=max repetition_level+1) where each element
Expand All @@ -290,14 +288,14 @@ struct SizeStatistics {
*
* This value should not be written if max_repetition_level is 0.
*/
cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
std::optional<std::vector<int64_t>> repetition_level_histogram;

/**
* Same as repetition_level_histogram except for definition levels.
*
* This value should not be written if max_definition_level is 0 or 1.
*/
cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
std::optional<std::vector<int64_t>> definition_level_histogram;
};

/**
Expand All @@ -318,7 +316,7 @@ struct OffsetIndex {
std::vector<PageLocation> page_locations;
// per-page size info. see description of the same field in SizeStatistics. only present for
// columns with a BYTE_ARRAY physical type.
cuda::std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
};

/**
Expand All @@ -329,11 +327,11 @@ struct ColumnIndex {
std::vector<std::vector<uint8_t>> min_values; // lower bound for values in each page
std::vector<std::vector<uint8_t>> max_values; // upper bound for values in each page
BoundaryOrder boundary_order =
BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered
cuda::std::optional<std::vector<int64_t>> null_counts; // Optional count of null values per page
BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered
std::optional<std::vector<int64_t>> null_counts; // Optional count of null values per page
// Repetition/definition level histograms for the column chunk
cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
std::optional<std::vector<int64_t>> repetition_level_histogram;
std::optional<std::vector<int64_t>> definition_level_histogram;
};

/**
Expand Down Expand Up @@ -383,11 +381,11 @@ struct ColumnChunkMetaData {
Statistics statistics;
// Set of all encodings used for pages in this column chunk. This information can be used to
// determine if all data pages are dictionary encoded for example.
cuda::std::optional<std::vector<PageEncodingStats>> encoding_stats;
std::optional<std::vector<PageEncodingStats>> encoding_stats;
// Optional statistics to help estimate total memory when converted to in-memory representations.
// The histograms contained in these statistics can also be useful in some cases for more
// fine-grained nullability/list length filter pushdown.
cuda::std::optional<SizeStatistics> size_statistics;
std::optional<SizeStatistics> size_statistics;
};

/**
Expand Down Expand Up @@ -429,13 +427,13 @@ struct RowGroup {
int64_t num_rows = 0;
// If set, specifies a sort ordering of the rows in this RowGroup.
// The sorting columns can be a subset of all the columns.
cuda::std::optional<std::vector<SortingColumn>> sorting_columns;
std::optional<std::vector<SortingColumn>> sorting_columns;
// Byte offset from beginning of file to first page (data or dictionary) in this row group
cuda::std::optional<int64_t> file_offset;
std::optional<int64_t> file_offset;
// Total byte size of all compressed (and potentially encrypted) column data in this row group
cuda::std::optional<int64_t> total_compressed_size;
std::optional<int64_t> total_compressed_size;
// Row group ordinal in the file
cuda::std::optional<int16_t> ordinal;
std::optional<int16_t> ordinal;
};

/**
Expand All @@ -460,7 +458,7 @@ struct FileMetaData {
std::vector<RowGroup> row_groups;
std::vector<KeyValue> key_value_metadata;
std::string created_by = "";
cuda::std::optional<std::vector<ColumnOrder>> column_orders;
std::optional<std::vector<ColumnOrder>> column_orders;
};

/**
Expand Down
14 changes: 7 additions & 7 deletions cpp/src/io/parquet/parquet_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ struct ColumnChunkDesc {
uint8_t def_level_bits_,
uint8_t rep_level_bits_,
Compression codec_,
cuda::std::optional<LogicalType> logical_type_,
std::optional<LogicalType> logical_type_,
int32_t ts_clock_rate_,
int32_t src_col_index_,
int32_t src_col_schema_,
Expand Down Expand Up @@ -441,12 +441,12 @@ struct ColumnChunkDesc {
int32_t num_data_pages{}; // number of data pages
int32_t num_dict_pages{}; // number of dictionary pages
PageInfo const* dict_page{};
string_index_pair* str_dict_index{}; // index for string dictionary
bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column
void** column_data_base{}; // base pointers of column data
void** column_string_base{}; // base pointers of column string data
Compression codec{}; // compressed codec enum
cuda::std::optional<LogicalType> logical_type{}; // logical type
string_index_pair* str_dict_index{}; // index for string dictionary
bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column
void** column_data_base{}; // base pointers of column data
void** column_string_base{}; // base pointers of column string data
Compression codec{}; // compressed codec enum
std::optional<LogicalType> logical_type{}; // logical type
int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)

int32_t src_col_index{}; // my input column index
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/io/parquet/predicate_pushdown.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ struct stats_caster {
}

void set_index(size_type index,
cuda::std::optional<std::vector<uint8_t>> const& binary_value,
std::optional<std::vector<uint8_t>> const& binary_value,
Type const type)
{
if (binary_value.has_value()) {
Expand Down Expand Up @@ -234,8 +234,8 @@ struct stats_caster {
max.set_index(stats_idx, max_value, colchunk.meta_data.type);
} else {
// Marking it null, if column present in row group
min.set_index(stats_idx, cuda::std::nullopt, {});
max.set_index(stats_idx, cuda::std::nullopt, {});
min.set_index(stats_idx, std::nullopt, {});
max.set_index(stats_idx, std::nullopt, {});
}
stats_idx++;
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/reader_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ namespace {
// be treated as a string. Currently the only logical type that has special handling is DECIMAL.
// Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which
// for now would also be treated as a string).
inline bool is_treat_fixed_length_as_string(cuda::std::optional<LogicalType> const& logical_type)
inline bool is_treat_fixed_length_as_string(std::optional<LogicalType> const& logical_type)
{
if (!logical_type.has_value()) { return true; }
return logical_type->type != LogicalType::DECIMAL;
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/parquet/reader_impl_chunking.cu
Original file line number Diff line number Diff line change
Expand Up @@ -371,11 +371,11 @@ int64_t find_next_split(int64_t cur_pos,
*
* @return A tuple of Parquet clock rate and Parquet decimal type.
*/
[[nodiscard]] std::tuple<int32_t, cuda::std::optional<LogicalType>> conversion_info(
[[nodiscard]] std::tuple<int32_t, std::optional<LogicalType>> conversion_info(
type_id column_type_id,
type_id timestamp_type_id,
Type physical,
cuda::std::optional<LogicalType> logical_type)
std::optional<LogicalType> logical_type)
{
int32_t const clock_rate =
is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0;
Expand All @@ -386,11 +386,11 @@ int64_t find_next_split(int64_t cur_pos,
// if decimal but not outputting as float or decimal, then convert to no logical type
if (column_type_id != type_id::FLOAT64 and
not cudf::is_fixed_point(data_type{column_type_id})) {
return std::make_tuple(clock_rate, cuda::std::nullopt);
return {clock_rate, std::nullopt};
}
}

return std::make_tuple(clock_rate, std::move(logical_type));
return {clock_rate, std::move(logical_type)};
}

/**
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/io/parquet/reader_impl_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf;

namespace {

cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
{
if (schema.converted_type.has_value()) {
switch (schema.converted_type.value()) {
Expand Down Expand Up @@ -66,7 +66,7 @@ cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const&
default: return LogicalType{LogicalType::UNDEFINED};
}
}
return cuda::std::nullopt;
return std::nullopt;
}

} // namespace
Expand Down Expand Up @@ -246,7 +246,7 @@ void metadata::sanitize_schema()
struct_elem.repetition_type = REQUIRED;
struct_elem.num_children = schema_elem.num_children;
struct_elem.type = UNDEFINED_TYPE;
struct_elem.converted_type = cuda::std::nullopt;
struct_elem.converted_type = std::nullopt;

// swap children
struct_elem.children_idx = std::move(schema_elem.children_idx);
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ struct aggregate_writer_metadata {
std::vector<std::vector<uint8_t>> column_indexes;
};
std::vector<per_file_metadata> files;
cuda::std::optional<std::vector<ColumnOrder>> column_orders = cuda::std::nullopt;
std::optional<std::vector<ColumnOrder>> column_orders = std::nullopt;
};

namespace {
Expand Down Expand Up @@ -472,7 +472,7 @@ struct leaf_schema_fn {
std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
{
col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
col_schema.converted_type = cuda::std::nullopt;
col_schema.converted_type = std::nullopt;
col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
if (timestamp_is_int96) {
col_schema.ts_scale = -1000; // negative value indicates division by absolute value
Expand Down Expand Up @@ -750,7 +750,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
col_schema.type = Type::BYTE_ARRAY;
}

col_schema.converted_type = cuda::std::nullopt;
col_schema.converted_type = std::nullopt;
col_schema.stats_dtype = statistics_dtype::dtype_byte_array;
col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
Expand Down Expand Up @@ -2795,7 +2795,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
// See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615
for (auto& se : md.schema) {
if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) {
se.logical_type = cuda::std::nullopt;
se.logical_type = std::nullopt;
}
}

Expand Down
Loading