From 4e75bbb0bc3ebcadc1102eb6261d3d4e8d059397 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 15 Dec 2023 15:12:07 -0800 Subject: [PATCH 1/7] use enum class --- .../io/parquet/compact_protocol_reader.cpp | 107 ++++++++++-------- .../io/parquet/compact_protocol_writer.cpp | 41 +++---- .../io/parquet/compact_protocol_writer.hpp | 9 +- cpp/src/io/parquet/page_enc.cu | 47 ++++---- cpp/src/io/parquet/page_hdr.cu | 32 +++--- cpp/src/io/parquet/parquet_common.hpp | 27 ++--- 6 files changed, 146 insertions(+), 117 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 55848802f12..a448921e763 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -45,28 +45,36 @@ class parquet_field { std::string field_type_string(FieldType type) { switch (type) { - case ST_FLD_TRUE: return "bool(true)"; - case ST_FLD_FALSE: return "bool(false)"; - case ST_FLD_BYTE: return "int8"; - case ST_FLD_I16: return "int16"; - case ST_FLD_I32: return "int32"; - case ST_FLD_I64: return "int64"; - case ST_FLD_DOUBLE: return "double"; - case ST_FLD_BINARY: return "binary"; - case ST_FLD_STRUCT: return "struct"; - case ST_FLD_LIST: return "list"; - case ST_FLD_SET: return "set"; - default: return "unknown(" + std::to_string(type) + ")"; + case FieldType::BOOLEAN_TRUE: return "bool(true)"; + case FieldType::BOOLEAN_FALSE: return "bool(false)"; + case FieldType::BYTE: return "int8"; + case FieldType::I16: return "int16"; + case FieldType::I32: return "int32"; + case FieldType::I64: return "int64"; + case FieldType::DOUBLE: return "double"; + case FieldType::BINARY: return "binary"; + case FieldType::STRUCT: return "struct"; + case FieldType::LIST: return "list"; + case FieldType::SET: return "set"; + default: return "unknown(" + std::to_string(static_cast(type)) + ")"; } } void assert_field_type(int type, FieldType expected) { - CUDF_EXPECTS(type == expected, + CUDF_EXPECTS(type == static_cast(expected), "expected " + field_type_string(expected) + " field, got " + field_type_string(static_cast(type)) + " field instead"); } +void assert_bool_field_type(int type) +{ + CUDF_EXPECTS(type == static_cast(FieldType::BOOLEAN_TRUE) || + type == static_cast(FieldType::BOOLEAN_FALSE), + "expected bool field, got " + field_type_string(static_cast(type)) + + " field instead"); +} + /** * @brief Abstract base class for list functors. */ @@ -86,7 +94,7 @@ class parquet_field_list : public parquet_field { public: inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_LIST); + assert_field_type(field_type, FieldType::LIST); auto const [t, n] = cpr->get_listh(); assert_field_type(t, EXPECTED_ELEM_TYPE); val.resize(n); @@ -111,8 +119,8 @@ class parquet_field_bool : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_TRUE || field_type == ST_FLD_FALSE, "expected bool field"); - val = field_type == ST_FLD_TRUE; + assert_bool_field_type(field_type); + val = field_type == static_cast(FieldType::BOOLEAN_TRUE); } }; @@ -122,14 +130,13 @@ class parquet_field_bool : public parquet_field { * @return True if field types mismatch or if the process of reading a * bool fails */ -struct parquet_field_bool_list : public parquet_field_list { +struct parquet_field_bool_list : public parquet_field_list { parquet_field_bool_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const current_byte = cpr->getb(); - CUDF_EXPECTS(current_byte == ST_FLD_TRUE || current_byte == ST_FLD_FALSE, - "expected bool field"); - this->val[i] = current_byte == ST_FLD_TRUE; + assert_bool_field_type(current_byte); + this->val[i] = current_byte == static_cast(FieldType::BOOLEAN_TRUE); }; bind_read_func(read_value); } @@ -162,9 +169,9 @@ class parquet_field_int : public parquet_field { } }; -using parquet_field_int8 = parquet_field_int; -using parquet_field_int32 = parquet_field_int; -using parquet_field_int64 = parquet_field_int; +using parquet_field_int8 = parquet_field_int; +using parquet_field_int32 = parquet_field_int; +using parquet_field_int64 = parquet_field_int; /** * @brief Functor to read a vector of integers from CompactProtocolReader @@ -183,7 +190,7 @@ struct parquet_field_int_list : public parquet_field_list { } }; -using parquet_field_int64_list = parquet_field_int_list; +using parquet_field_int64_list = parquet_field_int_list; /** * @brief Functor to read a string from CompactProtocolReader @@ -199,7 +206,7 @@ class parquet_field_string : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_BINARY); + assert_field_type(field_type, FieldType::BINARY); auto const n = cpr->get_u32(); CUDF_EXPECTS(n < static_cast(cpr->m_end - cpr->m_cur), "string length mismatch"); @@ -214,7 +221,7 @@ class parquet_field_string : public parquet_field { * @return True if field types mismatch or if the process of reading a * string fails */ -struct parquet_field_string_list : public parquet_field_list { +struct parquet_field_string_list : public parquet_field_list { parquet_field_string_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { @@ -241,7 +248,7 @@ class parquet_field_enum : public parquet_field { parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_I32); + assert_field_type(field_type, FieldType::I32); val = static_cast(cpr->get_i32()); } }; @@ -253,8 +260,9 @@ class parquet_field_enum : public parquet_field { * enum fails */ template -struct parquet_field_enum_list : public parquet_field_list { - parquet_field_enum_list(int f, std::vector& v) : parquet_field_list(f, v) +struct parquet_field_enum_list : public parquet_field_list { + parquet_field_enum_list(int f, std::vector& v) + : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { this->val[i] = static_cast(cpr->get_i32()); @@ -278,7 +286,7 @@ class parquet_field_struct : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_STRUCT); + assert_field_type(field_type, FieldType::STRUCT); cpr->read(&val); } }; @@ -324,7 +332,7 @@ class parquet_field_union_enumerator : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_STRUCT); + assert_field_type(field_type, FieldType::STRUCT); cpr->skip_struct_field(field_type); val = static_cast(field()); } @@ -337,8 +345,9 @@ class parquet_field_union_enumerator : public parquet_field { * struct fails */ template -struct parquet_field_struct_list : public parquet_field_list { - parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v) +struct parquet_field_struct_list : public parquet_field_list { + parquet_field_struct_list(int f, std::vector& v) + : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { cpr->read(&this->val[i]); @@ -361,7 +370,7 @@ class parquet_field_binary : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_BINARY); + assert_field_type(field_type, FieldType::BINARY); auto const n = cpr->get_u32(); CUDF_EXPECTS(n <= static_cast(cpr->m_end - cpr->m_cur), "binary length mismatch"); @@ -377,7 +386,8 @@ class parquet_field_binary : public parquet_field { * @return True if field types mismatch or if the process of reading a * binary fails */ -struct parquet_field_binary_list : public parquet_field_list, ST_FLD_BINARY> { +struct parquet_field_binary_list + : public parquet_field_list, FieldType::BINARY> { parquet_field_binary_list(int f, std::vector>& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { @@ -404,7 +414,7 @@ class parquet_field_struct_blob : public parquet_field { parquet_field_struct_blob(int f, std::vector& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_STRUCT); + assert_field_type(field_type, FieldType::STRUCT); uint8_t const* const start = cpr->m_cur; cpr->skip_struct_field(field_type); if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } @@ -439,24 +449,25 @@ class parquet_field_optional : public parquet_field { */ void CompactProtocolReader::skip_struct_field(int t, int depth) { - switch (t) { - case ST_FLD_TRUE: - case ST_FLD_FALSE: break; - case ST_FLD_I16: - case ST_FLD_I32: - case ST_FLD_I64: get_u64(); break; - case ST_FLD_BYTE: skip_bytes(1); break; - case ST_FLD_DOUBLE: skip_bytes(8); break; - case ST_FLD_BINARY: skip_bytes(get_u32()); break; - case ST_FLD_LIST: [[fallthrough]]; - case ST_FLD_SET: { + auto const t_enum = static_cast(t); + switch (t_enum) { + case FieldType::BOOLEAN_TRUE: + case FieldType::BOOLEAN_FALSE: break; + case FieldType::I16: + case FieldType::I32: + case FieldType::I64: get_u64(); break; + case FieldType::BYTE: skip_bytes(1); break; + case FieldType::DOUBLE: skip_bytes(8); break; + case FieldType::BINARY: skip_bytes(get_u32()); break; + case FieldType::LIST: [[fallthrough]]; + case FieldType::SET: { auto const [t, n] = get_listh(); CUDF_EXPECTS(depth <= 10, "struct nesting too deep"); for (uint32_t i = 0; i < n; i++) { skip_struct_field(t, depth + 1); } } break; - case ST_FLD_STRUCT: + case FieldType::STRUCT: for (;;) { int const c = getb(); t = c & 0xf; diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index f857b75f707..80778462db2 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -268,39 +268,40 @@ uint32_t CompactProtocolFieldWriter::put_int(int64_t v) return put_uint(((v ^ -s) << 1) + s); } -void CompactProtocolFieldWriter::put_field_header(int f, int cur, int t) +void CompactProtocolFieldWriter::put_field_header(int f, int cur, FieldType t) { if (f > cur && f <= cur + 15) - put_byte(((f - cur) << 4) | t); + put_packed_type_byte(f - cur, t); else { - put_byte(t); + put_byte(static_cast(t)); put_int(f); } } inline void CompactProtocolFieldWriter::field_bool(int field, bool b) { - put_field_header(field, current_field_value, b ? ST_FLD_TRUE : ST_FLD_FALSE); + put_field_header( + field, current_field_value, b ? FieldType::BOOLEAN_TRUE : FieldType::BOOLEAN_FALSE); current_field_value = field; } inline void CompactProtocolFieldWriter::field_int8(int field, int8_t val) { - put_field_header(field, current_field_value, ST_FLD_BYTE); + put_field_header(field, current_field_value, FieldType::BYTE); put_byte(val); current_field_value = field; } inline void CompactProtocolFieldWriter::field_int(int field, int32_t val) { - put_field_header(field, current_field_value, ST_FLD_I32); + put_field_header(field, current_field_value, FieldType::I32); put_int(val); current_field_value = field; } inline void CompactProtocolFieldWriter::field_int(int field, int64_t val) { - put_field_header(field, current_field_value, ST_FLD_I64); + put_field_header(field, current_field_value, FieldType::I64); put_int(val); current_field_value = field; } @@ -309,8 +310,8 @@ template <> inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte(static_cast((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I64)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::I64); if (val.size() >= 0xfUL) { put_uint(val.size()); } for (auto const v : val) { put_int(v); @@ -321,8 +322,8 @@ inline void CompactProtocolFieldWriter::field_int_list(int field, template inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte(static_cast((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I32)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::I32); if (val.size() >= 0xfUL) { put_uint(val.size()); } for (auto const& v : val) { put_int(static_cast(v)); @@ -333,7 +334,7 @@ inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector inline void CompactProtocolFieldWriter::field_struct(int field, T const& val) { - put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_field_header(field, current_field_value, FieldType::STRUCT); if constexpr (not std::is_empty_v) { writer.write(val); // write the struct if it's not empty } else { @@ -344,7 +345,7 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val) inline void CompactProtocolFieldWriter::field_empty_struct(int field) { - put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_field_header(field, current_field_value, FieldType::STRUCT); put_byte(0); // add a stop field current_field_value = field; } @@ -352,8 +353,8 @@ inline void CompactProtocolFieldWriter::field_empty_struct(int field) template inline void CompactProtocolFieldWriter::field_struct_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_STRUCT)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::STRUCT); if (val.size() >= 0xf) put_uint(val.size()); for (auto& v : val) { writer.write(v); @@ -370,7 +371,7 @@ inline size_t CompactProtocolFieldWriter::value() inline void CompactProtocolFieldWriter::field_struct_blob(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_field_header(field, current_field_value, FieldType::STRUCT); put_byte(val.data(), static_cast(val.size())); put_byte(0); current_field_value = field; @@ -378,7 +379,7 @@ inline void CompactProtocolFieldWriter::field_struct_blob(int field, inline void CompactProtocolFieldWriter::field_binary(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_BINARY); + put_field_header(field, current_field_value, FieldType::BINARY); put_uint(val.size()); put_byte(val.data(), static_cast(val.size())); current_field_value = field; @@ -386,7 +387,7 @@ inline void CompactProtocolFieldWriter::field_binary(int field, std::vector(val.data()), static_cast(val.size())); @@ -396,8 +397,8 @@ inline void CompactProtocolFieldWriter::field_string(int field, std::string cons inline void CompactProtocolFieldWriter::field_string_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_BINARY)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::BINARY); if (val.size() >= 0xf) put_uint(val.size()); for (auto& v : val) { put_uint(v.size()); diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index a2ed0f1f4dc..dbfb1a191ba 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -78,7 +78,14 @@ class CompactProtocolFieldWriter { uint32_t put_int(int64_t v); - void put_field_header(int f, int cur, int t); + template + void put_packed_type_byte(T high_bits, FieldType t) + { + uint8_t const clamped_high_bits = std::min(std::max(high_bits, t{0}), T{0xf}); + put_byte((clamped_high_bits << 4) | static_cast(t)); + } + + void put_field_header(int f, int cur, FieldType t); inline void field_bool(int field, bool b); diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 7b0eabdcfd4..6a0cac64be9 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -2063,13 +2063,14 @@ inline __device__ uint8_t* cpw_put_int64(uint8_t* p, int64_t v) return cpw_put_uint64(p, (v ^ -s) * 2 + s); } -inline __device__ uint8_t* cpw_put_fldh(uint8_t* p, int f, int cur, int t) +inline __device__ uint8_t* cpw_put_fldh(uint8_t* p, int f, int cur, FieldType t) { + auto const t_num = static_cast(t); if (f > cur && f <= cur + 15) { - *p++ = ((f - cur) << 4) | t; + *p++ = ((f - cur) << 4) | t_num; return p; } else { - *p++ = t; + *p++ = t_num; return cpw_put_int32(p, f); } } @@ -2087,7 +2088,7 @@ class header_encoder { inline __device__ void field_struct_begin(int field) { current_header_ptr = - cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_STRUCT); + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::STRUCT); current_field_index = 0; } @@ -2097,11 +2098,13 @@ class header_encoder { current_field_index = field; } - inline __device__ void field_list_begin(int field, size_t len, int type) + inline __device__ void field_list_begin(int field, size_t len, FieldType type) { - current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_LIST); + current_header_ptr = + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::LIST); + auto const t_num = static_cast(type); current_header_ptr = cpw_put_uint8( - current_header_ptr, static_cast((std::min(len, size_t{0xfu}) << 4) | type)); + current_header_ptr, static_cast((std::min(len, size_t{0xfu}) << 4) | t_num)); if (len >= 0xf) { current_header_ptr = cpw_put_uint32(current_header_ptr, len); } current_field_index = 0; } @@ -2110,7 +2113,9 @@ class header_encoder { inline __device__ void put_bool(bool value) { - current_header_ptr = cpw_put_uint8(current_header_ptr, value ? ST_FLD_TRUE : ST_FLD_FALSE); + auto const type_byte = + static_cast(value ? FieldType::BOOLEAN_TRUE : FieldType::BOOLEAN_FALSE); + current_header_ptr = cpw_put_uint8(current_header_ptr, type_byte); } inline __device__ void put_binary(void const* value, uint32_t length) @@ -2128,15 +2133,18 @@ class header_encoder { inline __device__ void field_bool(int field, bool value) { - current_header_ptr = cpw_put_fldh( - current_header_ptr, field, current_field_index, value ? ST_FLD_TRUE : ST_FLD_FALSE); + current_header_ptr = cpw_put_fldh(current_header_ptr, + field, + current_field_index, + value ? FieldType::BOOLEAN_TRUE : FieldType::BOOLEAN_FALSE); current_field_index = field; } template inline __device__ void field_int32(int field, T value) { - current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_I32); + current_header_ptr = + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::I32); current_header_ptr = cpw_put_int32(current_header_ptr, static_cast(value)); current_field_index = field; } @@ -2144,7 +2152,8 @@ class header_encoder { template inline __device__ void field_int64(int field, T value) { - current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_I64); + current_header_ptr = + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::I64); current_header_ptr = cpw_put_int64(current_header_ptr, static_cast(value)); current_field_index = field; } @@ -2152,7 +2161,7 @@ class header_encoder { inline __device__ void field_binary(int field, void const* value, uint32_t length) { current_header_ptr = - cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_BINARY); + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::BINARY); current_header_ptr = cpw_put_uint32(current_header_ptr, length); memcpy(current_header_ptr, value, length); current_header_ptr += length; @@ -2724,13 +2733,13 @@ __global__ void __launch_bounds__(1) : align8(ck_g->column_index_blob + ck_g->column_index_size - column_index_truncate_length); // null_pages - encoder.field_list_begin(1, num_data_pages, ST_FLD_TRUE); + encoder.field_list_begin(1, num_data_pages, FieldType::BOOLEAN_TRUE); for (uint32_t page = first_data_page; page < num_pages; page++) { encoder.put_bool(column_stats[pageidx + page].non_nulls == 0); } encoder.field_list_end(1); // min_values - encoder.field_list_begin(2, num_data_pages, ST_FLD_BINARY); + encoder.field_list_begin(2, num_data_pages, FieldType::BINARY); for (uint32_t page = first_data_page; page < num_pages; page++) { auto const [min_ptr, min_size] = get_extremum(&column_stats[pageidx + page].min_value, col_g.stats_dtype, @@ -2741,7 +2750,7 @@ __global__ void __launch_bounds__(1) } encoder.field_list_end(2); // max_values - encoder.field_list_begin(3, num_data_pages, ST_FLD_BINARY); + encoder.field_list_begin(3, num_data_pages, FieldType::BINARY); for (uint32_t page = first_data_page; page < num_pages; page++) { auto const [max_ptr, max_size] = get_extremum(&column_stats[pageidx + page].max_value, col_g.stats_dtype, @@ -2758,7 +2767,7 @@ __global__ void __launch_bounds__(1) col_g.converted_type, num_pages - first_data_page)); // null_counts - encoder.field_list_begin(5, num_data_pages, ST_FLD_I64); + encoder.field_list_begin(5, num_data_pages, FieldType::I64); for (uint32_t page = first_data_page; page < num_pages; page++) { encoder.put_int64(column_stats[pageidx + page].null_count); } @@ -2774,7 +2783,7 @@ __global__ void __launch_bounds__(1) // optionally encode histograms and sum var_bytes. if (cd->max_rep_level > REP_LVL_HIST_CUTOFF) { - encoder.field_list_begin(6, num_data_pages * (cd->max_rep_level + 1), ST_FLD_I64); + encoder.field_list_begin(6, num_data_pages * (cd->max_rep_level + 1), FieldType::I64); thrust::for_each(thrust::seq, page_start, page_end, [&] __device__(auto const& page) { for (int i = 0; i < cd->max_rep_level + 1; i++) { encoder.put_int64(page.rep_histogram[i]); @@ -2785,7 +2794,7 @@ __global__ void __launch_bounds__(1) } if (cd->max_def_level > DEF_LVL_HIST_CUTOFF) { - encoder.field_list_begin(7, num_data_pages * (cd->max_def_level + 1), ST_FLD_I64); + encoder.field_list_begin(7, num_data_pages * (cd->max_def_level + 1), FieldType::I64); thrust::for_each(thrust::seq, page_start, page_end, [&] __device__(auto const& page) { for (int i = 0; i < cd->max_def_level + 1; i++) { encoder.put_int64(page.def_histogram[i]); diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 114e47aa507..17cb1347435 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -114,28 +114,28 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) field_type = c & 0xf; if (!(c & 0xf0)) get_i32(bs); } - switch (field_type) { - case ST_FLD_TRUE: - case ST_FLD_FALSE: break; - case ST_FLD_I16: - case ST_FLD_I32: - case ST_FLD_I64: get_u32(bs); break; - case ST_FLD_BYTE: skip_bytes(bs, 1); break; - case ST_FLD_DOUBLE: skip_bytes(bs, 8); break; - case ST_FLD_BINARY: skip_bytes(bs, get_u32(bs)); break; - case ST_FLD_LIST: - case ST_FLD_SET: { // NOTE: skipping a list of lists is not handled + switch (static_cast(field_type)) { + case FieldType::BOOLEAN_TRUE: + case FieldType::BOOLEAN_FALSE: break; + case FieldType::I16: + case FieldType::I32: + case FieldType::I64: get_u32(bs); break; + case FieldType::BYTE: skip_bytes(bs, 1); break; + case FieldType::DOUBLE: skip_bytes(bs, 8); break; + case FieldType::BINARY: skip_bytes(bs, get_u32(bs)); break; + case FieldType::LIST: + case FieldType::SET: { // NOTE: skipping a list of lists is not handled auto const c = getb(bs); int n = c >> 4; if (n == 0xf) { n = get_u32(bs); } field_type = c & 0xf; - if (field_type == ST_FLD_STRUCT) { + if (static_cast(field_type) == FieldType::STRUCT) { struct_depth += n; } else { rep_cnt = n; } } break; - case ST_FLD_STRUCT: struct_depth++; break; + case FieldType::STRUCT: struct_depth++; break; } } while (rep_cnt || struct_depth); } @@ -178,7 +178,7 @@ struct ParquetFieldInt32 { inline __device__ bool operator()(byte_stream_s* bs, int field_type) { val = get_i32(bs); - return (field_type != ST_FLD_I32); + return (static_cast(field_type) != FieldType::I32); } }; @@ -197,7 +197,7 @@ struct ParquetFieldEnum { inline __device__ bool operator()(byte_stream_s* bs, int field_type) { val = static_cast(get_i32(bs)); - return (field_type != ST_FLD_I32); + return (static_cast(field_type) != FieldType::I32); } }; @@ -216,7 +216,7 @@ struct ParquetFieldStruct { inline __device__ bool operator()(byte_stream_s* bs, int field_type) { - return ((field_type != ST_FLD_STRUCT) || !op(bs)); + return ((static_cast(field_type) != FieldType::STRUCT) || !op(bs)); } }; diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 50736197eb9..8732de7efc9 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -140,19 +140,20 @@ enum BoundaryOrder { /** * @brief Thrift compact protocol struct field types */ -enum FieldType { - ST_FLD_TRUE = 1, - ST_FLD_FALSE = 2, - ST_FLD_BYTE = 3, - ST_FLD_I16 = 4, - ST_FLD_I32 = 5, - ST_FLD_I64 = 6, - ST_FLD_DOUBLE = 7, - ST_FLD_BINARY = 8, - ST_FLD_LIST = 9, - ST_FLD_SET = 10, - ST_FLD_MAP = 11, - ST_FLD_STRUCT = 12, +enum class FieldType : uint8_t { + BOOLEAN_TRUE = 1, + BOOLEAN_FALSE = 2, + BYTE = 3, + I16 = 4, + I32 = 5, + I64 = 6, + DOUBLE = 7, + BINARY = 8, + LIST = 9, + SET = 10, + MAP = 11, + STRUCT = 12, + UUID = 13, }; } // namespace cudf::io::parquet::detail From b09d49e8ce74d5ea04a2c1d4ca71c11f1c9be2e8 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 18 Dec 2023 09:44:04 -0800 Subject: [PATCH 2/7] typo + I8 rename --- cpp/src/io/parquet/compact_protocol_reader.cpp | 6 +++--- cpp/src/io/parquet/compact_protocol_writer.cpp | 2 +- cpp/src/io/parquet/compact_protocol_writer.hpp | 2 +- cpp/src/io/parquet/page_hdr.cu | 2 +- cpp/src/io/parquet/parquet_common.hpp | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index a448921e763..c765be83a42 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -47,7 +47,7 @@ std::string field_type_string(FieldType type) switch (type) { case FieldType::BOOLEAN_TRUE: return "bool(true)"; case FieldType::BOOLEAN_FALSE: return "bool(false)"; - case FieldType::BYTE: return "int8"; + case FieldType::I8: return "int8"; case FieldType::I16: return "int16"; case FieldType::I32: return "int32"; case FieldType::I64: return "int64"; @@ -169,7 +169,7 @@ class parquet_field_int : public parquet_field { } }; -using parquet_field_int8 = parquet_field_int; +using parquet_field_int8 = parquet_field_int; using parquet_field_int32 = parquet_field_int; using parquet_field_int64 = parquet_field_int; @@ -456,7 +456,7 @@ void CompactProtocolReader::skip_struct_field(int t, int depth) case FieldType::I16: case FieldType::I32: case FieldType::I64: get_u64(); break; - case FieldType::BYTE: skip_bytes(1); break; + case FieldType::I8: skip_bytes(1); break; case FieldType::DOUBLE: skip_bytes(8); break; case FieldType::BINARY: skip_bytes(get_u32()); break; case FieldType::LIST: [[fallthrough]]; diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 80778462db2..730e175dcab 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -287,7 +287,7 @@ inline void CompactProtocolFieldWriter::field_bool(int field, bool b) inline void CompactProtocolFieldWriter::field_int8(int field, int8_t val) { - put_field_header(field, current_field_value, FieldType::BYTE); + put_field_header(field, current_field_value, FieldType::I8); put_byte(val); current_field_value = field; } diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index dbfb1a191ba..d4d515a4603 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -81,7 +81,7 @@ class CompactProtocolFieldWriter { template void put_packed_type_byte(T high_bits, FieldType t) { - uint8_t const clamped_high_bits = std::min(std::max(high_bits, t{0}), T{0xf}); + uint8_t const clamped_high_bits = std::min(std::max(high_bits, T{0}), T{0xf}); put_byte((clamped_high_bits << 4) | static_cast(t)); } diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 17cb1347435..f5eb47e89ff 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -120,7 +120,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) case FieldType::I16: case FieldType::I32: case FieldType::I64: get_u32(bs); break; - case FieldType::BYTE: skip_bytes(bs, 1); break; + case FieldType::I8: skip_bytes(bs, 1); break; case FieldType::DOUBLE: skip_bytes(bs, 8); break; case FieldType::BINARY: skip_bytes(bs, get_u32(bs)); break; case FieldType::LIST: diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 8732de7efc9..56926dbb2bd 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -143,7 +143,7 @@ enum BoundaryOrder { enum class FieldType : uint8_t { BOOLEAN_TRUE = 1, BOOLEAN_FALSE = 2, - BYTE = 3, + I8 = 3, I16 = 4, I32 = 5, I64 = 6, From 5e9ab1fbd10e717096f424b7eb051cbae16a89d9 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 20 Dec 2023 10:17:00 -0800 Subject: [PATCH 3/7] fewer casts Co-authored-by: Ed Seidl --- cpp/src/io/parquet/compact_protocol_reader.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index c765be83a42..e56fadd9bb5 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -69,9 +69,10 @@ void assert_field_type(int type, FieldType expected) void assert_bool_field_type(int type) { - CUDF_EXPECTS(type == static_cast(FieldType::BOOLEAN_TRUE) || - type == static_cast(FieldType::BOOLEAN_FALSE), - "expected bool field, got " + field_type_string(static_cast(type)) + + auto const field_type = static_cast(type); + CUDF_EXPECTS(field_type == FieldType::BOOLEAN_TRUE || + field_type == FieldType::BOOLEAN_FALSE, + "expected bool field, got " + field_type_string(field_type) + " field instead"); } From 0a1bfab14d223cae274f83213f6a3be3d85e946c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 20 Dec 2023 10:38:20 -0800 Subject: [PATCH 4/7] fix up switch in field_type_string --- cpp/src/io/parquet/compact_protocol_reader.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index c765be83a42..91c002dd9e7 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -53,9 +53,11 @@ std::string field_type_string(FieldType type) case FieldType::I64: return "int64"; case FieldType::DOUBLE: return "double"; case FieldType::BINARY: return "binary"; - case FieldType::STRUCT: return "struct"; case FieldType::LIST: return "list"; case FieldType::SET: return "set"; + case FieldType::MAP: return "map"; + case FieldType::STRUCT: return "struct"; + case FieldType::UUID: return "UUID"; default: return "unknown(" + std::to_string(static_cast(type)) + ")"; } } From b71011636eb9f91c70fe9461450c7848d3d2cb9a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 20 Dec 2023 10:40:44 -0800 Subject: [PATCH 5/7] style --- cpp/src/io/parquet/compact_protocol_reader.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index c5ef0fcbcad..344b3c91e35 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -72,10 +72,8 @@ void assert_field_type(int type, FieldType expected) void assert_bool_field_type(int type) { auto const field_type = static_cast(type); - CUDF_EXPECTS(field_type == FieldType::BOOLEAN_TRUE || - field_type == FieldType::BOOLEAN_FALSE, - "expected bool field, got " + field_type_string(field_type) + - " field instead"); + CUDF_EXPECTS(field_type == FieldType::BOOLEAN_TRUE || field_type == FieldType::BOOLEAN_FALSE, + "expected bool field, got " + field_type_string(field_type) + " field instead"); } /** From d2e9e8c0dab8c111eb2e4efa1fc97277a4e25df7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 4 Jan 2024 14:54:36 -0800 Subject: [PATCH 6/7] style --- cpp/src/io/parquet/compact_protocol_reader.cpp | 2 +- cpp/src/io/parquet/compact_protocol_writer.cpp | 2 +- cpp/src/io/parquet/compact_protocol_writer.hpp | 2 +- cpp/src/io/parquet/page_enc.cu | 2 +- cpp/src/io/parquet/page_hdr.cu | 2 +- cpp/src/io/parquet/parquet_common.hpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 344b3c91e35..f161fd69678 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 730e175dcab..d610ec6c546 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index d4d515a4603..2ed7c078f8b 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 6a0cac64be9..d37831110be 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index f5eb47e89ff..f1c849f8abc 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 56926dbb2bd..a680e44f360 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From bf71d0fb77937d22b4d756bab3b11eabda7e6a77 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 9 Jan 2024 12:58:02 -0800 Subject: [PATCH 7/7] oops --- cpp/src/io/parquet/compact_protocol_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index f161fd69678..d39d832c18c 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -460,7 +460,7 @@ void CompactProtocolReader::skip_struct_field(int t, int depth) case FieldType::I8: skip_bytes(1); break; case FieldType::DOUBLE: skip_bytes(8); break; case FieldType::BINARY: skip_bytes(get_u32()); break; - case FieldType::LIST: [[fallthrough]]; + case FieldType::LIST: case FieldType::SET: { auto const [t, n] = get_listh(); CUDF_EXPECTS(depth <= 10, "struct nesting too deep");