diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index c011b4132a8..55848802f12 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -16,6 +16,8 @@ #include "compact_protocol_reader.hpp" +#include + #include #include #include @@ -40,14 +42,38 @@ class parquet_field { int field() const { return _field_val; } }; +std::string field_type_string(FieldType type) +{ + switch (type) { + case ST_FLD_TRUE: return "bool(true)"; + case ST_FLD_FALSE: return "bool(false)"; + case ST_FLD_BYTE: return "int8"; + case ST_FLD_I16: return "int16"; + case ST_FLD_I32: return "int32"; + case ST_FLD_I64: return "int64"; + case ST_FLD_DOUBLE: return "double"; + case ST_FLD_BINARY: return "binary"; + case ST_FLD_STRUCT: return "struct"; + case ST_FLD_LIST: return "list"; + case ST_FLD_SET: return "set"; + default: return "unknown(" + std::to_string(type) + ")"; + } +} + +void assert_field_type(int type, FieldType expected) +{ + CUDF_EXPECTS(type == expected, + "expected " + field_type_string(expected) + " field, got " + + field_type_string(static_cast(type)) + " field instead"); +} + /** * @brief Abstract base class for list functors. */ -template +template class parquet_field_list : public parquet_field { private: - using read_func_type = std::function; - FieldType _expected_type; + using read_func_type = std::function; read_func_type _read_value; protected: @@ -55,22 +81,18 @@ class parquet_field_list : public parquet_field { void bind_read_func(read_func_type fn) { _read_value = fn; } - parquet_field_list(int f, std::vector& v, FieldType t) - : parquet_field(f), _expected_type(t), val(v) - { - } + parquet_field_list(int f, std::vector& v) : parquet_field(f), val(v) {} public: - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_LIST) { return true; } + assert_field_type(field_type, ST_FLD_LIST); auto const [t, n] = cpr->get_listh(); - if (t != _expected_type) { return true; } + assert_field_type(t, EXPECTED_ELEM_TYPE); val.resize(n); for (uint32_t i = 0; i < n; i++) { - if (_read_value(i, cpr)) { return true; } + _read_value(i, cpr); } - return false; } }; @@ -87,11 +109,10 @@ class parquet_field_bool : public parquet_field { public: parquet_field_bool(int f, bool& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) { return true; } + CUDF_EXPECTS(field_type == ST_FLD_TRUE || field_type == ST_FLD_FALSE, "expected bool field"); val = field_type == ST_FLD_TRUE; - return false; } }; @@ -101,14 +122,14 @@ class parquet_field_bool : public parquet_field { * @return True if field types mismatch or if the process of reading a * bool fails */ -struct parquet_field_bool_list : public parquet_field_list { - parquet_field_bool_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_TRUE) +struct parquet_field_bool_list : public parquet_field_list { + parquet_field_bool_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const current_byte = cpr->getb(); - if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) { return true; } + CUDF_EXPECTS(current_byte == ST_FLD_TRUE || current_byte == ST_FLD_FALSE, + "expected bool field"); this->val[i] = current_byte == ST_FLD_TRUE; - return false; }; bind_read_func(read_value); } @@ -121,7 +142,7 @@ struct parquet_field_bool_list : public parquet_field_list { * * @return True if there is a type mismatch */ -template +template class parquet_field_int : public parquet_field { static constexpr bool is_byte = std::is_same_v; @@ -130,14 +151,14 @@ class parquet_field_int : public parquet_field { public: parquet_field_int(int f, T& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { + assert_field_type(field_type, EXPECTED_TYPE); if constexpr (is_byte) { val = cpr->getb(); } else { val = cpr->get_zigzag(); } - return (field_type != EXPECTED_TYPE); } }; @@ -152,12 +173,11 @@ using parquet_field_int64 = parquet_field_int; * integer fails */ template -struct parquet_field_int_list : public parquet_field_list { - parquet_field_int_list(int f, std::vector& v) : parquet_field_list(f, v, EXPECTED_TYPE) +struct parquet_field_int_list : public parquet_field_list { + parquet_field_int_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { this->val[i] = cpr->get_zigzag(); - return false; }; this->bind_read_func(read_value); } @@ -177,17 +197,14 @@ class parquet_field_string : public parquet_field { public: parquet_field_string(int f, std::string& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_BINARY) { return true; } + assert_field_type(field_type, ST_FLD_BINARY); auto const n = cpr->get_u32(); - if (n < static_cast(cpr->m_end - cpr->m_cur)) { - val.assign(reinterpret_cast(cpr->m_cur), n); - cpr->m_cur += n; - return false; - } else { - return true; - } + CUDF_EXPECTS(n < static_cast(cpr->m_end - cpr->m_cur), "string length mismatch"); + + val.assign(reinterpret_cast(cpr->m_cur), n); + cpr->m_cur += n; } }; @@ -197,19 +214,15 @@ class parquet_field_string : public parquet_field { * @return True if field types mismatch or if the process of reading a * string fails */ -struct parquet_field_string_list : public parquet_field_list { - parquet_field_string_list(int f, std::vector& v) - : parquet_field_list(f, v, ST_FLD_BINARY) +struct parquet_field_string_list : public parquet_field_list { + parquet_field_string_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const l = cpr->get_u32(); - if (l < static_cast(cpr->m_end - cpr->m_cur)) { - this->val[i].assign(reinterpret_cast(cpr->m_cur), l); - cpr->m_cur += l; - } else { - return true; - } - return false; + CUDF_EXPECTS(l < static_cast(cpr->m_end - cpr->m_cur), "string length mismatch"); + + this->val[i].assign(reinterpret_cast(cpr->m_cur), l); + cpr->m_cur += l; }; bind_read_func(read_value); } @@ -226,10 +239,10 @@ class parquet_field_enum : public parquet_field { public: parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { + assert_field_type(field_type, ST_FLD_I32); val = static_cast(cpr->get_i32()); - return (field_type != ST_FLD_I32); } }; @@ -240,12 +253,11 @@ class parquet_field_enum : public parquet_field { * enum fails */ template -struct parquet_field_enum_list : public parquet_field_list { - parquet_field_enum_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_I32) +struct parquet_field_enum_list : public parquet_field_list { + parquet_field_enum_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { this->val[i] = static_cast(cpr->get_i32()); - return false; }; this->bind_read_func(read_value); } @@ -264,9 +276,10 @@ class parquet_field_struct : public parquet_field { public: parquet_field_struct(int f, T& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - return (field_type != ST_FLD_STRUCT || !(cpr->read(&val))); + assert_field_type(field_type, ST_FLD_STRUCT); + cpr->read(&val); } }; @@ -286,15 +299,12 @@ class parquet_field_union_struct : public parquet_field { { } - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { T v; - bool const res = parquet_field_struct{field(), v}(cpr, field_type); - if (!res) { - val = v; - enum_val = static_cast(field()); - } - return res; + parquet_field_struct{field(), v}(cpr, field_type); + val = v; + enum_val = static_cast(field()); } }; @@ -312,12 +322,11 @@ class parquet_field_union_enumerator : public parquet_field { public: parquet_field_union_enumerator(int f, E& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_STRUCT) { return true; } + assert_field_type(field_type, ST_FLD_STRUCT); cpr->skip_struct_field(field_type); val = static_cast(field()); - return false; } }; @@ -328,12 +337,11 @@ class parquet_field_union_enumerator : public parquet_field { * struct fails */ template -struct parquet_field_struct_list : public parquet_field_list { - parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_STRUCT) +struct parquet_field_struct_list : public parquet_field_list { + parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { - if (not cpr->read(&this->val[i])) { return true; } - return false; + cpr->read(&this->val[i]); }; this->bind_read_func(read_value); } @@ -351,18 +359,15 @@ class parquet_field_binary : public parquet_field { public: parquet_field_binary(int f, std::vector& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_BINARY) { return true; } + assert_field_type(field_type, ST_FLD_BINARY); auto const n = cpr->get_u32(); - if (n <= static_cast(cpr->m_end - cpr->m_cur)) { - val.resize(n); - val.assign(cpr->m_cur, cpr->m_cur + n); - cpr->m_cur += n; - return false; - } else { - return true; - } + CUDF_EXPECTS(n <= static_cast(cpr->m_end - cpr->m_cur), "binary length mismatch"); + + val.resize(n); + val.assign(cpr->m_cur, cpr->m_cur + n); + cpr->m_cur += n; } }; @@ -372,20 +377,16 @@ class parquet_field_binary : public parquet_field { * @return True if field types mismatch or if the process of reading a * binary fails */ -struct parquet_field_binary_list : public parquet_field_list> { - parquet_field_binary_list(int f, std::vector>& v) - : parquet_field_list(f, v, ST_FLD_BINARY) +struct parquet_field_binary_list : public parquet_field_list, ST_FLD_BINARY> { + parquet_field_binary_list(int f, std::vector>& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const l = cpr->get_u32(); - if (l <= static_cast(cpr->m_end - cpr->m_cur)) { - val[i].resize(l); - val[i].assign(cpr->m_cur, cpr->m_cur + l); - cpr->m_cur += l; - } else { - return true; - } - return false; + CUDF_EXPECTS(l <= static_cast(cpr->m_end - cpr->m_cur), "binary length mismatch"); + + val[i].resize(l); + val[i].assign(cpr->m_cur, cpr->m_cur + l); + cpr->m_cur += l; }; bind_read_func(read_value); } @@ -401,13 +402,12 @@ class parquet_field_struct_blob : public parquet_field { public: parquet_field_struct_blob(int f, std::vector& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_STRUCT) { return true; } + assert_field_type(field_type, ST_FLD_STRUCT); uint8_t const* const start = cpr->m_cur; cpr->skip_struct_field(field_type); if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } - return false; } }; @@ -421,12 +421,11 @@ class parquet_field_optional : public parquet_field { public: parquet_field_optional(int f, thrust::optional& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { T v; - bool const res = FieldFunctor{field(), v}(cpr, field_type); - if (!res) { val = v; } - return res; + FieldFunctor{field(), v}(cpr, field_type); + val = v; } }; @@ -438,7 +437,7 @@ class parquet_field_optional : public parquet_field { * * @return True if the struct type is recognized, false otherwise */ -bool CompactProtocolReader::skip_struct_field(int t, int depth) +void CompactProtocolReader::skip_struct_field(int t, int depth) { switch (t) { case ST_FLD_TRUE: @@ -452,7 +451,7 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) case ST_FLD_LIST: [[fallthrough]]; case ST_FLD_SET: { auto const [t, n] = get_listh(); - if (depth > 10) { return false; } + CUDF_EXPECTS(depth <= 10, "struct nesting too deep"); for (uint32_t i = 0; i < n; i++) { skip_struct_field(t, depth + 1); } @@ -463,7 +462,7 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) t = c & 0xf; if (c == 0) { break; } // end of struct if ((c & 0xf0) == 0) { get_i16(); } // field id is not a delta - if (depth > 10) { return false; } + CUDF_EXPECTS(depth <= 10, "struct nesting too deep"); skip_struct_field(t, depth + 1); } break; @@ -471,21 +470,20 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) // printf("unsupported skip for type %d\n", t); break; } - return true; } template struct FunctionSwitchImpl { template - static inline bool run(CompactProtocolReader* cpr, + static inline void run(CompactProtocolReader* cpr, int field_type, int const& field, std::tuple& ops) { if (field == std::get(ops).field()) { - return std::get(ops)(cpr, field_type); + std::get(ops)(cpr, field_type); } else { - return FunctionSwitchImpl::run(cpr, field_type, field, ops); + FunctionSwitchImpl::run(cpr, field_type, field, ops); } } }; @@ -493,38 +491,35 @@ struct FunctionSwitchImpl { template <> struct FunctionSwitchImpl<0> { template - static inline bool run(CompactProtocolReader* cpr, + static inline void run(CompactProtocolReader* cpr, int field_type, int const& field, std::tuple& ops) { if (field == std::get<0>(ops).field()) { - return std::get<0>(ops)(cpr, field_type); + std::get<0>(ops)(cpr, field_type); } else { cpr->skip_struct_field(field_type); - return false; } } }; template -inline bool function_builder(CompactProtocolReader* cpr, std::tuple& op) +inline void function_builder(CompactProtocolReader* cpr, std::tuple& op) { constexpr int index = std::tuple_size>::value - 1; int field = 0; while (true) { int const current_byte = cpr->getb(); if (!current_byte) { break; } - int const field_delta = current_byte >> 4; - int const field_type = current_byte & 0xf; - field = field_delta ? field + field_delta : cpr->get_i16(); - bool const exit_function = FunctionSwitchImpl::run(cpr, field_type, field, op); - if (exit_function) { return false; } + int const field_delta = current_byte >> 4; + int const field_type = current_byte & 0xf; + field = field_delta ? field + field_delta : cpr->get_i16(); + FunctionSwitchImpl::run(cpr, field_type, field, op); } - return true; } -bool CompactProtocolReader::read(FileMetaData* f) +void CompactProtocolReader::read(FileMetaData* f) { using optional_list_column_order = parquet_field_optional, parquet_field_struct_list>; @@ -535,10 +530,10 @@ bool CompactProtocolReader::read(FileMetaData* f) parquet_field_struct_list(5, f->key_value_metadata), parquet_field_string(6, f->created_by), optional_list_column_order(7, f->column_orders)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(SchemaElement* s) +void CompactProtocolReader::read(SchemaElement* s) { using optional_converted_type = parquet_field_optional>; @@ -554,10 +549,10 @@ bool CompactProtocolReader::read(SchemaElement* s) parquet_field_int32(8, s->decimal_precision), parquet_field_optional(9, s->field_id), optional_logical_type(10, s->logical_type)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(LogicalType* l) +void CompactProtocolReader::read(LogicalType* l) { auto op = std::make_tuple( parquet_field_union_enumerator(1, l->type), @@ -572,52 +567,52 @@ bool CompactProtocolReader::read(LogicalType* l) parquet_field_union_enumerator(11, l->type), parquet_field_union_enumerator(12, l->type), parquet_field_union_enumerator(13, l->type)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(DecimalType* d) +void CompactProtocolReader::read(DecimalType* d) { auto op = std::make_tuple(parquet_field_int32(1, d->scale), parquet_field_int32(2, d->precision)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(TimeType* t) +void CompactProtocolReader::read(TimeType* t) { auto op = std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(TimestampType* t) +void CompactProtocolReader::read(TimestampType* t) { auto op = std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(TimeUnit* u) +void CompactProtocolReader::read(TimeUnit* u) { auto op = std::make_tuple(parquet_field_union_enumerator(1, u->type), parquet_field_union_enumerator(2, u->type), parquet_field_union_enumerator(3, u->type)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(IntType* i) +void CompactProtocolReader::read(IntType* i) { auto op = std::make_tuple(parquet_field_int8(1, i->bitWidth), parquet_field_bool(2, i->isSigned)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(RowGroup* r) +void CompactProtocolReader::read(RowGroup* r) { auto op = std::make_tuple(parquet_field_struct_list(1, r->columns), parquet_field_int64(2, r->total_byte_size), parquet_field_int64(3, r->num_rows)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(ColumnChunk* c) +void CompactProtocolReader::read(ColumnChunk* c) { auto op = std::make_tuple(parquet_field_string(1, c->file_path), parquet_field_int64(2, c->file_offset), @@ -626,10 +621,10 @@ bool CompactProtocolReader::read(ColumnChunk* c) parquet_field_int32(5, c->offset_index_length), parquet_field_int64(6, c->column_index_offset), parquet_field_int32(7, c->column_index_length)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(ColumnChunkMetaData* c) +void CompactProtocolReader::read(ColumnChunkMetaData* c) { using optional_size_statistics = parquet_field_optional>; @@ -645,10 +640,10 @@ bool CompactProtocolReader::read(ColumnChunkMetaData* c) parquet_field_int64(11, c->dictionary_page_offset), parquet_field_struct(12, c->statistics), optional_size_statistics(16, c->size_statistics)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(PageHeader* p) +void CompactProtocolReader::read(PageHeader* p) { auto op = std::make_tuple(parquet_field_enum(1, p->type), parquet_field_int32(2, p->uncompressed_page_size), @@ -656,26 +651,26 @@ bool CompactProtocolReader::read(PageHeader* p) parquet_field_struct(5, p->data_page_header), parquet_field_struct(7, p->dictionary_page_header), parquet_field_struct(8, p->data_page_header_v2)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(DataPageHeader* d) +void CompactProtocolReader::read(DataPageHeader* d) { auto op = std::make_tuple(parquet_field_int32(1, d->num_values), parquet_field_enum(2, d->encoding), parquet_field_enum(3, d->definition_level_encoding), parquet_field_enum(4, d->repetition_level_encoding)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(DictionaryPageHeader* d) +void CompactProtocolReader::read(DictionaryPageHeader* d) { auto op = std::make_tuple(parquet_field_int32(1, d->num_values), parquet_field_enum(2, d->encoding)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(DataPageHeaderV2* d) +void CompactProtocolReader::read(DataPageHeaderV2* d) { auto op = std::make_tuple(parquet_field_int32(1, d->num_values), parquet_field_int32(2, d->num_nulls), @@ -684,33 +679,33 @@ bool CompactProtocolReader::read(DataPageHeaderV2* d) parquet_field_int32(5, d->definition_levels_byte_length), parquet_field_int32(6, d->repetition_levels_byte_length), parquet_field_bool(7, d->is_compressed)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(KeyValue* k) +void CompactProtocolReader::read(KeyValue* k) { auto op = std::make_tuple(parquet_field_string(1, k->key), parquet_field_string(2, k->value)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(PageLocation* p) +void CompactProtocolReader::read(PageLocation* p) { auto op = std::make_tuple(parquet_field_int64(1, p->offset), parquet_field_int32(2, p->compressed_page_size), parquet_field_int64(3, p->first_row_index)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(OffsetIndex* o) +void CompactProtocolReader::read(OffsetIndex* o) { using optional_list_i64 = parquet_field_optional, parquet_field_int64_list>; auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations), optional_list_i64(2, o->unencoded_byte_array_data_bytes)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(SizeStatistics* s) +void CompactProtocolReader::read(SizeStatistics* s) { using optional_i64 = parquet_field_optional; using optional_list_i64 = parquet_field_optional, parquet_field_int64_list>; @@ -718,10 +713,10 @@ bool CompactProtocolReader::read(SizeStatistics* s) auto op = std::make_tuple(optional_i64(1, s->unencoded_byte_array_data_bytes), optional_list_i64(2, s->repetition_level_histogram), optional_list_i64(3, s->definition_level_histogram)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(ColumnIndex* c) +void CompactProtocolReader::read(ColumnIndex* c) { using optional_list_i64 = parquet_field_optional, parquet_field_int64_list>; @@ -732,10 +727,10 @@ bool CompactProtocolReader::read(ColumnIndex* c) optional_list_i64(5, c->null_counts), optional_list_i64(6, c->repetition_level_histogram), optional_list_i64(7, c->definition_level_histogram)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(Statistics* s) +void CompactProtocolReader::read(Statistics* s) { using optional_binary = parquet_field_optional, parquet_field_binary>; using optional_int64 = parquet_field_optional; @@ -746,13 +741,13 @@ bool CompactProtocolReader::read(Statistics* s) optional_int64(4, s->distinct_count), optional_binary(5, s->max_value), optional_binary(6, s->min_value)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(ColumnOrder* c) +void CompactProtocolReader::read(ColumnOrder* c) { auto op = std::make_tuple(parquet_field_union_enumerator(1, c->type)); - return function_builder(this, op); + function_builder(this, op); } /** diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index bd4fa7f01ca..f244df07176 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -94,32 +94,32 @@ class CompactProtocolReader { return {t, sz}; } - bool skip_struct_field(int t, int depth = 0); + void skip_struct_field(int t, int depth = 0); public: // Generate Thrift structure parsing routines - bool read(FileMetaData* f); - bool read(SchemaElement* s); - bool read(LogicalType* l); - bool read(DecimalType* d); - bool read(TimeType* t); - bool read(TimeUnit* u); - bool read(TimestampType* t); - bool read(IntType* t); - bool read(RowGroup* r); - bool read(ColumnChunk* c); - bool read(ColumnChunkMetaData* c); - bool read(PageHeader* p); - bool read(DataPageHeader* d); - bool read(DictionaryPageHeader* d); - bool read(DataPageHeaderV2* d); - bool read(KeyValue* k); - bool read(PageLocation* p); - bool read(OffsetIndex* o); - bool read(SizeStatistics* s); - bool read(ColumnIndex* c); - bool read(Statistics* s); - bool read(ColumnOrder* c); + void read(FileMetaData* f); + void read(SchemaElement* s); + void read(LogicalType* l); + void read(DecimalType* d); + void read(TimeType* t); + void read(TimeUnit* u); + void read(TimestampType* t); + void read(IntType* t); + void read(RowGroup* r); + void read(ColumnChunk* c); + void read(ColumnChunkMetaData* c); + void read(PageHeader* p); + void read(DataPageHeader* d); + void read(DictionaryPageHeader* d); + void read(DataPageHeaderV2* d); + void read(KeyValue* k); + void read(PageLocation* p); + void read(OffsetIndex* o); + void read(SizeStatistics* s); + void read(ColumnIndex* c); + void read(Statistics* s); + void read(ColumnOrder* c); public: static int NumRequiredBits(uint32_t max_level) noexcept diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 55146534781..ef51f373b24 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -264,7 +264,7 @@ metadata::metadata(datasource* source) auto const buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len); CompactProtocolReader cp(buffer->data(), ender->footer_len); - CUDF_EXPECTS(cp.read(this), "Cannot parse metadata"); + cp.read(this); CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema"); // loop through the column chunks and read column and offset indexes @@ -275,7 +275,7 @@ metadata::metadata(datasource* source) source->host_read(col.column_index_offset, col.column_index_length); cp.init(col_idx_buf->data(), col_idx_buf->size()); ColumnIndex ci; - CUDF_EXPECTS(cp.read(&ci), "Cannot parse column index"); + cp.read(&ci); col.column_index = std::move(ci); } if (col.offset_index_length > 0 && col.offset_index_offset > 0) { @@ -283,7 +283,7 @@ metadata::metadata(datasource* source) source->host_read(col.offset_index_offset, col.offset_index_length); cp.init(off_idx_buf->data(), off_idx_buf->size()); OffsetIndex oi; - CUDF_EXPECTS(cp.read(&oi), "Cannot parse offset index"); + cp.read(&oi); col.offset_index = std::move(oi); } } diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index e90e1af8a2b..39a4a89af92 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -225,9 +225,7 @@ void read_footer(std::unique_ptr const& source, source->host_read(len - ender->footer_len - ender_len, ender->footer_len); cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); - // returns true on success - bool res = cp.read(file_meta_data); - ASSERT_TRUE(res); + cp.read(file_meta_data); } // returns the number of bits used for dictionary encoding data at the given page location. @@ -242,8 +240,7 @@ int read_dict_bits(std::unique_ptr const& source, cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); - bool res = cp.read(&page_hdr); - CUDF_EXPECTS(res, "Cannot parse page header"); + cp.read(&page_hdr); // cp should be pointing at the start of page data now. the first byte // should be the encoding bit size @@ -263,8 +260,7 @@ cudf::io::parquet::detail::ColumnIndex read_column_index( cudf::io::parquet::detail::ColumnIndex colidx; auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length); cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); - bool res = cp.read(&colidx); - CUDF_EXPECTS(res, "Cannot parse column index"); + cp.read(&colidx); return colidx; } @@ -281,8 +277,7 @@ cudf::io::parquet::detail::OffsetIndex read_offset_index( cudf::io::parquet::detail::OffsetIndex offidx; auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length); cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); - bool res = cp.read(&offidx); - CUDF_EXPECTS(res, "Cannot parse offset index"); + cp.read(&offidx); return offidx; } @@ -306,8 +301,7 @@ cudf::io::parquet::detail::PageHeader read_page_header( cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); - bool res = cp.read(&page_hdr); - CUDF_EXPECTS(res, "Cannot parse page header"); + cp.read(&page_hdr); return page_hdr; }