From 1e41dca5c7dd9cca1d9d4485e5ebdfcc3fd8d306 Mon Sep 17 00:00:00 2001 From: ricki-epsilla Date: Mon, 18 Mar 2024 22:49:59 +0800 Subject: [PATCH] set list support; --- engine/db/catalog/basic_meta_impl.cpp | 26 +++++++++++ engine/db/catalog/meta_types.hpp | 6 +++ engine/db/table_segment_mvp.cpp | 62 ++++++++++++++++++++++++--- engine/db/table_segment_mvp.hpp | 4 ++ 4 files changed, 91 insertions(+), 7 deletions(-) diff --git a/engine/db/catalog/basic_meta_impl.cpp b/engine/db/catalog/basic_meta_impl.cpp index 8015876..cad9488 100644 --- a/engine/db/catalog/basic_meta_impl.cpp +++ b/engine/db/catalog/basic_meta_impl.cpp @@ -31,6 +31,7 @@ constexpr const char* METRIC_TYPE = "metric_type"; constexpr const char* INDICES = "indices"; constexpr const char* MODEL = "model"; constexpr const char* DIMENSIONS = "dimensions"; +constexpr const char* ELEMENT_TYPE = "element_type"; constexpr const char* DB_CATALOG_FILE_NAME = "catalog"; @@ -49,6 +50,10 @@ Status LoadFieldSchemaFromJson(const vectordb::Json& json, meta::FieldSchema& fi field_schema.vector_dimension_ = json.GetInt(VECTOR_DIMENSION); field_schema.metric_type_ = static_cast(json.GetInt(METRIC_TYPE)); } + // Only set and list fields have element_type. + if (field_schema.field_type_ == meta::FieldType::SET || field_schema.field_type_ == meta::FieldType::LIST) { + field_schema.element_type = static_cast(json.GetInt(ELEMENT_TYPE)); + } return Status::OK(); } @@ -113,6 +118,10 @@ void DumpFieldSchemaToJson(const meta::FieldSchema& field_schema, vectordb::Json json.SetInt(VECTOR_DIMENSION, field_schema.vector_dimension_); json.SetInt(METRIC_TYPE, static_cast(field_schema.metric_type_)); } + // Only set and list fields have element_type. + if (field_schema.field_type_ == meta::FieldType::SET || field_schema.field_type_ == meta::FieldType::LIST) { + json.SetInt(ELEMENT_TYPE, static_cast(field_schema.element_type)); + } } // Convert a TableSchema to a Json object @@ -364,6 +373,23 @@ Status ValidateSchema(TableSchema& table_schema, std::vector &em } } + // 8. Set and list fields must have element type and should be supported data type. + if (field.field_type_ == FieldType::SET || field.field_type_ == FieldType::LIST) { + if (field.element_type == FieldType::UNKNOWN) { + return Status(DB_UNEXPECTED_ERROR, "Element type of " + field.name_ + " is not valid."); + } + if (field.element_type != FieldType::INT1 && + field.element_type != FieldType::INT2 && + field.element_type != FieldType::INT4 && + field.element_type != FieldType::INT8 && + field.element_type != FieldType::FLOAT && + field.element_type != FieldType::DOUBLE && + field.element_type != FieldType::STRING && + field.element_type != FieldType::BOOL) { + return Status(DB_UNEXPECTED_ERROR, "Element type of " + field.name_ + " is not supported."); + } + } + if (has_primary_key && field.is_primary_key_) { return Status(DB_UNEXPECTED_ERROR, "Cannot have more than 1 primary key fields."); } diff --git a/engine/db/catalog/meta_types.hpp b/engine/db/catalog/meta_types.hpp index e8b0fbe..2e09c17 100644 --- a/engine/db/catalog/meta_types.hpp +++ b/engine/db/catalog/meta_types.hpp @@ -37,6 +37,9 @@ enum class FieldType { SPARSE_VECTOR_FLOAT = 50, SPARSE_VECTOR_DOUBLE = 51, + SET = 60, + LIST = 61, + UNKNOWN = 999, }; @@ -55,6 +58,7 @@ struct FieldSchema { FieldType field_type_ = FieldType::INT4; size_t vector_dimension_ = DEFAULT_VECTOR_DIMENSION; MetricType metric_type_ = MetricType::EUCLIDEAN; + FieldType element_type = FieldType::INT4; }; struct AutoEmbedding { @@ -101,6 +105,8 @@ static const std::unordered_map fieldTypeMap = { {"VECTOR_DOUBLE", FieldType::VECTOR_DOUBLE}, {"SPARSE_VECTOR_FLOAT", FieldType::SPARSE_VECTOR_FLOAT}, {"SPARSE_VECTOR_DOUBLE", FieldType::SPARSE_VECTOR_DOUBLE}, + {"SET", FieldType::SET}, + {"LIST", FieldType::LIST}, {"UNKNOWN", FieldType::UNKNOWN}}; static const std::unordered_map metricTypeMap = { diff --git a/engine/db/table_segment_mvp.cpp b/engine/db/table_segment_mvp.cpp index b109251..5e69bd5 100644 --- a/engine/db/table_segment_mvp.cpp +++ b/engine/db/table_segment_mvp.cpp @@ -40,6 +40,11 @@ constexpr size_t FieldTypeSizeMVP(meta::FieldType type) { // like the length of the string or the dimension of the vector. You might want // to handle these cases differently. return 0; + case meta::FieldType::SET: + case meta::FieldType::LIST: + // For these types, we can't determine the size without additional information + // like the length of the set or the list. + return 0; case meta::FieldType::UNKNOWN: default: // Unknown type @@ -52,13 +57,15 @@ Status TableSegmentMVP::Init(meta::TableSchema& table_schema, int64_t size_limit primitive_offset_ = 0; schema = table_schema; - // Get how many primitive, vectors, and variable-length attributes (string, sparse vectors). + // Get how many primitive, vectors, and variable-length attributes (string, sparse vectors, set, list). for (auto& field_schema : table_schema.fields_) { auto current_total_vec_num = dense_vector_num_ + sparse_vector_num_; if (field_schema.field_type_ == meta::FieldType::STRING || field_schema.field_type_ == meta::FieldType::JSON || field_schema.field_type_ == meta::FieldType::SPARSE_VECTOR_DOUBLE || - field_schema.field_type_ == meta::FieldType::SPARSE_VECTOR_FLOAT) { + field_schema.field_type_ == meta::FieldType::SPARSE_VECTOR_FLOAT || + field_schema.field_type_ == meta::FieldType::SET || + field_schema.field_type_ == meta::FieldType::LIST) { field_id_mem_offset_map_[field_schema.id_] = var_len_attr_num_; field_name_mem_offset_map_[field_schema.name_] = var_len_attr_num_; if (field_schema.field_type_ == meta::FieldType::SPARSE_VECTOR_DOUBLE || @@ -242,6 +249,47 @@ TableSegmentMVP::TableSegmentMVP(meta::TableSchema& table_schema, const std::str file.read(reinterpret_cast(v->data()), dataLen); var_len_attr_table_[attrIdx][recordIdx] = std::move(v); break; + case meta::FieldType::SET: + case meta::FieldType::LIST: { + // set or list contains elements of primitive data types + switch (field.element_type) { + case meta::FieldType::INT1: + case meta::FieldType::INT2: + case meta::FieldType::INT4: + case meta::FieldType::INT8: { + std::vector values(dataLen / sizeof(int64_t)); + file.read(reinterpret_cast(values.data()), dataLen); + var_len_attr_table_[attrIdx][recordIdx] = std::move(values); + break; + } + case meta::FieldType::FLOAT: { + std::vector values(dataLen / sizeof(float)); + file.read(reinterpret_cast(values.data()), dataLen); + var_len_attr_table_[attrIdx][recordIdx] = std::move(values); + break; + } + case meta::FieldType::DOUBLE: { + std::vector values(dataLen / sizeof(double)); + file.read(reinterpret_cast(values.data()), dataLen); + var_len_attr_table_[attrIdx][recordIdx] = std::move(values); + break; + } + case meta::FieldType::STRING: { + std::string str(dataLen, '\0'); + file.read(&str[0], dataLen); + var_len_attr_table_[attrIdx][recordIdx] = str; + break; + } + case meta::FieldType::BOOL: { + std::vector values(dataLen / sizeof(bool)); + file.read(reinterpret_cast(values.data()), dataLen); + var_len_attr_table_[attrIdx][recordIdx] = std::move(values); + break; + } + default: + break; + } + } } } } @@ -619,7 +667,7 @@ Status TableSegmentMVP::Insert(meta::TableSchema& table_schema, Json& records, i } else { // std::cerr << "primary key [" << value << "] already exists, skipping." << std::endl; skipped_entry++; - goto LOOP_END; + goto LOOP_END; } } } else { @@ -635,7 +683,7 @@ Status TableSegmentMVP::Insert(meta::TableSchema& table_schema, Json& records, i } else { // std::cerr << "primary key [" << value << "] already exists, skipping." << std::endl; skipped_entry++; - goto LOOP_END; + goto LOOP_END; } } break; @@ -651,7 +699,7 @@ Status TableSegmentMVP::Insert(meta::TableSchema& table_schema, Json& records, i } else { // std::cerr << "primary key [" << value << "] already exists, skipping." << std::endl; skipped_entry++; - goto LOOP_END; + goto LOOP_END; } } break; @@ -667,7 +715,7 @@ Status TableSegmentMVP::Insert(meta::TableSchema& table_schema, Json& records, i } else { // std::cerr << "primary key [" << value << "] already exists, skipping." << std::endl; skipped_entry++; - goto LOOP_END; + goto LOOP_END; } } break; @@ -683,7 +731,7 @@ Status TableSegmentMVP::Insert(meta::TableSchema& table_schema, Json& records, i } else { // std::cerr << "primary key [" << value << "] already exists, skipping." << std::endl; skipped_entry++; - goto LOOP_END; + goto LOOP_END; } } break; diff --git a/engine/db/table_segment_mvp.hpp b/engine/db/table_segment_mvp.hpp index 343302d..f7f1177 100644 --- a/engine/db/table_segment_mvp.hpp +++ b/engine/db/table_segment_mvp.hpp @@ -78,6 +78,10 @@ class TableSegmentMVP { int64_t dense_vector_num_; char* attribute_table_; // The attribute table in memory (exclude vector attributes and string attributes). std::vector var_len_attr_table_; // The variable length attribute table in memory. + + // TODO: + // vector for container attribute table + // std::vector> string_tables_; // Hold the string attributes. std::vector vector_dims_; float** vector_tables_; // The vector attribute tables. Each vector attribute has its own vector table.