Skip to content

Commit

Permalink
Support json & array types (#23408)
Browse files Browse the repository at this point in the history
Signed-off-by: yah01 <yang.cen@zilliz.com>
Co-authored-by: yah01 <yang.cen@zilliz.com>
  • Loading branch information
jiaoew1991 and yah01 authored Apr 20, 2023
1 parent 2afc982 commit 967a97b
Show file tree
Hide file tree
Showing 55 changed files with 2,735 additions and 2,227 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ require (
github.com/golang/protobuf v1.5.3
github.com/klauspost/compress v1.14.4
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230411174625-2c86533465fb
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230416064425-aec3e83865b2
github.com/milvus-io/milvus/pkg v0.0.0-00010101000000-000000000000
github.com/minio/minio-go/v7 v7.0.17
github.com/panjf2000/ants/v2 v2.7.2
Expand Down
6 changes: 2 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -572,10 +572,8 @@ github.com/microcosm-cc/bluemonday v1.0.2/go.mod h1:iVP4YcDBq+n/5fb23BhYFvIMq/le
github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230322065753-aa8a66130217 h1:58lCM3+oh3ZuCemnOE3V2VdaPnIL+LS7eoEyrFfrxOM=
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230322065753-aa8a66130217/go.mod h1:148qnlmZ0Fdm1Fq+Mj/OW2uDoEP25g3mjh0vMGtkgmk=
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230411174625-2c86533465fb h1:gMSlJbBbfI6IZ6vktimD94/ASaLYFNXiX2xhXqVeFxA=
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230411174625-2c86533465fb/go.mod h1:148qnlmZ0Fdm1Fq+Mj/OW2uDoEP25g3mjh0vMGtkgmk=
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230416064425-aec3e83865b2 h1:G5uN68X/7eoCfHUkNvkbNueFhHuohCZG94te+ApLAOY=
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230416064425-aec3e83865b2/go.mod h1:148qnlmZ0Fdm1Fq+Mj/OW2uDoEP25g3mjh0vMGtkgmk=
github.com/milvus-io/pulsar-client-go v0.6.10 h1:eqpJjU+/QX0iIhEo3nhOqMNXL+TyInAs1IAHZCrCM/A=
github.com/milvus-io/pulsar-client-go v0.6.10/go.mod h1:lQqCkgwDF8YFYjKA+zOheTk1tev2B+bKj5j7+nm8M1w=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
Expand Down
37 changes: 31 additions & 6 deletions internal/core/src/common/Column.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <sys/mman.h>

#include <cstddef>
#include <ostream>
#include <string_view>
#include <type_traits>
#include <vector>
Expand All @@ -26,9 +27,13 @@
#include "common/Utils.h"
#include "exceptions/EasyAssert.h"
#include "fmt/core.h"
#include "log/Log.h"
#include "nlohmann/json.hpp"

namespace milvus::segcore {

#define FIELD_DATA(info, field) (info->scalars().field##_data().data())

struct Entry {
char* data;
uint32_t length;
Expand Down Expand Up @@ -103,11 +108,17 @@ class VariableColumn : public ColumnBase {
using ViewType =
std::conditional_t<std::is_same_v<T, std::string>, std::string_view, T>;

template <typename Ctor>
VariableColumn(int64_t segment_id,
const FieldMeta& field_meta,
const LoadFieldDataInfo& info) {
const LoadFieldDataInfo& info,
Ctor&& ctor) {
auto begin = info.field_data->scalars().string_data().data().begin();
auto end = info.field_data->scalars().string_data().data().end();
if constexpr (std::is_same_v<T, nlohmann::json>) {
begin = info.field_data->scalars().json_data().data().begin();
end = info.field_data->scalars().json_data().data().end();
}

indices_.reserve(info.row_count);
while (begin != end) {
Expand All @@ -117,7 +128,7 @@ class VariableColumn : public ColumnBase {
}

data_ = static_cast<char*>(CreateMap(segment_id, field_meta, info));
construct_views();
construct_views(std::forward<Ctor>(ctor));
}

VariableColumn(VariableColumn&& field) noexcept
Expand Down Expand Up @@ -145,14 +156,28 @@ class VariableColumn : public ColumnBase {
}

protected:
template <typename Ctor>
void
construct_views() {
construct_views(Ctor ctor) {
views_.reserve(indices_.size());
for (size_t i = 0; i < indices_.size() - 1; i++) {
views_.emplace_back(data_ + indices_[i],
indices_[i + 1] - indices_[i]);
views_.emplace_back(
ctor(data_ + indices_[i], indices_[i + 1] - indices_[i]));
}
views_.emplace_back(
ctor(data_ + indices_.back(), size_ - indices_.back()));

// as we stores the json objects entirely in memory,
// the raw data is not needed anymore
if constexpr (std::is_same_v<T, nlohmann::json>) {
if (munmap(data_, size_)) {
AssertInfo(
true,
fmt::format(
"failed to unmap json field after deserialized, err={}",
strerror(errno)));
}
}
views_.emplace_back(data_ + indices_.back(), size_ - indices_.back());
}

private:
Expand Down
56 changes: 37 additions & 19 deletions internal/core/src/common/FieldMeta.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ datatype_name(DataType data_type) {
return "double";
case DataType::VARCHAR:
return "varChar";
case DataType::ARRAY:
return "array";
case DataType::JSON:
return "json";
case DataType::VECTOR_FLOAT:
return "vector_float";
case DataType::VECTOR_BINARY: {
Expand Down Expand Up @@ -105,11 +109,24 @@ datatype_is_string(DataType datatype) {
}
}

inline bool
datatype_is_binary(DataType datatype) {
switch (datatype) {
case DataType::ARRAY:
case DataType::JSON:
return true;
default:
return false;
}
}

inline bool
datatype_is_variable(DataType datatype) {
switch (datatype) {
case DataType::VARCHAR:
case DataType::STRING:
case DataType::ARRAY:
case DataType::JSON:
return true;
default:
return false;
Expand Down Expand Up @@ -152,7 +169,7 @@ class FieldMeta {

FieldMeta(const FieldName& name, FieldId id, DataType type)
: name_(name), id_(id), type_(type) {
Assert(!is_vector());
Assert(!datatype_is_vector(type_));
}

FieldMeta(const FieldName& name,
Expand All @@ -163,7 +180,7 @@ class FieldMeta {
id_(id),
type_(type),
string_info_(StringInfo{max_length}) {
Assert(is_string());
Assert(datatype_is_string(type_));
}

FieldMeta(const FieldName& name,
Expand All @@ -175,39 +192,26 @@ class FieldMeta {
id_(id),
type_(type),
vector_info_(VectorInfo{dim, metric_type}) {
Assert(is_vector());
}

bool
is_vector() const {
Assert(type_ != DataType::NONE);
return type_ == DataType::VECTOR_BINARY ||
type_ == DataType::VECTOR_FLOAT;
}

bool
is_string() const {
Assert(type_ != DataType::NONE);
return type_ == DataType::VARCHAR || type_ == DataType::STRING;
Assert(datatype_is_vector(type_));
}

int64_t
get_dim() const {
Assert(is_vector());
Assert(datatype_is_vector(type_));
Assert(vector_info_.has_value());
return vector_info_->dim_;
}

int64_t
get_max_len() const {
Assert(is_string());
Assert(datatype_is_string(type_));
Assert(string_info_.has_value());
return string_info_->max_length;
}

std::optional<knowhere::MetricType>
get_metric_type() const {
Assert(is_vector());
Assert(datatype_is_vector(type_));
Assert(vector_info_.has_value());
return vector_info_->metric_type_;
}
Expand All @@ -227,12 +231,26 @@ class FieldMeta {
return type_;
}

bool
is_vector() const {
return datatype_is_vector(type_);
}

bool
is_string() const {
return datatype_is_string(type_);
}

size_t
get_sizeof() const {
static const size_t ARRAY_SIZE = 128;
static const size_t JSON_SIZE = 512;
if (is_vector()) {
return datatype_sizeof(type_, get_dim());
} else if (is_string()) {
return string_info_->max_length;
} else if (datatype_is_variable(type_)) {
return type_ == DataType::ARRAY ? ARRAY_SIZE : JSON_SIZE;
} else {
return datatype_sizeof(type_);
}
Expand Down
2 changes: 2 additions & 0 deletions internal/core/src/common/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ enum class DataType {

STRING = 20,
VARCHAR = 21,
ARRAY = 22,
JSON = 23,

VECTOR_BINARY = 100,
VECTOR_FLOAT = 101,
Expand Down
30 changes: 22 additions & 8 deletions internal/core/src/common/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <google/protobuf/text_format.h>
#include <sys/mman.h>

#include <cstring>
#include <filesystem>
#include <memory>
#include <string>
Expand All @@ -25,6 +26,7 @@
#include "common/Consts.h"
#include "common/FieldMeta.h"
#include "common/LoadInfo.h"
#include "common/Types.h"
#include "config/ConfigChunkManager.h"
#include "exceptions/EasyAssert.h"
#include "knowhere/dataset.h"
Expand Down Expand Up @@ -192,17 +194,19 @@ GetDataSize(const FieldMeta& field, size_t row_count, const DataArray* data) {
switch (data_type) {
case DataType::VARCHAR:
case DataType::STRING: {
auto begin = data->scalars().string_data().data().begin();
auto end = data->scalars().string_data().data().end();

ssize_t size{0};
while (begin != end) {
size += begin->size();
begin++;
ssize_t size{};
for (auto& data : data->scalars().string_data().data()) {
size += data.size();
}
return size;
}
case DataType::JSON: {
ssize_t size{};
for (auto& data : data->scalars().json_data().data()) {
size += data.size();
}
return size;
}

default:
PanicInfo(fmt::format("not supported data type {}",
datatype_name(data_type)));
Expand Down Expand Up @@ -260,6 +264,16 @@ FillField(DataType data_type,
}
return dst;
}

case DataType::JSON: {
char* dest = reinterpret_cast<char*>(dst);
for (auto& data : data->scalars().json_data().data()) {
memcpy(dest, data.data(), data.size());
dest += data.size();
}
return dst;
}

case DataType::VECTOR_FLOAT:
return memcpy(
dst, data->vectors().float_vector().data().data(), size);
Expand Down
Empty file modified internal/core/src/pb/common.pb.cc
100644 → 100755
Empty file.
Empty file modified internal/core/src/pb/common.pb.h
100644 → 100755
Empty file.
Empty file modified internal/core/src/pb/index_cgo_msg.pb.cc
100644 → 100755
Empty file.
Empty file modified internal/core/src/pb/index_cgo_msg.pb.h
100644 → 100755
Empty file.
Empty file modified internal/core/src/pb/plan.pb.cc
100644 → 100755
Empty file.
Empty file modified internal/core/src/pb/plan.pb.h
100644 → 100755
Empty file.
Loading

0 comments on commit 967a97b

Please sign in to comment.