Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CAST(JSON as ROW(ARRAY)) #9447

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 53 additions & 7 deletions velox/functions/prestosql/tests/JsonCastTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,52 @@ TEST_F(JsonCastTest, orderOfKeys) {
testCast(data, map);
}

TEST_F(JsonCastTest, toRowOfArray) {
auto data = makeFlatVector<std::string>(
{
R"({"c0": [1, 2, 3], "c1": 1.2})",
R"({"c0": [], "c1": 1.3})",
R"({"c0": [10, null, 20, null], "c1": 1.4})",
},
JSON());

auto expected = makeRowVector({
makeArrayVectorFromJson<int64_t>({
"[1, 2, 3]",
"[]",
"[10, null, 20, null]",
}),
});

testCast(data, expected);
}

TEST_F(JsonCastTest, toRowDuplicateKey) {
std::vector<std::optional<std::string>> jsonStrings = {
R"({"c0": 1, "c1": 1.1})",
R"({"c0": 2, "c1": 1.2, "C0": 45})", // Duplicate keys: c0, C0.
R"({"c0": 3, "c1": 1.3, "c0": 55})", // Duplicate keys: c0, c0.
R"({"c0": 4, "c1": 1.4, "c2": 65})",
};

testThrow<std::string>(
JSON(),
ROW({"c0", "c1"}, {INTEGER(), REAL()}),
jsonStrings,
"Duplicate field: c0");

auto data = makeNullableFlatVector<std::string>(jsonStrings, JSON());

auto expected = makeRowVector({
makeFlatVector<int32_t>({1, 0, 0, 4}),
makeFlatVector<float>({1.1, 0.0, 0.0, 1.4}),
});
expected->setNull(1, true);
expected->setNull(2, true);

testCast(data, expected, true /*try_cast*/);
}

TEST_F(JsonCastTest, toRow) {
// Test casting to ROW from JSON arrays.
auto array = makeNullableFlatVector<JsonNativeType>(
Expand All @@ -1053,7 +1099,7 @@ TEST_F(JsonCastTest, toRow) {
auto map = makeNullableFlatVector<JsonNativeType>(
{R"({"c0":123,"c1":"abc","c2":true})"_sv,
R"({"c1":"abc","c2":true,"c0":123})"_sv,
R"({"c0":123,"c2":true,"c0":456})"_sv,
R"({"c10":123,"c2":true,"c0":456})"_sv,
R"({"c3":123,"c4":"abc","c2":false})"_sv,
R"({"c0":null,"c2":false})"_sv,
R"({"c0":null,"c2":null,"c1":null})"_sv},
Expand All @@ -1074,17 +1120,17 @@ TEST_F(JsonCastTest, toRow) {

// Use a mix of lower case and upper case JSON keys.
map = makeNullableFlatVector<JsonNativeType>(
{R"({"c0":123,"c1":"abc","c2":true})"_sv,
R"({"c1":"abc","c2":true,"c0":123})"_sv,
R"({"c0":123,"c2":true,"c0":456})"_sv,
R"({"c3":123,"c4":"abc","c2":false})"_sv,
{R"({"C0":123,"C1":"abc","C2":true})"_sv,
R"({"c1":"abc","C2":true,"c0":123})"_sv,
R"({"C10":123,"C2":true,"c0":456})"_sv,
R"({"c3":123,"C4":"abc","c2":false})"_sv,
R"({"c0":null,"c2":false})"_sv,
R"({"c0":null,"c2":null,"c1":null})"_sv},
R"({"c0":null,"c2":null,"C1":null})"_sv},
JSON());
testCast(map, makeRowVector({child4, child5, child6}));

// Use a mix of lower case and upper case field names in target ROW type.
testCast(map, makeRowVector({child4, child5, child6}));
testCast(map, makeRowVector({"c0", "C1", "C2"}, {child4, child5, child6}));

// Test casting to ROW from JSON null.
auto null = makeNullableFlatVector<JsonNativeType>({"null"_sv}, JSON());
Expand Down
48 changes: 31 additions & 17 deletions velox/functions/prestosql/types/JsonType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -856,30 +856,44 @@ struct CastFromJsonTypedImpl {
}
} else {
SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object());
folly::F14FastMap<std::string, simdjson::ondemand::value> lowerCaseKeys(
object.count_fields());

// TODO Populate this mapping once, not per-row.
// Mapping from lower-case field names of the target RowType to their
// indices.
folly::F14FastMap<std::string, int32_t> fieldIndices;
const auto size = rowType.size();
for (auto i = 0; i < size; ++i) {
auto key = rowType.nameOf(i);
boost::algorithm::to_lower(key);
fieldIndices[key] = i;
}

std::string key;
for (auto fieldResult : object) {
SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldResult);
if (!field.value().is_null()) {
SIMDJSON_ASSIGN_OR_RAISE(key, field.unescaped_key(true));
boost::algorithm::to_lower(key);
lowerCaseKeys[key] = field.value();

auto it = fieldIndices.find(key);
if (it != fieldIndices.end()) {
const auto index = it->second;

VELOX_USER_CHECK_GE(index, 0, "Duplicate field: {}", key);
it->second = -1;

SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH(
CastFromJsonTypedImpl<simdjson::ondemand::value>::apply,
rowType.childAt(index)->kind(),
field.value(),
writerTyped.get_writer_at(index)));
}
}
}
for (column_index_t numFields = rowType.size(), i = 0; i < numFields;
++i) {
key = rowType.nameOf(i);
boost::algorithm::to_lower(key);
auto it = lowerCaseKeys.find(key);
if (it == lowerCaseKeys.end()) {
writerTyped.set_null_at(i);
} else {
SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH(
CastFromJsonTypedImpl<simdjson::ondemand::value>::apply,
rowType.childAt(i)->kind(),
it->second,
writerTyped.get_writer_at(i)));

for (const auto& [key, index] : fieldIndices) {
if (index >= 0) {
writerTyped.set_null_at(index);
}
}
}
Expand Down Expand Up @@ -1038,7 +1052,7 @@ class JsonCastOperator : public exec::CastOperator {
maxSize = std::max(maxSize, input.size());
});
paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING);
rows.applyToSelected([&](auto row) {
context.applyToSelectedNoThrow(rows, [&](auto row) {
writer.setOffset(row);
if (inputVector->isNullAt(row)) {
writer.commitNull();
Expand Down
Loading