diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index 15fd0b48527c53..0a7bedf5bfd6dd 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -104,6 +104,29 @@ namespace doris::variant_util { +namespace { + +bool keep_empty_key_sparse(std::string_view path, TabletSchema::PathsSetInfo& paths_set_info) { + if (!path.empty()) { + return false; + } + // An empty JSON key is encoded as an empty relative subpath. It cannot be + // materialized as an extracted column because that path collides with the + // VARIANT root, so keep it in the sparse column. + paths_set_info.sparse_path_set.emplace(""); + return true; +} + +bool keep_empty_key_sparse(const PathInData& path, TabletSchema::PathsSetInfo& paths_set_info) { + if (path.empty() || !path.get_path().empty()) { + return false; + } + paths_set_info.sparse_path_set.emplace(""); + return true; +} + +} // namespace + inline void append_escaped_regex_char(std::string* regex_output, char ch) { switch (ch) { case '.': @@ -998,6 +1021,9 @@ void VariantCompactionUtil::get_subpaths(int32_t max_subcolumns_count, // Select top N paths as subcolumns, remaining paths as sparse columns for (const auto& [size, path] : paths_with_sizes) { + if (keep_empty_key_sparse(path, paths_set_info)) { + continue; + } if (paths_set_info.sub_path_set.size() < max_subcolumns_count) { paths_set_info.sub_path_set.emplace(path); } else { @@ -1010,6 +1036,9 @@ void VariantCompactionUtil::get_subpaths(int32_t max_subcolumns_count, } else { // Apply all paths as subcolumns for (const auto& [path, _] : stats) { + if (keep_empty_key_sparse(path, paths_set_info)) { + continue; + } paths_set_info.sub_path_set.emplace(path); } } @@ -1137,6 +1166,9 @@ Status VariantCompactionUtil::get_compaction_typed_columns( return Status::OK(); } for (const auto& path : typed_paths) { + if (keep_empty_key_sparse(path, paths_set_info)) { + continue; + } TabletSchema::SubColumnInfo sub_column_info; if (generate_sub_column_info(*target, parent_column->unique_id(), path, &sub_column_info)) { inherit_column_attributes(*parent_column, sub_column_info.column); @@ -1193,6 +1225,9 @@ void VariantCompactionUtil::get_compaction_subcolumns_from_subpaths( const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id()); // append subcolumns for (const auto& subpath : sorted_subpaths) { + if (keep_empty_key_sparse(std::string_view(subpath.data, subpath.size), paths_set_info)) { + continue; + } auto column_name = parent_column->name_lower_case() + "." + subpath.to_string(); auto column_path = PathInData(column_name); @@ -1255,6 +1290,9 @@ void VariantCompactionUtil::get_compaction_subcolumns_from_data_types( for (const auto& [path, data_types] : path_to_data_types) { // Typed paths are materialized by get_compaction_typed_columns(); this helper only // materializes regular subcolumns inferred from rowset data types. + if (keep_empty_key_sparse(path, paths_set_info)) { + continue; + } if (data_types.empty() || path.empty() || path.get_is_typed() || path.has_nested_part()) { continue; } diff --git a/be/test/exec/common/schema_util_test.cpp b/be/test/exec/common/schema_util_test.cpp index c5d83e7ce64053..c383a70116dade 100644 --- a/be/test/exec/common/schema_util_test.cpp +++ b/be/test/exec/common/schema_util_test.cpp @@ -449,6 +449,35 @@ TEST_F(SchemaUtilTest, get_subpaths_equal_to_max) { uid_to_paths_set_info[1].sub_path_set.end()); } +TEST_F(SchemaUtilTest, get_subpaths_keeps_empty_path_sparse) { + variant_util::PathToNoneNullValues path_stats = { + {"", 1000}, {"path1", 900}, {"path2", 800}, {"path3", 700}}; + + TabletSchema::PathsSetInfo limited_paths; + variant_util::VariantCompactionUtil::get_subpaths(2, path_stats, limited_paths); + EXPECT_TRUE(limited_paths.sparse_path_set.contains("")); + EXPECT_FALSE(limited_paths.sub_path_set.contains("")); + EXPECT_TRUE(limited_paths.sub_path_set.contains("path1")); + EXPECT_TRUE(limited_paths.sub_path_set.contains("path2")); + EXPECT_TRUE(limited_paths.sparse_path_set.contains("path3")); + + TabletSchema::PathsSetInfo exact_limit_paths; + variant_util::VariantCompactionUtil::get_subpaths(4, path_stats, exact_limit_paths); + EXPECT_TRUE(exact_limit_paths.sparse_path_set.contains("")); + EXPECT_FALSE(exact_limit_paths.sub_path_set.contains("")); + EXPECT_TRUE(exact_limit_paths.sub_path_set.contains("path1")); + EXPECT_TRUE(exact_limit_paths.sub_path_set.contains("path2")); + EXPECT_TRUE(exact_limit_paths.sub_path_set.contains("path3")); + + TabletSchema::PathsSetInfo unlimited_paths; + variant_util::VariantCompactionUtil::get_subpaths(0, path_stats, unlimited_paths); + EXPECT_TRUE(unlimited_paths.sparse_path_set.contains("")); + EXPECT_FALSE(unlimited_paths.sub_path_set.contains("")); + EXPECT_TRUE(unlimited_paths.sub_path_set.contains("path1")); + EXPECT_TRUE(unlimited_paths.sub_path_set.contains("path2")); + EXPECT_TRUE(unlimited_paths.sub_path_set.contains("path3")); +} + TEST_F(SchemaUtilTest, get_subpaths_multiple_variants) { TabletSchema schema; TabletColumn variant1; @@ -1455,6 +1484,7 @@ TEST_F(SchemaUtilTest, get_compaction_typed_columns) { schema->append_column(variant); std::unordered_set typed_paths; + typed_paths.insert(""); typed_paths.insert("profile.id.name"); TabletSchemaSPtr output_schema = std::make_shared(); TabletColumnPtr parent_column = std::make_shared(variant); @@ -1465,6 +1495,8 @@ TEST_F(SchemaUtilTest, get_compaction_typed_columns) { EXPECT_EQ(output_schema->num_columns(), 1); EXPECT_EQ(output_schema->column(0).type(), FieldType::OLAP_FIELD_TYPE_INT); EXPECT_EQ(paths_set_info.typed_path_set.size(), 1); + EXPECT_TRUE(paths_set_info.sparse_path_set.contains("")); + EXPECT_FALSE(paths_set_info.typed_path_set.contains("")); typed_paths.insert("abc"); EXPECT_FALSE(variant_util::VariantCompactionUtil::get_compaction_typed_columns( @@ -1532,6 +1564,7 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns_from_subpaths) { TabletColumnPtr parent_column = std::make_shared(variant); TabletSchema::PathsSetInfo paths_set_info; + paths_set_info.sub_path_set.insert(""); paths_set_info.sub_path_set.insert("a"); paths_set_info.sub_path_set.insert("b"); doris::variant_util::PathToDataTypes path_to_data_types; @@ -1541,6 +1574,7 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns_from_subpaths) { variant_util::VariantCompactionUtil::get_compaction_subcolumns_from_subpaths( paths_set_info, parent_column, schema, path_to_data_types, sparse_paths, output_schema); EXPECT_EQ(output_schema->num_columns(), 2); + EXPECT_TRUE(paths_set_info.sparse_path_set.contains("")); for (const auto& column : output_schema->columns()) { EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_VARIANT); } @@ -1704,6 +1738,7 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns_from_data_types) { path_to_data_types[PathInData("b")] = {std::make_shared()}; // -> STRING path_to_data_types[PathInData("typed", true)] = {std::make_shared()}; path_to_data_types[PathInData("shared")] = {std::make_shared()}; + path_to_data_types[PathInData("")] = {std::make_shared()}; TabletSchemaSPtr output_schema = std::make_shared(); TabletSchema::PathsSetInfo paths_set_info; @@ -1756,6 +1791,21 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns_from_data_types) { EXPECT_TRUE(paths_set_info.sub_path_set.contains("b")); EXPECT_TRUE(paths_set_info.sub_path_set.contains("shared")); EXPECT_FALSE(paths_set_info.sub_path_set.contains("typed")); + EXPECT_TRUE(paths_set_info.sparse_path_set.contains("")); + EXPECT_FALSE(paths_set_info.sub_path_set.contains("")); + + doris::variant_util::PathToDataTypes root_path_to_data_types; + root_path_to_data_types[PathInData()] = {std::make_shared()}; + TabletSchemaSPtr root_output_schema = std::make_shared(); + TabletSchema::PathsSetInfo root_paths_set_info; + + variant_util::VariantCompactionUtil::get_compaction_subcolumns_from_data_types( + root_paths_set_info, parent_column, target, root_path_to_data_types, + root_output_schema); + + EXPECT_EQ(root_output_schema->num_columns(), 0); + EXPECT_FALSE(root_paths_set_info.sparse_path_set.contains("")); + EXPECT_FALSE(root_paths_set_info.sub_path_set.contains("")); } // Test has_different_structure_in_same_path function indirectly through check_variant_has_no_ambiguous_paths diff --git a/regression-test/data/variant_p0/test_variant_empty_key_sparse_bucket.out b/regression-test/data/variant_p0/test_variant_empty_key_sparse_bucket.out new file mode 100644 index 00000000000000..f28067adecb156 --- /dev/null +++ b/regression-test/data/variant_p0/test_variant_empty_key_sparse_bucket.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !empty_key_values -- +\N +\N +\N +\N + + +1234566 +16 +8888888 +UPPER CASE +dkdkdkdkdkd +ooaoaaaaaaa +xmxxmmmmmm diff --git a/regression-test/suites/variant_p0/test_variant_empty_key_sparse_bucket.groovy b/regression-test/suites/variant_p0/test_variant_empty_key_sparse_bucket.groovy new file mode 100644 index 00000000000000..391b9f20b2c914 --- /dev/null +++ b/regression-test/suites/variant_p0/test_variant_empty_key_sparse_bucket.groovy @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_variant_empty_key_sparse_bucket", "nonConcurrent") { + sql "SET default_variant_max_subcolumns_count = 0" + sql "SET default_variant_enable_doc_mode = false" + sql "SET use_v3_storage_format = false" + sql "SET default_variant_enable_typed_paths_to_sparse = false" + sql "SET default_variant_sparse_hash_shard_count = 3" + sql "SET enable_rewrite_element_at_to_slot = true" + + sql "DROP TABLE IF EXISTS test_variant_empty_key_sparse_bucket" + sql """ + CREATE TABLE test_variant_empty_key_sparse_bucket ( + k bigint, + v variant + ) + DUPLICATE KEY(`k`) + DISTRIBUTED BY HASH(k) BUCKETS 1 + properties("replication_num" = "1", "disable_auto_compaction" = "true"); + """ + + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (1, '{"中文" : "中文", "\\\\u4E2C\\\\u6587": "unicode"}')""" + sql "TRUNCATE TABLE test_variant_empty_key_sparse_bucket" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (3, '{"": ""}')""" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (4, '{"!@#^&*()": "11111"}')""" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (5, '{"123": "456", "789": "012"}')""" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (7, '{"AA": "UPPER CASE", "aa": "lower case"}')""" + sql "ALTER TABLE test_variant_empty_key_sparse_bucket RENAME COLUMN v Tags" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (1, '{"tag_key1" : 123456}')""" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (7, '{"": "UPPER CASE"}')""" + sql """ + INSERT INTO test_variant_empty_key_sparse_bucket VALUES + (7, '{"":16,"OpenCapStatus":0,"AccStatus":1,"AccTimeSum":481,"LowVoltage":0,"TowedStatus":0,"EncryptLng":117.23572361077638,"deviceId":"A1100614808888"}') + """ + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (7, '{"": ""}')""" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (7, '{"": "dkdkdkdkdkd"}')""" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (7, '{"": "xmxxmmmmmm"}')""" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (7, '{"": "ooaoaaaaaaa"}')""" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (7, '{"": 1234566}')""" + sql """INSERT INTO test_variant_empty_key_sparse_bucket VALUES (7, '{"": 8888888}')""" + + trigger_and_wait_compaction("test_variant_empty_key_sparse_bucket", "cumulative") + + qt_empty_key_values """ + SELECT cast(Tags[''] as text) + FROM test_variant_empty_key_sparse_bucket + ORDER BY cast(Tags[''] as string) + """ +}