diff --git a/base/base/EnumReflection.h b/base/base/EnumReflection.h new file mode 100644 index 0000000000..0d1f8ae0a4 --- /dev/null +++ b/base/base/EnumReflection.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +template concept is_enum = std::is_enum_v; + +namespace detail +{ +template +constexpr void static_for(F && f, std::index_sequence) +{ + (std::forward(f)(std::integral_constant(I)>()) , ...); +} +} + +/** + * Iterate over enum values in compile-time (compile-time switch/case, loop unrolling). + * + * @example static_for([](auto enum_value) { return template_func(); } + * ^ enum_value can be used as a template parameter + */ +template +constexpr void static_for(F && f) +{ + constexpr size_t count = magic_enum::enum_count(); + detail::static_for(std::forward(f), std::make_index_sequence()); +} + +/// Enable printing enum values as strings via fmt + magic_enum +template +struct fmt::formatter : fmt::formatter +{ + constexpr auto format(T value, auto& format_context) + { + return formatter::format(magic_enum::enum_name(value), format_context); + } +}; diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 8841abfca7..f6dacd9b06 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -328,6 +328,10 @@ if (USE_SIMDJSON) add_subdirectory (simdjson-cmake) endif() +if (USE_RAPIDJSON) + add_subdirectory (rapidjson-cmake) +endif() + if (USE_BREAKPAD) add_subdirectory(breakpad-cmake) endif() diff --git a/contrib/rapidjson-cmake/CMakeLists.txt b/contrib/rapidjson-cmake/CMakeLists.txt new file mode 100644 index 0000000000..e6b7fa25ee --- /dev/null +++ b/contrib/rapidjson-cmake/CMakeLists.txt @@ -0,0 +1,4 @@ +set(RAPIDJSON_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/rapidjson/include") + +add_library(rapidjson INTERFACE) +target_include_directories(rapidjson BEFORE INTERFACE "${RAPIDJSON_INCLUDE_DIR}") diff --git a/src/Analyzers/ExprAnalyzer.cpp b/src/Analyzers/ExprAnalyzer.cpp index 2713629cb6..faf20597a2 100644 --- a/src/Analyzers/ExprAnalyzer.cpp +++ b/src/Analyzers/ExprAnalyzer.cpp @@ -655,7 +655,7 @@ void ExprAnalyzerVisitor::processSubqueryArgsWithCoercion(ASTPtr & lhs_ast, ASTP { DataTypePtr super_type = nullptr; if (enable_implicit_type_conversion) - super_type = getLeastSupertype({lhs_type, rhs_type}, allow_extended_conversion); + super_type = getLeastSupertype(DataTypes{lhs_type, rhs_type}, allow_extended_conversion); if (!super_type) throw Exception("Incompatible types for IN prediacte", ErrorCodes::TYPE_MISMATCH); if (!lhs_type->equals(*super_type)) diff --git a/src/Analyzers/QueryAnalyzer.cpp b/src/Analyzers/QueryAnalyzer.cpp index 628bad15f0..c1a7dd153e 100644 --- a/src/Analyzers/QueryAnalyzer.cpp +++ b/src/Analyzers/QueryAnalyzer.cpp @@ -869,7 +869,7 @@ ScopePtr QueryAnalyzerVisitor::analyzeJoinUsing(ASTTableJoin & table_join, Scope { try { - output_type = getLeastSupertype({left_type, right_type}, allow_extended_conversion); + output_type = getLeastSupertype(DataTypes{left_type, right_type}, allow_extended_conversion); } catch (DB::Exception & ex) { @@ -961,7 +961,7 @@ ScopePtr QueryAnalyzerVisitor::analyzeJoinUsing(ASTTableJoin & table_join, Scope { try { - output_type = getLeastSupertype({left_type, right_type}, allow_extended_conversion); + output_type = getLeastSupertype(DataTypes{left_type, right_type}, allow_extended_conversion); } catch (DB::Exception & ex) { @@ -1159,7 +1159,7 @@ ScopePtr QueryAnalyzerVisitor::analyzeJoinOn(ASTTableJoin & table_join, ScopePtr { try { - super_type = getLeastSupertype({left_type, right_type}, allow_extended_conversion); + super_type = getLeastSupertype(DataTypes{left_type, right_type}, allow_extended_conversion); } catch (DB::Exception & ex) { diff --git a/src/Analyzers/QueryRewriter.cpp b/src/Analyzers/QueryRewriter.cpp index eec9163eaa..0f88dc15a2 100644 --- a/src/Analyzers/QueryRewriter.cpp +++ b/src/Analyzers/QueryRewriter.cpp @@ -44,6 +44,7 @@ #include #include #include +#include namespace DB { @@ -433,7 +434,10 @@ namespace } StoragePtr storage = joined_tables.getLeftTableStorage(); - rewrite_context.result.emplace(source_columns, storage, storage ? storage->getInMemoryMetadataPtr() : nullptr); + StorageSnapshotPtr storage_snapshot = nullptr; + if (storage) + storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context); + rewrite_context.result.emplace(source_columns, storage, storage_snapshot); auto & result = *(rewrite_context.result); if (tables_with_columns.size() > 1) @@ -486,8 +490,15 @@ namespace for (const auto & col : result.analyzed_join->columnsFromJoinedTable()) all_source_columns_set.insert(col.name); } - normalizeNameAndAliases(node, result.aliases, all_source_columns_set, settings, context, result.storage, - result.metadata_snapshot, graphviz_index); + normalizeNameAndAliases( + node, + result.aliases, + all_source_columns_set, + settings, + context, + result.storage, + result.storage_snapshot ? result.storage_snapshot->metadata : nullptr, + graphviz_index); } // 5. Call `TreeOptimizer` since some optimizations will change the query result diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bec838c1a0..b06fab1387 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -613,6 +613,14 @@ if (USE_LIBPQXX) dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${LIBPQXX_INCLUDE_DIR}) endif() +if (USE_SIMDJSON) + dbms_target_link_libraries(PUBLIC simdjson) +endif() + +if (USE_RAPIDJSON) + dbms_target_link_libraries(PUBLIC rapidjson) +endif() + if(USE_CPP_JIEBA) dbms_target_link_libraries(PUBLIC ${CPP_JIEBA_LIBRARY}) endif() @@ -621,7 +629,6 @@ if (USE_TSQUERY) dbms_target_link_libraries(PUBLIC ${TSQUERY_LIBRARY}) endif() - if (TARGET ch_contrib::ulid) dbms_target_link_libraries (PUBLIC ch_contrib::ulid) endif() @@ -639,6 +646,7 @@ if (USE_NLP) endif() dbms_target_include_directories (SYSTEM BEFORE PUBLIC ${PARALLEL_HASHMAP_INCLUDE_DIR}) +dbms_target_include_directories (SYSTEM BEFORE PUBLIC ${MAGIC_ENUM_INCLUDE_DIR}) include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake") diff --git a/src/Catalog/Catalog.cpp b/src/Catalog/Catalog.cpp index 6627642971..9d9f47444a 100644 --- a/src/Catalog/Catalog.cpp +++ b/src/Catalog/Catalog.cpp @@ -13,7 +13,13 @@ * limitations under the License. */ +#include #include +#include +#include +#include +#include +#include #include #include #include @@ -33,6 +39,11 @@ #include #include #include +#include +#include +#include +#include +#include #include // #include // #include @@ -1208,6 +1219,22 @@ namespace Catalog ProfileEvents::SetWorkerGroupForTableFailed); } + void Catalog::initStorageObjectSchema(StoragePtr & res) + { + // Load dynamic object column schema + if (res && hasDynamicSubcolumns(res->getInMemoryMetadata().getColumns())) + { + auto cnch_table = std::dynamic_pointer_cast(res); + + if (cnch_table) + { + auto assembled_schema = tryGetTableObjectAssembledSchema(res->getStorageUUID()); + auto partial_schemas = tryGetTableObjectPartialSchemas(res->getStorageUUID()); + cnch_table->resetObjectSchemas(assembled_schema, partial_schemas); + } + } + } + StoragePtr Catalog::getTable(const Context & query_context, const String & database, const String & name, const TxnTimestamp & ts) { StoragePtr res = nullptr; @@ -1245,6 +1272,7 @@ namespace Catalog res = createTableFromDataModel(query_context, *table); + initStorageObjectSchema(res); /// TODO: (zuochuang.zema, guanzhe.andy) handle TimeTravel if (auto * cnch_merge_tree = dynamic_cast(res.get())) { @@ -1313,6 +1341,9 @@ namespace Catalog if (!table) return; res = createTableFromDataModel(query_context, *table); + + initStorageObjectSchema(res); + /// Try insert the storage into cache. if (res && cache_manager) { @@ -6241,6 +6272,208 @@ namespace Catalog d.set_definition(create_query); } + void Catalog::appendObjectPartialSchema( + const StoragePtr & table, const TxnTimestamp & txn_id, const MutableMergeTreeDataPartsCNCHVector & parts) + { + //txn partial schema + //multi column + auto cnch_table = std::dynamic_pointer_cast(table); + if (!cnch_table) + return; + + auto subcolumns_limit = cnch_table->getSettings()->json_subcolumns_threshold; + + //check schema compatibility and merge part schema + auto partial_schema = DB::getConcreteObjectColumns( + parts.begin(), parts.end(), table->getInMemoryMetadata().columns, [](const auto & part) { return part->getColumns(); }); + + // compare with existed schema , check if it need to insert + // Attention: this comparison will scan existed partial schema from meta store, it may cost too many meta store resource. + // if it cause meta store performance fallback, just remove this comparison + auto assembled_schema = tryGetTableObjectAssembledSchema(table->getStorageUUID()); + auto existed_partial_schemas = tryGetTableObjectPartialSchemas(table->getStorageUUID()); + std::vector existed_partial_schema_txnids(existed_partial_schemas.size()); + std::for_each( + existed_partial_schemas.begin(), + existed_partial_schemas.end(), + [&existed_partial_schema_txnids](const auto & existed_partial_schema) { + existed_partial_schema_txnids.emplace_back(existed_partial_schema.first); + }); + auto committed_partial_schema_txnids = filterUncommittedObjectPartialSchemas(existed_partial_schema_txnids); + std::vector committed_partial_schema_list(committed_partial_schema_txnids.size() + 2); + std::for_each( + committed_partial_schema_txnids.begin(), + committed_partial_schema_txnids.end(), + [&committed_partial_schema_list, &existed_partial_schemas](const auto & txn_id) { + committed_partial_schema_list.emplace_back(existed_partial_schemas[txn_id]); + }); + + committed_partial_schema_list.emplace_back(assembled_schema); + + auto existed_assembled_schema = DB::getConcreteObjectColumns( + committed_partial_schema_list.begin(), + committed_partial_schema_list.end(), + cnch_table->getInMemoryMetadata().getColumns(), + [](const auto & partial_schema_) { return partial_schema_; }); + + committed_partial_schema_list.emplace_back(partial_schema); + auto new_assembled_schema = DB::getConcreteObjectColumns( + committed_partial_schema_list.begin(), + committed_partial_schema_list.end(), + cnch_table->getInMemoryMetadata().getColumns(), + [](const auto & partial_schema_) { return partial_schema_; }); + + if (new_assembled_schema != existed_assembled_schema) + { + DB::limitObjectSubcolumns(new_assembled_schema, subcolumns_limit); + + meta_proxy->appendObjectPartialSchema( + name_space, UUIDHelpers::UUIDToString(table->getStorageUUID()), txn_id.toUInt64(), partial_schema.toString()); + cnch_table->appendObjectPartialSchema(txn_id, partial_schema); + + LOG_DEBUG( + log, + "Append dynamic object partial schema [TxnTimestamp:{}, Partial Schema:{}]", + txn_id.toString(), + partial_schema.toString()); + } + } + + ObjectAssembledSchema Catalog::tryGetTableObjectAssembledSchema(const UUID & table_uuid) const + { + auto serialized_assembled_schema = meta_proxy->getObjectAssembledSchema(name_space, UUIDHelpers::UUIDToString(table_uuid)); + + if (serialized_assembled_schema.empty()) + return ColumnsDescription(); + return ColumnsDescription::parse(serialized_assembled_schema); + } + + ObjectPartialSchemas Catalog::tryGetTableObjectPartialSchemas(const UUID & table_uuid, const int & limit_size) const + { + auto serialized_partial_schemas + = meta_proxy->scanObjectPartialSchemas(name_space, UUIDHelpers::UUIDToString(table_uuid), limit_size); + ObjectPartialSchemas partial_schemas; + partial_schemas.reserve(serialized_partial_schemas.size()); + std::for_each( + serialized_partial_schemas.begin(), + serialized_partial_schemas.end(), + [&partial_schemas](std::pair serialized_partial_schema) { + partial_schemas.emplace( + std::stoll(serialized_partial_schema.first), + ColumnsDescription::parse(serialized_partial_schema.second)); + }); + + return partial_schemas; + } + + bool Catalog::resetObjectAssembledSchemaAndPurgePartialSchemas( + const UUID & table_uuid, + const ObjectAssembledSchema & old_assembled_schema, + const ObjectAssembledSchema & new_assembled_schema, + const std::vector & partial_schema_txnids) + { + return meta_proxy->resetObjectAssembledSchemaAndPurgePartialSchemas( + name_space, + UUIDHelpers::UUIDToString(table_uuid), + old_assembled_schema.empty() ? "" : old_assembled_schema.toString(), + new_assembled_schema.toString(), + partial_schema_txnids); + } + + std::vector Catalog::filterUncommittedObjectPartialSchemas(std::vector & unfiltered_partial_schema_txnids) + { + std::vector committed_partial_schema_txnids; + std::unordered_map unfiltered_partial_schema_txnid_map; + unfiltered_partial_schema_txnid_map.reserve(unfiltered_partial_schema_txnids.size()); + std::for_each( + unfiltered_partial_schema_txnids.begin(), + unfiltered_partial_schema_txnids.end(), + [&unfiltered_partial_schema_txnid_map](const auto & txn_id) { unfiltered_partial_schema_txnid_map[txn_id] = txn_id; }); + + // query partial schema status in meta store + auto partial_schema_statuses = batchGetObjectPartialSchemaStatuses(unfiltered_partial_schema_txnids); + std::for_each( + partial_schema_statuses.begin(), + partial_schema_statuses.end(), + [&committed_partial_schema_txnids, &unfiltered_partial_schema_txnid_map](const auto & partial_schema_status_pair) { + if (partial_schema_status_pair.second == ObjectPartialSchemaStatus::Finished) + { + committed_partial_schema_txnids.emplace_back(partial_schema_status_pair.first); + unfiltered_partial_schema_txnid_map.erase(partial_schema_status_pair.first); + } + }); + + // query remaining partial schemas by its co-responding txn record status + unfiltered_partial_schema_txnids.clear(); + std::transform( + unfiltered_partial_schema_txnid_map.begin(), + unfiltered_partial_schema_txnid_map.end(), + std::back_inserter(unfiltered_partial_schema_txnids), + [](const auto & txn_id_pair) { return txn_id_pair.first; }); + auto txn_record_statuses = getTransactionRecords(unfiltered_partial_schema_txnids, 10000); + + std::for_each( + txn_record_statuses.begin(), txn_record_statuses.end(), [&committed_partial_schema_txnids](TransactionRecord txn_record) { + auto txn_id = txn_record.txnID(); + auto status = txn_record.status(); + if (status == CnchTransactionStatus::Finished) + committed_partial_schema_txnids.emplace_back(txn_id); + }); + + return committed_partial_schema_txnids; + } + + ObjectPartialSchemaStatuses + Catalog::batchGetObjectPartialSchemaStatuses(const std::vector & txn_ids, const int & batch_size) + { + ObjectPartialSchemaStatuses partial_schema_statuses; + size_t total_txn_size = txn_ids.size(); + + partial_schema_statuses.reserve(total_txn_size); + + auto fetch_records_in_batch = [&](size_t begin, size_t end) { + auto statuses_in_metastore = meta_proxy->batchGetObjectPartialSchemaStatuses( + name_space, std::vector(txn_ids.begin() + begin, txn_ids.begin() + end)); + + for (const auto & serialized_partial_schema_status : statuses_in_metastore) + { + auto txn_id = serialized_partial_schema_status.first; + auto status = ObjectSchemas::deserializeObjectPartialSchemaStatus(serialized_partial_schema_status.second); + partial_schema_statuses.emplace(txn_id, status); + } + }; + + if (batch_size > 0) + { + size_t batch_count{0}; + while (batch_count + batch_size < total_txn_size) + { + fetch_records_in_batch(batch_count, batch_count + batch_size); + batch_count += batch_size; + } + fetch_records_in_batch(batch_count, total_txn_size); + } + else + fetch_records_in_batch(0, total_txn_size); + + return partial_schema_statuses; + } + + void Catalog::batchDeleteObjectPartialSchemaStatus(const std::vector &txn_ids) + { + meta_proxy->batchDeletePartialSchemaStatus(name_space, txn_ids); + } + + void Catalog::commitObjectPartialSchema(const TxnTimestamp &txn_id) + { + meta_proxy->updateObjectPartialSchemaStatus(name_space, txn_id, ObjectPartialSchemaStatus::Finished); + } + + void Catalog::abortObjectPartialSchema(const TxnTimestamp & txn_id) + { + meta_proxy->updateObjectPartialSchemaStatus(name_space, txn_id, ObjectPartialSchemaStatus::Aborted); + } + std::unordered_map> Catalog::loadPartitionMetricsSnapshotFromMetastore(const String & table_uuid) { diff --git a/src/Catalog/Catalog.h b/src/Catalog/Catalog.h index 39d43feccf..85f3e4fa51 100644 --- a/src/Catalog/Catalog.h +++ b/src/Catalog/Catalog.h @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -34,12 +35,15 @@ #include #include #include +#include "common/types.h" +#include #include #include #include -#include #include -#include "Catalog/IMetastore.h" +#include "Storages/IStorage_fwd.h" +#include +#include // #include namespace DB::ErrorCodes @@ -755,6 +759,25 @@ public: DeleteBitmapMetaPtrVector listDetachedDeleteBitmaps(const MergeTreeMetaBase & storage, const AttachFilter & filter); + // Append partial object column schema in Txn + void + appendObjectPartialSchema(const StoragePtr & table, const TxnTimestamp & txn_id, const MutableMergeTreeDataPartsCNCHVector & parts); + ObjectAssembledSchema tryGetTableObjectAssembledSchema(const UUID & table_uuid) const; + std::vector filterUncommittedObjectPartialSchemas(std::vector & unfiltered_partial_schema_txnids); + // @param limit_size -1 means no limit , read all partial schemas as possible + ObjectPartialSchemas tryGetTableObjectPartialSchemas(const UUID & table_uuid, const int & limit_size = -1) const; + bool resetObjectAssembledSchemaAndPurgePartialSchemas( + const UUID & table_uuid, + const ObjectAssembledSchema & old_assembled_schema, + const ObjectAssembledSchema & new_assembled_schema, + const std::vector & partial_schema_txnids); + + ObjectPartialSchemaStatuses batchGetObjectPartialSchemaStatuses(const std::vector & txn_ids, const int & batch_size = 10000); + void batchDeleteObjectPartialSchemaStatus(const std::vector & txn_ids); + void commitObjectPartialSchema(const TxnTimestamp & txn_id); + void abortObjectPartialSchema(const TxnTimestamp & txn_id); + void initStorageObjectSchema(StoragePtr & res); + // Access Entities std::optional tryGetAccessEntity(EntityType type, const String & name); std::vector getAllAccessEntities(EntityType type); @@ -762,6 +785,7 @@ public: void dropAccessEntity(EntityType type, const UUID & uuid, const String & name); void putAccessEntity(EntityType type, AccessEntityModel & new_access_entity, AccessEntityModel & old_access_entity, bool replace_if_exists = true); + private: Poco::Logger * log = &Poco::Logger::get("Catalog"); Context & context; diff --git a/src/Catalog/MetastoreProxy.cpp b/src/Catalog/MetastoreProxy.cpp index 8df66b31fe..77737ec840 100644 --- a/src/Catalog/MetastoreProxy.cpp +++ b/src/Catalog/MetastoreProxy.cpp @@ -26,10 +26,12 @@ #include #include #include -#include "common/types.h" +#include #include -#include "Catalog/MetastoreByteKVImpl.h" -#include "Interpreters/executeQuery.h" +#include +#include +#include +#include namespace DB::ErrorCodes { @@ -1585,6 +1587,11 @@ void MetastoreProxy::setBGJobStatus(const String & name_space, const String & uu dedupWorkerBGJobStatusKey(name_space, uuid), String{BGJobStatusInCatalog::serializeToChar(status)} ); + else if (type == CnchBGThreadType::ObjectSchemaAssemble) + metastore_ptr->put( + objectSchemaAssembleBGJobStatusKey(name_space, uuid), + String{BGJobStatusInCatalog::serializeToChar(status)} + ); else throw Exception(String{"persistent status is not support for "} + toString(type), ErrorCodes::LOGICAL_ERROR); } @@ -1604,6 +1611,8 @@ std::optional MetastoreProxy::getBGJobStatus(const String & metastore_ptr->get(mmysqlBGJobStatusKey(name_space, uuid), status_store_data); else if (type == CnchBGThreadType::DedupWorker) metastore_ptr->get(dedupWorkerBGJobStatusKey(name_space, uuid), status_store_data); + else if (type == CnchBGThreadType::ObjectSchemaAssemble) + metastore_ptr->get(objectSchemaAssembleBGJobStatusKey(name_space, uuid), status_store_data); else throw Exception(String{"persistent status is not support for "} + toString(type), ErrorCodes::LOGICAL_ERROR); @@ -1638,6 +1647,8 @@ std::unordered_map MetastoreProxy::getBGJobStatuses(co return metastore_ptr->getByPrefix(allMmysqlBGJobStatusKeyPrefix(name_space)); else if (type == CnchBGThreadType::DedupWorker) return metastore_ptr->getByPrefix(allDedupWorkerBGJobStatusKeyPrefix(name_space)); + else if (type == CnchBGThreadType::ObjectSchemaAssemble) + return metastore_ptr->getByPrefix(allObjectSchemaAssembleBGJobStatusKeyPrefix(name_space)); else throw Exception(String{"persistent status is not support for "} + toString(type), ErrorCodes::LOGICAL_ERROR); }; @@ -1679,6 +1690,9 @@ void MetastoreProxy::dropBGJobStatus(const String & name_space, const String & u case CnchBGThreadType::DedupWorker: metastore_ptr->drop(dedupWorkerBGJobStatusKey(name_space, uuid)); break; + case CnchBGThreadType::ObjectSchemaAssemble: + metastore_ptr->drop(objectSchemaAssembleBGJobStatusKey(name_space, uuid)); + break; default: throw Exception(String{"persistent status is not support for "} + toString(type), ErrorCodes::LOGICAL_ERROR); } @@ -2710,6 +2724,139 @@ IMetaStore::IteratorPtr MetastoreProxy::getItemsInTrash(const String & name_spac return metastore_ptr->getByPrefix(trashItemsPrefix(name_space, table_uuid), limit); } +String MetastoreProxy::extractTxnIDFromPartialSchemaKey(const String &partial_schema_key) +{ + auto pos = partial_schema_key.find_last_of('_'); + return partial_schema_key.substr(pos + 1, String::npos); +} + +void MetastoreProxy::appendObjectPartialSchema( + const String & name_space, const String & table_uuid, const UInt64 & txn_id, const SerializedObjectSchema & partial_schema) +{ + BatchCommitRequest batch_write; + batch_write.AddPut(SinglePutRequest(partialSchemaKey(name_space, table_uuid, txn_id), partial_schema)); + batch_write.AddPut(SinglePutRequest( + partialSchemaStatusKey(name_space, txn_id), + ObjectSchemas::serializeObjectPartialSchemaStatus(ObjectPartialSchemaStatus::Running))); + + BatchCommitResponse store_response; + metastore_ptr->batchWrite(batch_write, store_response); +} + +SerializedObjectSchema MetastoreProxy::getObjectPartialSchema(const String &name_space, const String &table_uuid, const UInt64 &txn_id) +{ + SerializedObjectSchema partial_schema; + metastore_ptr->get(partialSchemaKey(name_space, table_uuid, txn_id), partial_schema); + if (partial_schema.empty()) + return ""; + + return partial_schema; +} + +SerializedObjectSchemas MetastoreProxy::scanObjectPartialSchemas(const String &name_space, const String &table_uuid, const UInt64 &limit_size) +{ + auto scan_prefix = partialSchemaPrefix(name_space, table_uuid); + UInt64 scan_limit = limit_size <= 0 ? 0 : limit_size; + + auto scan_iterator = metastore_ptr->getByPrefix(scan_prefix, scan_limit); + SerializedObjectSchemas serialized_object_schemas; + while (scan_iterator->next()) + { + auto key = scan_iterator->key(); + serialized_object_schemas.emplace(extractTxnIDFromPartialSchemaKey(key), scan_iterator->value()); + } + + return serialized_object_schemas; +} + +SerializedObjectSchema MetastoreProxy::getObjectAssembledSchema(const String &name_space, const String &table_uuid) +{ + SerializedObjectSchema assembled_schema; + metastore_ptr->get(assembledSchemaKey(name_space, table_uuid), assembled_schema); + if (assembled_schema.empty()) + return ""; + + return assembled_schema; +} + +bool MetastoreProxy::resetObjectAssembledSchemaAndPurgePartialSchemas( + const String & name_space, + const String & table_uuid, + const SerializedObjectSchema & old_assembled_schema, + const SerializedObjectSchema & new_assembled_schema, + const std::vector & partial_schema_txnids) +{ + Poco::Logger * log = &Poco::Logger::get(__func__); + + BatchCommitRequest batch_write; + bool if_not_exists = false; + if (old_assembled_schema.empty()) + if_not_exists = true; + + auto update_request = SinglePutRequest(assembledSchemaKey(name_space, table_uuid), new_assembled_schema, old_assembled_schema); + update_request.if_not_exists = if_not_exists; + batch_write.AddPut(update_request); + + for (const auto & txn_id : partial_schema_txnids) + batch_write.AddDelete(partialSchemaKey(name_space, table_uuid, txn_id.toUInt64())); + + BatchCommitResponse store_response; + try + { + metastore_ptr->batchWrite(batch_write, store_response); + return true; + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::METASTORE_COMMIT_CAS_FAILURE) + { + LOG_WARNING( + log, + fmt::format( + "Object schema refresh CAS put fail with old schema:{} and new schema:{}", old_assembled_schema, new_assembled_schema)); + return false; + } + else + throw e; + } +} + +SerializedObjectSchemaStatuses MetastoreProxy::batchGetObjectPartialSchemaStatuses(const String &name_space, const std::vector &txn_ids) +{ + Strings keys; + for (const auto & txn_id : txn_ids) + keys.emplace_back(partialSchemaStatusKey(name_space, txn_id.toUInt64())); + auto serialized_statuses_in_metastore = metastore_ptr->multiGet(keys); + SerializedObjectSchemaStatuses serialized_statuses; + serialized_statuses.reserve(serialized_statuses_in_metastore.size()); + for (size_t i = 0; i < serialized_statuses_in_metastore.size(); i++) + { + auto txn_id = txn_ids[i]; + auto status = serialized_statuses_in_metastore[i].first; + if (status.empty()) + serialized_statuses.emplace(txn_id, ObjectSchemas::serializeObjectPartialSchemaStatus(ObjectPartialSchemaStatus::Finished)); + else + serialized_statuses.emplace(txn_id, status); + } + + return serialized_statuses; +} + +void MetastoreProxy::batchDeletePartialSchemaStatus(const String &name_space, const std::vector &txn_ids) +{ + BatchCommitRequest batch_delete; + for (const auto & txn_id : txn_ids) + batch_delete.AddDelete(partialSchemaStatusKey(name_space, txn_id.toUInt64())); + + BatchCommitResponse delete_result; + metastore_ptr->batchWrite(batch_delete, delete_result); +} + +void MetastoreProxy::updateObjectPartialSchemaStatus(const String &name_space, const TxnTimestamp &txn_id, const ObjectPartialSchemaStatus & status) +{ + metastore_ptr->put(partialSchemaStatusKey(name_space, txn_id), ObjectSchemas::serializeObjectPartialSchemaStatus(status)); +} + IMetaStore::IteratorPtr MetastoreProxy::getAllDeleteBitmaps(const String & name_space, const String & table_uuid) { return metastore_ptr->getByPrefix(deleteBitmapPrefix(name_space, table_uuid)); diff --git a/src/Catalog/MetastoreProxy.h b/src/Catalog/MetastoreProxy.h index 7eec8eb9cd..160afdebdb 100644 --- a/src/Catalog/MetastoreProxy.h +++ b/src/Catalog/MetastoreProxy.h @@ -22,6 +22,7 @@ #include // #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -95,6 +97,7 @@ namespace DB::Catalog #define PARTGC_BG_JOB_STATUS "PARTGC_BGJS_" #define CONSUMER_BG_JOB_STATUS "CONSUMER_BGJS_" #define DEDUPWORKER_BG_JOB_STATUS "DEDUPWORKER_BGJS_" +#define OBJECT_SCHEMA_ASSEMBLE_BG_JOB_STATUS "OBJECT_SCHEMA_ASSEMBLE_BGJS_" #define PREALLOCATE_VW "PVW_" #define DICTIONARY_STORE_PREFIX "DIC_" #define RESOURCE_GROUP_PREFIX "RG_" @@ -113,6 +116,9 @@ namespace DB::Catalog #define MATERIALIZEDMYSQL_PREFIX "MMYSQL_" #define MATERIALIZEDMYSQL_BG_JOB_STATUS "MATERIALIZEDMYSQL_BGJS_" #define DETACHED_DELETE_BITMAP_PREFIX "DDLB_" +#define OBJECT_PARTIAL_SCHEMA_PREFIX "PS_" +#define OBJECT_ASSEMBLED_SCHEMA_PREFIX "AS_" +#define OBJECT_PARTIAL_SCHEMA_STATUS_PREFIX "PSS_" #define PARTITION_PARTS_METRICS_SNAPSHOT_PREFIX "PPS_" #define TABLE_TRASHITEMS_METRICS_SNAPSHOT_PREFIX "TTS_" #define DICTIONARY_BUCKET_UPDATE_TIME_PREFIX "DBUT_" @@ -143,6 +149,10 @@ static EntityMetastorePrefix getEntityMetastorePrefix(EntityType type) } } +using SerializedObjectSchemas = std::unordered_map; +using SerializedObjectSchemaStatuses = std::unordered_map; +using SerializedObjectSchema = String; + static std::shared_ptr getFDBInstance(const String & cluster_config_path) { /// Notice: A single process can only have fdb instance @@ -592,6 +602,16 @@ public: return allDedupWorkerBGJobStatusKeyPrefix(name_space) + uuid; } + static std::string allObjectSchemaAssembleBGJobStatusKeyPrefix(const std::string & name_space) + { + return escapeString(name_space) + '_' + OBJECT_SCHEMA_ASSEMBLE_BG_JOB_STATUS; + } + + static std::string objectSchemaAssembleBGJobStatusKey(const std::string & name_space, const std::string & uuid) + { + return allObjectSchemaAssembleBGJobStatusKeyPrefix(name_space) + uuid; + } + static UUID parseUUIDFromBGJobStatusKey(const std::string & key); static std::string preallocateVW(const std::string & name_space, const std::string & uuid) @@ -765,6 +785,26 @@ public: return escapeString(name_space) + "_" + DATA_ITEM_TRASH_PREFIX + uuid + "_"; } + static String partialSchemaPrefix(const String & name_space, const String & table_uuid) + { + return escapeString(name_space) + "_" + OBJECT_PARTIAL_SCHEMA_PREFIX + table_uuid + "_"; + } + + static String partialSchemaKey(const String & name_space, const String & table_uuid, const UInt64 & txn_id) + { + return escapeString(name_space) + "_" + OBJECT_PARTIAL_SCHEMA_PREFIX + table_uuid + "_" + toString(txn_id); + } + + static String assembledSchemaKey(const String & name_space, const String & table_uuid) + { + return escapeString(name_space) + "_" + OBJECT_ASSEMBLED_SCHEMA_PREFIX + table_uuid; + } + + static String partialSchemaStatusKey(const String & name_space, const UInt64 & txn_id) + { + return escapeString(name_space) + "-" + OBJECT_PARTIAL_SCHEMA_STATUS_PREFIX + "_" + toString(txn_id); + } + static String partitionPartsMetricsSnapshotPrefix(const String & name_space, const String & table_uuid, const String & partition_id) { return escapeString(name_space) + "_" + PARTITION_PARTS_METRICS_SNAPSHOT_PREFIX + table_uuid + "_" + partition_id; @@ -1105,6 +1145,25 @@ public: * @param limit Limit the results, disabled by passing 0. */ IMetaStore::IteratorPtr getItemsInTrash(const String & name_space, const String & table_uuid, const size_t & limit); + + //Object column schema related API + static String extractTxnIDFromPartialSchemaKey(const String & partial_schema_key); + void appendObjectPartialSchema( + const String & name_space, const String & table_uuid, const UInt64 & txn_id, const SerializedObjectSchema & partial_schema); + SerializedObjectSchema getObjectPartialSchema(const String & name_space, const String & table_uuid, const UInt64 & txn_id); + SerializedObjectSchemas scanObjectPartialSchemas(const String & name_space, const String & table_uuid, const UInt64 & limit_size); + SerializedObjectSchema getObjectAssembledSchema(const String & name_space, const String & table_uuid); + bool resetObjectAssembledSchemaAndPurgePartialSchemas( + const String & name_space, + const String & table_uuid, + const SerializedObjectSchema & old_assembled_schema, + const SerializedObjectSchema & new_assembled_schema, + const std::vector & partial_schema_txnids); + + SerializedObjectSchemaStatuses batchGetObjectPartialSchemaStatuses(const String & name_space, const std::vector & txn_ids); + void batchDeletePartialSchemaStatus(const String & name_space, const std::vector & txn_ids); + void updateObjectPartialSchemaStatus(const String &name_space, const TxnTimestamp & txn_id, const ObjectPartialSchemaStatus & status); + IMetaStore::IteratorPtr getAllDeleteBitmaps(const String & name_space, const String & table_uuid); /** diff --git a/src/CloudServices/CnchBGThreadCommon.h b/src/CloudServices/CnchBGThreadCommon.h index f22a26846b..bd3685e795 100644 --- a/src/CloudServices/CnchBGThreadCommon.h +++ b/src/CloudServices/CnchBGThreadCommon.h @@ -35,9 +35,10 @@ namespace CnchBGThread DedupWorker = 5, Clustering = 6, MaterializedMySQL = 7, + ObjectSchemaAssemble = 8, ServerMinType = PartGC, - ServerMaxType = MaterializedMySQL, + ServerMaxType = ObjectSchemaAssemble, GlobalGC = 20, /// reserve several entries TxnGC = 21, @@ -96,6 +97,8 @@ constexpr auto toString(CnchBGThreadType type) return "TxnGCThread"; case CnchBGThreadType::ResourceReport: return "ResourceReport"; + case CnchBGThreadType::ObjectSchemaAssemble: + return "ObjectSchemaAssembleThread"; case CnchBGThreadType::MemoryBuffer: return "MemoryBuffer"; case CnchBGThreadType::MaterializedMySQL: diff --git a/src/CloudServices/CnchBGThreadsMap.cpp b/src/CloudServices/CnchBGThreadsMap.cpp index 8cc3ba2094..59b9516e30 100644 --- a/src/CloudServices/CnchBGThreadsMap.cpp +++ b/src/CloudServices/CnchBGThreadsMap.cpp @@ -23,6 +23,8 @@ #include #include #include +#include + #include #include @@ -71,6 +73,10 @@ CnchBGThreadPtr CnchBGThreadsMap::createThread(const StorageID & storage_id) { return std::make_shared(getContext(), storage_id); } + else if (type == CnchBGThreadType::ObjectSchemaAssemble) + { + return std::make_shared(getContext(), storage_id); + } else if (type == CnchBGThreadType::MaterializedMySQL) { return std::make_shared(getContext(), storage_id); diff --git a/src/CloudServices/CnchObjectColumnSchemaAssembleThread.cpp b/src/CloudServices/CnchObjectColumnSchemaAssembleThread.cpp new file mode 100644 index 0000000000..4f34214a0a --- /dev/null +++ b/src/CloudServices/CnchObjectColumnSchemaAssembleThread.cpp @@ -0,0 +1,138 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +CnchObjectColumnSchemaAssembleThread::CnchObjectColumnSchemaAssembleThread(ContextPtr context_, const StorageID & id_) + : ICnchBGThread(context_, CnchBGThreadType::ObjectSchemaAssemble, id_) +{ +} + +void CnchObjectColumnSchemaAssembleThread::runImpl() +{ + try + { + auto storage = getStorageFromCatalog(); + auto table_uuid = storage->getStorageUUID(); + auto & table = checkAndGetCnchTable(storage); + auto storage_settings = table.getSettings(); + auto database_name = storage->getDatabaseName(); + auto table_name = storage->getTableName(); + + if (storage->is_dropped) + { + LOG_DEBUG(log, "Table was dropped, wait for removing..."); + scheduled_task->scheduleAfter(10 * 1000); + return; + } + + if (storage->supportsDynamicSubcolumns() && hasDynamicSubcolumns(storage->getInMemoryMetadata().getColumns())) + { + LOG_INFO(log, "{}.{} Start assemble partial schemas", database_name, table_name); + auto catalog = getContext()->getCnchCatalog(); + auto [current_topology_version, current_topology] = getContext()->getCnchTopologyMaster()->getCurrentTopologyVersion(); + + // Step 1:scan object partial schema and drop uncommitted partial schemas + auto old_assembled_schema = catalog->tryGetTableObjectAssembledSchema(table_uuid); + auto partial_schemas + = catalog->tryGetTableObjectPartialSchemas(table_uuid, storage_settings->json_partial_schema_assemble_batch_size); + LOG_DEBUG(log, "{}.{} Before assemble. Assembled schema :{}", database_name, table_name, old_assembled_schema.toString()); + + if (partial_schemas.empty()) + { + LOG_INFO(log, "{}.{} no need to refresh dynamic object column schema.", database_name, table_name); + scheduled_task->scheduleAfter(50 * 1000); + return; + } + + std::vector unfiltered_partial_schema_txnids; + unfiltered_partial_schema_txnids.reserve(partial_schemas.size()); + for (const auto & [txn_id, partial_schema] : partial_schemas) + { + LOG_DEBUG( + log, + "{}.{} Before assemble. Partial schema :[{}->{}]", + database_name, + table_name, + txn_id.toString(), + partial_schema.toString()); + + unfiltered_partial_schema_txnids.emplace_back(txn_id); + } + + auto committed_partial_schema_txnids = catalog->filterUncommittedObjectPartialSchemas(unfiltered_partial_schema_txnids); + + // Step 2:assemble partial schema to assembled schema + std::vector partial_schema_txn_ids_for_print; + std::vector schemas_ready_to_assemble; + partial_schema_txn_ids_for_print.reserve(committed_partial_schema_txnids.size()); + schemas_ready_to_assemble.reserve(committed_partial_schema_txnids.size() + 1); + for (auto & txn_id : committed_partial_schema_txnids) + { + auto partial_schema = partial_schemas[txn_id]; + schemas_ready_to_assemble.emplace_back(partial_schema); + partial_schema_txn_ids_for_print.emplace_back(txn_id.toString()); + } + schemas_ready_to_assemble.emplace_back(old_assembled_schema); + auto new_assembled_schema = DB::getConcreteObjectColumns( + schemas_ready_to_assemble.begin(), + schemas_ready_to_assemble.end(), + storage->getInMemoryMetadata().getColumns(), + [](const auto & schema) { return schema; }); + + // Step 3:update assembled schema and delete partial schema in meta store + // TODO:@lianwenlong consider purge fail and check lease + auto cas_put_result = catalog->resetObjectAssembledSchemaAndPurgePartialSchemas( + table_uuid, old_assembled_schema, new_assembled_schema, committed_partial_schema_txnids); + + LOG_DEBUG( + log, + "{}.{} After assemble.Assembled schema :{} and deleted txn ids:{}, result:{}", + database_name, + table_name, + new_assembled_schema.toString(), + fmt::join(partial_schema_txn_ids_for_print, ","), + std::to_string(cas_put_result)); + + if (cas_put_result) + { + // Step 4:update assembled schema and delete partial schema in storage cache + if (auto cache_manager = getContext()->getPartCacheManager()) + { + if (auto storage_in_cache = cache_manager->getStorageFromCache(table_uuid, current_topology_version)) + { + auto & table_in_cache = checkAndGetCnchTable(storage_in_cache); + table_in_cache.refreshAssembledSchema(new_assembled_schema, committed_partial_schema_txnids); + } + } + + // Step5: clean partial schema status in meta store including + catalog->batchDeleteObjectPartialSchemaStatus(committed_partial_schema_txnids); + // Step6: @TODO:lianwenlong rollback aborted partial schema from meta store and storage cache + } + + LOG_INFO( + log, "{}.{} Finish assemble partial schemas with result:{}", database_name, table_name, std::to_string(cas_put_result)); + } + } + catch (...) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + } + + scheduled_task->scheduleAfter(50 * 1000); +} + +} //namespace DB end diff --git a/src/CloudServices/CnchObjectColumnSchemaAssembleThread.h b/src/CloudServices/CnchObjectColumnSchemaAssembleThread.h new file mode 100644 index 0000000000..44cf2b3ca6 --- /dev/null +++ b/src/CloudServices/CnchObjectColumnSchemaAssembleThread.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace DB +{ +class CnchObjectColumnSchemaAssembleThread : public ICnchBGThread +{ +public: + CnchObjectColumnSchemaAssembleThread(ContextPtr context_, const StorageID & id_); + +private: + void runImpl() override; +}; +}//namespace DB end diff --git a/src/CloudServices/CnchServerResource.cpp b/src/CloudServices/CnchServerResource.cpp index 77222cca82..dee58f65c3 100644 --- a/src/CloudServices/CnchServerResource.cpp +++ b/src/CloudServices/CnchServerResource.cpp @@ -27,9 +27,10 @@ #include #include #include -#include "Interpreters/Context_fwd.h" -#include "Storages/Hive/HiveFile/IHiveFile.h" -#include "Storages/Hive/StorageCnchHive.h" +#include +#include +#include +#include #include #include @@ -63,6 +64,7 @@ AssignedResource::AssignedResource(AssignedResource && resource) part_names = resource.part_names; // don't call move here resource.sent_create_query = true; + object_columns = resource.object_columns; } void AssignedResource::addDataParts(const ServerDataPartsVector & parts) @@ -470,6 +472,7 @@ void CnchServerResource::allocateResource( worker_resource.create_table_query = resource.create_table_query; worker_resource.worker_table_name = resource.worker_table_name; worker_resource.bucket_numbers = assigned_bucket_numbers; + worker_resource.object_columns = resource.object_columns; } } } diff --git a/src/CloudServices/CnchServerResource.h b/src/CloudServices/CnchServerResource.h index b52cbdef98..971b7a13d0 100644 --- a/src/CloudServices/CnchServerResource.h +++ b/src/CloudServices/CnchServerResource.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -86,6 +88,8 @@ struct AssignedResource std::unordered_set part_names; + ColumnsDescription object_columns; + void addDataParts(const ServerDataPartsVector & parts); void addDataParts(const FileDataPartsCNCHVector & parts); void addDataParts(const HiveFiles & parts); @@ -141,6 +145,13 @@ public: assigned_resource.bucket_numbers = required_bucket_numbers; } + void setResourceReplicated(const UUID & storage_id, bool replicated) + { + std::lock_guard lock(mutex); + auto & assigned_resource = assigned_table_resource.at(storage_id); + assigned_resource.replicated = replicated; + } + /// Send resource to worker void sendResource(const ContextPtr & context, const HostWithPorts & worker); /// allocate and send resource to worker_group @@ -152,6 +163,14 @@ public: void sendResources(const ContextPtr & context, WorkerAction act); void cleanResource(); + void addDynamicObjectSchema(const UUID & storage_id, const ColumnsDescription & object_columns_) + { + std::lock_guard lock(mutex); + auto & assigned_resource = assigned_table_resource.at(storage_id); + + assigned_resource.object_columns = object_columns_; + } + void setSendMutations(bool send_mutations_) { send_mutations = send_mutations_; } private: diff --git a/src/CloudServices/CnchServerServiceImpl.cpp b/src/CloudServices/CnchServerServiceImpl.cpp index 545d3bb4b7..c213c55661 100644 --- a/src/CloudServices/CnchServerServiceImpl.cpp +++ b/src/CloudServices/CnchServerServiceImpl.cpp @@ -26,12 +26,13 @@ #include #include #include -#include "Common/tests/gtest_global_context.h" +#include +#include #include #include #include #include -#include +#include #include #include #include diff --git a/src/CloudServices/CnchWorkerClient.cpp b/src/CloudServices/CnchWorkerClient.cpp index 17f2bd5bce..3ca602a642 100644 --- a/src/CloudServices/CnchWorkerClient.cpp +++ b/src/CloudServices/CnchWorkerClient.cpp @@ -22,8 +22,9 @@ #include #include #include -#include "Storages/Hive/HiveFile/IHiveFile.h" -#include "Storages/Hive/StorageCnchHive.h" +#include +#include +#include #include #include #include @@ -76,6 +77,12 @@ void CnchWorkerClient::submitManipulationTask( params.mutation_commands->writeText(write_buf); } + if (hasDynamicSubcolumns(storage.getInMemoryMetadata().columns)) + { + request.set_dynamic_object_column_schema( + storage.getStorageSnapshot(storage.getInMemoryMetadataPtr(), nullptr)->object_columns.toString()); + } + stub->submitManipulationTask(&cntl, &request, &response, nullptr); assertController(cntl); @@ -322,7 +329,10 @@ brpc::CallId CnchWorkerClient::sendResources( for (const auto & resource : resources_to_send) { if (!resource.sent_create_query) + { request.add_create_queries(resource.create_table_query); + request.add_dynamic_object_column_schema(resource.object_columns.toString()); + } /// parts auto & table_data_parts = *request.mutable_data_parts()->Add(); diff --git a/src/CloudServices/CnchWorkerResource.cpp b/src/CloudServices/CnchWorkerResource.cpp index 4787c3167a..b101932f3d 100644 --- a/src/CloudServices/CnchWorkerResource.cpp +++ b/src/CloudServices/CnchWorkerResource.cpp @@ -13,6 +13,7 @@ * limitations under the License. */ +#include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +45,7 @@ namespace ErrorCodes extern const int TABLE_ALREADY_EXISTS; } -void CnchWorkerResource::executeCreateQuery(ContextMutablePtr context, const String & create_query, bool skip_if_exists) +void CnchWorkerResource::executeCreateQuery(ContextMutablePtr context, const String & create_query, bool skip_if_exists, const ColumnsDescription & object_columns) { LOG_DEBUG(&Poco::Logger::get("WorkerResource"), "start create cloud table {}", create_query); const char * begin = create_query.data(); @@ -139,6 +141,9 @@ void CnchWorkerResource::executeCreateQuery(ContextMutablePtr context, const Str StoragePtr res = StorageFactory::instance().get(ast_create_query, "", context, context->getGlobalContext(), columns, constraints, foreign_keys, unique_not_enforced, false); res->startup(); + if (auto cloud_table = std::dynamic_pointer_cast(res)) + cloud_table->resetObjectColumns(object_columns); + { auto lock = getLock(); cloud_tables.emplace(std::make_pair(tenant_db, table_name), res); diff --git a/src/CloudServices/CnchWorkerResource.h b/src/CloudServices/CnchWorkerResource.h index 250fd62547..c4b422dacf 100644 --- a/src/CloudServices/CnchWorkerResource.h +++ b/src/CloudServices/CnchWorkerResource.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -33,7 +34,7 @@ class CloudTablesBlockSource; class CnchWorkerResource { public: - void executeCreateQuery(ContextMutablePtr context, const String & create_query, bool skip_if_exists = false); + void executeCreateQuery(ContextMutablePtr context, const String & create_query, bool skip_if_exists = false, const ColumnsDescription & object_columns = {}); StoragePtr getTable(const StorageID & table_id) const; DatabasePtr getDatabase(const String & database_name) const; bool isCnchTableInWorker(const StorageID & table_id) const; diff --git a/src/CloudServices/CnchWorkerServiceImpl.cpp b/src/CloudServices/CnchWorkerServiceImpl.cpp index 46527c0146..bc8649fa0f 100644 --- a/src/CloudServices/CnchWorkerServiceImpl.cpp +++ b/src/CloudServices/CnchWorkerServiceImpl.cpp @@ -48,8 +48,9 @@ #include #include #include -#include "Common/Configurations.h" -#include "Common/Exception.h" +#include +#include +#include #if USE_RDKAFKA # include @@ -184,6 +185,8 @@ void CnchWorkerServiceImpl::submitManipulationTask( auto * data = dynamic_cast(storage.get()); if (!data) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table {} is not CloudMergeTree", storage->getStorageID().getNameForLogs()); + if (request->has_dynamic_object_column_schema()) + data->resetObjectColumns(ColumnsDescription::parse(request->dynamic_object_column_schema())); auto params = ManipulationTaskParams(storage); params.type = static_cast(request->type()); @@ -632,8 +635,14 @@ void CnchWorkerServiceImpl::sendResources( { /// create a copy of session_context to avoid modify settings in SessionResource auto context_for_create = Context::createCopy(query_context); - for (const auto & create_query : request->create_queries()) - worker_resource->executeCreateQuery(context_for_create, create_query, true); + for (int i = 0; i < request->create_queries_size(); i++) + { + auto create_query = request->create_queries().at(i); + auto object_columns = request->dynamic_object_column_schema().at(i); + + worker_resource->executeCreateQuery(context_for_create, create_query, false, ColumnsDescription::parse(object_columns)); + } + LOG_DEBUG(log, "Successfully create {} queries for Session: {}", request->create_queries_size(), request->txn_id()); } diff --git a/src/Columns/ColumnAggregateFunction.cpp b/src/Columns/ColumnAggregateFunction.cpp index 78377c9462..865a21849e 100644 --- a/src/Columns/ColumnAggregateFunction.cpp +++ b/src/Columns/ColumnAggregateFunction.cpp @@ -164,16 +164,16 @@ MutableColumnPtr ColumnAggregateFunction::convertToValues(MutableColumnPtr colum /// If there are references to states in final column, we must hold their ownership /// by holding arenas and source. - auto callback = [&](auto & subcolumn) + auto callback = [&](IColumn & subcolumn) { - if (auto * aggregate_subcolumn = typeid_cast(subcolumn.get())) + if (auto * aggregate_subcolumn = typeid_cast(&subcolumn)) { aggregate_subcolumn->foreign_arenas = concatArenas(column_aggregate_func.foreign_arenas, column_aggregate_func.my_arena); aggregate_subcolumn->src = column_aggregate_func.getPtr(); } }; - callback(res); + callback(*res); res->forEachSubcolumnRecursively(callback); for (auto * val : data) @@ -349,7 +349,7 @@ ColumnPtr ColumnAggregateFunction::filter(const Filter & filter, ssize_t result_ { size_t size = data.size(); if (size != filter.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filter.size(), size); if (size == 0) return cloneEmpty(); @@ -371,7 +371,6 @@ ColumnPtr ColumnAggregateFunction::filter(const Filter & filter, ssize_t result_ return res; } - ColumnPtr ColumnAggregateFunction::permute(const Permutation & perm, size_t limit) const { size_t size = data.size(); diff --git a/src/Columns/ColumnAggregateFunction.h b/src/Columns/ColumnAggregateFunction.h index 7b06c687b2..9b9b4b9ee2 100644 --- a/src/Columns/ColumnAggregateFunction.h +++ b/src/Columns/ColumnAggregateFunction.h @@ -228,6 +228,21 @@ public: throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED); } + double getRatioOfDefaultRows(double) const override + { + return 0.0; + } + + UInt64 getNumberOfDefaultRows() const override + { + return 0; + } + + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getIndicesOfNonDefaultRows is not supported for ColumnAggregateFunction"); + } + void getPermutation(PermutationSortDirection direction, PermutationSortStability stability, size_t limit, int nan_direction_hint, Permutation & res) const override; diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 17def6afd1..d178c44e1c 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -629,7 +629,7 @@ ColumnPtr ColumnArray::filterString(const Filter & filt, ssize_t result_size_hin { size_t col_size = getOffsets().size(); if (col_size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), col_size); if (0 == col_size) return ColumnArray::create(data); @@ -697,7 +697,7 @@ ColumnPtr ColumnArray::filterGeneric(const Filter & filt, ssize_t result_size_hi { size_t size = getOffsets().size(); if (size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size); if (size == 0) return ColumnArray::create(data); @@ -963,6 +963,20 @@ ColumnPtr ColumnArray::compress() const }); } +double ColumnArray::getRatioOfDefaultRows(double sample_ratio) const +{ + return getRatioOfDefaultRowsImpl(sample_ratio); +} + +UInt64 ColumnArray::getNumberOfDefaultRows() const +{ + return getNumberOfDefaultRowsImpl(); +} + +void ColumnArray::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const +{ + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); +} ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const { diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index 3b3c94b41c..0b0f4c0a4a 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -172,17 +172,17 @@ public: ColumnPtr compress() const override; - void forEachSubcolumn(ColumnCallback callback) override + void forEachSubcolumn(MutableColumnCallback callback) override { callback(offsets); callback(data); } - void forEachSubcolumnRecursively(ColumnCallback callback) override + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override { - callback(offsets); + callback(*offsets); offsets->forEachSubcolumnRecursively(callback); - callback(data); + callback(*data); data->forEachSubcolumnRecursively(callback); } @@ -193,6 +193,10 @@ public: return false; } + double getRatioOfDefaultRows(double sample_ratio) const override; + UInt64 getNumberOfDefaultRows() const override; + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; + bool isCollationSupported() const override { return getData().isCollationSupported(); } size_t ALWAYS_INLINE offsetAt(ssize_t i) const { return getOffsets()[i - 1]; } diff --git a/src/Columns/ColumnBitMap64.h b/src/Columns/ColumnBitMap64.h index b21b2f67b8..6f5c6378ca 100644 --- a/src/Columns/ColumnBitMap64.h +++ b/src/Columns/ColumnBitMap64.h @@ -257,6 +257,21 @@ public: ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override; + double getRatioOfDefaultRows(double) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getRatioOfDefaultRows is not supported for {}", getName()); + } + + UInt64 getNumberOfDefaultRows() const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getNumberOfDefaultRows is not supported for {}", getName()); + } + + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getIndicesOfNonDefaultRows is not supported for {}", getName()); + } + ColumnPtr permute(const Permutation & perm, size_t limit) const override; ColumnPtr index(const IColumn & indexes, size_t limit) const override; diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index d1a3cc596f..78152a451a 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -132,6 +132,9 @@ public: void gather(ColumnGathererStream &) override { throwMustBeDecompressed(); } void getExtremes(Field &, Field &) const override { throwMustBeDecompressed(); } size_t byteSizeAt(size_t) const override { throwMustBeDecompressed(); } + double getRatioOfDefaultRows(double) const override { throwMustBeDecompressed(); } + UInt64 getNumberOfDefaultRows() const override { throwMustBeDecompressed(); } + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeDecompressed(); } protected: size_t rows; diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index 0672dbb7fb..2a5f006999 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -26,6 +26,7 @@ #include #include #include +#include namespace DB @@ -207,6 +208,7 @@ public: } ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override; + ColumnPtr replicate(const Offsets & offsets) const override; ColumnPtr permute(const Permutation & perm, size_t limit) const override; ColumnPtr index(const IColumn & indexes, size_t limit) const override; @@ -253,7 +255,7 @@ public: data->getExtremes(min, max); } - void forEachSubcolumn(ColumnCallback callback) override + void forEachSubcolumn(MutableColumnCallback callback) override { callback(data); } @@ -271,6 +273,27 @@ public: return false; } + double getRatioOfDefaultRows(double) const override + { + return data->isDefaultAt(0) ? 1.0 : 0.0; + } + + UInt64 getNumberOfDefaultRows() const override + { + return data->isDefaultAt(0) ? s : 0; + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + if (!data->isDefaultAt(0)) + { + size_t to = limit && from + limit < size() ? from + limit : size(); + indices.reserve(indices.size() + to - from); + for (size_t i = from; i < to; ++i) + indices.push_back(i); + } + } + bool isNullable() const override { return isColumnNullable(*data); } bool onlyNull() const override { return data->isNullAt(0); } bool isNumeric() const override { return data->isNumeric(); } diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h index 77821e0459..d58a3b0937 100644 --- a/src/Columns/ColumnDecimal.h +++ b/src/Columns/ColumnDecimal.h @@ -175,6 +175,7 @@ public: bool isDefaultAt(size_t n) const override { return data[n].value == 0; } ColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override; + ColumnPtr permute(const IColumn::Permutation & perm, size_t limit) const override; ColumnPtr index(const IColumn & indexes, size_t limit) const override; @@ -198,6 +199,21 @@ public: return false; } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return this->template getRatioOfDefaultRowsImpl(sample_ratio); + } + + UInt64 getNumberOfDefaultRows() const override + { + return this->template getNumberOfDefaultRowsImpl(); + } + + void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override + { + return this->template getIndicesOfNonDefaultRowsImpl(indices, from, limit); + } + ColumnPtr compress() const override; diff --git a/src/Columns/ColumnFixedString.h b/src/Columns/ColumnFixedString.h index ed3af66cc6..61f7b8c224 100644 --- a/src/Columns/ColumnFixedString.h +++ b/src/Columns/ColumnFixedString.h @@ -177,6 +177,7 @@ public: ColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override; + ColumnPtr permute(const Permutation & perm, size_t limit) const override; ColumnPtr index(const IColumn & indexes, size_t limit) const override; @@ -214,6 +215,21 @@ public: return false; } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return getRatioOfDefaultRowsImpl(sample_ratio); + } + + UInt64 getNumberOfDefaultRows() const override + { + return getNumberOfDefaultRowsImpl(); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); + } + bool canBeInsideNullable() const override { return true; } bool isFixedAndContiguous() const override { return true; } diff --git a/src/Columns/ColumnFunction.cpp b/src/Columns/ColumnFunction.cpp index 83f668ded7..8440511566 100644 --- a/src/Columns/ColumnFunction.cpp +++ b/src/Columns/ColumnFunction.cpp @@ -187,6 +187,14 @@ void ColumnFunction::appendArgument(const ColumnWithTypeAndName & column) captured_columns.push_back(column); } +DataTypePtr ColumnFunction::getResultType() const +{ + // if (recursively_convert_result_to_full_column_if_low_cardinality) + // return recursiveRemoveLowCardinality(function->getResultType()); + + return function->getResultType(); +} + ColumnWithTypeAndName ColumnFunction::reduce() const { auto args = function->getArgumentTypes().size(); @@ -203,4 +211,12 @@ ColumnWithTypeAndName ColumnFunction::reduce() const return res; } +const ColumnFunction * checkAndGetShortCircuitArgument(const ColumnPtr & column) +{ + const ColumnFunction * column_function; + if ((column_function = typeid_cast(column.get())) && column_function->isShortCircuitArgument()) + return column_function; + return nullptr; +} + } diff --git a/src/Columns/ColumnFunction.h b/src/Columns/ColumnFunction.h index 6ceea78541..40c75a13e4 100644 --- a/src/Columns/ColumnFunction.h +++ b/src/Columns/ColumnFunction.h @@ -155,6 +155,25 @@ public: throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + DataTypePtr getResultType() const; + + double getRatioOfDefaultRows(double) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getRatioOfDefaultRows is not supported for {}", getName()); + } + + UInt64 getNumberOfDefaultRows() const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getNumberOfDefaultRows is not supported for {}", getName()); + } + + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getIndicesOfNonDefaultRows is not supported for {}", getName()); + } + + bool isShortCircuitArgument() const { return false; } + private: size_t size_; FunctionBasePtr function; @@ -163,4 +182,6 @@ private: void appendArgument(const ColumnWithTypeAndName & column); }; +const ColumnFunction * checkAndGetShortCircuitArgument(const ColumnPtr & column); + } diff --git a/src/Columns/ColumnLowCardinality.h b/src/Columns/ColumnLowCardinality.h index d73e38e60d..2bb650b34b 100644 --- a/src/Columns/ColumnLowCardinality.h +++ b/src/Columns/ColumnLowCardinality.h @@ -323,7 +323,7 @@ public: return idx.getPositions()->allocatedBytes() + getDictionary().allocatedBytes(); } - void forEachSubcolumn(ColumnCallback callback) override + void forEachSubcolumn(MutableColumnCallback callback) override { if (full_state) { @@ -338,22 +338,22 @@ public: callback(dictionary.getColumnUniquePtr()); } - void forEachSubcolumnRecursively(ColumnCallback callback) override + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override { if (isFullState()) { - callback(nested_column); + callback(*nested_column); nested_column->forEachSubcolumnRecursively(callback); } - callback(idx.getPositionsPtr()); + callback(*idx.getPositionsPtr()); idx.getPositionsPtr()->forEachSubcolumnRecursively(callback); /// Column doesn't own dictionary if it's shared. if (!dictionary.isShared()) { - callback(dictionary.getColumnUniquePtr()); + callback(*dictionary.getColumnUniquePtr()); dictionary.getColumnUniquePtr()->forEachSubcolumnRecursively(callback); } } @@ -371,6 +371,21 @@ public: return false; } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return getIndexes().getRatioOfDefaultRows(sample_ratio); + } + + UInt64 getNumberOfDefaultRows() const override + { + return getIndexes().getNumberOfDefaultRows(); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit); + } + bool valuesHaveFixedSize() const override { if (full_state) @@ -482,6 +497,7 @@ public: const ColumnPtr & getPositions() const { return positions; } WrappedPtr & getPositionsPtr() { return positions; } + const WrappedPtr & getPositionsPtr() const { return positions; } size_t getPositionAt(size_t row) const; void insertPosition(UInt64 position); void insertPositionsRange(const IColumn & column, UInt64 offset, UInt64 limit); diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index 339cb51d74..3fc943ae8d 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -383,14 +383,14 @@ void ColumnMap::getExtremes(Field & min, Field & max) const max = std::move(map_max_value); } -void ColumnMap::forEachSubcolumn(ColumnCallback callback) +void ColumnMap::forEachSubcolumn(MutableColumnCallback callback) { nested->forEachSubcolumn(callback); } -void ColumnMap::forEachSubcolumnRecursively(ColumnCallback callback) +void ColumnMap::forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) { - callback(nested); + callback(*nested); nested->forEachSubcolumnRecursively(callback); } @@ -410,6 +410,21 @@ ColumnPtr ColumnMap::compress() const }); } +double ColumnMap::getRatioOfDefaultRows(double sample_ratio) const +{ + return getRatioOfDefaultRowsImpl(sample_ratio); +} + +UInt64 ColumnMap::getNumberOfDefaultRows() const +{ + return getNumberOfDefaultRowsImpl(); +} + +void ColumnMap::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const +{ + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); +} + /** * Generic implementation of get implicit value column based on key value. * TODO: specialize this function for Number type and String type. diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index 9484205160..736de7be4e 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -117,9 +117,12 @@ public: size_t byteSizeAt(size_t n) const override; size_t allocatedBytes() const override; void protect() override; - void forEachSubcolumn(ColumnCallback callback) override; - void forEachSubcolumnRecursively(ColumnCallback callback) override; + void forEachSubcolumn(MutableColumnCallback callback) override; + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override; bool structureEquals(const IColumn & rhs) const override; + double getRatioOfDefaultRows(double sample_ratio) const override; + UInt64 getNumberOfDefaultRows() const override; + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; /** Access embeded columns*/ const ColumnArray & getNestedColumn() const { return assert_cast(*nested); } diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 192d200488..cd585e2fdb 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -149,17 +149,17 @@ public: ColumnPtr compress() const override; - void forEachSubcolumn(ColumnCallback callback) override + void forEachSubcolumn(MutableColumnCallback callback) override { callback(nested_column); callback(null_map); } - void forEachSubcolumnRecursively(ColumnCallback callback) override + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override { - callback(nested_column); + callback(*nested_column); nested_column->forEachSubcolumnRecursively(callback); - callback(null_map); + callback(*null_map); null_map->forEachSubcolumnRecursively(callback); } @@ -170,6 +170,21 @@ public: return false; } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return getRatioOfDefaultRowsImpl(sample_ratio); + } + + UInt64 getNumberOfDefaultRows() const override + { + return getNumberOfDefaultRowsImpl(); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + getIndicesOfNonDefaultRowsImpl(indices, from, limit); + } + bool isNullable() const override { return true; } bool isFixedAndContiguous() const override { return false; } bool valuesHaveFixedSize() const override { return nested_column->valuesHaveFixedSize(); } diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp new file mode 100644 index 0000000000..16f53d1d57 --- /dev/null +++ b/src/Columns/ColumnObject.cpp @@ -0,0 +1,1095 @@ +#include +#include +#include +#include +#include +// #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int ILLEGAL_COLUMN; + extern const int DUPLICATE_COLUMN; + extern const int NUMBER_OF_DIMENSIONS_MISMATCHED; + extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; + extern const int ARGUMENT_OUT_OF_BOUND; +} + +namespace +{ + +/// Recreates column with default scalar values and keeps sizes of arrays. +ColumnPtr recreateColumnWithDefaultValues( + const ColumnPtr & column, const DataTypePtr & scalar_type, size_t num_dimensions) +{ + const auto * column_array = checkAndGetColumn(column.get()); + if (column_array && num_dimensions) + { + return ColumnArray::create( + recreateColumnWithDefaultValues( + column_array->getDataPtr(), scalar_type, num_dimensions - 1), + IColumn::mutate(column_array->getOffsetsPtr())); + } + + return createArrayOfType(scalar_type, num_dimensions)->createColumn()->cloneResized(column->size()); +} + +/// Replaces NULL fields to given field or empty array. +class FieldVisitorReplaceNull : public StaticVisitor +{ +public: + explicit FieldVisitorReplaceNull( + const Field & replacement_, size_t num_dimensions_) + : replacement(replacement_) + , num_dimensions(num_dimensions_) + { + } + + Field operator()(const Null &) const + { + return num_dimensions ? Array() : replacement; + } + + Field operator()(const Array & x) const + { + assert(num_dimensions > 0); + const size_t size = x.size(); + Array res(size); + for (size_t i = 0; i < size; ++i) + res[i] = applyVisitor(FieldVisitorReplaceNull(replacement, num_dimensions - 1), x[i]); + return res; + } + + template + Field operator()(const T & x) const { return x; } + +private: + const Field & replacement; + size_t num_dimensions; +}; + +/// Visitor that allows to get type of scalar field +/// or least common type of scalars in array. +/// More optimized version of FieldToDataType. +class FieldVisitorToScalarType : public StaticVisitor<> +{ +public: + using FieldType = Field::Types::Which; + + void operator()(const Array & x) + { + size_t size = x.size(); + for (size_t i = 0; i < size; ++i) + applyVisitor(*this, x[i]); + } + + void operator()(const UInt64 & x) + { + field_types.insert(FieldType::UInt64); + if (x <= std::numeric_limits::max()) + type_indexes.insert(TypeIndex::UInt8); + else if (x <= std::numeric_limits::max()) + type_indexes.insert(TypeIndex::UInt16); + else if (x <= std::numeric_limits::max()) + type_indexes.insert(TypeIndex::UInt32); + else + type_indexes.insert(TypeIndex::UInt64); + } + + void operator()(const Int64 & x) + { + field_types.insert(FieldType::Int64); + if (x <= std::numeric_limits::max() && x >= std::numeric_limits::min()) + type_indexes.insert(TypeIndex::Int8); + else if (x <= std::numeric_limits::max() && x >= std::numeric_limits::min()) + type_indexes.insert(TypeIndex::Int16); + else if (x <= std::numeric_limits::max() && x >= std::numeric_limits::min()) + type_indexes.insert(TypeIndex::Int32); + else + type_indexes.insert(TypeIndex::Int64); + } + + [[maybe_unused]]void operator()(const bool &) + { + field_types.insert(FieldType::UInt64); + type_indexes.insert(TypeIndex::UInt8); + } + + void operator()(const Null &) + { + have_nulls = true; + } + + template + void operator()(const T &) + { + field_types.insert(Field::TypeToEnum>::value); + type_indexes.insert(TypeToTypeIndex>); + } + + DataTypePtr getScalarType() const { return getLeastSupertypeOrString(type_indexes); } + bool haveNulls() const { return have_nulls; } + bool needConvertField() const { return field_types.size() > 1; } + +private: + TypeIndexSet type_indexes; + std::unordered_set field_types; + bool have_nulls = false; +}; + +} + +FieldInfo getFieldInfo(const Field & field) +{ + FieldVisitorToScalarType to_scalar_type_visitor; + applyVisitor(to_scalar_type_visitor, field); + FieldVisitorToNumberOfDimensions to_number_dimension_visitor; + + return + { + to_scalar_type_visitor.getScalarType(), + to_scalar_type_visitor.haveNulls(), + to_scalar_type_visitor.needConvertField(), + applyVisitor(to_number_dimension_visitor, field), + to_number_dimension_visitor.need_fold_dimension + }; +} + +ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr && data_, bool is_nullable_) + : least_common_type(getDataTypeByColumn(*data_)) + , is_nullable(is_nullable_) + , num_rows(data_->size()) +{ + data.push_back(std::move(data_)); +} + +ColumnObject::Subcolumn::Subcolumn( + size_t size_, bool is_nullable_) + : least_common_type(std::make_shared()) + , is_nullable(is_nullable_) + , num_of_defaults_in_prefix(size_) + , num_rows(size_) +{ +} + +size_t ColumnObject::Subcolumn::size() const +{ + return num_rows; +} + +size_t ColumnObject::Subcolumn::byteSize() const +{ + size_t res = 0; + for (const auto & part : data) + res += part->byteSize(); + return res; +} + +size_t ColumnObject::Subcolumn::allocatedBytes() const +{ + size_t res = 0; + for (const auto & part : data) + res += part->allocatedBytes(); + return res; +} + +void ColumnObject::Subcolumn::get(size_t n, Field & res) const +{ + if (isFinalized()) + { + getFinalizedColumn().get(n, res); + return; + } + + size_t ind = n; + if (ind < num_of_defaults_in_prefix) + { + res = least_common_type.get()->getDefault(); + return; + } + + ind -= num_of_defaults_in_prefix; + for (const auto & part : data) + { + if (ind < part->size()) + { + part->get(ind, res); + res = convertFieldToTypeOrThrow(res, *least_common_type.get()); + return; + } + + ind -= part->size(); + } + + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Index ({}) for getting field is out of range", n); +} + +void ColumnObject::Subcolumn::checkTypes() const +{ + DataTypes prefix_types; + prefix_types.reserve(data.size()); + for (size_t i = 0; i < data.size(); ++i) + { + auto current_type = getDataTypeByColumn(*data[i]); + prefix_types.push_back(current_type); + auto prefix_common_type = getLeastSupertype(prefix_types); + if (!prefix_common_type->equals(*current_type)) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Data type {} of column at position {} cannot represent all columns from i-th prefix", + current_type->getName(), i); + } +} + +void ColumnObject::Subcolumn::insert(Field field) +{ + auto info = DB::getFieldInfo(field); + insert(std::move(field), std::move(info)); +} + +void ColumnObject::Subcolumn::addNewColumnPart(DataTypePtr type) +{ + //@TODO:@ignore serialize sparse + // auto serialization = type->getSerialization(ISerialization::Kind::SPARSE); + // data.push_back(type->createColumn(*serialization)); + data.push_back(type->createColumn()); + least_common_type = LeastCommonType{std::move(type)}; +} + +static bool isConversionRequiredBetweenIntegers(const IDataType & lhs, const IDataType & rhs) +{ + /// If both of types are signed/unsigned integers and size of left field type + /// is less than right type, we don't need to convert field, + /// because all integer fields are stored in Int64/UInt64. + + WhichDataType which_lhs(lhs); + WhichDataType which_rhs(rhs); + + bool is_native_int = which_lhs.isNativeInt() && which_rhs.isNativeInt(); + bool is_native_uint = which_lhs.isNativeUInt() && which_rhs.isNativeUInt(); + + return (!is_native_int && !is_native_uint) + || lhs.getSizeOfValueInMemory() > rhs.getSizeOfValueInMemory(); +} + +void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) +{ + auto base_type = std::move(info.scalar_type); + + if (isNothing(base_type) && info.num_dimensions == 0) + { + insertDefault(); + return; + } + + auto column_dim = least_common_type.getNumberOfDimensions(); + auto value_dim = info.num_dimensions; + + if (isNothing(least_common_type.get())) + column_dim = value_dim; + + if (isNothing(base_type)) + value_dim = column_dim; + + if (value_dim != column_dim) + throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATCHED, + "Dimension of types mismatched between inserted value and column. " + "Dimension of value: {}. Dimension of column: {}", + value_dim, column_dim); + + if (is_nullable) + base_type = makeNullable(base_type); + + if (!is_nullable && info.have_nulls) + field = applyVisitor(FieldVisitorReplaceNull(base_type->getDefault(), value_dim), std::move(field)); + + bool type_changed = false; + const auto & least_common_base_type = least_common_type.getBase(); + + if (data.empty()) + { + addNewColumnPart(createArrayOfType(std::move(base_type), value_dim)); + } + else if (!least_common_base_type->equals(*base_type) && !isNothing(base_type)) + { + if (isConversionRequiredBetweenIntegers(*base_type, *least_common_base_type)) + { + base_type = getLeastSupertypeOrString(DataTypes{std::move(base_type), least_common_base_type}); + type_changed = true; + if (!least_common_base_type->equals(*base_type)) + addNewColumnPart(createArrayOfType(std::move(base_type), value_dim)); + } + } + + if (type_changed || info.need_convert) + field = convertFieldToTypeOrThrow(field, *least_common_type.get()); + + data.back()->insert(field); + ++num_rows; +} + +void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn & src, size_t start, size_t length) +{ + assert(start + length <= src.size()); + size_t end = start + length; + num_rows += length; + + if (data.empty()) + { + addNewColumnPart(src.getLeastCommonType()); + } + else if (!least_common_type.get()->equals(*src.getLeastCommonType())) + { + auto new_least_common_type = getLeastSupertypeOrString(DataTypes{least_common_type.get(), src.getLeastCommonType()}); + if (!new_least_common_type->equals(*least_common_type.get())) + addNewColumnPart(std::move(new_least_common_type)); + } + + if (end <= src.num_of_defaults_in_prefix) + { + data.back()->insertManyDefaults(length); + return; + } + + if (start < src.num_of_defaults_in_prefix) + data.back()->insertManyDefaults(src.num_of_defaults_in_prefix - start); + + auto insert_from_part = [&](const auto & column, size_t from, size_t n) + { + assert(from + n <= column->size()); + auto column_type = getDataTypeByColumn(*column); + + if (column_type->equals(*least_common_type.get())) + { + data.back()->insertRangeFrom(*column, from, n); + return; + } + + /// If we need to insert large range, there is no sense to cut part of column and cast it. + /// Casting of all column and inserting from it can be faster. + /// Threshold is just a guess. + + if (n * 3 >= column->size()) + { + auto casted_column = castColumn({column, column_type, ""}, least_common_type.get()); + data.back()->insertRangeFrom(*casted_column, from, n); + return; + } + + auto casted_column = column->cut(from, n); + casted_column = castColumn({casted_column, column_type, ""}, least_common_type.get()); + data.back()->insertRangeFrom(*casted_column, 0, n); + }; + + size_t pos = 0; + size_t processed_rows = src.num_of_defaults_in_prefix; + + /// Find the first part of the column that intersects the range. + while (pos < src.data.size() && processed_rows + src.data[pos]->size() < start) + { + processed_rows += src.data[pos]->size(); + ++pos; + } + + /// Insert from the first part of column. + if (pos < src.data.size() && processed_rows < start) + { + size_t part_start = start - processed_rows; + size_t part_length = std::min(src.data[pos]->size() - part_start, end - start); + insert_from_part(src.data[pos], part_start, part_length); + processed_rows += src.data[pos]->size(); + ++pos; + } + + /// Insert from the parts of column in the middle of range. + while (pos < src.data.size() && processed_rows + src.data[pos]->size() < end) + { + insert_from_part(src.data[pos], 0, src.data[pos]->size()); + processed_rows += src.data[pos]->size(); + ++pos; + } + + /// Insert from the last part of column if needed. + if (pos < src.data.size() && processed_rows < end) + { + size_t part_end = end - processed_rows; + insert_from_part(src.data[pos], 0, part_end); + } +} + +bool ColumnObject::Subcolumn::isFinalized() const +{ + return num_of_defaults_in_prefix == 0 && + (data.empty() || (data.size() == 1 && !data[0]->isSparse())); +} + +void ColumnObject::Subcolumn::finalize() +{ + if (isFinalized()) + return; + + if (data.size() == 1 && num_of_defaults_in_prefix == 0) + { + data[0] = data[0]->convertToFullColumnIfSparse(); + return; + } + + const auto & to_type = least_common_type.get(); + auto result_column = to_type->createColumn(); + + if (num_of_defaults_in_prefix) + result_column->insertManyDefaults(num_of_defaults_in_prefix); + + for (auto & part : data) + { + part = part->convertToFullColumnIfSparse(); + auto from_type = getDataTypeByColumn(*part); + size_t part_size = part->size(); + + if (!from_type->equals(*to_type)) + { + auto offsets = ColumnUInt64::create(); + auto & offsets_data = offsets->getData(); + + /// We need to convert only non-default values and then recreate column + /// with default value of new type, because default values (which represents misses in data) + /// may be inconsistent between types (e.g "0" in UInt64 and empty string in String). + + part->getIndicesOfNonDefaultRows(offsets_data, 0, part_size); + + if (offsets->size() == part_size) + { + part = castColumn({part, from_type, ""}, to_type); + } + else + { + auto values = part->index(*offsets, offsets->size()); + values = castColumn({values, from_type, ""}, to_type); + part = values->createWithOffsets(offsets_data, to_type->getDefault(), part_size, /*shift=*/ 0); + } + } + + result_column->insertRangeFrom(*part, 0, part_size); + } + + data = { std::move(result_column) }; + num_of_defaults_in_prefix = 0; +} + +void ColumnObject::Subcolumn::insertDefault() +{ + if (data.empty()) + ++num_of_defaults_in_prefix; + else + data.back()->insertDefault(); + + ++num_rows; +} + +void ColumnObject::Subcolumn::insertManyDefaults(size_t length) +{ + if (data.empty()) + num_of_defaults_in_prefix += length; + else + data.back()->insertManyDefaults(length); + + num_rows += length; +} + +void ColumnObject::Subcolumn::popBack(size_t n) +{ + assert(n <= size()); + + num_rows -= n; + size_t num_removed = 0; + for (auto it = data.rbegin(); it != data.rend(); ++it) + { + if (n == 0) + break; + + auto & column = *it; + if (n < column->size()) + { + column->popBack(n); + n = 0; + } + else + { + ++num_removed; + n -= column->size(); + } + } + + data.resize(data.size() - num_removed); + num_of_defaults_in_prefix -= n; +} + +ColumnObject::Subcolumn ColumnObject::Subcolumn::cut(size_t start, size_t length) const +{ + Subcolumn new_subcolumn(0, is_nullable); + new_subcolumn.insertRangeFrom(*this, start, length); + return new_subcolumn; +} + +Field ColumnObject::Subcolumn::getLastField() const +{ + if (data.empty()) + return Field(); + + const auto & last_part = data.back(); + assert(!last_part->empty()); + return (*last_part)[last_part->size() - 1]; +} + +FieldInfo ColumnObject::Subcolumn::getFieldInfo() const +{ + const auto & base_type = least_common_type.getBase(); + return FieldInfo + { + .scalar_type = base_type, + .have_nulls = base_type->isNullable(), + .need_convert = false, + .num_dimensions = least_common_type.getNumberOfDimensions(), + }; +} + +ColumnObject::Subcolumn ColumnObject::Subcolumn::recreateWithDefaultValues(const FieldInfo & field_info) const +{ + auto scalar_type = field_info.scalar_type; + if (is_nullable) + scalar_type = makeNullable(scalar_type); + + Subcolumn new_subcolumn(*this); + new_subcolumn.least_common_type = LeastCommonType{createArrayOfType(scalar_type, field_info.num_dimensions)}; + + for (auto & part : new_subcolumn.data) + part = recreateColumnWithDefaultValues(part, scalar_type, field_info.num_dimensions); + + return new_subcolumn; +} + +IColumn & ColumnObject::Subcolumn::getFinalizedColumn() +{ + assert(isFinalized()); + return *data[0]; +} + +const IColumn & ColumnObject::Subcolumn::getFinalizedColumn() const +{ + assert(isFinalized()); + return *data[0]; +} + +const ColumnPtr & ColumnObject::Subcolumn::getFinalizedColumnPtr() const +{ + assert(isFinalized()); + return data[0]; +} + +ColumnObject::Subcolumn::LeastCommonType::LeastCommonType() + : type(std::make_shared()) + , base_type(type) + , num_dimensions(0) +{ +} + +ColumnObject::Subcolumn::LeastCommonType::LeastCommonType(DataTypePtr type_) + : type(std::move(type_)) + , base_type(getBaseTypeOfArray(type)) + , num_dimensions(DB::getNumberOfDimensions(*type)) +{ +} + +ColumnObject::ColumnObject(bool is_nullable_) + : is_nullable(is_nullable_) + , num_rows(0) +{ +} + +ColumnObject::ColumnObject(Subcolumns && subcolumns_, bool is_nullable_) + : is_nullable(is_nullable_) + , subcolumns(std::move(subcolumns_)) + , num_rows(subcolumns.empty() ? 0 : (*subcolumns.begin())->data.size()) + +{ + checkConsistency(); +} + +void ColumnObject::checkConsistency() const +{ + if (subcolumns.empty()) + return; + + for (const auto & leaf : subcolumns) + { + if (num_rows != leaf->data.size()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Sizes of subcolumns are inconsistent in ColumnObject." + " Subcolumn '{}' has {} rows, but expected size is {}", + leaf->path.getPath(), leaf->data.size(), num_rows); + } + } +} + +size_t ColumnObject::size() const +{ +#ifndef NDEBUG + checkConsistency(); +#endif + return num_rows; +} + +size_t ColumnObject::byteSize() const +{ + size_t res = 0; + for (const auto & entry : subcolumns) + res += entry->data.byteSize(); + return res; +} + +size_t ColumnObject::allocatedBytes() const +{ + size_t res = 0; + for (const auto & entry : subcolumns) + res += entry->data.allocatedBytes(); + return res; +} + +void ColumnObject::forEachSubcolumn(MutableColumnCallback callback) +{ + for (const auto & entry : subcolumns) + for (auto & part : entry->data.data) + callback(part); +} + +void ColumnObject::forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) +{ + for (auto & entry : subcolumns) + { + for (auto & part : entry->data.data) + { + callback(*part); + part->forEachSubcolumnRecursively(callback); + } + } +} + +void ColumnObject::insert(const Field & field) +{ + const auto & object = field.get(); + + HashSet inserted_paths; + size_t old_size = size(); + for (const auto & [key_str, value] : object) + { + PathInData key(key_str); + inserted_paths.insert(key_str); + if (!hasSubcolumn(key)) + addSubcolumn(key, old_size); + + auto & subcolumn = getSubcolumn(key); + subcolumn.insert(value); + } + + for (auto & entry : subcolumns) + { + if (!inserted_paths.has(entry->path.getPath())) + { + bool inserted = tryInsertDefaultFromNested(entry); + if (!inserted) + entry->data.insertDefault(); + } + } + + ++num_rows; +} + +void ColumnObject::insertDefault() +{ + for (auto & entry : subcolumns) + entry->data.insertDefault(); + + ++num_rows; +} + +Field ColumnObject::operator[](size_t n) const +{ + Field object; + get(n, object); + return object; +} + +void ColumnObject::get(size_t n, Field & res) const +{ + assert(n < size()); + res = Object(); + auto & object = res.get(); + + for (const auto & entry : subcolumns) + { + auto it = object.try_emplace(entry->path.getPath()).first; + entry->data.get(n, it->second); + } +} + +void ColumnObject::insertFrom(const IColumn & src, size_t n) +{ + insert(src[n]); +} + +void ColumnObject::insertRangeFrom(const IColumn & src, size_t start, size_t length) +{ + const auto & src_object = assert_cast(src); + + for (const auto & entry : src_object.subcolumns) + { + if (!hasSubcolumn(entry->path)) + { + if (entry->path.hasNested()) + addNestedSubcolumn(entry->path, entry->data.getFieldInfo(), num_rows); + else + addSubcolumn(entry->path, num_rows); + } + + auto & subcolumn = getSubcolumn(entry->path); + subcolumn.insertRangeFrom(entry->data, start, length); + } + + for (auto & entry : subcolumns) + { + if (!src_object.hasSubcolumn(entry->path)) + { + bool inserted = tryInsertManyDefaultsFromNested(entry); + if (!inserted) + entry->data.insertManyDefaults(length); + } + } + + num_rows += length; + finalize(); +} + +void ColumnObject::popBack(size_t length) +{ + for (auto & entry : subcolumns) + entry->data.popBack(length); + + num_rows -= length; +} + +template +MutableColumnPtr ColumnObject::applyForSubcolumns(Func && func) const +{ + if (!isFinalized()) + { + auto finalized = cloneFinalized(); + auto & finalized_object = assert_cast(*finalized); + return finalized_object.applyForSubcolumns(std::forward(func)); + } + + auto res = ColumnObject::create(is_nullable); + for (const auto & subcolumn : subcolumns) + { + auto new_subcolumn = func(subcolumn->data.getFinalizedColumn()); + res->addSubcolumn(subcolumn->path, new_subcolumn->assumeMutable()); + } + + return res; +} + +ColumnPtr ColumnObject::permute(const Permutation & perm, size_t limit) const +{ + return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.permute(perm, limit); }); +} + +ColumnPtr ColumnObject::filter(const Filter & filter, ssize_t result_size_hint) const +{ + return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.filter(filter, result_size_hint); }); +} + +ColumnPtr ColumnObject::index(const IColumn & indexes, size_t limit) const +{ + return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.index(indexes, limit); }); +} + +ColumnPtr ColumnObject::replicate(const Offsets & offsets) const +{ + return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.replicate(offsets); }); +} + +MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const +{ + if (new_size == 0) + return ColumnObject::create(is_nullable); + + return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.cloneResized(new_size); }); +} + +void ColumnObject::getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation & res) const +{ + res.resize(num_rows); + std::iota(res.begin(), res.end(), 0); +} + +void ColumnObject::compareColumn(const IColumn & rhs, size_t rhs_row_num, + PaddedPODArray * row_indexes, PaddedPODArray & compare_results, + int direction, int nan_direction_hint) const +{ + return doCompareColumn(assert_cast(rhs), rhs_row_num, row_indexes, + compare_results, direction, nan_direction_hint); +} + +void ColumnObject::getExtremes(Field & min, Field & max) const +{ + if (num_rows == 0) + { + min = Object(); + max = Object(); + } + else + { + get(0, min); + get(0, max); + } +} + +MutableColumns ColumnObject::scatter(ColumnIndex num_columns, const Selector & selector) const +{ + return scatterImpl(num_columns, selector); +} + +void ColumnObject::gather(ColumnGathererStream & gatherer) +{ + gatherer.gather(*this); +} + +const ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key) const +{ + if (const auto * node = subcolumns.findLeaf(key)) + return node->data; + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath()); +} + +ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key) +{ + if (const auto * node = subcolumns.findLeaf(key)) + return const_cast(node)->data; + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath()); +} + +bool ColumnObject::hasSubcolumn(const PathInData & key) const +{ + return subcolumns.findLeaf(key) != nullptr; +} + +void ColumnObject::addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn) +{ + size_t new_size = subcolumn->size(); + bool inserted = subcolumns.add(key, Subcolumn(std::move(subcolumn), is_nullable)); + + if (!inserted) + throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath()); + + if (num_rows == 0) + num_rows = new_size; + else if (new_size != num_rows) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, + "Size of subcolumn {} ({}) is inconsistent with column size ({})", + key.getPath(), new_size, num_rows); +} + +void ColumnObject::addSubcolumn(const PathInData & key, size_t new_size) +{ + bool inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable)); + if (!inserted) + throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath()); + + if (num_rows == 0) + num_rows = new_size; + else if (new_size != num_rows) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, + "Required size of subcolumn {} ({}) is inconsistent with column size ({})", + key.getPath(), new_size, num_rows); +} + +void ColumnObject::addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size) +{ + if (!key.hasNested()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot add Nested subcolumn, because path doesn't contain Nested"); + + bool inserted = false; + /// We find node that represents the same Nested type as @key. + const auto * nested_node = subcolumns.findBestMatch(key); + + if (nested_node) + { + /// Find any leaf of Nested subcolumn. + const auto * leaf = subcolumns.findLeaf(nested_node, [&](const auto &) { return true; }); + assert(leaf); + + /// Recreate subcolumn with default values and the same sizes of arrays. + auto new_subcolumn = leaf->data.recreateWithDefaultValues(field_info); + + /// It's possible that we have already inserted value from current row + /// to this subcolumn. So, adjust size to expected. + if (new_subcolumn.size() > new_size) + new_subcolumn.popBack(new_subcolumn.size() - new_size); + + assert(new_subcolumn.size() == new_size); + inserted = subcolumns.add(key, new_subcolumn); + } + else + { + /// If node was not found just add subcolumn with empty arrays. + inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable)); + } + + if (!inserted) + throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath()); + + if (num_rows == 0) + num_rows = new_size; + else if (new_size != num_rows) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, + "Required size of subcolumn {} ({}) is inconsistent with column size ({})", + key.getPath(), new_size, num_rows); +} + +const ColumnObject::Subcolumns::Node * ColumnObject::getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const +{ + if (!entry->path.hasNested()) + return nullptr; + + size_t old_size = entry->data.size(); + const auto * current_node = subcolumns.findLeaf(entry->path); + const Subcolumns::Node * leaf = nullptr; + + while (current_node) + { + /// Try to find the first Nested up to the current node. + const auto * node_nested = subcolumns.findParent(current_node, + [](const auto & candidate) { return candidate.isNested(); }); + + if (!node_nested) + break; + + /// Find the leaf with subcolumn that contains values + /// for the last rows. + /// If there are no leaves, skip current node and find + /// the next node up to the current. + leaf = subcolumns.findLeaf(node_nested, + [&](const auto & candidate) + { + return candidate.data.size() > old_size; + }); + + if (leaf) + break; + + current_node = node_nested->parent; + } + + if (leaf && isNothing(leaf->data.getLeastCommonTypeBase())) + return nullptr; + + return leaf; +} + +bool ColumnObject::tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const +{ + const auto * leaf = getLeafOfTheSameNested(entry); + if (!leaf) + return false; + + size_t old_size = entry->data.size(); + auto field_info = entry->data.getFieldInfo(); + + /// Cut the needed range from the found leaf + /// and replace scalar values to the correct + /// default values for given entry. + auto new_subcolumn = leaf->data + .cut(old_size, leaf->data.size() - old_size) + .recreateWithDefaultValues(field_info); + + entry->data.insertRangeFrom(new_subcolumn, 0, new_subcolumn.size()); + return true; +} + +bool ColumnObject::tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const +{ + const auto * leaf = getLeafOfTheSameNested(entry); + if (!leaf) + return false; + + auto last_field = leaf->data.getLastField(); + if (last_field.isNull()) + return false; + + size_t leaf_num_dimensions = leaf->data.getNumberOfDimensions(); + size_t entry_num_dimensions = entry->data.getNumberOfDimensions(); + + auto default_scalar = entry_num_dimensions > leaf_num_dimensions + ? createEmptyArrayField(entry_num_dimensions - leaf_num_dimensions) + : entry->data.getLeastCommonTypeBase()->getDefault(); + + auto default_field = applyVisitor(FieldVisitorReplaceScalars(default_scalar, leaf_num_dimensions), last_field); + entry->data.insert(std::move(default_field)); + return true; +} + +PathsInData ColumnObject::getKeys() const +{ + PathsInData keys; + keys.reserve(subcolumns.size()); + for (const auto & entry : subcolumns) + keys.emplace_back(entry->path); + return keys; +} + +bool ColumnObject::isFinalized() const +{ + return std::all_of(subcolumns.begin(), subcolumns.end(), + [](const auto & entry) { return entry->data.isFinalized(); }); +} + +void ColumnObject::finalize() +{ + size_t old_size = size(); + Subcolumns new_subcolumns; + for (auto && entry : subcolumns) + { + const auto & least_common_type = entry->data.getLeastCommonType(); + + /// Do not add subcolumns, which consist only from NULLs. + if (isNothing(getBaseTypeOfArray(least_common_type))) + continue; + + entry->data.finalize(); + new_subcolumns.add(entry->path, entry->data); + } + + /// If all subcolumns were skipped add a dummy subcolumn, + /// because Tuple type must have at least one element. + if (new_subcolumns.empty()) + new_subcolumns.add(PathInData{COLUMN_NAME_DUMMY}, Subcolumn{ColumnUInt8::create(old_size, 0), is_nullable}); + + std::swap(subcolumns, new_subcolumns); + checkObjectHasNoAmbiguousPaths(getKeys()); +} + +} diff --git a/src/Columns/ColumnObject.h b/src/Columns/ColumnObject.h new file mode 100644 index 0000000000..297ef17b23 --- /dev/null +++ b/src/Columns/ColumnObject.h @@ -0,0 +1,271 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +/// Info that represents a scalar or array field in a decomposed view. +/// It allows to recreate field with different number +/// of dimensions or nullability. +struct FieldInfo +{ + /// The common type of of all scalars in field. + DataTypePtr scalar_type; + + /// Do we have NULL scalar in field. + bool have_nulls; + + /// If true then we have scalars with different types in array and + /// we need to convert scalars to the common type. + bool need_convert; + + /// Number of dimension in array. 0 if field is scalar. + size_t num_dimensions; + + /// If true then this field is an array of variadic dimension field + /// and we need to normalize the dimension + bool need_fold_dimension; +}; + +FieldInfo getFieldInfo(const Field & field); + +/** A column that represents object with dynamic set of subcolumns. + * Subcolumns are identified by paths in document and are stored in + * a trie-like structure. ColumnObject is not suitable for writing into tables + * and it should be converted to Tuple with fixed set of subcolumns before that. + */ +class ColumnObject final : public COWHelper +{ +public: + /** Class that represents one subcolumn. + * It stores values in several parts of column + * and keeps current common type of all parts. + * We add a new column part with a new type, when we insert a field, + * which can't be converted to the current common type. + * After insertion of all values subcolumn should be finalized + * for writing and other operations. + */ + class Subcolumn + { + public: + Subcolumn() = default; + Subcolumn(size_t size_, bool is_nullable_); + Subcolumn(MutableColumnPtr && data_, bool is_nullable_); + + size_t size() const; + size_t byteSize() const; + size_t allocatedBytes() const; + void get(size_t n, Field & res) const; + + bool isFinalized() const; + const DataTypePtr & getLeastCommonType() const { return least_common_type.get(); } + const DataTypePtr & getLeastCommonTypeBase() const { return least_common_type.getBase(); } + size_t getNumberOfDimensions() const { return least_common_type.getNumberOfDimensions(); } + + /// Checks the consistency of column's parts stored in @data. + void checkTypes() const; + + /// Inserts a field, which scalars can be arbitrary, but number of + /// dimensions should be consistent with current common type. + void insert(Field field); + void insert(Field field, FieldInfo info); + + void insertDefault(); + void insertManyDefaults(size_t length); + void insertRangeFrom(const Subcolumn & src, size_t start, size_t length); + void popBack(size_t n); + + Subcolumn cut(size_t start, size_t length) const; + + /// Converts all column's parts to the common type and + /// creates a single column that stores all values. + void finalize(); + + /// Returns last inserted field. + Field getLastField() const; + + FieldInfo getFieldInfo() const; + + /// Recreates subcolumn with default scalar values and keeps sizes of arrays. + /// Used to create columns of type Nested with consistent array sizes. + Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const; + + /// Returns single column if subcolumn in finalizes. + /// Otherwise -- undefined behaviour. + IColumn & getFinalizedColumn(); + const IColumn & getFinalizedColumn() const; + const ColumnPtr & getFinalizedColumnPtr() const; + + const std::vector & getData() const { return data; } + size_t getNumberOfDefaultsInPrefix() const { return num_of_defaults_in_prefix; } + + friend class ColumnObject; + + private: + class LeastCommonType + { + public: + LeastCommonType(); + explicit LeastCommonType(DataTypePtr type_); + + const DataTypePtr & get() const { return type; } + const DataTypePtr & getBase() const { return base_type; } + size_t getNumberOfDimensions() const { return num_dimensions; } + + private: + DataTypePtr type; + DataTypePtr base_type; + size_t num_dimensions = 0; + }; + + void addNewColumnPart(DataTypePtr type); + + /// Current least common type of all values inserted to this subcolumn. + LeastCommonType least_common_type; + + /// If true then common type type of subcolumn is Nullable + /// and default values are NULLs. + bool is_nullable = false; + + /// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes. + /// That means that the least common type for i-th prefix is the type of i-th part + /// and it's the supertype for all type of column from 0 to i-1. + std::vector data; + + /// Until we insert any non-default field we don't know further + /// least common type and we count number of defaults in prefix, + /// which will be converted to the default type of final common type. + size_t num_of_defaults_in_prefix = 0; + + size_t num_rows = 0; + }; + + using Subcolumns = SubcolumnsTree; + +private: + /// If true then all subcolumns are nullable. + const bool is_nullable; + + Subcolumns subcolumns; + size_t num_rows; + +public: + static constexpr auto COLUMN_NAME_DUMMY = "_dummy"; + + explicit ColumnObject(bool is_nullable_); + ColumnObject(Subcolumns && subcolumns_, bool is_nullable_); + + /// Checks that all subcolumns have consistent sizes. + void checkConsistency() const; + + bool hasSubcolumn(const PathInData & key) const; + + const Subcolumn & getSubcolumn(const PathInData & key) const; + Subcolumn & getSubcolumn(const PathInData & key); + + void incrementNumRows() { ++num_rows; } + + /// Adds a subcolumn from existing IColumn. + void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn); + + /// Adds a subcolumn of specific size with default values. + void addSubcolumn(const PathInData & key, size_t new_size); + + /// Adds a subcolumn of type Nested of specific size with default values. + /// It cares about consistency of sizes of Nested arrays. + void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size); + + /// Finds a subcolumn from the same Nested type as @entry and inserts + /// an array with default values with consistent sizes as in Nested type. + bool tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const; + bool tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const; + + const Subcolumns & getSubcolumns() const { return subcolumns; } + Subcolumns & getSubcolumns() { return subcolumns; } + PathsInData getKeys() const; + + /// Part of interface + + const char * getFamilyName() const override { return "Object"; } + TypeIndex getDataType() const override { return TypeIndex::Object; } + + size_t size() const override; + size_t byteSize() const override; + size_t allocatedBytes() const override; + void forEachSubcolumn(MutableColumnCallback callback) override; + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override; + void insert(const Field & field) override; + void insertDefault() override; + void insertFrom(const IColumn & src, size_t n) override; + void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; + void popBack(size_t length) override; + Field operator[](size_t n) const override; + void get(size_t n, Field & res) const override; + + ColumnPtr permute(const Permutation & perm, size_t limit) const override; + ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override; + ColumnPtr index(const IColumn & indexes, size_t limit) const override; + ColumnPtr replicate(const Offsets & offsets) const override; + MutableColumnPtr cloneResized(size_t new_size) const override; + + /// Finalizes all subcolumns. + void finalize() override; + bool isFinalized() const override; + + /// Order of rows in ColumnObject is undefined. + void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation & res) const override; + void compareColumn(const IColumn & rhs, size_t rhs_row_num, + PaddedPODArray * row_indexes, PaddedPODArray & compare_results, + int direction, int nan_direction_hint) const override; + + void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override {} + int compareAt(size_t, size_t, const IColumn &, int) const override { return 0; } + void getExtremes(Field & min, Field & max) const override; + + MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override; + void gather(ColumnGathererStream & gatherer) override; + + /// All other methods throw exception. + + StringRef getDataAt(size_t) const override { throwMustBeConcrete(); } + bool isDefaultAt(size_t) const override { throwMustBeConcrete(); } + void insertData(const char *, size_t) override { throwMustBeConcrete(); } + StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); } + const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); } + const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); } + void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); } + void updateWeakHash32(WeakHash32 &) const override { throwMustBeConcrete(); } + void updateHashFast(SipHash &) const override { throwMustBeConcrete(); } + bool hasEqualValues() const override { throwMustBeConcrete(); } + size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); } + double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); } + UInt64 getNumberOfDefaultRows() const override { throwMustBeConcrete(); } + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); } + +private: + [[noreturn]] static void throwMustBeConcrete() + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ColumnObject must be converted to ColumnTuple before use"); + } + + template + MutableColumnPtr applyForSubcolumns(Func && func) const; + + /// It's used to get shared sized of Nested to insert correct default values. + const Subcolumns::Node * getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const; +}; +} diff --git a/src/Columns/ColumnSketchBinary.h b/src/Columns/ColumnSketchBinary.h index 39fdc98ffe..96a68c0c90 100644 --- a/src/Columns/ColumnSketchBinary.h +++ b/src/Columns/ColumnSketchBinary.h @@ -299,6 +299,21 @@ public: void validate() const; bool isCollationSupported() const override { return true; } + + double getRatioOfDefaultRows(double) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getRatioOfDefaultRows is not supported for {}", getName()); + } + + UInt64 getNumberOfDefaultRows() const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getNumberOfDefaultRows is not supported for {}", getName()); + } + + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getIndicesOfNonDefaultRows is not supported for {}", getName()); + } }; diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 39700e5bd8..cdc250812a 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -214,7 +214,6 @@ ColumnPtr ColumnString::filter(const Filter & filt, ssize_t result_size_hint) co return res; } - ColumnPtr ColumnString::permute(const Permutation & perm, size_t limit) const { size_t size = offsets.size(); diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index 6c0f108de3..9061bd0700 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -310,6 +310,21 @@ public: return typeid(rhs) == typeid(ColumnString); } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return getRatioOfDefaultRowsImpl(sample_ratio); + } + + UInt64 getNumberOfDefaultRows() const override + { + return getNumberOfDefaultRowsImpl(); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); + } + Chars & getChars() { return chars; } const Chars & getChars() const { return chars; } diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index 3612355f51..1c56f626cc 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -503,18 +503,18 @@ void ColumnTuple::getExtremes(Field & min, Field & max) const max = max_tuple; } -void ColumnTuple::forEachSubcolumn(ColumnCallback callback) +void ColumnTuple::forEachSubcolumn(MutableColumnCallback callback) { for (auto & column : columns) callback(column); } -void ColumnTuple::forEachSubcolumnRecursively(ColumnCallback callback) +void ColumnTuple::forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) { for (auto & column : columns) { - callback(column); + callback(*column); column->forEachSubcolumnRecursively(callback); } } @@ -569,4 +569,19 @@ ColumnPtr ColumnTuple::compress() const }); } +double ColumnTuple::getRatioOfDefaultRows(double sample_ratio) const +{ + return getRatioOfDefaultRowsImpl(sample_ratio); +} + +UInt64 ColumnTuple::getNumberOfDefaultRows() const +{ + return getNumberOfDefaultRowsImpl(); +} + +void ColumnTuple::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const +{ + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); +} + } diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index a9d6415656..82f3b99b3c 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -115,12 +115,14 @@ public: size_t byteSizeAt(size_t n) const override; size_t allocatedBytes() const override; void protect() override; - void forEachSubcolumn(ColumnCallback callback) override; - void forEachSubcolumnRecursively(ColumnCallback callback) override; + void forEachSubcolumn(MutableColumnCallback callback) override; + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override; bool structureEquals(const IColumn & rhs) const override; bool isCollationSupported() const override; ColumnPtr compress() const override; - + double getRatioOfDefaultRows(double sample_ratio) const override; + UInt64 getNumberOfDefaultRows() const override; + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; size_t tupleSize() const { return columns.size(); } const IColumn & getColumn(size_t idx) const { return *columns[idx]; } diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index 7425c3d585..5b451a6112 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -37,7 +37,7 @@ #include #include -#include "Columns/ColumnConst.h" +#include namespace DB @@ -128,7 +128,10 @@ public: return column_holder->allocatedBytes() + reverse_index.allocatedBytes() + (nested_null_mask ? nested_null_mask->allocatedBytes() : 0); } - void forEachSubcolumn(IColumn::ColumnCallback callback) override + + void forEachSubcolumn(IColumn::ColumnCallback callback) const override { callback(column_holder); } + + void forEachSubcolumn(IColumn::MutableColumnCallback callback) override { callback(column_holder); reverse_index.setColumn(getRawColumnPtr()); @@ -136,6 +139,21 @@ public: nested_column_nullable = ColumnNullable::create(column_holder, nested_null_mask); } + void forEachSubcolumnRecursively(IColumn::RecursiveColumnCallback callback) const override + { + callback(*column_holder); + column_holder->forEachSubcolumnRecursively(callback); + } + + void forEachSubcolumnRecursively(IColumn::RecursiveMutableColumnCallback callback) override + { + callback(*column_holder); + column_holder->forEachSubcolumnRecursively(callback); + reverse_index.setColumn(getRawColumnPtr()); + if (is_nullable) + nested_column_nullable = ColumnNullable::create(column_holder, nested_null_mask); + } + void forEachSubcolumnRecursively(IColumn::ColumnCallback callback) override { callback(column_holder); diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index 2b3db71bda..b8885bfa61 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -356,6 +356,21 @@ public: return typeid(rhs) == typeid(ColumnVector); } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return this->template getRatioOfDefaultRowsImpl(sample_ratio); + } + + UInt64 getNumberOfDefaultRows() const override + { + return this->template getNumberOfDefaultRowsImpl(); + } + + void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override + { + return this->template getIndicesOfNonDefaultRowsImpl(indices, from, limit); + } + ColumnPtr compress() const override; /// Replace elements that match the filter with zeroes. If inverted replaces not matched elements. diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index f6be478d4b..9ff563fbd4 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -15,12 +15,10 @@ String IColumn::dumpStructure() const WriteBufferFromOwnString res; res << getFamilyName() << "(size = " << size(); - ColumnCallback callback = [&](ColumnPtr & subcolumn) + forEachSubcolumn([&](const auto & subcolumn) { res << ", " << subcolumn->dumpStructure(); - }; - - const_cast(this)->forEachSubcolumn(callback); + }); res << ")"; return res.str(); @@ -31,6 +29,50 @@ void IColumn::insertFrom(const IColumn & src, size_t n) insert(src[n]); } +ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +{ + if (offsets.size() + shift != size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Incompatible sizes of offsets ({}), shift ({}) and size of column {}", offsets.size(), shift, size()); + + auto res = cloneEmpty(); + res->reserve(total_rows); + + ssize_t current_offset = -1; + for (size_t i = 0; i < offsets.size(); ++i) + { + ssize_t offsets_diff = static_cast(offsets[i]) - current_offset; + current_offset = offsets[i]; + + if (offsets_diff > 1) + res->insertMany(default_field, offsets_diff - 1); + + res->insertFrom(*this, i + shift); + } + + ssize_t offsets_diff = static_cast(total_rows) - current_offset; + if (offsets_diff > 1) + res->insertMany(default_field, offsets_diff - 1); + + return res; +} + +void IColumn::forEachSubcolumn(ColumnCallback callback) const +{ + const_cast(this)->forEachSubcolumn([&callback](WrappedPtr & subcolumn) + { + callback(std::as_const(subcolumn)); + }); +} + +void IColumn::forEachSubcolumnRecursively(RecursiveColumnCallback callback) const +{ + std::as_const(*this).forEachSubcolumnRecursively([&callback](const IColumn & subcolumn) + { + callback(const_cast(subcolumn)); + }); +} + bool isColumnNullable(const IColumn & column) { return checkColumn(column); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 0f65303f0f..fd5e5d34eb 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -91,6 +91,10 @@ public: /// If column is ColumnLowCardinality, transforms is to full column. virtual Ptr convertToFullColumnIfLowCardinality() const { return getPtr(); } + /// If column isn't ColumnSparse, return itself. + /// If column is ColumnSparse, transforms it to full column. + [[nodiscard]] virtual Ptr convertToFullColumnIfSparse() const { return getPtr(); } + /// Creates empty column with the same type. virtual MutablePtr cloneEmpty() const { return cloneResized(0); } @@ -209,6 +213,13 @@ public: insertFrom(src, position); } + /// Appends one field multiple times. Can be optimized in inherited classes. + virtual void insertMany(const Field & field, size_t length) + { + for (size_t i = 0; i < length; ++i) + insert(field); + } + /// Appends data located in specified memory chunk if it is possible (throws an exception if it cannot be implemented). /// Is used to optimize some computations (in aggregation, for example). /// Parameter length could be ignored if column values have fixed size. @@ -414,8 +425,18 @@ public: /// If the column contains subcolumns (such as Array, Nullable, etc), do callback on them. /// Shallow: doesn't do recursive calls; don't do call for itself. - using ColumnCallback = std::function; - virtual void forEachSubcolumn(ColumnCallback) {} + + using ColumnCallback = std::function; + virtual void forEachSubcolumn(ColumnCallback) const; + + using MutableColumnCallback = std::function; + virtual void forEachSubcolumn(MutableColumnCallback) {} + + using RecursiveColumnCallback = std::function; + virtual void forEachSubcolumnRecursively(RecursiveColumnCallback) const; + + using RecursiveMutableColumnCallback = std::function; + virtual void forEachSubcolumnRecursively(RecursiveMutableColumnCallback) {} /// Similar to forEachSubcolumn but it also do recursive calls. virtual void forEachSubcolumnRecursively(ColumnCallback) {} @@ -427,6 +448,23 @@ public: throw Exception("Method structureEquals is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + /// Returns ratio of values in column, that are equal to default value of column. + /// Checks only @sample_ratio ratio of rows. + [[nodiscard]] virtual double getRatioOfDefaultRows(double sample_ratio = 1.0) const = 0; /// NOLINT + + /// Returns number of values in column, that are equal to default value of column. + [[nodiscard]] virtual UInt64 getNumberOfDefaultRows() const = 0; + + /// Returns indices of values in column, that not equal to default value of column. + virtual void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const = 0; + + /// Returns column with @total_size elements. + /// In result column values from current column are at positions from @offsets. + /// Other values are filled by @default_value. + /// @shift means how much rows to skip from the beginning of current column. + /// Used to create full column from sparse. + [[nodiscard]] virtual Ptr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const; + /// Compress column in memory to some representation that allows to decompress it back. /// Return itself if compression is not applicable for this column type. virtual Ptr compress() const @@ -442,6 +480,16 @@ public: return getPtr(); } + /// Some columns may require finalization before using of other operations. + virtual void finalize() {} + virtual bool isFinalized() const { return true; } + + MutablePtr cloneFinalized() const + { + auto finalized = IColumn::mutate(getPtr()); + finalized->finalize(); + return finalized; + } static MutablePtr mutate(Ptr ptr) { @@ -509,6 +557,8 @@ public: virtual bool lowCardinality() const { return false; } + [[nodiscard]] virtual bool isSparse() const { return false; } + virtual bool isCollationSupported() const { return false; } virtual ~IColumn() = default; @@ -545,6 +595,9 @@ protected: template double getRatioOfDefaultRowsImpl(double sample_ratio) const; + template + UInt64 getNumberOfDefaultRowsImpl() const; + template void getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const; diff --git a/src/Columns/IColumnDummy.h b/src/Columns/IColumnDummy.h index e763d178d3..a51632abd6 100644 --- a/src/Columns/IColumnDummy.h +++ b/src/Columns/IColumnDummy.h @@ -180,6 +180,21 @@ public: return res; } + double getRatioOfDefaultRows(double) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getRatioOfDefaultRows is not supported for {}", getName()); + } + + UInt64 getNumberOfDefaultRows() const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getNumberOfDefaultRows is not supported for {}", getName()); + } + + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getIndicesOfNonDefaultRows is not supported for {}", getName()); + } + void gather(ColumnGathererStream &) override { throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); diff --git a/src/Columns/IColumnImpl.h b/src/Columns/IColumnImpl.h index 53fbd72033..3ee7b6db8b 100644 --- a/src/Columns/IColumnImpl.h +++ b/src/Columns/IColumnImpl.h @@ -195,6 +195,16 @@ double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const return static_cast(res) / num_checked_rows; } +template +UInt64 IColumn::getNumberOfDefaultRowsImpl() const +{ + UInt64 res = 0; + size_t num_rows = size(); + for (size_t i = 0; i < num_rows; ++i) + res += static_cast(*this).isDefaultAt(i); + return res; +} + template void IColumn::getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const { diff --git a/src/Columns/IColumnUnique.h b/src/Columns/IColumnUnique.h index 1e8bf25bd0..746a2a8d2c 100644 --- a/src/Columns/IColumnUnique.h +++ b/src/Columns/IColumnUnique.h @@ -210,6 +210,21 @@ public: { throw Exception("Method hasEqualValues is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED); } + + double getRatioOfDefaultRows(double) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getRatioOfDefaultRows' not implemented for ColumnUnique"); + } + + UInt64 getNumberOfDefaultRows() const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getNumberOfDefaultRows' not implemented for ColumnUnique"); + } + + void getIndicesOfNonDefaultRows(IColumn::Offsets &, size_t, size_t) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getIndicesOfNonDefaultRows' not implemented for ColumnUnique"); + } }; using ColumnUniquePtr = IColumnUnique::ColumnUniquePtr; diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index ca5ee4c9c1..fa03fdd39d 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -48,6 +48,7 @@ M(11, POSITION_OUT_OF_BOUND) \ M(12, PARAMETER_OUT_OF_BOUND) \ M(13, SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH) \ + M(14, TOO_MANY_SUBCOLUMNS_IN_JSON) \ M(15, DUPLICATE_COLUMN) \ M(16, NO_SUCH_COLUMN_IN_TABLE) \ M(17, DELIMITER_IN_STRING_LITERAL_DOESNT_MATCH) \ diff --git a/src/Common/FieldVisitorCompatibleBinary.cpp b/src/Common/FieldVisitorCompatibleBinary.cpp index a47cab61f2..8733d4d471 100644 --- a/src/Common/FieldVisitorCompatibleBinary.cpp +++ b/src/Common/FieldVisitorCompatibleBinary.cpp @@ -149,6 +149,19 @@ void FieldVisitorCompatibleWriteBinary::operator()(const BitMap64 & x, WriteBuff writeString(buffer.data(), bytes, buf); } +void FieldVisitorCompatibleWriteBinary::operator()(const Object & x, WriteBuffer & buf) const +{ + const size_t size = x.size(); + writeBinary(size, buf); + + for (const auto & [key, value] : x) + { + const UInt8 type = value.getType(); + writeBinary(type, buf); + writeBinary(key, buf); + Field::dispatch([&buf] (const auto & val) { FieldVisitorCompatibleWriteBinary()(val, buf); }, value); + } +} void FieldVisitorCompatibleReadBinary::deserialize(UInt64 & value, ReadBuffer & buf) { @@ -306,4 +319,19 @@ void FieldVisitorCompatibleReadBinary::deserialize(BitMap64 & value, ReadBuffer value = BitMap64::readSafe(bitmap_buffer.data(), bytes); } +void FieldVisitorCompatibleReadBinary::deserialize(Object & value, ReadBuffer & buf) +{ + size_t size; + readBinary(size, buf); + + for (size_t index = 0; index < size; ++index) + { + UInt8 type; + String key; + readBinary(type, buf); + readBinary(key, buf); + value[key] = Field::dispatch(FieldVisitorCompatibleReadBinary(buf), static_cast(type)); + } +} + } diff --git a/src/Common/FieldVisitorCompatibleBinary.h b/src/Common/FieldVisitorCompatibleBinary.h index bf7bb43b36..578f4d6882 100644 --- a/src/Common/FieldVisitorCompatibleBinary.h +++ b/src/Common/FieldVisitorCompatibleBinary.h @@ -35,6 +35,7 @@ public: void operator()(const DecimalField & x, WriteBuffer & buf) const; void operator()(const AggregateFunctionStateData & x, WriteBuffer & buf) const; void operator()(const BitMap64 & x, WriteBuffer & buf) const; + void operator()(const Object & x, WriteBuffer & buf) const; void operator()(const IPv4 & x, WriteBuffer & buf) const; void operator()(const IPv6 & x, WriteBuffer & buf) const; }; @@ -97,6 +98,7 @@ private: static void deserialize(Tuple & value, ReadBuffer & buf); static void deserialize(Map & value, ReadBuffer & buf); static void deserialize(BitMap64 & value, ReadBuffer & buf); + static void deserialize(Object & value, ReadBuffer & buf); }; } diff --git a/src/Common/FieldVisitorConvertToNumber.h b/src/Common/FieldVisitorConvertToNumber.h index 6b3f6c24f4..66d6210603 100644 --- a/src/Common/FieldVisitorConvertToNumber.h +++ b/src/Common/FieldVisitorConvertToNumber.h @@ -77,6 +77,11 @@ public: throw Exception("Cannot convert Map to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE); } + T operator() (const Object &) const + { + throw Exception("Cannot convert Object to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE); + } + T operator() (const UInt64 & x) const { return T(x); } T operator() (const Int64 & x) const { return T(x); } T operator() (const Int128 & x) const { return T(x); } diff --git a/src/Common/FieldVisitorDump.cpp b/src/Common/FieldVisitorDump.cpp index a0eccd65f8..53e9fab0d0 100644 --- a/src/Common/FieldVisitorDump.cpp +++ b/src/Common/FieldVisitorDump.cpp @@ -138,4 +138,21 @@ String FieldVisitorDump::operator() (const BitMap64 & x) const return wb.str(); } +String FieldVisitorDump::operator() (const Object & x) const +{ + WriteBufferFromOwnString wb; + + wb << "Object_("; + for (auto it = x.begin(); it != x.end(); ++it) + { + if (it != x.begin()) + wb << ", "; + wb << "(" << it->first << ", " << applyVisitor(*this, it->second) << ")"; + } + wb << ')'; + + return wb.str(); + +} + } diff --git a/src/Common/FieldVisitorDump.h b/src/Common/FieldVisitorDump.h index 50c5857706..a105fbccf5 100644 --- a/src/Common/FieldVisitorDump.h +++ b/src/Common/FieldVisitorDump.h @@ -53,6 +53,7 @@ public: String operator() (const DecimalField & x) const; String operator() (const AggregateFunctionStateData & x) const; String operator() (const BitMap64 & x) const; + String operator() (const Object & x) const; }; } diff --git a/src/Common/FieldVisitorHash.cpp b/src/Common/FieldVisitorHash.cpp index ac0ea23820..4ad68a8e03 100644 --- a/src/Common/FieldVisitorHash.cpp +++ b/src/Common/FieldVisitorHash.cpp @@ -238,4 +238,17 @@ void FieldVisitorHash::operator() (const BitMap64 & x) const applyVisitor(*this, Field(*it)); } +void FieldVisitorHash::operator() (const Object & x) const +{ + UInt8 type = Field::Types::Object; + hash.update(type); + hash.update(x.size()); + + for (const auto & [key, value]: x) + { + hash.update(key); + applyVisitor(*this, value); + } +} + } diff --git a/src/Common/FieldVisitorHash.h b/src/Common/FieldVisitorHash.h index ba20ba897d..75c0261db9 100644 --- a/src/Common/FieldVisitorHash.h +++ b/src/Common/FieldVisitorHash.h @@ -59,6 +59,7 @@ public: void operator() (const DecimalField & x) const; void operator() (const AggregateFunctionStateData & x) const; void operator() (const BitMap64 & x) const; + void operator() (const Object & x) const; }; } diff --git a/src/Common/FieldVisitorSum.cpp b/src/Common/FieldVisitorSum.cpp index dd5f09c216..712c94bb5a 100644 --- a/src/Common/FieldVisitorSum.cpp +++ b/src/Common/FieldVisitorSum.cpp @@ -53,6 +53,7 @@ bool FieldVisitorSum::operator() (UUID &) const { throw Exception("Cannot sum UU bool FieldVisitorSum::operator() (IPv4 &) const { throw Exception("Cannot sum IPv4s", ErrorCodes::LOGICAL_ERROR); } bool FieldVisitorSum::operator() (IPv6 &) const { throw Exception("Cannot sum IPv6s", ErrorCodes::LOGICAL_ERROR); } bool FieldVisitorSum::operator() (BitMap64 &) const { throw Exception("Cannot sum BitMap64", ErrorCodes::LOGICAL_ERROR); } +bool FieldVisitorSum::operator() (Object &) const { throw Exception("Cannot sum Objects", ErrorCodes::LOGICAL_ERROR); } bool FieldVisitorSum::operator() (AggregateFunctionStateData &) const { diff --git a/src/Common/FieldVisitorSum.h b/src/Common/FieldVisitorSum.h index 6a5ecdf827..143f6de05b 100644 --- a/src/Common/FieldVisitorSum.h +++ b/src/Common/FieldVisitorSum.h @@ -53,6 +53,7 @@ public: bool operator() (IPv6 &) const; bool operator() (AggregateFunctionStateData &) const; bool operator() (BitMap64 &) const; + bool operator() (Object &) const; template bool operator() (DecimalField & x) const diff --git a/src/Common/FieldVisitorToString.cpp b/src/Common/FieldVisitorToString.cpp index c612d6206a..a224384a53 100644 --- a/src/Common/FieldVisitorToString.cpp +++ b/src/Common/FieldVisitorToString.cpp @@ -160,4 +160,23 @@ String FieldVisitorToString::operator() (const BitMap64 & x) const return wb.str(); } +String FieldVisitorToString::operator() (const Object & x) const +{ + WriteBufferFromOwnString wb; + + wb << '{'; + for (auto it = x.begin(); it != x.end(); ++it) + { + if (it != x.begin()) + wb << ", "; + + writeDoubleQuoted(it->first, wb); + wb << ": " << applyVisitor(*this, it->second); + } + wb << '}'; + + return wb.str(); + +} + } diff --git a/src/Common/FieldVisitorToString.h b/src/Common/FieldVisitorToString.h index e0370f6cef..5db5421092 100644 --- a/src/Common/FieldVisitorToString.h +++ b/src/Common/FieldVisitorToString.h @@ -53,6 +53,7 @@ public: String operator() (const DecimalField & x) const; String operator() (const AggregateFunctionStateData & x) const; String operator() (const BitMap64 & x) const; + String operator() (const Object & x) const; }; } diff --git a/src/Common/FieldVisitorWriteBinary.cpp b/src/Common/FieldVisitorWriteBinary.cpp index dbda379d0d..0f2cfbeb84 100644 --- a/src/Common/FieldVisitorWriteBinary.cpp +++ b/src/Common/FieldVisitorWriteBinary.cpp @@ -104,4 +104,18 @@ void FieldVisitorWriteBinary::operator() (const BitMap64 & x, WriteBuffer & buf) writeString(buffer.data(), bytes, buf); } +void FieldVisitorWriteBinary::operator() (const Object & x, WriteBuffer & buf) const +{ + const size_t size = x.size(); + writeBinary(size, buf); + + for (const auto & [key, value] : x) + { + const UInt8 type = value.getType(); + writeBinary(type, buf); + writeBinary(key, buf); + Field::dispatch([&buf] (const auto & val) { FieldVisitorWriteBinary()(val, buf); }, value); + } +} + } diff --git a/src/Common/FieldVisitorWriteBinary.h b/src/Common/FieldVisitorWriteBinary.h index 2da61740c0..24edfd1bd7 100644 --- a/src/Common/FieldVisitorWriteBinary.h +++ b/src/Common/FieldVisitorWriteBinary.h @@ -52,6 +52,7 @@ public: void operator() (const DecimalField & x, WriteBuffer & buf) const; void operator() (const AggregateFunctionStateData & x, WriteBuffer & buf) const; void operator() (const BitMap64 & x, WriteBuffer & buf) const; + void operator() (const Object & x, WriteBuffer & buf) const; }; } diff --git a/src/Common/JSONParsers/DummyJSONParser.h b/src/Common/JSONParsers/DummyJSONParser.h index 6266ed48f6..01fdab1abb 100644 --- a/src/Common/JSONParsers/DummyJSONParser.h +++ b/src/Common/JSONParsers/DummyJSONParser.h @@ -2,9 +2,6 @@ #include #include -#include -#include "ElementTypes.h" - namespace DB { @@ -25,26 +22,25 @@ struct DummyJSONParser class Element { public: - Element() = default; - static ElementType type() { return ElementType::NULL_VALUE; } - static bool isInt64() { return false; } - static bool isUInt64() { return false; } - static bool isDouble() { return false; } - static bool isString() { return false; } - static bool isArray() { return false; } - static bool isObject() { return false; } - static bool isBool() { return false; } - static bool isNull() { return false; } + Element() {} + bool isInt64() const { return false; } + bool isUInt64() const { return false; } + bool isDouble() const { return false; } + bool isString() const { return false; } + bool isArray() const { return false; } + bool isObject() const { return false; } + bool isBool() const { return false; } + bool isNull() const { return false; } - static Int64 getInt64() { return 0; } - static UInt64 getUInt64() { return 0; } - static double getDouble() { return 0; } - static bool getBool() { return false; } - static std::string_view getString() { return {}; } - static Array getArray() { return {}; } - static Object getObject() { return {}; } + Int64 getInt64() const { return 0; } + UInt64 getUInt64() const { return 0; } + double getDouble() const { return 0; } + bool getBool() const { return false; } + std::string_view getString() const { return {}; } + Array getArray() const { return {}; } + Object getObject() const { return {}; } - static Element getElement() { return {}; } + Element getElement() { return {}; } }; /// References an array in a JSON document. @@ -56,14 +52,14 @@ struct DummyJSONParser public: Element operator*() const { return {}; } Iterator & operator++() { return *this; } - Iterator operator++(int) { return *this; } /// NOLINT + Iterator operator++(int) { return *this; } friend bool operator==(const Iterator &, const Iterator &) { return true; } friend bool operator!=(const Iterator &, const Iterator &) { return false; } }; - static Iterator begin() { return {}; } - static Iterator end() { return {}; } - static size_t size() { return 0; } + Iterator begin() const { return {}; } + Iterator end() const { return {}; } + size_t size() const { return 0; } Element operator[](size_t) const { return {}; } }; @@ -78,15 +74,15 @@ struct DummyJSONParser public: KeyValuePair operator*() const { return {}; } Iterator & operator++() { return *this; } - Iterator operator++(int) { return *this; } /// NOLINT + Iterator operator++(int) { return *this; } friend bool operator==(const Iterator &, const Iterator &) { return true; } friend bool operator!=(const Iterator &, const Iterator &) { return false; } }; - static Iterator begin() { return {}; } - static Iterator end() { return {}; } - static size_t size() { return 0; } - bool find(std::string_view, Element &) const { return false; } /// NOLINT + Iterator begin() const { return {}; } + Iterator end() const { return {}; } + size_t size() const { return 0; } + bool find(const std::string_view &, Element &) const { return false; } #if 0 /// Optional: Provides access to an object's element by index. @@ -95,7 +91,7 @@ struct DummyJSONParser }; /// Parses a JSON document, returns the reference to its root element if succeeded. - bool parse(std::string_view, Element &) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Functions JSON* are not supported"); } /// NOLINT + bool parse(const std::string_view &, Element &) { throw Exception{"Functions JSON* are not supported", ErrorCodes::NOT_IMPLEMENTED}; } #if 0 /// Optional: Allocates memory to parse JSON documents faster. diff --git a/src/Functions/DummyJSONParser.h b/src/Common/JSONParsers/DummyJSONParser.h~cnch-ce-merge similarity index 56% rename from src/Functions/DummyJSONParser.h rename to src/Common/JSONParsers/DummyJSONParser.h~cnch-ce-merge index 01fdab1abb..6266ed48f6 100644 --- a/src/Functions/DummyJSONParser.h +++ b/src/Common/JSONParsers/DummyJSONParser.h~cnch-ce-merge @@ -2,6 +2,9 @@ #include #include +#include +#include "ElementTypes.h" + namespace DB { @@ -22,25 +25,26 @@ struct DummyJSONParser class Element { public: - Element() {} - bool isInt64() const { return false; } - bool isUInt64() const { return false; } - bool isDouble() const { return false; } - bool isString() const { return false; } - bool isArray() const { return false; } - bool isObject() const { return false; } - bool isBool() const { return false; } - bool isNull() const { return false; } + Element() = default; + static ElementType type() { return ElementType::NULL_VALUE; } + static bool isInt64() { return false; } + static bool isUInt64() { return false; } + static bool isDouble() { return false; } + static bool isString() { return false; } + static bool isArray() { return false; } + static bool isObject() { return false; } + static bool isBool() { return false; } + static bool isNull() { return false; } - Int64 getInt64() const { return 0; } - UInt64 getUInt64() const { return 0; } - double getDouble() const { return 0; } - bool getBool() const { return false; } - std::string_view getString() const { return {}; } - Array getArray() const { return {}; } - Object getObject() const { return {}; } + static Int64 getInt64() { return 0; } + static UInt64 getUInt64() { return 0; } + static double getDouble() { return 0; } + static bool getBool() { return false; } + static std::string_view getString() { return {}; } + static Array getArray() { return {}; } + static Object getObject() { return {}; } - Element getElement() { return {}; } + static Element getElement() { return {}; } }; /// References an array in a JSON document. @@ -52,14 +56,14 @@ struct DummyJSONParser public: Element operator*() const { return {}; } Iterator & operator++() { return *this; } - Iterator operator++(int) { return *this; } + Iterator operator++(int) { return *this; } /// NOLINT friend bool operator==(const Iterator &, const Iterator &) { return true; } friend bool operator!=(const Iterator &, const Iterator &) { return false; } }; - Iterator begin() const { return {}; } - Iterator end() const { return {}; } - size_t size() const { return 0; } + static Iterator begin() { return {}; } + static Iterator end() { return {}; } + static size_t size() { return 0; } Element operator[](size_t) const { return {}; } }; @@ -74,15 +78,15 @@ struct DummyJSONParser public: KeyValuePair operator*() const { return {}; } Iterator & operator++() { return *this; } - Iterator operator++(int) { return *this; } + Iterator operator++(int) { return *this; } /// NOLINT friend bool operator==(const Iterator &, const Iterator &) { return true; } friend bool operator!=(const Iterator &, const Iterator &) { return false; } }; - Iterator begin() const { return {}; } - Iterator end() const { return {}; } - size_t size() const { return 0; } - bool find(const std::string_view &, Element &) const { return false; } + static Iterator begin() { return {}; } + static Iterator end() { return {}; } + static size_t size() { return 0; } + bool find(std::string_view, Element &) const { return false; } /// NOLINT #if 0 /// Optional: Provides access to an object's element by index. @@ -91,7 +95,7 @@ struct DummyJSONParser }; /// Parses a JSON document, returns the reference to its root element if succeeded. - bool parse(const std::string_view &, Element &) { throw Exception{"Functions JSON* are not supported", ErrorCodes::NOT_IMPLEMENTED}; } + bool parse(std::string_view, Element &) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Functions JSON* are not supported"); } /// NOLINT #if 0 /// Optional: Allocates memory to parse JSON documents faster. diff --git a/src/Common/config.h.in b/src/Common/config.h.in index c88a76a138..28d0829b3a 100644 --- a/src/Common/config.h.in +++ b/src/Common/config.h.in @@ -25,6 +25,8 @@ #cmakedefine01 USE_KRB5 #cmakedefine01 USE_JEMALLOC #cmakedefine01 USE_BZIP2 +#cmakedefine01 USE_SIMDJSON +#cmakedefine01 USE_RAPIDJSON #cmakedefine01 USE_JAVA_EXTENSIONS #cmakedefine01 USE_TSQUERY #cmakedefine01 USE_SIMDJSON diff --git a/src/Core/Field.cpp b/src/Core/Field.cpp index 7a832a9cfd..3068029d2f 100644 --- a/src/Core/Field.cpp +++ b/src/Core/Field.cpp @@ -150,6 +150,12 @@ inline Field getBinaryValue(UInt8 type, ReadBuffer & buf) DB::readBinary(value.toUnderType(), buf); return value; } + case Field::Types::Object: + { + Object value; + readBinary(value, buf); + return value; + } } return Field(); } @@ -274,6 +280,40 @@ void writeBinary(const BitMap64 & x, WriteBuffer & buf) writeString(tmp_buf.data(), bytes, buf); } +void readBinary(Object & x, ReadBuffer & buf) +{ + size_t size; + readBinary(size, buf); + + for (size_t index = 0; index < size; ++index) + { + UInt8 type; + String key; + readBinary(type, buf); + readBinary(key, buf); + x[key] = getBinaryValue(type, buf); + } +} + +void writeBinary(const Object & x, WriteBuffer & buf) +{ + const size_t size = x.size(); + writeBinary(size, buf); + + for (const auto & [key, value] : x) + { + const UInt8 type = value.getType(); + writeBinary(type, buf); + writeBinary(key, buf); + Field::dispatch([&buf] (const auto & val) { FieldVisitorWriteBinary()(val, buf); }, value); + } +} + +void writeText(const Object & x, WriteBuffer & buf) +{ + writeFieldText(Field(x), buf); +} + template void readQuoted(DecimalField & x, ReadBuffer & buf) { diff --git a/src/Core/Field.h b/src/Core/Field.h index c5a7d672c4..41e125a37a 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -84,6 +85,18 @@ DEFINE_FIELD_VECTOR(Tuple); #undef DEFINE_FIELD_VECTOR +using FieldMap = std::map, AllocatorWithMemoryTracking>>; + +#define DEFINE_FIELD_MAP(X) \ +struct X : public FieldMap \ +{ \ + using FieldMap::FieldMap; \ +} + +DEFINE_FIELD_MAP(Object); + +#undef DEFINE_FIELD_MAP + struct AggregateFunctionStateData { String name; /// Name with arguments. @@ -246,6 +259,7 @@ template <> struct NearestFieldTypeImpl { using Type = Tuple; }; template <> struct NearestFieldTypeImpl { using Type = Map; }; template <> struct NearestFieldTypeImpl { using Type = UInt64; }; template <> struct NearestFieldTypeImpl { using Type = Null; }; +template <> struct NearestFieldTypeImpl { using Type = Object; }; template <> struct NearestFieldTypeImpl { using Type = IPv4; }; template <> struct NearestFieldTypeImpl { using Type = IPv6; }; template <> struct NearestFieldTypeImpl { using Type = NegativeInfinity; }; @@ -306,6 +320,7 @@ public: (SketchBinary, 29), (IPv4, 30), (IPv6, 31), + (Object, 32), // Special types for index analysis (NegativeInfinity, 254), (PositiveInfinity, 255)); @@ -519,6 +534,7 @@ public: case Types::Array: return get() < rhs.get(); case Types::Tuple: return get() < rhs.get(); case Types::Map: return get() < rhs.get(); + case Types::Object: return get() < rhs.get(); case Types::Decimal32: return get>() < rhs.get>(); case Types::Decimal64: return get>() < rhs.get>(); case Types::Decimal128: return get>() < rhs.get>(); @@ -563,6 +579,7 @@ public: case Types::Array: return get() <= rhs.get(); case Types::Tuple: return get() <= rhs.get(); case Types::Map: return get() <= rhs.get(); + case Types::Object: return get() <= rhs.get(); case Types::Decimal32: return get>() <= rhs.get>(); case Types::Decimal64: return get>() <= rhs.get>(); case Types::Decimal128: return get>() <= rhs.get>(); @@ -617,6 +634,7 @@ public: case Types::Decimal256: return get>() == rhs.get>(); case Types::AggregateFunctionState: return get() == rhs.get(); case Types::BitMap64: return get() == rhs.get(); + case Types::Object: return get() == rhs.get(); } throw Exception("Bad type of Field", ErrorCodes::BAD_TYPE_OF_FIELD); @@ -663,6 +681,7 @@ public: case Types::Decimal256: return f(field.template get>()); case Types::AggregateFunctionState: return f(field.template get()); case Types::BitMap64: return f(field.template get()); + case Types::Object: return f(field.template get()); #if !defined(__clang__) #pragma GCC diagnostic pop #endif @@ -726,6 +745,8 @@ public: return f.template operator()(); case Types::BitMap64: return f.template operator()(); + case Types::Object: + return f.template operator()(); case Types::IPv4: return f.template operator()(); case Types::IPv6: @@ -885,6 +906,9 @@ private: case Types::BitMap64: destroy(); break; + case Types::Object: + destroy(); + break; default: break; } @@ -906,30 +930,31 @@ private: using Row = std::vector; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Null; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::NegativeInfinity; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::PositiveInfinity; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt128; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt256; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int128; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int256; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UUID; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Null; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::NegativeInfinity; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::PositiveInfinity; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UInt64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UInt128; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UInt256; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Int64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Int128; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Int256; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UUID; }; template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::IPv4; }; template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::IPv6; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Float64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::String; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Array; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Tuple; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Map; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal32; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal64; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal128; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal256; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal64; }; -template <> struct Field::TypeToEnum{ static const Types::Which value = Types::AggregateFunctionState; }; -template <> struct Field::TypeToEnum{ static const Types::Which value = Types::BitMap64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Float64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::String; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Array; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Tuple; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Map; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal32; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal64; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal128; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal256; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal64; }; +template <> struct Field::TypeToEnum{ static constexpr Types::Which value = Types::AggregateFunctionState; }; +template <> struct Field::TypeToEnum{ static constexpr Types::Which value = Types::BitMap64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Object; }; template <> struct Field::EnumToType { using Type = Null; }; template <> struct Field::EnumToType { using Type = NegativeInfinity; }; @@ -948,6 +973,7 @@ template <> struct Field::EnumToType { using Type = Strin template <> struct Field::EnumToType { using Type = Array; }; template <> struct Field::EnumToType { using Type = Tuple; }; template <> struct Field::EnumToType { using Type = Map; }; +template <> struct Field::EnumToType { using Type = Object; }; template <> struct Field::EnumToType { using Type = DecimalField; }; template <> struct Field::EnumToType { using Type = DecimalField; }; template <> struct Field::EnumToType { using Type = DecimalField; }; @@ -1099,34 +1125,39 @@ class WriteBuffer; /// It is assumed that all elements of the array have the same type. void readBinary(Array & x, ReadBuffer & buf); - [[noreturn]] inline void readText(Array &, ReadBuffer &) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } [[noreturn]] inline void readQuoted(Array &, ReadBuffer &) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } /// It is assumed that all elements of the array have the same type. /// Also write size and type into buf. UInt64 and Int64 is written in variadic size form void writeBinary(const Array & x, WriteBuffer & buf); - void writeText(const Array & x, WriteBuffer & buf); - [[noreturn]] inline void writeQuoted(const Array &, WriteBuffer &) { throw Exception("Cannot write Array quoted.", ErrorCodes::NOT_IMPLEMENTED); } void readBinary(Tuple & x, ReadBuffer & buf); - [[noreturn]] inline void readText(Tuple &, ReadBuffer &) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); } [[noreturn]] inline void readQuoted(Tuple &, ReadBuffer &) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); } void writeBinary(const Tuple & x, WriteBuffer & buf); - void writeText(const Tuple & x, WriteBuffer & buf); +[[noreturn]] inline void writeQuoted(const Tuple &, WriteBuffer &) { throw Exception("Cannot write Tuple quoted.", ErrorCodes::NOT_IMPLEMENTED); } void readBinary(Map & x, ReadBuffer & buf); [[noreturn]] inline void readText(Map &, ReadBuffer &) { throw Exception("Cannot read Map.", ErrorCodes::NOT_IMPLEMENTED); } [[noreturn]] inline void readQuoted(Map &, ReadBuffer &) { throw Exception("Cannot read Map.", ErrorCodes::NOT_IMPLEMENTED); } + void writeBinary(const Map & x, WriteBuffer & buf); void writeText(const Map & x, WriteBuffer & buf); [[noreturn]] inline void writeQuoted(const Map &, WriteBuffer &) { throw Exception("Cannot write Map quoted.", ErrorCodes::NOT_IMPLEMENTED); } +void readBinary(Object & x, ReadBuffer & buf); +[[noreturn]] inline void readText(Object &, ReadBuffer &) { throw Exception("Cannot read Object.", ErrorCodes::NOT_IMPLEMENTED); } +[[noreturn]] inline void readQuoted(Object &, ReadBuffer &) { throw Exception("Cannot read Object.", ErrorCodes::NOT_IMPLEMENTED); } + +void writeBinary(const Object & x, WriteBuffer & buf); +void writeText(const Object & x, WriteBuffer & buf); +[[noreturn]] inline void writeQuoted(const Object &, WriteBuffer &) { throw Exception("Cannot write Object quoted.", ErrorCodes::NOT_IMPLEMENTED); } + __attribute__ ((noreturn)) inline void writeText(const AggregateFunctionStateData &, WriteBuffer &) { // This probably doesn't make any sense, but we have to have it for @@ -1156,8 +1187,6 @@ void readFieldBinary(Field & field, ReadBuffer & buf); void writeFieldBinary(const Field & field, WriteBuffer & buf); -[[noreturn]] inline void writeQuoted(const Tuple &, WriteBuffer &) { throw Exception("Cannot write Tuple quoted.", ErrorCodes::NOT_IMPLEMENTED); } - String toString(const Field & x); } diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7093bd4905..6607bcb258 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -340,7 +340,6 @@ enum PreloadLevelSettings : UInt64 M(Float, totals_auto_threshold, 0.5, "The threshold for totals_mode = 'auto'.", 0) \ M(Bool, allow_suspicious_low_cardinality_types, true, "In CREATE TABLE statement allows specifying LowCardinality modifier for types of small fixed size (8 or less). Enabling this may increase merge times and memory consumption.", 0) \ M(Bool, allow_suspicious_fixed_string_types, false, "In CREATE TABLE statement allows creating columns of type FixedString(n) with n > 256. FixedString with length >= 256 is suspicious and most likely indicates misusage", 0) \ - M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, compile_expressions, true, "Compile some scalar functions and operators to native code.", 0) \ M(UInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled", 0) \ M(Bool, compile_aggregate_expressions, true, "Compile aggregate functions to native code.", 0) \ @@ -1253,6 +1252,9 @@ enum PreloadLevelSettings : UInt64 M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \ M(Bool, optimize_skip_merged_partitions, false, "Skip partitions with one part with level > 0 in optimize final", 0) \ M(Bool, optimize_on_insert, true, "Do the same transformation for inserted block of data as if merge was done on this block.", 0) \ + M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ + M(Bool, describe_extend_object_types, false, "Deduce concrete type of columns of type Object in DESCRIBE query", 0) \ + M(Bool, describe_include_subcolumns, false, "If true, subcolumns of all table columns will be included into result of DESCRIBE query", 0) \ M(Bool, allow_experimental_map_type, true, "Obsolete setting, does nothing.", 0) \ M(Bool, allow_experimental_window_functions, true, "Allow experimental window functions", 0) \ M(Bool, allow_experimental_projection_optimization, false, "Enable projection optimization when processing SELECT queries", 0) \ diff --git a/src/Core/TypeId.h b/src/Core/TypeId.h index d2ae56b428..13fcf3f814 100644 --- a/src/Core/TypeId.h +++ b/src/Core/TypeId.h @@ -47,6 +47,7 @@ TYPEID_MAP(Int256) TYPEID_MAP(Float32) TYPEID_MAP(Float64) TYPEID_MAP(UUID) + TYPEID_MAP(IPv4) TYPEID_MAP(IPv6) diff --git a/src/Core/Types.h b/src/Core/Types.h index 5fc360137e..abf275fa5f 100644 --- a/src/Core/Types.h +++ b/src/Core/Types.h @@ -88,6 +88,7 @@ enum class TypeIndex BitMap64, Time, SketchBinary, + Object, IPv4, IPv6, }; @@ -340,6 +341,7 @@ inline constexpr const char * getTypeName(TypeIndex idx) case TypeIndex::Map: return "Map"; case TypeIndex::BitMap64: return "BitMap64"; case TypeIndex::SketchBinary: return "SketchBinary"; + case TypeIndex::Object: return "Object"; } __builtin_unreachable(); diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in index 85a80b4bcc..8d5d963a16 100644 --- a/src/Core/config_core.h.in +++ b/src/Core/config_core.h.in @@ -16,4 +16,6 @@ #cmakedefine01 USE_NURAFT #cmakedefine01 USE_KRB5 #cmakedefine01 USE_JEMALLOC +#cmakedefine01 USE_SIMDJSON +#cmakedefine01 USE_RAPIDJSON #cmakedefine01 USE_NLP diff --git a/src/DaemonManager/BGJobStatusInCatalog.cpp b/src/DaemonManager/BGJobStatusInCatalog.cpp index 46b006c689..f0a5b0fcb7 100644 --- a/src/DaemonManager/BGJobStatusInCatalog.cpp +++ b/src/DaemonManager/BGJobStatusInCatalog.cpp @@ -126,6 +126,7 @@ IBGJobStatusPersistentStoreProxy::CacheClearer CatalogBGJobStatusPersistentStore if (catalog) // catalog is nullptr in unittest statuses_cache = catalog->getBGJobStatuses(type); + is_cache_prefetched = true; return CacheClearer{this}; } diff --git a/src/DaemonManager/DaemonJob.cpp b/src/DaemonManager/DaemonJob.cpp index 81fde8777e..f8ac58b10b 100644 --- a/src/DaemonManager/DaemonJob.cpp +++ b/src/DaemonManager/DaemonJob.cpp @@ -53,6 +53,8 @@ bvar::Adder & getExecuteMetric(CnchBGThreadType type) return g_executeImpl_TxnGC; case CnchBGThreadType::Clustering: return g_executeImpl_Clustering; + case CnchBGThreadType::ObjectSchemaAssemble: + return g_executeImpl_ObjectSchemaAssemble; case CnchBGThreadType::MaterializedMySQL: return g_executeImpl_MaterializedMySQL; default: @@ -81,6 +83,8 @@ bvar::Adder & getExecuteErrorMetric(CnchBGThreadType type) return g_executeImpl_TxnGC_error; case CnchBGThreadType::Clustering: return g_executeImpl_Clustering_error; + case CnchBGThreadType::ObjectSchemaAssemble: + return g_executeImpl_ObjectSchemaAssemble_error; case CnchBGThreadType::MaterializedMySQL: return g_executeImpl_MaterializedMySQL_error; default: diff --git a/src/DaemonManager/DaemonJobServerBGThread.cpp b/src/DaemonManager/DaemonJobServerBGThread.cpp index 3b73eb6674..a2c17254bb 100644 --- a/src/DaemonManager/DaemonJobServerBGThread.cpp +++ b/src/DaemonManager/DaemonJobServerBGThread.cpp @@ -1138,6 +1138,7 @@ void registerServerBGThreads(DaemonFactory & factory) factory.registerDaemonJobForBGThreadInServer>("PART_CLUSTERING"); factory.registerDaemonJobForBGThreadInServer>("CONSUMER"); factory.registerDaemonJobForBGThreadInServer>("DEDUP_WORKER"); + factory.registerDaemonJobForBGThreadInServer>("OBJECT_SCHEMA_ASSEMBLE"); factory.registerDaemonJobForBGThreadInServer>("MATERIALIZED_MYSQL"); } diff --git a/src/DaemonManager/DaemonManager.cpp b/src/DaemonManager/DaemonManager.cpp index e1bbeee5eb..bf6e63f23d 100644 --- a/src/DaemonManager/DaemonManager.cpp +++ b/src/DaemonManager/DaemonManager.cpp @@ -179,6 +179,7 @@ std::unordered_map createDaemonJob { "CONSUMER", 10000}, { "DEDUP_WORKER", 10000}, { "PART_CLUSTERING", 10000}, + { "OBJECT_SCHEMA_ASSEMBLE", 10000}, { "MATERIALIZED_MYSQL", 10000} }; diff --git a/src/DaemonManager/Metrics.cpp b/src/DaemonManager/Metrics.cpp index f8877c8904..a2f537c457 100644 --- a/src/DaemonManager/Metrics.cpp +++ b/src/DaemonManager/Metrics.cpp @@ -63,6 +63,11 @@ namespace DB::DaemonManager::BRPCMetrics bvar::Window> g_executeImpl_Clustering_error_minute("DaemonManager_Internal", "executeImpl_Clustering_error", & g_executeImpl_Clustering_error, 60); bvar::Window> g_executeImpl_Clustering_minute("DaemonManager_Internal", "executeImpl_Clustering", & g_executeImpl_Clustering, 60); + bvar::Adder< int > g_executeImpl_ObjectSchemaAssemble_error; + bvar::Adder< int > g_executeImpl_ObjectSchemaAssemble; + bvar::Window> g_executeImpl_ObjectSchemaAssemble_error_minute("DaemonManager_Internal", "executeImpl_ObjectSchemaAssemble_error", & g_executeImpl_ObjectSchemaAssemble_error, 60); + bvar::Window> g_executeImpl_ObjectSchemaAssemble_minute("DaemonManager_Internal", "executeImpl_ObjectSchemaAssemble", & g_executeImpl_ObjectSchemaAssemble, 60); + bvar::Adder< int > g_executeImpl_MaterializedMySQL_error; bvar::Adder< int > g_executeImpl_MaterializedMySQL; bvar::Window> g_executeImpl_MaterializedMySQL_error_minute("DaemonManager_Internal", "executeImpl_MaterializedMySQL_error", & g_executeImpl_MaterializedMySQL_error, 60); diff --git a/src/DaemonManager/Metrics.h b/src/DaemonManager/Metrics.h index cb87d26e79..0205e147ac 100644 --- a/src/DaemonManager/Metrics.h +++ b/src/DaemonManager/Metrics.h @@ -37,6 +37,8 @@ namespace DB::DaemonManager::BRPCMetrics extern bvar::Adder< int > g_executeImpl_TxnGC; extern bvar::Adder< int > g_executeImpl_Clustering_error; extern bvar::Adder< int > g_executeImpl_Clustering; + extern bvar::Adder< int > g_executeImpl_ObjectSchemaAssemble_error; + extern bvar::Adder< int > g_executeImpl_ObjectSchemaAssemble; extern bvar::Adder< int > g_executeImpl_MaterializedMySQL_error; extern bvar::Adder< int > g_executeImpl_MaterializedMySQL; }/// end namespace diff --git a/src/DataStreams/NativeBlockOutputStream.cpp b/src/DataStreams/NativeBlockOutputStream.cpp index a2ae8dccd9..eeb47b94d4 100644 --- a/src/DataStreams/NativeBlockOutputStream.cpp +++ b/src/DataStreams/NativeBlockOutputStream.cpp @@ -85,7 +85,7 @@ static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuf auto full_type = lc_type->getFullLowCardinalityTypePtr(); auto serialization = full_type->getDefaultSerialization(); ISerialization::SerializeBinaryBulkStatePtr state; - serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkStatePrefix(*full_column, settings, state); serialization->serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); serialization->serializeBinaryBulkStateSuffix(settings, state); return ; @@ -95,7 +95,7 @@ static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuf auto serialization = type.getDefaultSerialization(); ISerialization::SerializeBinaryBulkStatePtr state; - serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkStatePrefix(*full_column, settings, state); serialization->serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); serialization->serializeBinaryBulkStateSuffix(settings, state); } diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp index 7ddea38c42..cea4d50090 100644 --- a/src/DataStreams/RemoteQueryExecutor.cpp +++ b/src/DataStreams/RemoteQueryExecutor.cpp @@ -524,12 +524,13 @@ void RemoteQueryExecutor::sendExternalTables() { SelectQueryInfo query_info; auto metadata_snapshot = cur->getInMemoryMetadataPtr(); + auto storage_snapshot = cur->getStorageSnapshot(metadata_snapshot, context); QueryProcessingStage::Enum read_from_table_stage = cur->getQueryProcessingStage( - context, QueryProcessingStage::Complete, metadata_snapshot, query_info); + context, QueryProcessingStage::Complete, storage_snapshot, query_info); Pipe pipe = cur->read( metadata_snapshot->getColumns().getNamesOfPhysical(), - metadata_snapshot, query_info, context, + storage_snapshot, query_info, context, read_from_table_stage, DEFAULT_BLOCK_SIZE, 1); if (pipe.empty()) diff --git a/src/DataStreams/RemoteQueryExecutor.h b/src/DataStreams/RemoteQueryExecutor.h index 5ca4321d1c..2407f21a5d 100644 --- a/src/DataStreams/RemoteQueryExecutor.h +++ b/src/DataStreams/RemoteQueryExecutor.h @@ -87,7 +87,7 @@ public: /// Query is resent to a replica, the query itself can be modified. std::atomic resent_query { false }; - + /// Read next block of data. Returns empty block if query is finished. Block read(); diff --git a/src/DataTypes/CMakeLists.txt b/src/DataTypes/CMakeLists.txt index a6176efc7f..dd29c3146a 100644 --- a/src/DataTypes/CMakeLists.txt +++ b/src/DataTypes/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory (Serializations) + if (ENABLE_EXAMPLES) add_subdirectory(examples) endif () diff --git a/src/DataTypes/DataTypeArray.cpp b/src/DataTypes/DataTypeArray.cpp index bcf3a9c1f5..bd3876435a 100644 --- a/src/DataTypes/DataTypeArray.cpp +++ b/src/DataTypes/DataTypeArray.cpp @@ -95,26 +95,26 @@ ColumnPtr DataTypeArray::getSubcolumnImpl(const String & subcolumn_name, const I return ColumnArray::create(subcolumn, column_array.getOffsetsPtr()); } -SerializationPtr DataTypeArray::getSubcolumnSerialization( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const -{ - return getSubcolumnSerializationImpl(subcolumn_name, base_serialization_getter, 0); -} +// SerializationPtr DataTypeArray::getSubcolumnSerialization( +// const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +// { +// return getSubcolumnSerializationImpl(subcolumn_name, base_serialization_getter, 0); +// } -SerializationPtr DataTypeArray::getSubcolumnSerializationImpl( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter, size_t level) const -{ - if (subcolumn_name == "size" + std::to_string(level)) - return std::make_shared(base_serialization_getter(DataTypeUInt64()), subcolumn_name, false); +// SerializationPtr DataTypeArray::getSubcolumnSerializationImpl( +// const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter, size_t level) const +// { +// if (subcolumn_name == "size" + std::to_string(level)) +// return std::make_shared(base_serialization_getter(DataTypeUInt64()), subcolumn_name, false); - SerializationPtr subcolumn; - if (const auto * nested_array = typeid_cast(nested.get())) - subcolumn = nested_array->getSubcolumnSerializationImpl(subcolumn_name, base_serialization_getter, level + 1); - else - subcolumn = nested->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); +// SerializationPtr subcolumn; +// if (const auto * nested_array = typeid_cast(nested.get())) +// subcolumn = nested_array->getSubcolumnSerializationImpl(subcolumn_name, base_serialization_getter, level + 1); +// else +// subcolumn = nested->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); - return std::make_shared(subcolumn); -} +// return std::make_shared(subcolumn); +// } SerializationPtr DataTypeArray::doGetDefaultSerialization() const { diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index a94cf785c8..a642f238df 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -78,8 +78,8 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; - SerializationPtr getSubcolumnSerialization( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; + // SerializationPtr getSubcolumnSerialization( + // const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; SerializationPtr doGetDefaultSerialization() const override; diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index 9dd6df66e3..88533ca238 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -27,6 +27,8 @@ public: bool isCategorial() const override { return true; } bool canBeInsideNullable() const override { return true; } bool isComparable() const override { return true; } + + virtual bool contains(const IDataType & rhs) const = 0; bool canBeMapKeyType() const override { return true; } }; @@ -78,7 +80,7 @@ public: /// Example: /// Enum('a' = 1, 'b' = 2) -> Enum('c' = 1, 'b' = 2, 'd' = 3) OK /// Enum('a' = 1, 'b' = 2) -> Enum('a' = 2, 'b' = 1) NOT OK - bool contains(const IDataType & rhs) const; + bool contains(const IDataType & rhs) const override; SerializationPtr doGetDefaultSerialization() const override; diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index 0e55401608..419f9dd788 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -288,6 +288,7 @@ DataTypeFactory::DataTypeFactory() registerDataTypeBitMap64(*this); registerDataTypeSketchBinary(*this); registerDataTypeDomainBool(*this); + registerDataTypeObject(*this); } DataTypeFactory & DataTypeFactory::instance() diff --git a/src/DataTypes/DataTypeFactory.h b/src/DataTypes/DataTypeFactory.h index ea7c29bb18..7eb39aab68 100644 --- a/src/DataTypes/DataTypeFactory.h +++ b/src/DataTypes/DataTypeFactory.h @@ -113,5 +113,6 @@ void registerDataTypeBitMap64(DataTypeFactory & factory); void registerDataTypeSet(DataTypeFactory & factory); void registerDataTypeSketchBinary(DataTypeFactory & factory); void registerDataTypeDomainBool(DataTypeFactory & factory); +void registerDataTypeObject(DataTypeFactory & factory); } diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index 7e1931eae2..d45ea51b66 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -32,6 +32,27 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +DataTypeMap::DataTypeMap(const DataTypePtr & nested_) + : nested(nested_) +{ + const auto * type_array = typeid_cast(nested.get()); + if (!type_array) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expected Array(Tuple(key, value)) type, got {}", nested->getName()); + + const auto * type_tuple = typeid_cast(type_array->getNestedType().get()); + if (!type_tuple) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expected Array(Tuple(key, value)) type, got {}", nested->getName()); + + if (type_tuple->getElements().size() != 2) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expected Array(Tuple(key, value)) type, got {}", nested->getName()); + + key_type = type_tuple->getElement(0); + value_type = type_tuple->getElement(1); + checkKeyType(); +} DataTypeMap::DataTypeMap(const DataTypes & elems_) { @@ -93,11 +114,11 @@ ColumnPtr DataTypeMap::getSubcolumn(const String & subcolumn_name, const IColumn return nested->getSubcolumn(subcolumn_name, extractNestedColumn(column)); } -SerializationPtr DataTypeMap::getSubcolumnSerialization( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const -{ - return nested->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); -} +// SerializationPtr DataTypeMap::getSubcolumnSerialization( +// const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +// { +// return nested->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); +// } MutableColumnPtr DataTypeMap::createColumn() const { diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index b738043cc4..bbb3ced938 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -31,6 +31,7 @@ private: public: static constexpr bool is_parametric = true; + explicit DataTypeMap(const DataTypePtr & nested_); DataTypeMap(const DataTypes & elems); DataTypeMap(const DataTypePtr & key_type_, const DataTypePtr & value_type_); @@ -42,8 +43,8 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; - SerializationPtr getSubcolumnSerialization( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; + // SerializationPtr getSubcolumnSerialization( + // const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; MutableColumnPtr createColumn() const override; @@ -73,7 +74,6 @@ public: private: void checkKeyType() const; - bool isMapKVStore() const { return flags & TYPE_MAP_KV_STORE_FLAG;} bool isMapByteStore() const { return flags & TYPE_MAP_BYTE_STORE_FLAG; } }; diff --git a/src/DataTypes/DataTypeNullable.cpp b/src/DataTypes/DataTypeNullable.cpp index 7568d285d8..3785451191 100644 --- a/src/DataTypes/DataTypeNullable.cpp +++ b/src/DataTypes/DataTypeNullable.cpp @@ -81,14 +81,14 @@ ColumnPtr DataTypeNullable::getSubcolumn(const String & subcolumn_name, const IC return nested_data_type->getSubcolumn(subcolumn_name, column_nullable.getNestedColumn()); } -SerializationPtr DataTypeNullable::getSubcolumnSerialization( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const -{ - if (subcolumn_name == "null") - return std::make_shared(base_serialization_getter(DataTypeUInt8()), subcolumn_name, false); +// SerializationPtr DataTypeNullable::getSubcolumnSerialization( +// const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +// { +// if (subcolumn_name == "null") +// return std::make_shared(base_serialization_getter(DataTypeUInt8()), subcolumn_name, false); - return nested_data_type->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); -} +// return nested_data_type->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); +// } SerializationPtr DataTypeNullable::doGetDefaultSerialization() const { diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 9220019881..b74f7d313b 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -68,8 +68,8 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; - SerializationPtr getSubcolumnSerialization( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; + // SerializationPtr getSubcolumnSerialization( + // const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; const DataTypePtr & getNestedType() const { return nested_data_type; } diff --git a/src/DataTypes/DataTypeObject.cpp b/src/DataTypes/DataTypeObject.cpp new file mode 100644 index 0000000000..720436d0e0 --- /dev/null +++ b/src/DataTypes/DataTypeObject.cpp @@ -0,0 +1,82 @@ +#include +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int UNEXPECTED_AST_STRUCTURE; +} + +DataTypeObject::DataTypeObject(const String & schema_format_, bool is_nullable_) + : schema_format(Poco::toLower(schema_format_)) + , is_nullable(is_nullable_) +{ +} + +bool DataTypeObject::equals(const IDataType & rhs) const +{ + if (const auto * object = typeid_cast(&rhs)) + return schema_format == object->schema_format && is_nullable == object->is_nullable; + return false; +} + +SerializationPtr DataTypeObject::doGetDefaultSerialization() const +{ + return getObjectSerialization(schema_format); +} + +String DataTypeObject::doGetName() const +{ + WriteBufferFromOwnString out; + if (is_nullable) + out << "Object(Nullable(" << quote << schema_format << "))"; + else + out << "Object(" << quote << schema_format << ")"; + return out.str(); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Object data type family must have one argument - name of schema format"); + + ASTPtr schema_argument = arguments->children[0]; + bool is_nullable = false; + + if (const auto * func = schema_argument->as()) + { + if (func->name != "Nullable" || func->arguments->children.size() != 1) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Expected 'Nullable()' as parameter for type Object (function: {})", func->name); + + schema_argument = func->arguments->children[0]; + is_nullable = true; + } + + const auto * literal = schema_argument->as(); + if (!literal || literal->value.getType() != Field::Types::String) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Object data type family must have a const string as its schema name parameter"); + + return std::make_shared(literal->value.get(), is_nullable); +} + +void registerDataTypeObject(DataTypeFactory & factory) +{ + factory.registerDataType("Object", create); + factory.registerSimpleDataType("JSON", + [] { return std::make_shared("JSON", false); }, + DataTypeFactory::CaseInsensitive); +} + +} diff --git a/src/DataTypes/DataTypeObject.h b/src/DataTypes/DataTypeObject.h new file mode 100644 index 0000000000..937a909137 --- /dev/null +++ b/src/DataTypes/DataTypeObject.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class DataTypeObject : public IDataType +{ +private: + String schema_format; + bool is_nullable; + +public: + DataTypeObject(const String & schema_format_, bool is_nullable_); + + const char * getFamilyName() const override { return "Object"; } + String doGetName() const override; + TypeIndex getTypeId() const override { return TypeIndex::Object; } + + MutableColumnPtr createColumn() const override { return ColumnObject::create(is_nullable); } + + Field getDefault() const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDefault() is not implemented for data type {}", getName()); + } + + bool haveSubtypes() const override { return false; } + bool equals(const IDataType & rhs) const override; + bool isParametric() const override { return true; } + bool hasDynamicSubcolumns() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; + + bool hasNullableSubcolumns() const { return is_nullable; } + + const String & getSchemaFormat() const { return schema_format; } +}; + +} diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index 682e6afcb9..16be0a281f 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -291,25 +291,25 @@ ColumnPtr DataTypeTuple::getSubcolumn(const String & subcolumn_name, const IColu throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); } -SerializationPtr DataTypeTuple::getSubcolumnSerialization( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const -{ - auto on_success = [&](size_t pos) - { - return std::make_shared(base_serialization_getter(*elems[pos]), names[pos]); - }; +// SerializationPtr DataTypeTuple::getSubcolumnSerialization( +// const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +// { +// auto on_success = [&](size_t pos) +// { +// return std::make_shared(base_serialization_getter(*elems[pos]), names[pos]); +// }; - auto on_continue = [&](size_t pos, const String & next_subcolumn) - { - auto next_serialization = elems[pos]->getSubcolumnSerialization(next_subcolumn, base_serialization_getter); - return std::make_shared(next_serialization, names[pos]); - }; +// auto on_continue = [&](size_t pos, const String & next_subcolumn) +// { +// auto next_serialization = elems[pos]->getSubcolumnSerialization(next_subcolumn, base_serialization_getter); +// return std::make_shared(next_serialization, names[pos]); +// }; - if (auto serialization = getSubcolumnEntity(subcolumn_name, on_success, on_continue)) - return serialization; +// if (auto serialization = getSubcolumnEntity(subcolumn_name, on_success, on_continue)) +// return serialization; - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); -} +// throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); +// } SerializationPtr DataTypeTuple::doGetDefaultSerialization() const @@ -320,7 +320,7 @@ SerializationPtr DataTypeTuple::doGetDefaultSerialization() const { String elem_name = use_explicit_names ? names[i] : toString(i + 1); auto serialization = elems[i]->getDefaultSerialization(); - serializations[i] = std::make_shared(serialization, elem_name); + serializations[i] = std::make_shared(serialization, elem_name); } return std::make_shared(std::move(serializations), use_explicit_names); @@ -335,7 +335,7 @@ SerializationPtr DataTypeTuple::getSerialization(const String & column_name, con String elem_name = use_explicit_names ? names[i] : toString(i + 1); auto subcolumn_name = Nested::concatenateName(column_name, elem_name); auto serializaion = elems[i]->getSerialization(subcolumn_name, callback); - serializations[i] = std::make_shared(serializaion, elem_name); + serializations[i] = std::make_shared(serializaion, elem_name); } return std::make_shared(std::move(serializations), use_explicit_names); diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index b129a82dba..6bbced0cbc 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -57,8 +57,8 @@ public: SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const override; - SerializationPtr getSubcolumnSerialization( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; + // SerializationPtr getSubcolumnSerialization( + // const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; SerializationPtr doGetDefaultSerialization() const override; diff --git a/src/DataTypes/FieldToDataType.cpp b/src/DataTypes/FieldToDataType.cpp index c783f877a5..ba1bd64401 100644 --- a/src/DataTypes/FieldToDataType.cpp +++ b/src/DataTypes/FieldToDataType.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -152,9 +153,9 @@ DataTypePtr FieldToDataType::operator() (const Array & x) const element_types.reserve(x.size()); for (const Field & elem : x) - element_types.emplace_back(applyVisitor(FieldToDataType(), elem)); + element_types.emplace_back(applyVisitor(FieldToDataType(allow_convertion_to_string), elem)); - return std::make_shared(getLeastSupertype(element_types)); + return std::make_shared(getLeastSupertype(element_types, allow_convertion_to_string)); } @@ -167,7 +168,7 @@ DataTypePtr FieldToDataType::operator() (const Tuple & tuple) const element_types.reserve(tuple.size()); for (const auto & element : tuple) - element_types.push_back(applyVisitor(FieldToDataType(), element)); + element_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), element)); return std::make_shared(element_types); } @@ -185,7 +186,9 @@ DataTypePtr FieldToDataType::operator() (const Map & map) const value_types.push_back(applyVisitor(FieldToDataType(), elem.second)); } - return std::make_shared(getLeastSupertype(key_types), getLeastSupertype(value_types)); + return std::make_shared( + getLeastSupertype(key_types, allow_convertion_to_string), + getLeastSupertype(value_types, allow_convertion_to_string)); } DataTypePtr FieldToDataType::operator() (const AggregateFunctionStateData & x) const @@ -199,4 +202,10 @@ DataTypePtr FieldToDataType::operator() (const BitMap64 &) const return std::make_shared(); } +DataTypePtr FieldToDataType::operator() (const Object &) const +{ + /// TODO: Do we need different parameters for type Object? + return std::make_shared("json", false); +} + } diff --git a/src/DataTypes/FieldToDataType.h b/src/DataTypes/FieldToDataType.h index 6e8923f3d2..2ff3e79292 100644 --- a/src/DataTypes/FieldToDataType.h +++ b/src/DataTypes/FieldToDataType.h @@ -41,6 +41,11 @@ using DataTypePtr = std::shared_ptr; class FieldToDataType : public StaticVisitor { public: + FieldToDataType(bool allow_convertion_to_string_ = false) + : allow_convertion_to_string(allow_convertion_to_string_) + { + } + DataTypePtr operator() (const Null & x) const; DataTypePtr operator() (const NegativeInfinity & x) const; DataTypePtr operator() (const PositiveInfinity & x) const; @@ -64,6 +69,10 @@ public: DataTypePtr operator() (const DecimalField & x) const; DataTypePtr operator() (const AggregateFunctionStateData & x) const; DataTypePtr operator() (const BitMap64 & x) const; + DataTypePtr operator() (const Object & map) const; + +private: + bool allow_convertion_to_string; }; } diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 528eb0155f..615f688bbf 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -117,6 +118,50 @@ size_t IDataType::getSizeOfValueInMemory() const throw Exception("Value of type " + getName() + " in memory is not of fixed size.", ErrorCodes::LOGICAL_ERROR); } +void IDataType::forEachSubcolumn( + const SubcolumnCallback & callback, + const SubstreamData & data) +{ + ISerialization::StreamCallback callback_with_data = [&](const auto & subpath) + { + for (size_t i = 0; i < subpath.size(); ++i) + { + size_t prefix_len = i + 1; + if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, prefix_len)) + { + auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len); + auto subdata = ISerialization::createFromPath(subpath, prefix_len); + callback(subpath, name, subdata); + } + subpath[i].visited = true; + } + }; + + ISerialization::EnumerateStreamsSettings settings; + settings.position_independent_encoding = false; + data.serialization->enumerateStreams(settings, callback_with_data, data); +} + +template +Ptr IDataType::getForSubcolumn( + const String & subcolumn_name, + const SubstreamData & data, + Ptr SubstreamData::*member, + bool throw_if_null) const +{ + Ptr res; + forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata) + { + if (name == subcolumn_name) + res = subdata.*member; + }, data); + + if (!res && throw_if_null) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); + + return res; +} + DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const { if (subcolumn_name == MAIN_SUBCOLUMN_NAME) @@ -127,11 +172,17 @@ DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const { - auto subcolumn_type = tryGetSubcolumnType(subcolumn_name); - if (subcolumn_type) - return subcolumn_type; + if (subcolumn_name == MAIN_SUBCOLUMN_NAME) + return shared_from_this(); + + auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); + return getForSubcolumn(subcolumn_name, data, &SubstreamData::type, true); +} - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); +ColumnPtr IDataType::tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const +{ + auto data = SubstreamData(getDefaultSerialization()).withColumn(column); + return getForSubcolumn(subcolumn_name, data, &SubstreamData::column, false); } ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const IColumn &) const @@ -141,21 +192,12 @@ ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const IColumn & Names IDataType::getSubcolumnNames() const { - NameSet res; - getDefaultSerialization()->enumerateStreams([&res, this](const ISerialization::SubstreamPath & substream_path) + Names res; + forEachSubcolumn([&](const auto &, const auto & name, const auto &) { - ISerialization::SubstreamPath new_path; - /// Iterate over path to try to get intermediate subcolumns for complex nested types. - for (const auto & elem : substream_path) - { - new_path.push_back(elem); - auto subcolumn_name = ISerialization::getSubcolumnNameForStream(new_path); - if (!subcolumn_name.empty() && tryGetSubcolumnType(subcolumn_name)) - res.insert(subcolumn_name); - } - }); - - return Names(std::make_move_iterator(res.begin()), std::make_move_iterator(res.end())); + res.push_back(name); + }, SubstreamData(getDefaultSerialization())); + return res; } void IDataType::insertDefaultInto(IColumn & column) const @@ -173,6 +215,14 @@ void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const custom_serialization = std::move(custom_desc_->serialization); } +SerializationInfoPtr IDataType::getSerializationInfo(const IColumn & column) const +{ + if (const auto * column_const = checkAndGetColumn(&column)) + return getSerializationInfo(column_const->getDataColumn()); + + return std::make_shared(ISerialization::getKind(column), SerializationInfo::Settings{}); +} + SerializationPtr IDataType::getDefaultSerialization() const { if (custom_serialization) @@ -181,9 +231,23 @@ SerializationPtr IDataType::getDefaultSerialization() const return doGetDefaultSerialization(); } -SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const BaseSerializationGetter &) const +SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); + auto data = SubstreamData(serialization); + return getForSubcolumn(subcolumn_name, data, &SubstreamData::serialization, true); +} + +SerializationPtr IDataType::getSerialization(ISerialization::Kind /*kind*/) const +{ + // if (supportsSparseSerialization() && kind == ISerialization::Kind::SPARSE) + // return getSparseSerialization(); + + return getDefaultSerialization(); +} + +SerializationPtr IDataType::getSerialization(const SerializationInfo & info) const +{ + return getSerialization(info.getKind()); } // static @@ -191,14 +255,9 @@ SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, con { if (column.isSubcolumn()) { - /// Wrap to custom serialization deepest subcolumn, which is represented in non-complex type. - auto base_serialization_getter = [&](const IDataType & subcolumn_type) - { - return subcolumn_type.getSerialization(column.name, callback); - }; - - auto type_in_storage = column.getTypeInStorage(); - return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), base_serialization_getter); + const auto & type_in_storage = column.getTypeInStorage(); + auto default_serialization = type_in_storage->getDefaultSerialization(); + return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), default_serialization); } return column.type->getSerialization(column.name, callback); diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 3af2fe5d20..db8c2119c0 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -110,12 +110,34 @@ public: static constexpr auto MAIN_SUBCOLUMN_NAME = "__main"; virtual DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const; DataTypePtr getSubcolumnType(const String & subcolumn_name) const; + + ColumnPtr tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const; virtual ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const; Names getSubcolumnNames() const; + using SubstreamData = ISerialization::SubstreamData; + using SubstreamPath = ISerialization::SubstreamPath; + + using SubcolumnCallback = std::function; + + static void forEachSubcolumn( + const SubcolumnCallback & callback, + const SubstreamData & data); + + virtual SerializationInfoPtr getSerializationInfo(const IColumn & column) const; + /// Returns default serialization of data type. SerializationPtr getDefaultSerialization() const; + /// Chooses serialization according to serialization kind. + SerializationPtr getSerialization(ISerialization::Kind kind) const; + + /// Chooses serialization according to collected information about content of column. + virtual SerializationPtr getSerialization(const SerializationInfo & info) const; + /// Asks whether the stream with given name exists in table. /// If callback returned true for all streams, which are required for /// one of serialization types, that serialization will be chosen for reading. @@ -133,7 +155,7 @@ public: /// Returns serialization wrapper for reading one particular subcolumn of data type. virtual SerializationPtr getSubcolumnSerialization( - const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const; + const String & subcolumn_name, const SerializationPtr & serialization) const; using StreamCallbackWithType = std::function; @@ -371,6 +393,14 @@ protected: public: const IDataTypeCustomName * getCustomName() const { return custom_name.get(); } const ISerialization * getCustomSerialization() const { return custom_serialization.get(); } + +private: + template + Ptr getForSubcolumn( + const String & subcolumn_name, + const SubstreamData & data, + Ptr SubstreamData::*member, + bool throw_if_null) const; }; void setDefaultUseMapType(bool default_use_kv_map_type); @@ -443,6 +473,7 @@ struct WhichDataType constexpr bool isMap() const {return idx == TypeIndex::Map; } constexpr bool isSet() const { return idx == TypeIndex::Set; } constexpr bool isInterval() const { return idx == TypeIndex::Interval; } + constexpr bool isObject() const { return idx == TypeIndex::Object; } constexpr bool isNothing() const { return idx == TypeIndex::Nothing; } constexpr bool isNullable() const { return idx == TypeIndex::Nullable; } @@ -450,6 +481,7 @@ struct WhichDataType constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; } constexpr bool isSimple() const { return isInt() || isUInt() || isFloat() || isString(); } constexpr bool isBitmap64() const { return idx == TypeIndex::BitMap64; } + }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) @@ -481,6 +513,12 @@ inline bool isIPv4(const DataTypePtr & data_type) { return WhichDataType(data_ty inline bool isIPv6(const DataTypePtr & data_type) { return WhichDataType(data_type).isIPv6(); } inline bool isBitmap64(const DataTypePtr & data_type) { return WhichDataType(data_type).isBitmap64(); } +template +inline bool isObject(const T & data_type) +{ + return WhichDataType(data_type).isObject(); +} + template inline bool isUInt8(const T & data_type) { @@ -642,4 +680,19 @@ template <> inline constexpr bool IsDataTypeDateOrDateTime = t template <> inline constexpr bool IsDataTypeDateOrDateTime = true; template <> inline constexpr bool IsDataTypeDateOrDateTime = true; +#define FOR_NUMERIC_TYPES(M) \ + M(UInt8) \ + M(UInt16) \ + M(UInt32) \ + M(UInt64) \ + M(UInt128) \ + M(UInt256) \ + M(Int8) \ + M(Int16) \ + M(Int32) \ + M(Int64) \ + M(Int128) \ + M(Int256) \ + M(Float32) \ + M(Float64) } diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index 4ebe3876d0..592df88b02 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -30,6 +30,12 @@ namespace Nested std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name) { + if (nested_table_name.empty()) + return nested_field_name; + + if (nested_field_name.empty()) + return nested_table_name; + return nested_table_name + "." + nested_field_name; } diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp new file mode 100644 index 0000000000..ab73ebc2cf --- /dev/null +++ b/src/DataTypes/ObjectUtils.cpp @@ -0,0 +1,1050 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TYPE_MISMATCH; + extern const int LOGICAL_ERROR; + extern const int INCOMPATIBLE_COLUMNS; + extern const int TOO_MANY_SUBCOLUMNS_IN_JSON; +} + +size_t getNumberOfDimensions(const IDataType & type) +{ + if (const auto * type_array = typeid_cast(&type)) + return type_array->getNumberOfDimensions(); + return 0; +} + +size_t getNumberOfDimensions(const IColumn & column) +{ + if (const auto * column_array = checkAndGetColumn(column)) + return column_array->getNumberOfDimensions(); + return 0; +} + +DataTypePtr getBaseTypeOfArray(const DataTypePtr & type) +{ + /// Get raw pointers to avoid extra copying of type pointers. + const DataTypeArray * last_array = nullptr; + const auto * current_type = type.get(); + while (const auto * type_array = typeid_cast(current_type)) + { + current_type = type_array->getNestedType().get(); + last_array = type_array; + } + + return last_array ? last_array->getNestedType() : type; +} + +ColumnPtr getBaseColumnOfArray(const ColumnPtr & column) +{ + /// Get raw pointers to avoid extra copying of column pointers. + const ColumnArray * last_array = nullptr; + const auto * current_column = column.get(); + while (const auto * column_array = checkAndGetColumn(current_column)) + { + current_column = &column_array->getData(); + last_array = column_array; + } + + return last_array ? last_array->getDataPtr() : column; +} + +DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions) +{ + for (size_t i = 0; i < num_dimensions; ++i) + type = std::make_shared(std::move(type)); + return type; +} + +ColumnPtr createArrayOfColumn(ColumnPtr column, size_t num_dimensions) +{ + for (size_t i = 0; i < num_dimensions; ++i) + column = ColumnArray::create(column); + return column; +} + +Array createEmptyArrayField(size_t num_dimensions) +{ + if (num_dimensions == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create array field with 0 dimensions"); + + Array array; + Array * current_array = &array; + for (size_t i = 1; i < num_dimensions; ++i) + { + current_array->push_back(Array()); + current_array = ¤t_array->back().get(); + } + + return array; +} + +DataTypePtr getDataTypeByColumn(const IColumn & column) +{ + auto idx = column.getDataType(); + WhichDataType which(idx); + if (which.isSimple()) + return DataTypeFactory::instance().get(String(magic_enum::enum_name(idx))); + + if (which.isNothing()) + return std::make_shared(); + + if (const auto * column_array = checkAndGetColumn(&column)) + return std::make_shared(getDataTypeByColumn(column_array->getData())); + + if (const auto * column_nullable = checkAndGetColumn(&column)) + return makeNullable(getDataTypeByColumn(column_nullable->getNestedColumn())); + + /// TODO: add more types. + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get data type of column {}", column.getFamilyName()); +} + +template +static auto extractVector(const std::vector & vec) +{ + static_assert(I < std::tuple_size_v); + std::vector> res; + res.reserve(vec.size()); + for (const auto & elem : vec) + res.emplace_back(std::get(elem)); + return res; +} + +static DataTypePtr recreateTupleWithElements(const DataTypeTuple & type_tuple, const DataTypes & elements) +{ + return type_tuple.haveExplicitNames() + ? std::make_shared(elements, type_tuple.getElementNames()) + : std::make_shared(elements); +} + +static std::pair convertObjectColumnToTuple( + const ColumnObject & column_object, const DataTypeObject & type_object) +{ + if (!column_object.isFinalized()) + { + auto finalized = column_object.cloneFinalized(); + const auto & finalized_object = assert_cast(*finalized); + return convertObjectColumnToTuple(finalized_object, type_object); + } + + const auto & subcolumns = column_object.getSubcolumns(); + + PathsInData tuple_paths; + DataTypes tuple_types; + Columns tuple_columns; + + for (const auto & entry : subcolumns) + { + tuple_paths.emplace_back(entry->path); + tuple_types.emplace_back(entry->data.getLeastCommonType()); + tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr()); + } + + return unflattenTuple(tuple_paths, tuple_types, tuple_columns); +} + +static std::pair recursivelyConvertDynamicColumnToTuple( + const ColumnPtr & column, const DataTypePtr & type) +{ + if (!type->hasDynamicSubcolumns()) + return {column, type}; + + if (const auto * type_object = typeid_cast(type.get())) + { + const auto & column_object = assert_cast(*column); + return convertObjectColumnToTuple(column_object, *type_object); + } + + if (const auto * type_array = typeid_cast(type.get())) + { + const auto & column_array = assert_cast(*column); + auto [new_column, new_type] = recursivelyConvertDynamicColumnToTuple( + column_array.getDataPtr(), type_array->getNestedType()); + + return + { + ColumnArray::create(new_column, column_array.getOffsetsPtr()), + std::make_shared(std::move(new_type)), + }; + } + + if (const auto * type_map = typeid_cast(type.get())) + { + const auto & column_map = assert_cast(*column); + auto [new_column, new_type] = recursivelyConvertDynamicColumnToTuple( + column_map.getNestedColumnPtr(), type_map->getNestedType()); + + return + { + ColumnMap::create(new_column), + std::make_shared(std::move(new_type)), + }; + } + + if (const auto * type_tuple = typeid_cast(type.get())) + { + const auto & tuple_columns = assert_cast(*column).getColumns(); + const auto & tuple_types = type_tuple->getElements(); + + assert(tuple_columns.size() == tuple_types.size()); + const size_t tuple_size = tuple_types.size(); + + Columns new_tuple_columns(tuple_size); + DataTypes new_tuple_types(tuple_size); + + for (size_t i = 0; i < tuple_size; ++i) + { + std::tie(new_tuple_columns[i], new_tuple_types[i]) + = recursivelyConvertDynamicColumnToTuple(tuple_columns[i], tuple_types[i]); + } + + return + { + ColumnTuple::create(new_tuple_columns), + recreateTupleWithElements(*type_tuple, new_tuple_types) + }; + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type->getName()); +} + +static bool needCheckTypeCompatibilityLocal(const IStorage & storage) +{ + const auto * cloud_table = dynamic_cast(&storage); + return cloud_table == nullptr; +} + +void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & storage_snapshot) +{ + for (auto & column : block) + { + if (!column.type->hasDynamicSubcolumns()) + continue; + + std::tie(column.column, column.type) = recursivelyConvertDynamicColumnToTuple(column.column, column.type); + + GetColumnsOptions options(GetColumnsOptions::AllPhysical); + auto storage_column = storage_snapshot->tryGetColumn(options, column.name); + if (!storage_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", column.name); + + if (needCheckTypeCompatibilityLocal(storage_snapshot->storage)) + { + auto storage_column_concrete = storage_snapshot->getColumn(options.withExtendedObjects(), column.name); + + /// Check that constructed Tuple type and type in storage are compatible. + getLeastCommonTypeForDynamicColumns(storage_column->type, {column.type, storage_column_concrete.type}, true); + } + } +} + +static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts) +{ + if (prefix.size() > parts.size()) + return false; + + for (size_t i = 0; i < prefix.size(); ++i) + if (prefix[i].key != parts[i].key) + return false; + return true; +} + +/// Returns true if there exists a prefix with matched names, +/// but not matched structure (is Nested, number of dimensions). +static bool hasDifferentStructureInPrefix(const PathInData::Parts & lhs, const PathInData::Parts & rhs) +{ + for (size_t i = 0; i < std::min(lhs.size(), rhs.size()); ++i) + { + if (lhs[i].key != rhs[i].key) + return false; + else if (lhs[i] != rhs[i]) + return true; + } + return false; +} + +void checkObjectHasNoAmbiguousPaths(const PathsInData & paths) +{ + size_t size = paths.size(); + for (size_t i = 0; i < size; ++i) + { + for (size_t j = 0; j < i; ++j) + { + if (isPrefix(paths[i].getParts(), paths[j].getParts()) + || isPrefix(paths[j].getParts(), paths[i].getParts())) + throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, + "Data in Object has ambiguous paths: '{}' and '{}'", + paths[i].getPath(), paths[j].getPath()); + + if (hasDifferentStructureInPrefix(paths[i].getParts(), paths[j].getParts())) + throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, + "Data in Object has ambiguous paths: '{}' and '{}'. " + "Paths have prefixes matched by names, but different in structure", + paths[i].getPath(), paths[j].getPath()); + } + } +} + +static DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguous_paths) +{ + /// Types of subcolumns by path from all tuples. + std::unordered_map subcolumns_types; + + /// First we flatten tuples, then get common type for paths + /// and finally unflatten paths and create new tuple type. + for (const auto & type : types) + { + const auto * type_tuple = typeid_cast(type.get()); + if (!type_tuple) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Least common type for object can be deduced only from tuples, but {} given", type->getName()); + + auto [tuple_paths, tuple_types] = flattenTuple(type); + assert(tuple_paths.size() == tuple_types.size()); + + for (size_t i = 0; i < tuple_paths.size(); ++i) + subcolumns_types[tuple_paths[i]].push_back(tuple_types[i]); + } + + PathsInData tuple_paths; + DataTypes tuple_types; + + /// Get the least common type for all paths. + for (const auto & [key, subtypes] : subcolumns_types) + { + assert(!subtypes.empty()); + if (key.getPath() == ColumnObject::COLUMN_NAME_DUMMY) + continue; + + size_t first_dim = getNumberOfDimensions(*subtypes[0]); + for (size_t i = 1; i < subtypes.size(); ++i) + if (first_dim != getNumberOfDimensions(*subtypes[i])) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Uncompatible types of subcolumn '{}': {} and {}", + key.getPath(), subtypes[0]->getName(), subtypes[i]->getName()); + + tuple_paths.emplace_back(key); + tuple_types.emplace_back(getLeastSupertypeOrString(subtypes)); + } + + if (tuple_paths.empty()) + { + tuple_paths.emplace_back(ColumnObject::COLUMN_NAME_DUMMY); + tuple_types.emplace_back(std::make_shared()); + } + + if (check_ambiguous_paths) + checkObjectHasNoAmbiguousPaths(tuple_paths); + + return unflattenTuple(tuple_paths, tuple_types); +} + +static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl( + const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths); + +template +static DataTypePtr getLeastCommonTypeForColumnWithNestedType( + const Type & type, const DataTypes & concrete_types, bool check_ambiguos_paths) +{ + DataTypes nested_types; + nested_types.reserve(concrete_types.size()); + + for (const auto & concrete_type : concrete_types) + { + const auto * type_with_nested_conctete = typeid_cast(concrete_type.get()); + if (!type_with_nested_conctete) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected {} type, got {}", demangle(typeid(Type).name()), concrete_type->getName()); + + nested_types.push_back(type_with_nested_conctete->getNestedType()); + } + + return std::make_shared( + getLeastCommonTypeForDynamicColumnsImpl( + type.getNestedType(), nested_types, check_ambiguos_paths)); +} + +static DataTypePtr getLeastCommonTypeForTuple( + const DataTypeTuple & type, const DataTypes & concrete_types, bool check_ambiguos_paths) +{ + const auto & element_types = type.getElements(); + DataTypes new_element_types(element_types.size()); + + for (size_t i = 0; i < element_types.size(); ++i) + { + DataTypes concrete_element_types; + concrete_element_types.reserve(concrete_types.size()); + + for (const auto & type_concrete : concrete_types) + { + const auto * type_tuple_conctete = typeid_cast(type_concrete.get()); + if (!type_tuple_conctete) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected Tuple type, got {}", type_concrete->getName()); + + concrete_element_types.push_back(type_tuple_conctete->getElement(i)); + } + + new_element_types[i] = getLeastCommonTypeForDynamicColumnsImpl( + element_types[i], concrete_element_types, check_ambiguos_paths); + } + + return recreateTupleWithElements(type, new_element_types); +} + +static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl( + const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths) +{ + if (!type_in_storage->hasDynamicSubcolumns()) + return type_in_storage; + + if (isObject(type_in_storage)) + return getLeastCommonTypeForObject(concrete_types, check_ambiguos_paths); + + if (const auto * type_array = typeid_cast(type_in_storage.get())) + return getLeastCommonTypeForColumnWithNestedType(*type_array, concrete_types, check_ambiguos_paths); + + if (const auto * type_map = typeid_cast(type_in_storage.get())) + return getLeastCommonTypeForColumnWithNestedType(*type_map, concrete_types, check_ambiguos_paths); + + if (const auto * type_tuple = typeid_cast(type_in_storage.get())) + return getLeastCommonTypeForTuple(*type_tuple, concrete_types, check_ambiguos_paths); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName()); +} + +DataTypePtr getLeastCommonTypeForDynamicColumns( + const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths) +{ + if (concrete_types.empty()) + return nullptr; + + bool all_equal = true; + for (size_t i = 1; i < concrete_types.size(); ++i) + { + if (!concrete_types[i]->equals(*concrete_types[0])) + { + all_equal = false; + break; + } + } + + if (all_equal) + return concrete_types[0]; + + return getLeastCommonTypeForDynamicColumnsImpl(type_in_storage, concrete_types, check_ambiguos_paths); +} + +DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage) +{ + if (!type_in_storage->hasDynamicSubcolumns()) + return type_in_storage; + + if (isObject(type_in_storage)) + return std::make_shared( + DataTypes{std::make_shared()}, Names{ColumnObject::COLUMN_NAME_DUMMY}); + + if (const auto * type_array = typeid_cast(type_in_storage.get())) + return std::make_shared( + createConcreteEmptyDynamicColumn(type_array->getNestedType())); + + if (const auto * type_map = typeid_cast(type_in_storage.get())) + return std::make_shared( + createConcreteEmptyDynamicColumn(type_map->getNestedType())); + + if (const auto * type_tuple = typeid_cast(type_in_storage.get())) + { + const auto & elements = type_tuple->getElements(); + DataTypes new_elements; + new_elements.reserve(elements.size()); + + for (const auto & element : elements) + new_elements.push_back(createConcreteEmptyDynamicColumn(element)); + + return recreateTupleWithElements(*type_tuple, new_elements); + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName()); +} + +bool hasDynamicSubcolumns(const ColumnsDescription & columns) +{ + return std::any_of(columns.begin(), columns.end(), + [](const auto & column) + { + return column.type->hasDynamicSubcolumns(); + }); +} + +void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns) +{ + NamesAndTypesList subcolumns_list; + for (auto & column : columns_list) + { + auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, column.name); + if (object_column) + { + column.type = object_column->type; + + if (with_subcolumns) + subcolumns_list.splice(subcolumns_list.end(), object_columns.getSubcolumns(column.name)); + } + } + + columns_list.splice(columns_list.end(), std::move(subcolumns_list)); +} + +void updateObjectColumns( + ColumnsDescription & object_columns, + const ColumnsDescription & storage_columns, + const NamesAndTypesList & new_columns) +{ + for (const auto & new_column : new_columns) + { + auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, new_column.name); + if (object_column && !object_column->type->equals(*new_column.type)) + { + auto storage_column = storage_columns.getColumn(GetColumnsOptions::All, new_column.name); + object_columns.modify(new_column.name, [&](auto & column) + { + column.type = getLeastCommonTypeForDynamicColumns(storage_column.type, {object_column->type, new_column.type}); + }); + } + } +} + +namespace +{ + +void flattenTupleImpl( + PathInDataBuilder & builder, + DataTypePtr type, + std::vector & new_paths, + DataTypes & new_types) +{ + if (const auto * type_tuple = typeid_cast(type.get())) + { + const auto & tuple_names = type_tuple->getElementNames(); + const auto & tuple_types = type_tuple->getElements(); + + for (size_t i = 0; i < tuple_names.size(); ++i) + { + builder.append(tuple_names[i], false); + flattenTupleImpl(builder, tuple_types[i], new_paths, new_types); + builder.popBack(); + } + } + else if (const auto * type_array = typeid_cast(type.get())) + { + PathInDataBuilder element_builder; + std::vector element_paths; + DataTypes element_types; + + flattenTupleImpl(element_builder, type_array->getNestedType(), element_paths, element_types); + assert(element_paths.size() == element_types.size()); + + for (size_t i = 0; i < element_paths.size(); ++i) + { + builder.append(element_paths[i], true); + new_paths.emplace_back(builder.getParts()); + new_types.emplace_back(std::make_shared(element_types[i])); + builder.popBack(element_paths[i].size()); + } + } + else + { + new_paths.emplace_back(builder.getParts()); + new_types.emplace_back(type); + } +} + +/// @offsets_columns are used as stack of array offsets and allows to recreate Array columns. +void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns & offsets_columns) +{ + if (const auto * column_tuple = checkAndGetColumn(column.get())) + { + const auto & subcolumns = column_tuple->getColumns(); + for (const auto & subcolumn : subcolumns) + flattenTupleImpl(subcolumn, new_columns, offsets_columns); + } + else if (const auto * column_array = checkAndGetColumn(column.get())) + { + offsets_columns.push_back(column_array->getOffsetsPtr()); + flattenTupleImpl(column_array->getDataPtr(), new_columns, offsets_columns); + offsets_columns.pop_back(); + } + else + { + if (!offsets_columns.empty()) + { + auto new_column = ColumnArray::create(column, offsets_columns.back()); + for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it) + new_column = ColumnArray::create(new_column, *it); + + new_columns.push_back(std::move(new_column)); + } + else + { + new_columns.push_back(column); + } + } +} + +DataTypePtr reduceNumberOfDimensions(DataTypePtr type, size_t dimensions_to_reduce) +{ + while (dimensions_to_reduce--) + { + const auto * type_array = typeid_cast(type.get()); + if (!type_array) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce"); + + type = type_array->getNestedType(); + } + + return type; +} + +ColumnPtr reduceNumberOfDimensions(ColumnPtr column, size_t dimensions_to_reduce) +{ + while (dimensions_to_reduce--) + { + const auto * column_array = typeid_cast(column.get()); + if (!column_array) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce"); + + column = column_array->getDataPtr(); + } + + return column; +} + +/// We save intermediate column, type and number of array +/// dimensions for each intermediate node in path in subcolumns tree. +struct ColumnWithTypeAndDimensions +{ + ColumnPtr column; + DataTypePtr type; + size_t array_dimensions; +}; + +using SubcolumnsTreeWithColumns = SubcolumnsTree; +using Node = SubcolumnsTreeWithColumns::Node; + +/// Creates data type and column from tree of subcolumns. +ColumnWithTypeAndDimensions createTypeFromNode(const Node & node) +{ + auto collect_tuple_elemets = [](const auto & children) + { + if (children.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create type from empty Tuple or Nested node"); + + std::vector> tuple_elements; + tuple_elements.reserve(children.size()); + for (const auto & [name, child] : children) + { + assert(child); + auto column = createTypeFromNode(*child); + tuple_elements.emplace_back(name, std::move(column)); + } + + /// Sort to always create the same type for the same set of subcolumns. + std::sort(tuple_elements.begin(), tuple_elements.end(), + [](const auto & lhs, const auto & rhs) { return std::get<0>(lhs) < std::get<0>(rhs); }); + + auto tuple_names = extractVector<0>(tuple_elements); + auto tuple_columns = extractVector<1>(tuple_elements); + + return std::make_tuple(std::move(tuple_names), std::move(tuple_columns)); + }; + + if (node.kind == Node::SCALAR) + { + return node.data; + } + else if (node.kind == Node::NESTED) + { + auto [tuple_names, tuple_columns] = collect_tuple_elemets(node.children); + + Columns offsets_columns; + offsets_columns.reserve(tuple_columns[0].array_dimensions + 1); + + /// If we have a Nested node and child node with anonymous array levels + /// we need to push a Nested type through all array levels. + /// Example: { "k1": [[{"k2": 1, "k3": 2}] } should be parsed as + /// `k1 Array(Nested(k2 Int, k3 Int))` and k1 is marked as Nested + /// and `k2` and `k3` has anonymous_array_level = 1 in that case. + + const auto & current_array = assert_cast(*node.data.column); + offsets_columns.push_back(current_array.getOffsetsPtr()); + + auto first_column = tuple_columns[0].column; + for (size_t i = 0; i < tuple_columns[0].array_dimensions; ++i) + { + const auto & column_array = assert_cast(*first_column); + offsets_columns.push_back(column_array.getOffsetsPtr()); + first_column = column_array.getDataPtr(); + } + + size_t num_elements = tuple_columns.size(); + Columns tuple_elements_columns(num_elements); + DataTypes tuple_elements_types(num_elements); + + /// Reduce extra array dimensions to get columns and types of Nested elements. + for (size_t i = 0; i < num_elements; ++i) + { + assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions); + tuple_elements_columns[i] = reduceNumberOfDimensions(tuple_columns[i].column, tuple_columns[i].array_dimensions); + tuple_elements_types[i] = reduceNumberOfDimensions(tuple_columns[i].type, tuple_columns[i].array_dimensions); + } + + auto result_column = ColumnArray::create(ColumnTuple::create(tuple_elements_columns), offsets_columns.back()); + auto result_type = createNested(tuple_elements_types, tuple_names); + + /// Recreate result Array type and Array column. + for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it) + { + result_column = ColumnArray::create(result_column, *it); + result_type = std::make_shared(result_type); + } + + return {result_column, result_type, tuple_columns[0].array_dimensions}; + } + else + { + auto [tuple_names, tuple_columns] = collect_tuple_elemets(node.children); + + size_t num_elements = tuple_columns.size(); + Columns tuple_elements_columns(num_elements); + DataTypes tuple_elements_types(num_elements); + + for (size_t i = 0; i < tuple_columns.size(); ++i) + { + assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions); + tuple_elements_columns[i] = tuple_columns[i].column; + tuple_elements_types[i] = tuple_columns[i].type; + } + + auto result_column = ColumnTuple::create(tuple_elements_columns); + auto result_type = std::make_shared(tuple_elements_types, tuple_names); + + return {result_column, result_type, tuple_columns[0].array_dimensions}; + } +} + +} + +std::pair flattenTuple(const DataTypePtr & type) +{ + std::vector new_path_parts; + DataTypes new_types; + PathInDataBuilder builder; + + flattenTupleImpl(builder, type, new_path_parts, new_types); + + PathsInData new_paths(new_path_parts.begin(), new_path_parts.end()); + return {new_paths, new_types}; +} + +ColumnPtr flattenTuple(const ColumnPtr & column) +{ + Columns new_columns; + Columns offsets_columns; + + flattenTupleImpl(column, new_columns, offsets_columns); + return ColumnTuple::create(new_columns); +} + +DataTypePtr unflattenTuple(const PathsInData & paths, const DataTypes & tuple_types) +{ + assert(paths.size() == tuple_types.size()); + Columns tuple_columns; + tuple_columns.reserve(tuple_types.size()); + for (const auto & type : tuple_types) + tuple_columns.emplace_back(type->createColumn()); + + return unflattenTuple(paths, tuple_types, tuple_columns).second; +} + +std::pair unflattenObjectToTuple(const ColumnObject & column) +{ + const auto & subcolumns = column.getSubcolumns(); + + if (subcolumns.empty()) + { + auto type = std::make_shared( + DataTypes{std::make_shared()}, + Names{ColumnObject::COLUMN_NAME_DUMMY}); + + return {type->createColumn()->cloneResized(column.size()), type}; + } + + PathsInData paths; + DataTypes types; + Columns columns; + + paths.reserve(subcolumns.size()); + types.reserve(subcolumns.size()); + columns.reserve(subcolumns.size()); + + for (const auto & entry : subcolumns) + { + paths.emplace_back(entry->path); + types.emplace_back(entry->data.getLeastCommonType()); + columns.emplace_back(entry->data.getFinalizedColumnPtr()); + } + + return unflattenTuple(paths, types, columns); +} + +std::pair unflattenTuple( + const PathsInData & paths, + const DataTypes & tuple_types, + const Columns & tuple_columns) +{ + assert(paths.size() == tuple_types.size()); + assert(paths.size() == tuple_columns.size()); + + if (paths.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot unflatten empty Tuple"); + + /// We add all paths to the subcolumn tree and then create a type from it. + /// The tree stores column, type and number of array dimensions + /// for each intermediate node. + SubcolumnsTreeWithColumns tree; + + for (size_t i = 0; i < paths.size(); ++i) + { + auto column = tuple_columns[i]; + auto type = tuple_types[i]; + + const auto & parts = paths[i].getParts(); + size_t num_parts = parts.size(); + + size_t pos = 0; + tree.add(paths[i], [&](Node::Kind kind, bool exists) -> std::shared_ptr + { + if (pos >= num_parts) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Not enough name parts for path {}. Expected at least {}, got {}", + paths[i].getPath(), pos + 1, num_parts); + + size_t array_dimensions = kind == Node::NESTED ? 1 : parts[pos].anonymous_array_level; + ColumnWithTypeAndDimensions current_column{column, type, array_dimensions}; + + /// Get type and column for next node. + if (array_dimensions) + { + type = reduceNumberOfDimensions(type, array_dimensions); + column = reduceNumberOfDimensions(column, array_dimensions); + } + + ++pos; + if (exists) + return nullptr; + + return kind == Node::SCALAR + ? std::make_shared(kind, current_column, paths[i]) + : std::make_shared(kind, current_column); + }); + } + + auto [column, type, _] = createTypeFromNode(tree.getRoot()); + return std::make_pair(std::move(column), std::move(type)); +} + +static void addConstantToWithClause(const ASTPtr & query, const String & column_name, const DataTypePtr & data_type) +{ + auto & select = query->as(); + if (!select.with()) + select.setExpression(ASTSelectQuery::Expression::WITH, std::make_shared()); + + /// TODO: avoid materialize + auto node = makeASTFunction("materialize", + makeASTFunction("CAST", + std::make_shared(data_type->getDefault()), + std::make_shared(data_type->getName()))); + + node->alias = column_name; + node->prefer_alias_to_column_name = true; + select.with()->children.push_back(std::move(node)); +} + +/// @expected_columns and @available_columns contain descriptions +/// of extended Object columns. +void replaceMissedSubcolumnsByConstants( + const ColumnsDescription & expected_columns, + const ColumnsDescription & available_columns, + ASTPtr query) +{ + NamesAndTypes missed_names_types; + + /// Find all subcolumns that are in @expected_columns, but not in @available_columns. + for (const auto & column : available_columns) + { + auto expected_column = expected_columns.getColumn(GetColumnsOptions::All, column.name); + + /// Extract all paths from both descriptions to easily check existence of subcolumns. + auto [available_paths, available_types] = flattenTuple(column.type); + auto [expected_paths, expected_types] = flattenTuple(expected_column.type); + + auto extract_names_and_types = [&column](const auto & paths, const auto & types) + { + NamesAndTypes res; + res.reserve(paths.size()); + for (size_t i = 0; i < paths.size(); ++i) + { + auto full_name = Nested::concatenateName(column.name, paths[i].getPath()); + res.emplace_back(full_name, types[i]); + } + + std::sort(res.begin(), res.end()); + return res; + }; + + auto available_names_types = extract_names_and_types(available_paths, available_types); + auto expected_names_types = extract_names_and_types(expected_paths, expected_types); + + std::set_difference( + expected_names_types.begin(), expected_names_types.end(), + available_names_types.begin(), available_names_types.end(), + std::back_inserter(missed_names_types), + [](const auto & lhs, const auto & rhs) { return lhs.name < rhs.name; }); + } + + if (missed_names_types.empty()) + return; + + IdentifierNameSet identifiers; + query->collectIdentifierNames(identifiers); + + /// Replace missed subcolumns to default literals of theirs type. + for (const auto & [name, type] : missed_names_types) + if (identifiers.contains(name)) + addConstantToWithClause(query, name, type); +} + +Field FieldVisitorReplaceScalars::operator()(const Array & x) const +{ + if (num_dimensions_to_keep == 0) + return replacement; + + const size_t size = x.size(); + Array res(size); + for (size_t i = 0; i < size; ++i) + res[i] = applyVisitor(FieldVisitorReplaceScalars(replacement, num_dimensions_to_keep - 1), x[i]); + return res; +} + +size_t FieldVisitorToNumberOfDimensions::operator()(const Array & x) +{ + const size_t size = x.size(); + size_t dimensions = 0; + + for (size_t i = 0; i < size; ++i) + { + size_t element_dimensions = applyVisitor(*this, x[i]); + if (i > 0 && element_dimensions != dimensions) + need_fold_dimension = true; + + dimensions = std::max(dimensions, element_dimensions); + } + + return 1 + dimensions; +} + +Field FieldVisitorFoldDimension::operator()(const Array & x) const +{ + if (num_dimensions_to_fold == 0) + return x; + + const size_t size = x.size(); + Array res(size); + for (size_t i = 0; i < size; ++i) + res[i] = applyVisitor(FieldVisitorFoldDimension(num_dimensions_to_fold - 1), x[i]); + + return res; +} + +void setAllObjectsToDummyTupleType(NamesAndTypesList & columns) +{ + for (auto & column : columns) + if (column.type->hasDynamicSubcolumns()) + column.type = createConcreteEmptyDynamicColumn(column.type); +} + +void limitObjectSubcolumns(const ColumnsDescription & object_columns, const UInt64 & subcolumns_threshold) +{ + auto columns = object_columns.getAll(); + for (const auto & column : columns) + { + auto subcolumns = object_columns.getSubcolumns(column.getNameInStorage()); + + if (subcolumns.size() > subcolumns_threshold) + throw Exception( + ErrorCodes::TOO_MANY_SUBCOLUMNS_IN_JSON, + fmt::format( + "Column [{}] has [{}] subcolumns exceed threshold [{}].", + column.getNameInStorage(), + subcolumns.size(), + subcolumns_threshold)); + } +} + +UInt64 getColumnsCommitTimeForJSONTable(const IStorage & table, const NamesAndTypesList & search_part_columns) +{ + Names exclude_column_names; + for (const auto & column : table.getInMemoryMetadata().columns) + { + if (column.type->hasDynamicSubcolumns()) + exclude_column_names.emplace_back(column.name); + } + + auto exclude_json_column = [&](const NamesAndTypesList & columns) -> NamesAndTypesList { + NamesAndTypesList res; + for (const auto & column : columns) + { + if (std::find(exclude_column_names.begin(), exclude_column_names.end(), column.name) == exclude_column_names.end()) + res.push_back(column); + } + return res; + }; + + auto search_part_columns_excluded_json = exclude_json_column(search_part_columns); + + if (search_part_columns_excluded_json == exclude_json_column(*table.part_columns)) + return table.commit_time.toUInt64(); + for (const auto & version : table.previous_versions_part_columns) + { + if (search_part_columns_excluded_json == exclude_json_column(*version.second)) + return version.first; + } + return 0; +} +} diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h new file mode 100644 index 0000000000..aee9e39f19 --- /dev/null +++ b/src/DataTypes/ObjectUtils.h @@ -0,0 +1,218 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +struct StorageSnapshot; +using StorageSnapshotPtr = std::shared_ptr; + +/// Returns number of dimensions in Array type. 0 if type is not array. +size_t getNumberOfDimensions(const IDataType & type); + +/// Returns number of dimensions in Array column. 0 if column is not array. +size_t getNumberOfDimensions(const IColumn & column); + +/// Returns type of scalars of Array of arbitrary dimensions. +DataTypePtr getBaseTypeOfArray(const DataTypePtr & type); + +/// Returns Array type with requested scalar type and number of dimensions. +DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions); + +/// Returns column of scalars of Array of arbitrary dimensions. +ColumnPtr getBaseColumnOfArray(const ColumnPtr & column); + +/// Returns empty Array column with requested scalar column and number of dimensions. +ColumnPtr createArrayOfColumn(const ColumnPtr & column, size_t num_dimensions); + +/// Returns Array with requested number of dimensions and no scalars. +Array createEmptyArrayField(size_t num_dimensions); + +/// Tries to get data type by column. Only limited subset of types is supported +DataTypePtr getDataTypeByColumn(const IColumn & column); + +/// Converts Object types and columns to Tuples in @columns_list and @block +/// and checks that types are consistent with types in @storage_snapshot. +void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & storage_snapshot); + +/// Checks that each path is not the prefix of any other path. +void checkObjectHasNoAmbiguousPaths(const PathsInData & paths); + +/// Receives several Tuple types and deduces the least common type among them. +DataTypePtr getLeastCommonTypeForDynamicColumns( + const DataTypePtr & type_in_storage, const DataTypes & types, bool check_ambiguos_paths = false); + +DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage); + +/// Converts types of object columns to tuples in @columns_list +/// according to @object_columns and adds all tuple's subcolumns if needed. +void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns); + +/// Checks whether @columns contain any column with dynamic subcolumns. +bool hasDynamicSubcolumns(const ColumnsDescription & columns); + +/// Updates types of objects in @object_columns inplace +/// according to types in new_columns. +void updateObjectColumns( + ColumnsDescription & object_columns, + const ColumnsDescription & storage_columns, + const NamesAndTypesList & new_columns); + +using DataTypeTuplePtr = std::shared_ptr; + +/// Flattens nested Tuple to plain Tuple. I.e extracts all paths and types from tuple. +/// E.g. Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) -> Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) +std::pair flattenTuple(const DataTypePtr & type); + +/// Flattens nested Tuple column to plain Tuple column. +ColumnPtr flattenTuple(const ColumnPtr & column); + +/// The reverse operation to 'flattenTuple'. +/// Creates nested Tuple from all paths and types. +/// E.g. Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) -> Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) +DataTypePtr unflattenTuple( + const PathsInData & paths, + const DataTypes & tuple_types); + +std::pair unflattenObjectToTuple(const ColumnObject & column); + +std::pair unflattenTuple( + const PathsInData & paths, + const DataTypes & tuple_types, + const Columns & tuple_columns); + + +/// For all columns which exist in @expected_columns and +/// don't exist in @available_columns adds to WITH clause +/// an alias with column name to literal of default value of column type. +void replaceMissedSubcolumnsByConstants( + const ColumnsDescription & expected_columns, + const ColumnsDescription & available_columns, + ASTPtr query); + +void limitObjectSubcolumns(const ColumnsDescription & object_columns, const UInt64 & subcolumns_threshold); + +UInt64 getColumnsCommitTimeForJSONTable(const IStorage & table, const NamesAndTypesList & search_part_columns); + +/// Visitor that keeps @num_dimensions_to_keep dimensions in arrays +/// and replaces all scalars or nested arrays to @replacement at that level. +class FieldVisitorReplaceScalars : public StaticVisitor +{ +public: + FieldVisitorReplaceScalars(const Field & replacement_, size_t num_dimensions_to_keep_) + : replacement(replacement_), num_dimensions_to_keep(num_dimensions_to_keep_) + { + } + + Field operator()(const Array & x) const; + + template + Field operator()(const T &) const { return replacement; } + +private: + const Field & replacement; + size_t num_dimensions_to_keep; +}; + +/// Calculates number of dimensions in array field. +/// Returns 0 for scalar fields. +class FieldVisitorToNumberOfDimensions : public StaticVisitor +{ +public: + size_t operator()(const Array & x); + + template + size_t operator()(const T &) const { return 0; } + + bool need_fold_dimension = false; +}; + +/// Fold field (except Null) to the higher dimension, e.g. `1` -- fold 2 --> `[[1]]` +/// used to normalize dimension of element in an array. e.g [1, [2]] --> [[1], [2]] +class FieldVisitorFoldDimension : public StaticVisitor +{ +public: + explicit FieldVisitorFoldDimension(size_t num_dimensions_to_fold_) : num_dimensions_to_fold(num_dimensions_to_fold_) { } + + Field operator()(const Array & x) const; + + Field operator()(const Null & x) const { return x; } + + template + Field operator()(const T & x) const + { + if (num_dimensions_to_fold == 0) + return x; + + Array res(1, x); + for (size_t i = 1; i < num_dimensions_to_fold; ++i) + { + Array new_res; + new_res.push_back(std::move(res)); + res = std::move(new_res); + } + + return res; + } + +private: + size_t num_dimensions_to_fold; +}; + +void setAllObjectsToDummyTupleType(NamesAndTypesList & columns); + +/// Receives range of objects, which contains collections +/// of columns-like objects (e.g. ColumnsDescription or NamesAndTypesList) +/// and deduces the common types of object columns for all entries. +/// @entry_columns_getter should extract reference to collection of +/// columns-like objects from entry to which Iterator points. +/// columns-like object should have fields "name" and "type". +template +ColumnsDescription getConcreteObjectColumns( + Iterator begin, Iterator end, + const ColumnsDescription & storage_columns, + EntryColumnsGetter && entry_columns_getter) +{ + std::unordered_map types_in_entries; + + /// Add dummy column for all Object columns + /// to not lose any column if it's missing + /// in all entries. If it exists in any entry + /// dummy column will be removed. + for (const auto & column : storage_columns) + { + if (column.type->hasDynamicSubcolumns()) + types_in_entries[column.name].push_back(createConcreteEmptyDynamicColumn(column.type)); + } + + for (auto it = begin; it != end; ++it) + { + const auto & entry_columns = entry_columns_getter(*it); + for (const auto & column : entry_columns) + { + auto storage_column = storage_columns.tryGetPhysical(column.name); + if (storage_column && storage_column->type->hasDynamicSubcolumns()) + types_in_entries[column.name].push_back(column.type); + } + } + + ColumnsDescription res; + for (const auto & [name, types] : types_in_entries) + { + auto storage_column = storage_columns.getPhysical(name); + res.add({name, getLeastCommonTypeForDynamicColumns(storage_column.type, types)}); + } + + return res; +} + +} diff --git a/src/DataTypes/Serializations/CMakeLists.txt b/src/DataTypes/Serializations/CMakeLists.txt new file mode 100644 index 0000000000..7d8afb1e04 --- /dev/null +++ b/src/DataTypes/Serializations/CMakeLists.txt @@ -0,0 +1,3 @@ +if (ENABLE_TESTS) + add_subdirectory (tests) +endif () \ No newline at end of file diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 7db64b786f..b407196645 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -25,6 +25,7 @@ #include #include #include +#include namespace DB @@ -37,36 +38,42 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +ISerialization::Kind ISerialization::getKind(const IColumn & column) +{ + if (column.isSparse()) + return Kind::SPARSE; + + return Kind::DEFAULT; +} + +String ISerialization::kindToString(Kind kind) +{ + switch (kind) + { + case Kind::DEFAULT: + return "Default"; + case Kind::SPARSE: + return "Sparse"; + } + UNREACHABLE(); +} + +ISerialization::Kind ISerialization::stringToKind(const String & str) +{ + if (str == "Default") + return Kind::DEFAULT; + else if (str == "Sparse") + return Kind::SPARSE; + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown serialization kind '{}'", str); +} + String ISerialization::Substream::toString() const { - switch (type) - { - case ArrayElements: - return "ArrayElements"; - case ArraySizes: - return "ArraySizes"; - case NullableElements: - return "NullableElements"; - case NullMap: - return "NullMap"; - case TupleElement: - return "TupleElement(" + tuple_element_name + ", " - + std::to_string(escape_tuple_delimiter) + ")"; - case DictionaryKeys: - return "DictionaryKeys"; - case DictionaryIndexes: - return "DictionaryIndexes"; - case SparseElements: - return "SparseElements"; - case SparseOffsets: - return "SparseOffsets"; - case StringElements: - return "StringElements"; - case StringOffsets: - return "StringOffsets"; - } + if (type == TupleElement) + return fmt::format("TupleElement({}, escape_tuple_delimiter = {})", tuple_element_name, escape_tuple_delimiter ? "true" : "false"); - __builtin_unreachable(); + return String(magic_enum::enum_name(type)); } String ISerialization::SubstreamPath::toString() const @@ -83,9 +90,37 @@ String ISerialization::SubstreamPath::toString() const return wb.str(); } -void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +void ISerialization::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const { - callback(path); + settings.path.push_back(Substream::Regular); + settings.path.back().data = data; + callback(settings.path); + settings.path.pop_back(); +} + +void ISerialization::enumerateStreams( + const StreamCallback & callback, + const DataTypePtr & type, + const ColumnPtr & column) const +{ + EnumerateStreamsSettings settings; + auto data = SubstreamData(getPtr()).withType(type).withColumn(column); + enumerateStreams(settings, callback, data); +} + +void ISerialization::enumerateStreams( + const StreamCallback & callback, + const SubstreamPath & path, + const DataTypePtr & type, + const ColumnPtr & column) const +{ + EnumerateStreamsSettings settings; + settings.path = path; + auto data = SubstreamData(getPtr()).withType(type).withColumn(column); + enumerateStreams(settings, callback, data); } void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const @@ -130,6 +165,8 @@ void ISerialization::deserializeBinaryBulkWithMultipleStreams( } } +using SubstreamIterator = ISerialization::SubstreamPath::const_iterator; + size_t ISerialization::skipBinaryBulkWithMultipleStreams( const NameAndTypePair & name_and_type, size_t limit, @@ -144,39 +181,42 @@ size_t ISerialization::skipBinaryBulkWithMultipleStreams( static String getNameForSubstreamPath( String stream_name, - const ISerialization::SubstreamPath & path, + SubstreamIterator begin, + SubstreamIterator end, bool escape_tuple_delimiter) { using Substream = ISerialization::Substream; size_t array_level = 0; size_t null_level = 0; - for (const auto & elem : path) + for (auto it = begin; it != end; ++it) { - if (elem.type == Substream::NullMap) + if (it->type == Substream::NullMap) stream_name += ".null" + (null_level > 0 ? toString(null_level): ""); - else if (elem.type == Substream::ArraySizes) + else if (it->type == Substream::ArraySizes) stream_name += ".size" + toString(array_level); - else if (elem.type == Substream::ArrayElements) + else if (it->type == Substream::ArrayElements) ++array_level; - else if (elem.type == Substream::NullableElements) + else if (it->type == Substream::NullableElements) ++null_level; - else if (elem.type == Substream::DictionaryKeys) + else if (it->type == Substream::DictionaryKeys) stream_name += ".dict"; - else if (elem.type == Substream::SparseOffsets) + else if (it->type == Substream::SparseOffsets) stream_name += ".sparse.idx"; - else if (elem.type == Substream::TupleElement) + else if (it->type == Substream::TupleElement) { /// For compatibility reasons, we use %2E (escaped dot) instead of dot. /// Because nested data may be represented not by Array of Tuple, /// but by separate Array columns with names in a form of a.b, /// and name is encoded as a whole. - stream_name += (escape_tuple_delimiter && elem.escape_tuple_delimiter ? - escapeForFileName(".") : ".") + escapeForFileName(elem.tuple_element_name); + if (escape_tuple_delimiter && it->escape_tuple_delimiter) + stream_name += escapeForFileName("." + it->tuple_element_name); + else + stream_name += "." + it->tuple_element_name; } - else if (elem.type == Substream::StringElements) + else if (it->type == Substream::StringElements) stream_name += ".str_elements"; - else if (elem.type == Substream::StringOffsets) + else if (it->type == Substream::StringOffsets) stream_name += ".str_offsets"; } @@ -197,12 +237,17 @@ String ISerialization::getFileNameForStream(const String & name_in_storage, cons else stream_name = escapeForFileName(name_in_storage); - return getNameForSubstreamPath(std::move(stream_name), path, true); + return getNameForSubstreamPath(std::move(stream_name), path.begin(), path.end(), true); } String ISerialization::getSubcolumnNameForStream(const SubstreamPath & path) { - auto subcolumn_name = getNameForSubstreamPath("", path, false); + return getSubcolumnNameForStream(path, path.size()); +} + +String ISerialization::getSubcolumnNameForStream(const SubstreamPath & path, size_t prefix_len) +{ + auto subcolumn_name = getNameForSubstreamPath("", path.begin(), path.begin() + prefix_len, false); if (!subcolumn_name.empty()) subcolumn_name = subcolumn_name.substr(1); // It starts with a dot. @@ -250,6 +295,47 @@ void ISerialization::deserializeMemComparable(IColumn &, ReadBuffer &) const throw Exception("Serialization type doesn't support mem-comparable encoding", ErrorCodes::NOT_IMPLEMENTED); } +size_t ISerialization::getArrayLevel(const SubstreamPath & path) +{ + size_t level = 0; + for (const auto & elem : path) + level += elem.type == Substream::ArrayElements; + return level; +} + +bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len) +{ + if (prefix_len == 0 || prefix_len > path.size()) + return false; + + size_t last_elem = prefix_len - 1; + return path[last_elem].type == Substream::NullMap + || path[last_elem].type == Substream::TupleElement + || path[last_elem].type == Substream::ArraySizes; +} + +ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len) +{ + assert(prefix_len <= path.size()); + if (prefix_len == 0) + return {}; + + ssize_t last_elem = prefix_len - 1; + auto res = path[last_elem].data; + for (ssize_t i = last_elem - 1; i >= 0; --i) + { + const auto & creator = path[i].creator; + if (creator) + { + res.type = res.type ? creator->create(res.type) : res.type; + res.serialization = res.serialization ? creator->create(res.serialization) : res.serialization; + res.column = res.column ? creator->create(res.column) : res.column; + } + } + + return res; +} + void ISerialization::throwUnexpectedDataAfterParsedValue(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & type_name) const { WriteBufferFromOwnString ostr; diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 85963d4a38..b11182995a 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -23,7 +23,9 @@ #include #include +#include +#include #include #include @@ -41,6 +43,15 @@ class IColumn; using ColumnPtr = COW::Ptr; using MutableColumnPtr = COW::MutablePtr; +class IDataType; +using DataTypePtr = std::shared_ptr; + +class ISerialization; +using SerializationPtr = std::shared_ptr; + +class SerializationInfo; +using SerializationInfoPtr = std::shared_ptr; + class Field; struct FormatSettings; @@ -48,12 +59,24 @@ struct NameAndTypePair; class CompressedDataIndex; -class ISerialization +class ISerialization : private boost::noncopyable, public std::enable_shared_from_this { public: ISerialization() = default; virtual ~ISerialization() = default; + enum class Kind : UInt8 + { + DEFAULT = 0, + SPARSE = 1, + }; + + SerializationPtr getPtr() const { return shared_from_this(); } + + static Kind getKind(const IColumn & column); + static String kindToString(Kind kind); + static Kind stringToKind(const String & str); + /** Binary serialization for range of values in column - for writing to disk/network, etc. * * Some data types are represented in multiple streams while being serialized. @@ -76,6 +99,47 @@ public: * * Default implementations of ...WithMultipleStreams methods will call serializeBinaryBulk, deserializeBinaryBulk for single stream. */ + struct ISubcolumnCreator + { + virtual DataTypePtr create(const DataTypePtr & prev) const = 0; + virtual SerializationPtr create(const SerializationPtr & prev) const = 0; + virtual ColumnPtr create(const ColumnPtr & prev) const = 0; + virtual ~ISubcolumnCreator() = default; + }; + + using SubcolumnCreatorPtr = std::shared_ptr; + + struct SubstreamData + { + SubstreamData() = default; + SubstreamData(SerializationPtr serialization_) + : serialization(std::move(serialization_)) + { + } + + SubstreamData & withType(DataTypePtr type_) + { + type = std::move(type_); + return *this; + } + + SubstreamData & withColumn(ColumnPtr column_) + { + column = std::move(column_); + return *this; + } + + SubstreamData & withSerializationInfo(SerializationInfoPtr serialization_info_) + { + serialization_info = std::move(serialization_info_); + return *this; + } + + SerializationPtr serialization; + DataTypePtr type; + ColumnPtr column; + SerializationInfoPtr serialization_info; + }; struct Substream { @@ -95,6 +159,10 @@ public: SparseElements, SparseOffsets, + ObjectStructure, + ObjectData, + + Regular, StringElements, StringOffsets, }; @@ -103,9 +171,21 @@ public: /// Index of tuple element, starting at 1 or name. String tuple_element_name; + /// Name of subcolumn of object column. + String object_key_name; + /// Do we need to escape a dot in filenames for tuple elements. bool escape_tuple_delimiter = true; + /// Data for current substream. + SubstreamData data; + + /// Creator of subcolumn for current substream. + SubcolumnCreatorPtr creator = nullptr; + + /// Flag, that may help to traverse substream paths. + mutable bool visited = false; + Substream(Type type_) : type(type_) {} String toString() const; @@ -122,9 +202,28 @@ public: using StreamCallback = std::function; - virtual void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const; - void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); } - void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); } + struct EnumerateStreamsSettings + { + SubstreamPath path; + bool position_independent_encoding = true; + }; + + virtual void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const; + + /// Enumerate streams with default settings. + void enumerateStreams( + const StreamCallback & callback, + const DataTypePtr & type = nullptr, + const ColumnPtr & column = nullptr) const; + + void enumerateStreams( + const StreamCallback & callback, + const SubstreamPath & path, + const DataTypePtr & type = nullptr, + const ColumnPtr & column = nullptr) const; using OutputStreamGetter = std::function; using InputStreamGetter = std::function; @@ -174,7 +273,9 @@ public: }; /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. + /// Column may be used only to retrieve the structure. virtual void serializeBinaryBulkStatePrefix( + const IColumn & /*column*/, SerializeBinaryBulkSettings & /*settings*/, SerializeBinaryBulkStatePtr & /*state*/) const {} @@ -300,17 +401,45 @@ public: static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path); static String getFileNameForStream(const String & name_in_storage, const SubstreamPath & path); static String getSubcolumnNameForStream(const SubstreamPath & path); + static String getSubcolumnNameForStream(const SubstreamPath & path, size_t prefix_len); static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column); static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path); static bool isSpecialCompressionAllowed(const SubstreamPath & path); + + static size_t getArrayLevel(const SubstreamPath & path); + static bool hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len); + static SubstreamData createFromPath(const SubstreamPath & path, size_t prefix_len); protected: + template + State * checkAndGetState(const StatePtr & state) const; [[noreturn]] void throwUnexpectedDataAfterParsedValue(IColumn & column, ReadBuffer & istr, const FormatSettings &, const String & type_name) const; }; using SerializationPtr = std::shared_ptr; using Serializations = std::vector; +template +State * ISerialization::checkAndGetState(const StatePtr & state) const +{ + if (!state) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Got empty state for {}", demangle(typeid(*this).name())); + + auto * state_concrete = typeid_cast(state.get()); + if (!state_concrete) + { + auto & state_ref = *state; + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Invalid State for {}. Expected: {}, got {}", + demangle(typeid(*this).name()), + demangle(typeid(State).name()), + demangle(typeid(state_ref).name())); + } + + return state_concrete; +} + } diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index 70a72c51e7..b06561ea75 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -8,6 +9,8 @@ #include #include +#include +#include namespace DB { @@ -177,23 +180,68 @@ ColumnPtr arrayOffsetsToSizes(const IColumn & column) return column_sizes; } - -void SerializationArray::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +DataTypePtr SerializationArray::SubcolumnCreator::create(const DataTypePtr & prev) const { - path.push_back(Substream::ArraySizes); - callback(path); - path.back() = Substream::ArrayElements; - nested->enumerateStreams(callback, path); - path.pop_back(); + return std::make_shared(prev); } +SerializationPtr SerializationArray::SubcolumnCreator::create(const SerializationPtr & prev) const +{ + return std::make_shared(prev); +} + +ColumnPtr SerializationArray::SubcolumnCreator::create(const ColumnPtr & prev) const +{ + return ColumnArray::create(prev, offsets); +} + +void SerializationArray::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_array = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_array = data.column ? &assert_cast(*data.column) : nullptr; + auto offsets = column_array ? column_array->getOffsetsPtr() : nullptr; + + auto offsets_serialization = + std::make_shared( + std::make_shared>(), + "size" + std::to_string(getArrayLevel(settings.path)), false); + + auto offsets_column = offsets && !settings.position_independent_encoding + ? arrayOffsetsToSizes(*offsets) + : offsets; + + settings.path.push_back(Substream::ArraySizes); + settings.path.back().data = SubstreamData(offsets_serialization) + .withType(type_array ? std::make_shared() : nullptr) + .withColumn(std::move(offsets_column)) + .withSerializationInfo(data.serialization_info); + + callback(settings.path); + + settings.path.back() = Substream::ArrayElements; + settings.path.back().data = data; + settings.path.back().creator = std::make_shared(offsets); + + auto next_data = SubstreamData(nested) + .withType(type_array ? type_array->getNestedType() : nullptr) + .withColumn(column_array ? column_array->getDataPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + nested->enumerateStreams(settings, callback, next_data); + settings.path.pop_back(); +} void SerializationArray::serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { settings.path.push_back(Substream::ArrayElements); - nested->serializeBinaryBulkStatePrefix(settings, state); + const auto & column_array = assert_cast(column); + nested->serializeBinaryBulkStatePrefix(column_array.getData(), settings, state); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h index 71037090a4..cdcbaadeb2 100644 --- a/src/DataTypes/Serializations/SerializationArray.h +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -35,9 +35,13 @@ public: * This is necessary, because when implementing nested structures, several arrays can have common sizes. */ - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; @@ -62,6 +66,18 @@ public: DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const override; + +private: + struct SubcolumnCreator : public ISubcolumnCreator + { + const ColumnPtr offsets; + + explicit SubcolumnCreator(const ColumnPtr & offsets_) : offsets(offsets_) {} + + DataTypePtr create(const DataTypePtr & prev) const override; + SerializationPtr create(const SerializationPtr & prev) const override; + ColumnPtr create(const ColumnPtr & prev) const override; + }; }; ColumnPtr arrayOffsetsToSizes(const IColumn & column); diff --git a/src/DataTypes/Serializations/SerializationBigString.cpp b/src/DataTypes/Serializations/SerializationBigString.cpp index 881e49f996..b956df93d1 100644 --- a/src/DataTypes/Serializations/SerializationBigString.cpp +++ b/src/DataTypes/Serializations/SerializationBigString.cpp @@ -131,14 +131,14 @@ DeserializeBinaryBulkStateBigString * checkAndGetBigStringDeserializeState( } -void SerializationBigString::enumerateStreams(const StreamCallback & callback, - SubstreamPath & path) const +void SerializationBigString::enumerateStreams( + EnumerateStreamsSettings & settings, const StreamCallback & callback, const SubstreamData & /*data*/) const { - path.push_back(Substream::StringElements); - callback(path); - path.back() = Substream::StringOffsets; - callback(path); - path.pop_back(); + settings.path.push_back(Substream::StringElements); + callback(settings.path); + settings.path.back() = Substream::StringOffsets; + callback(settings.path); + settings.path.pop_back(); } void SerializationBigString::serializeBinaryBulkWithMultipleStreams( diff --git a/src/DataTypes/Serializations/SerializationBigString.h b/src/DataTypes/Serializations/SerializationBigString.h index 07f86b84ae..e1be08c505 100644 --- a/src/DataTypes/Serializations/SerializationBigString.h +++ b/src/DataTypes/Serializations/SerializationBigString.h @@ -11,7 +11,7 @@ namespace DB class SerializationBigString final : public ISerialization { public: - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + void enumerateStreams(EnumerateStreamsSettings & settings, const StreamCallback & callback, const SubstreamData & data) const override; void serializeBinaryBulkWithMultipleStreams(const IColumn & column, size_t offset, size_t limit, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; diff --git a/src/DataTypes/Serializations/SerializationInfo.cpp b/src/DataTypes/Serializations/SerializationInfo.cpp new file mode 100644 index 0000000000..9513e94bf9 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationInfo.cpp @@ -0,0 +1,283 @@ +#include +// #include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CORRUPTED_DATA; +} + +namespace +{ + +constexpr auto KEY_VERSION = "version"; +constexpr auto KEY_NUM_ROWS = "num_rows"; +constexpr auto KEY_COLUMNS = "columns"; +constexpr auto KEY_NUM_DEFAULTS = "num_defaults"; +constexpr auto KEY_KIND = "kind"; +constexpr auto KEY_NAME = "name"; + +} + +void SerializationInfo::Data::add(const IColumn & /*column*/) +{ + // size_t rows = column.size(); + // double ratio = column.getRatioOfDefaultRows(ColumnSparse::DEFAULT_ROWS_SEARCH_SAMPLE_RATIO); + + // num_rows += rows; + // num_defaults += static_cast(ratio * rows); +} + +void SerializationInfo::Data::add(const Data & other) +{ + num_rows += other.num_rows; + num_defaults += other.num_defaults; +} + +void SerializationInfo::Data::addDefaults(size_t length) +{ + num_rows += length; + num_defaults += length; +} + +SerializationInfo::SerializationInfo(ISerialization::Kind kind_, const Settings & settings_) + : settings(settings_) + , kind(kind_) +{ +} + +SerializationInfo::SerializationInfo(ISerialization::Kind kind_, const Settings & settings_, const Data & data_) + : settings(settings_) + , kind(kind_) + , data(data_) +{ +} + +void SerializationInfo::add(const IColumn & column) +{ + data.add(column); + if (settings.choose_kind) + kind = chooseKind(data, settings); +} + +void SerializationInfo::add(const SerializationInfo & other) +{ + data.add(other.data); + if (settings.choose_kind) + kind = chooseKind(data, settings); +} + +void SerializationInfo::addDefaults(size_t length) +{ + data.addDefaults(length); + if (settings.choose_kind) + kind = chooseKind(data, settings); +} + +void SerializationInfo::replaceData(const SerializationInfo & other) +{ + data = other.data; +} + +MutableSerializationInfoPtr SerializationInfo::clone() const +{ + return std::make_shared(kind, settings, data); +} + +/// Returns true if all rows with default values of type 'lhs' +/// are mapped to default values of type 'rhs' after conversion. +[[maybe_unused]]static bool preserveDefaultsAfterConversion(const IDataType & lhs, const IDataType & rhs) +{ + if (lhs.equals(rhs)) + return true; + + bool lhs_is_columned_as_numeric = isColumnedAsNumber(lhs) || isColumnedAsDecimal(lhs); + bool rhs_is_columned_as_numeric = isColumnedAsNumber(rhs) || isColumnedAsDecimal(rhs); + + if (lhs_is_columned_as_numeric && rhs_is_columned_as_numeric) + return true; + + if (isStringOrFixedString(lhs) && isStringOrFixedString(rhs)) + return true; + + return false; +} + +std::shared_ptr SerializationInfo::createWithType( + const IDataType & /*old_type*/, + const IDataType & /*new_type*/, + const Settings & new_settings) const +{ + auto new_kind = kind; + // if (new_kind == ISerialization::Kind::SPARSE) + // { + // if (!new_type.supportsSparseSerialization() + // || !preserveDefaultsAfterConversion(old_type, new_type)) + // new_kind = ISerialization::Kind::DEFAULT; + // } + + return std::make_shared(new_kind, new_settings); +} + +void SerializationInfo::serialializeKindBinary(WriteBuffer & out) const +{ + writeBinary(static_cast(kind), out); +} + +void SerializationInfo::deserializeFromKindsBinary(ReadBuffer & in) +{ + UInt8 kind_num; + readBinary(kind_num, in); + auto maybe_kind = magic_enum::enum_cast(kind_num); + if (!maybe_kind) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Unknown serialization kind {}", std::to_string(kind_num)); + + kind = *maybe_kind; +} + +Poco::JSON::Object SerializationInfo::toJSON() const +{ + Poco::JSON::Object object; + object.set(KEY_KIND, ISerialization::kindToString(kind)); + object.set(KEY_NUM_DEFAULTS, data.num_defaults); + object.set(KEY_NUM_ROWS, data.num_rows); + return object; +} + +void SerializationInfo::fromJSON(const Poco::JSON::Object & object) +{ + if (!object.has(KEY_KIND) || !object.has(KEY_NUM_DEFAULTS) || !object.has(KEY_NUM_ROWS)) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Missed field '{}' or '{}' or '{}' in SerializationInfo of columns", + KEY_KIND, KEY_NUM_DEFAULTS, KEY_NUM_ROWS); + + data.num_rows = object.getValue(KEY_NUM_ROWS); + data.num_defaults = object.getValue(KEY_NUM_DEFAULTS); + kind = ISerialization::stringToKind(object.getValue(KEY_KIND)); +} + +ISerialization::Kind SerializationInfo::chooseKind(const Data & data, const Settings & settings) +{ + double ratio = data.num_rows ? std::min(static_cast(data.num_defaults) / data.num_rows, 1.0) : 0.0; + return ratio > settings.ratio_of_defaults_for_sparse ? ISerialization::Kind::SPARSE : ISerialization::Kind::DEFAULT; +} + +SerializationInfoByName::SerializationInfoByName( + const NamesAndTypesList & /*columns*/, + const SerializationInfo::Settings & /*settings*/) +{ + // if (settings.isAlwaysDefault()) + // return; + + // for (const auto & column : columns) + // if (column.type->supportsSparseSerialization()) + // emplace(column.name, column.type->createSerializationInfo(settings)); +} + +void SerializationInfoByName::add(const Block & block) +{ + for (const auto & column : block) + { + auto it = find(column.name); + if (it == end()) + continue; + + it->second->add(*column.column); + } +} + +void SerializationInfoByName::add(const SerializationInfoByName & other) +{ + for (const auto & [name, info] : other) + { + auto it = find(name); + if (it == end()) + continue; + + it->second->add(*info); + } +} + +void SerializationInfoByName::replaceData(const SerializationInfoByName & other) +{ + for (const auto & [name, new_info] : other) + { + auto & old_info = (*this)[name]; + + if (old_info) + old_info->replaceData(*new_info); + else + old_info = new_info->clone(); + } +} + +void SerializationInfoByName::writeJSON(WriteBuffer & out) const +{ + Poco::JSON::Object object; + object.set(KEY_VERSION, SERIALIZATION_INFO_VERSION); + + Poco::JSON::Array column_infos; + for (const auto & [name, info] : *this) + { + auto info_json = info->toJSON(); + info_json.set(KEY_NAME, name); + column_infos.add(std::move(info_json)); /// NOLINT + } + + object.set(KEY_COLUMNS, std::move(column_infos)); /// NOLINT + + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(object, oss); + + return writeString(oss.str(), out); +} + +void SerializationInfoByName::readJSON(ReadBuffer & in) +{ + String json_str; + readString(json_str, in); + + Poco::JSON::Parser parser; + auto object = parser.parse(json_str).extract(); + + if (!object->has(KEY_VERSION)) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Missed version of serialization infos"); + + if (object->getValue(KEY_VERSION) > SERIALIZATION_INFO_VERSION) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Unknown version of serialization infos ({}). Should be less or equal than {}", + object->getValue(KEY_VERSION), SERIALIZATION_INFO_VERSION); + + if (object->has(KEY_COLUMNS)) + { + auto array = object->getArray(KEY_COLUMNS); + for (const auto & elem : *array) + { + auto elem_object = elem.extract(); + + if (!elem_object->has(KEY_NAME)) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Missed field '{}' in SerializationInfo of columns", KEY_NAME); + + auto name = elem_object->getValue(KEY_NAME); + if (auto it = find(name); it != end()) + it->second->fromJSON(*elem_object); + } + } +} + +} diff --git a/src/DataTypes/Serializations/SerializationInfo.h b/src/DataTypes/Serializations/SerializationInfo.h new file mode 100644 index 0000000000..5b802b379e --- /dev/null +++ b/src/DataTypes/Serializations/SerializationInfo.h @@ -0,0 +1,114 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class ReadBuffer; +class ReadBuffer; +class WriteBuffer; +class NamesAndTypesList; +class Block; + +constexpr auto SERIALIZATION_INFO_VERSION = 0; + +/** Contains information about kind of serialization of column and its subcolumns. + * Also contains information about content of columns, + * that helps to choose kind of serialization of column. + * + * Currently has only information about number of default rows, + * that helps to choose sparse serialization. + * + * Should be extended, when new kinds of serialization will be implemented. + */ +class SerializationInfo +{ +public: + struct Data + { + size_t num_rows = 0; + size_t num_defaults = 0; + + void add(const IColumn & column); + void add(const Data & other); + void addDefaults(size_t length); + }; + + struct Settings + { + const double ratio_of_defaults_for_sparse = 1.0; + const bool choose_kind = false; + + bool isAlwaysDefault() const { return ratio_of_defaults_for_sparse >= 1.0; } + }; + + SerializationInfo(ISerialization::Kind kind_, const Settings & settings_); + SerializationInfo(ISerialization::Kind kind_, const Settings & settings_, const Data & data_); + + virtual ~SerializationInfo() = default; + + virtual bool hasCustomSerialization() const { return kind != ISerialization::Kind::DEFAULT; } + virtual bool structureEquals(const SerializationInfo & rhs) const { return typeid(SerializationInfo) == typeid(rhs); } + + virtual void add(const IColumn & column); + virtual void add(const SerializationInfo & other); + virtual void addDefaults(size_t length); + virtual void replaceData(const SerializationInfo & other); + + virtual std::shared_ptr clone() const; + + virtual std::shared_ptr createWithType( + const IDataType & old_type, + const IDataType & new_type, + const Settings & new_settings) const; + + virtual void serialializeKindBinary(WriteBuffer & out) const; + virtual void deserializeFromKindsBinary(ReadBuffer & in); + + virtual Poco::JSON::Object toJSON() const; + virtual void fromJSON(const Poco::JSON::Object & object); + + void setKind(ISerialization::Kind kind_) { kind = kind_; } + const Settings & getSettings() const { return settings; } + const Data & getData() const { return data; } + ISerialization::Kind getKind() const { return kind; } + + static ISerialization::Kind chooseKind(const Data & data, const Settings & settings); + +protected: + const Settings settings; + + ISerialization::Kind kind; + Data data; +}; + +using SerializationInfoPtr = std::shared_ptr; +using MutableSerializationInfoPtr = std::shared_ptr; + +using SerializationInfos = std::vector; +using MutableSerializationInfos = std::vector; + +/// The order is important because info is serialized to part metadata. +class SerializationInfoByName : public std::map +{ +public: + SerializationInfoByName() = default; + SerializationInfoByName(const NamesAndTypesList & columns, const SerializationInfo::Settings & settings); + + void add(const Block & block); + void add(const SerializationInfoByName & other); + + /// Takes data from @other, but keeps current serialization kinds. + /// If column exists in @other infos, but not in current infos, + /// it's cloned to current infos. + void replaceData(const SerializationInfoByName & other); + + void writeJSON(WriteBuffer & out) const; + void readJSON(ReadBuffer & in); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationInfoTuple.cpp b/src/DataTypes/Serializations/SerializationInfoTuple.cpp new file mode 100644 index 0000000000..d36668f03b --- /dev/null +++ b/src/DataTypes/Serializations/SerializationInfoTuple.cpp @@ -0,0 +1,165 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CORRUPTED_DATA; + extern const int THERE_IS_NO_COLUMN; +} + +SerializationInfoTuple::SerializationInfoTuple( + MutableSerializationInfos elems_, Names names_, const Settings & settings_) + : SerializationInfo(ISerialization::Kind::DEFAULT, settings_) + , elems(std::move(elems_)) + , names(std::move(names_)) +{ + assert(names.size() == elems.size()); + for (size_t i = 0; i < names.size(); ++i) + name_to_elem[names[i]] = elems[i]; +} + +bool SerializationInfoTuple::hasCustomSerialization() const +{ + return std::any_of(elems.begin(), elems.end(), [](const auto & elem) { return elem->hasCustomSerialization(); }); +} + +bool SerializationInfoTuple::structureEquals(const SerializationInfo & rhs) const +{ + const auto * rhs_tuple = typeid_cast(&rhs); + if (!rhs_tuple || elems.size() != rhs_tuple->elems.size()) + return false; + + for (size_t i = 0; i < elems.size(); ++i) + if (!elems[i]->structureEquals(*rhs_tuple->elems[i])) + return false; + + return true; +} + +void SerializationInfoTuple::add(const IColumn & column) +{ + SerializationInfo::add(column); + + const auto & column_tuple = assert_cast(column); + const auto & right_elems = column_tuple.getColumns(); + assert(elems.size() == right_elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->add(*right_elems[i]); +} + +void SerializationInfoTuple::add(const SerializationInfo & other) +{ + SerializationInfo::add(other); + + const auto & other_info = assert_cast(other); + for (const auto & [name, elem] : name_to_elem) + { + auto it = other_info.name_to_elem.find(name); + if (it != other_info.name_to_elem.end()) + elem->add(*it->second); + else + elem->addDefaults(other_info.getData().num_rows); + } +} + +void SerializationInfoTuple::addDefaults(size_t length) +{ + for (const auto & elem : elems) + elem->addDefaults(length); +} + +void SerializationInfoTuple::replaceData(const SerializationInfo & other) +{ + SerializationInfo::add(other); + + const auto & other_info = assert_cast(other); + for (const auto & [name, elem] : name_to_elem) + { + auto it = other_info.name_to_elem.find(name); + if (it != other_info.name_to_elem.end()) + elem->replaceData(*it->second); + } +} + +MutableSerializationInfoPtr SerializationInfoTuple::clone() const +{ + MutableSerializationInfos elems_cloned; + elems_cloned.reserve(elems.size()); + for (const auto & elem : elems) + elems_cloned.push_back(elem->clone()); + + return std::make_shared(std::move(elems_cloned), names, settings); +} + +MutableSerializationInfoPtr SerializationInfoTuple::createWithType( + const IDataType & old_type, + const IDataType & new_type, + const Settings & new_settings) const +{ + const auto & old_tuple = assert_cast(old_type); + const auto & new_tuple = assert_cast(new_type); + + const auto & old_elements = old_tuple.getElements(); + const auto & new_elements = new_tuple.getElements(); + + assert(elems.size() == old_elements.size()); + assert(elems.size() == new_elements.size()); + + MutableSerializationInfos infos; + infos.reserve(elems.size()); + for (size_t i = 0; i < elems.size(); ++i) + infos.push_back(elems[i]->createWithType(*old_elements[i], *new_elements[i], new_settings)); + + return std::make_shared(std::move(infos), names, new_settings); +} + +void SerializationInfoTuple::serialializeKindBinary(WriteBuffer & out) const +{ + SerializationInfo::serialializeKindBinary(out); + for (const auto & elem : elems) + elem->serialializeKindBinary(out); +} + +void SerializationInfoTuple::deserializeFromKindsBinary(ReadBuffer & in) +{ + SerializationInfo::deserializeFromKindsBinary(in); + for (const auto & elem : elems) + elem->deserializeFromKindsBinary(in); +} + +Poco::JSON::Object SerializationInfoTuple::toJSON() const +{ + auto object = SerializationInfo::toJSON(); + Poco::JSON::Array subcolumns; + for (const auto & elem : elems) + subcolumns.add(elem->toJSON()); + + object.set("subcolumns", subcolumns); + return object; +} + +void SerializationInfoTuple::fromJSON(const Poco::JSON::Object & object) +{ + SerializationInfo::fromJSON(object); + + if (!object.has("subcolumns")) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Missed field 'subcolumns' in SerializationInfo of columns SerializationInfoTuple"); + + auto subcolumns = object.getArray("subcolumns"); + if (elems.size() != subcolumns->size()) + throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, + "Mismatched number of subcolumns between JSON and SerializationInfoTuple." + "Expected: {}, got: {}", elems.size(), subcolumns->size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->fromJSON(*subcolumns->getObject(static_cast(i))); +} + +} diff --git a/src/DataTypes/Serializations/SerializationInfoTuple.h b/src/DataTypes/Serializations/SerializationInfoTuple.h new file mode 100644 index 0000000000..a9f3bdb6c6 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationInfoTuple.h @@ -0,0 +1,45 @@ +#pragma once +#include +#include + +namespace DB +{ + +class SerializationInfoTuple : public SerializationInfo +{ +public: + SerializationInfoTuple(MutableSerializationInfos elems_, Names names_, const Settings & settings_); + + bool hasCustomSerialization() const override; + bool structureEquals(const SerializationInfo & rhs) const override; + + void add(const IColumn & column) override; + void add(const SerializationInfo & other) override; + void addDefaults(size_t length) override; + void replaceData(const SerializationInfo & other) override; + + MutableSerializationInfoPtr clone() const override; + + MutableSerializationInfoPtr createWithType( + const IDataType & old_type, + const IDataType & new_type, + const Settings & new_settings) const override; + + void serialializeKindBinary(WriteBuffer & out) const override; + void deserializeFromKindsBinary(ReadBuffer & in) override; + + Poco::JSON::Object toJSON() const override; + void fromJSON(const Poco::JSON::Object & object) override; + + const MutableSerializationInfoPtr & getElementInfo(size_t i) const { return elems[i]; } + ISerialization::Kind getElementKind(size_t i) const { return elems[i]->getKind(); } + +private: + MutableSerializationInfos elems; + Names names; + + using NameToElem = std::unordered_map; + NameToElem name_to_elem; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index 0e885533b0..e2e137798d 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -61,16 +61,27 @@ SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dic { } -void SerializationLowCardinality::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +void SerializationLowCardinality::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const { - path.push_back(Substream::DictionaryKeys); - dict_inner_serialization->enumerateStreams(callback, path); - path.back() = Substream::DictionaryIndexes; - // for fall-back compatibility - dictionary_type->getDefaultSerialization()->enumerateStreams(callback, path); - // callback(path); + const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr; - path.pop_back(); + settings.path.push_back(Substream::DictionaryKeys); + auto dict_data = SubstreamData(dict_inner_serialization) + .withType(data.type ? dictionary_type : nullptr) + .withColumn(column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr) + .withSerializationInfo(data.serialization_info); + + settings.path.back().data = dict_data; + dict_inner_serialization->enumerateStreams(settings, callback, dict_data); + + settings.path.back() = Substream::DictionaryIndexes; + settings.path.back().data = data; + + callback(settings.path); + settings.path.pop_back(); } struct KeysSerializationVersion @@ -269,6 +280,7 @@ static DeserializeStateLowCardinality * checkAndGetLowCardinalityDeserializeStat } void SerializationLowCardinality::serializeBinaryBulkStatePrefix( + const IColumn & /*column*/, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { @@ -895,15 +907,20 @@ SerializationFullLowCardinality::SerializationFullLowCardinality(const DataTypeP dict_inner_serialization = dictionary_type_->getDefaultSerialization(); } -void SerializationFullLowCardinality::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +void SerializationFullLowCardinality::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const { - dict_inner_serialization->enumerateStreams(callback, path); - path.push_back(Substream::DictionaryKeys); - callback(path); - path.pop_back(); + /// TODO(fredwang) verify + dict_inner_serialization->enumerateStreams(settings, callback, data); + settings.path.push_back(Substream::DictionaryKeys); + callback(settings.path); + settings.path.pop_back(); } void SerializationFullLowCardinality::serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { @@ -918,7 +935,7 @@ void SerializationFullLowCardinality::serializeBinaryBulkStatePrefix( /// Write version and create SerializeBinaryBulkState. UInt64 key_version; key_version = KeysSerializationVersion::DictionariesInFullState; - dict_inner_serialization->serializeBinaryBulkStatePrefix(settings, state); + dict_inner_serialization->serializeBinaryBulkStatePrefix(column, settings, state); writeIntBinary(key_version, *stream); diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h index f4ef071fbd..0d67c627e5 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.h +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -38,9 +38,13 @@ protected: public: SerializationLowCardinality(const DataTypePtr & dictionary_type); - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; @@ -106,10 +110,11 @@ class SerializationFullLowCardinality : public SerializationLowCardinality { public: SerializationFullLowCardinality(const DataTypePtr & dictionary_type); - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; - + + void enumerateStreams(EnumerateStreamsSettings & settings, const StreamCallback & callback, const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 6435e3bf0a..ba4c275ae4 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -297,17 +298,25 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c deserializeText(column, rb, settings); } - -void SerializationMap::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +void SerializationMap::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const { - nested->enumerateStreams(callback, path); + auto next_data = SubstreamData(nested) + .withType(data.type ? assert_cast(*data.type).getNestedType() : nullptr) + .withColumn(data.column ? assert_cast(*data.column).getNestedColumnPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + nested->enumerateStreams(settings, callback, next_data); } void SerializationMap::serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { - nested->serializeBinaryBulkStatePrefix(settings, state); + nested->serializeBinaryBulkStatePrefix(extractNestedColumn(column), settings, state); } void SerializationMap::serializeBinaryBulkStateSuffix( diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index 851a6b0add..d101ac5a3e 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -31,9 +31,13 @@ public: void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; - + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + void serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp new file mode 100644 index 0000000000..ca60948ce6 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNamed.cpp @@ -0,0 +1,78 @@ +#include + +namespace DB +{ + +void SerializationNamed::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + addToPath(settings.path); + settings.path.back().data = data; + settings.path.back().creator = std::make_shared(name, escape_delimiter); + + nested_serialization->enumerateStreams(settings, callback, data); + settings.path.pop_back(); +} + +void SerializationNamed::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkStatePrefix(column, settings, state); + settings.path.pop_back(); +} + +void SerializationNamed::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + +void SerializationNamed::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + +void SerializationNamed::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); + settings.path.pop_back(); +} + +void SerializationNamed::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + addToPath(settings.path); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); + settings.path.pop_back(); +} + +void SerializationNamed::addToPath(SubstreamPath & path) const +{ + path.push_back(Substream::TupleElement); + path.back().tuple_element_name = name; + path.back().escape_tuple_delimiter = escape_delimiter; +} + +} diff --git a/src/DataTypes/Serializations/SerializationNamed.h b/src/DataTypes/Serializations/SerializationNamed.h new file mode 100644 index 0000000000..52bbb03944 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNamed.h @@ -0,0 +1,80 @@ +#pragma once + +#include + +namespace DB +{ + +/// Serialization wrapper that acts like nested serialization, +/// but adds a passed name to the substream path like the +/// read column was the tuple element with this name. +/// It's used while reading subcolumns of complex types. +/// In particular while reading components of named tuples. +class SerializationNamed final : public SerializationWrapper +{ +private: + String name; + bool escape_delimiter; + +public: + SerializationNamed(const SerializationPtr & nested_, const String & name_, bool escape_delimiter_ = true) + : SerializationWrapper(nested_) + , name(name_), escape_delimiter(escape_delimiter_) + { + } + + const String & getElementName() const { return name; } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + struct SubcolumnCreator : public ISubcolumnCreator + { + const String name; + const bool escape_delimiter; + + SubcolumnCreator(const String & name_, bool escape_delimiter_) + : name(name_), escape_delimiter(escape_delimiter_) {} + + DataTypePtr create(const DataTypePtr & prev) const override { return prev; } + ColumnPtr create(const ColumnPtr & prev) const override { return prev; } + SerializationPtr create(const SerializationPtr & prev) const override + { + return std::make_shared(prev, name, escape_delimiter); + } + }; + + void addToPath(SubstreamPath & path) const; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 4de2b08c04..c10887d48e 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -1,5 +1,8 @@ #include #include +#include +#include +#include #include #include @@ -20,22 +23,61 @@ namespace ErrorCodes extern const int CANNOT_READ_ALL_DATA; } -void SerializationNullable::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +DataTypePtr SerializationNullable::SubcolumnCreator::create(const DataTypePtr & prev) const { - path.push_back(Substream::NullMap); - callback(path); - path.back() = Substream::NullableElements; - nested->enumerateStreams(callback, path); - path.pop_back(); + return std::make_shared(prev); } +SerializationPtr SerializationNullable::SubcolumnCreator::create(const SerializationPtr & prev) const +{ + return std::make_shared(prev); +} + +ColumnPtr SerializationNullable::SubcolumnCreator::create(const ColumnPtr & prev) const +{ + return ColumnNullable::create(prev, null_map); +} + +void SerializationNullable::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_nullable = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_nullable = data.column ? &assert_cast(*data.column) : nullptr; + + auto null_map_serialization = std::make_shared(std::make_shared>(), "null", false); + + settings.path.push_back(Substream::NullMap); + auto null_map_data = SubstreamData(null_map_serialization) + .withType(type_nullable ? std::make_shared() : nullptr) + .withColumn(column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + settings.path.back().data = null_map_data; + callback(settings.path); + + settings.path.back() = Substream::NullableElements; + settings.path.back().creator = std::make_shared(null_map_data.column); + settings.path.back().data = data; + + auto next_data = SubstreamData(nested) + .withType(type_nullable ? type_nullable->getNestedType() : nullptr) + .withColumn(column_nullable ? column_nullable->getNestedColumnPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + nested->enumerateStreams(settings, callback, next_data); + settings.path.pop_back(); +} void SerializationNullable::serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { settings.path.push_back(Substream::NullableElements); - nested->serializeBinaryBulkStatePrefix(settings, state); + const auto & column_nullable = assert_cast(column); + nested->serializeBinaryBulkStatePrefix(column_nullable.getNestedColumn(), settings, state); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index b0b96c021d..d1ebebe356 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -13,9 +13,13 @@ private: public: SerializationNullable(const SerializationPtr & nested_) : nested(nested_) {} - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; @@ -80,6 +84,18 @@ public: static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); template static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + +private: + struct SubcolumnCreator : public ISubcolumnCreator + { + const ColumnPtr null_map; + + explicit SubcolumnCreator(const ColumnPtr & null_map_) : null_map(null_map_) {} + + DataTypePtr create(const DataTypePtr & prev) const override; + SerializationPtr create(const SerializationPtr & prev) const override; + ColumnPtr create(const ColumnPtr & prev) const override; + }; }; } diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp new file mode 100644 index 0000000000..8e6b4288a9 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationObject.cpp @@ -0,0 +1,523 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int INCORRECT_DATA; + extern const int CANNOT_READ_ALL_DATA; + extern const int ARGUMENT_OUT_OF_BOUND; + extern const int LOGICAL_ERROR; +} + +template +template +void SerializationObject::deserializeTextImpl(IColumn & column, Reader && reader) const +{ + auto & column_object = assert_cast(column); + + String buf; + reader(buf); + std::optional result; + + /// Treat empty string as an empty object + /// for better CAST from String to Object. + if (!buf.empty()) + { + auto parser = parsers_pool.get([] { return new Parser; }); + result = parser->parse(buf.data(), buf.size()); + } + else + { + result = ParseResult{}; + } + + if (!result) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse object"); + + auto & [paths, values] = *result; + assert(paths.size() == values.size()); + + size_t old_column_size = column_object.size(); + for (size_t i = 0; i < paths.size(); ++i) + { + auto field_info = getFieldInfo(values[i]); + if (field_info.need_fold_dimension) + values[i] = applyVisitor(FieldVisitorFoldDimension(field_info.num_dimensions), std::move(values[i])); + if (isNothing(field_info.scalar_type)) + continue; + + if (!column_object.hasSubcolumn(paths[i])) + { + if (paths[i].hasNested()) + column_object.addNestedSubcolumn(paths[i], field_info, old_column_size); + else + column_object.addSubcolumn(paths[i], old_column_size); + } + + auto & subcolumn = column_object.getSubcolumn(paths[i]); + assert(subcolumn.size() == old_column_size); + + subcolumn.insert(std::move(values[i]), std::move(field_info)); + } + + /// Insert default values to missed subcolumns. + const auto & subcolumns = column_object.getSubcolumns(); + for (const auto & entry : subcolumns) + { + if (entry->data.size() == old_column_size) + { + bool inserted = column_object.tryInsertDefaultFromNested(entry); + if (!inserted) + entry->data.insertDefault(); + } + } + + column_object.incrementNumRows(); +} + +template +void SerializationObject::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { readStringInto(s, istr); }); +} + +template +void SerializationObject::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { readEscapedString(s, istr); }); +} + +template +void SerializationObject::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { readQuotedStringInto(s, istr); }); +} + +template +void SerializationObject::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { Parser::readJSON(s, istr); }); +} + +template +void SerializationObject::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, [&](String & s) { readCSVStringInto(s, istr, settings.csv); }); +} + +template +template +void SerializationObject::checkSerializationIsSupported(const TSettings & settings) const +{ + if (settings.position_independent_encoding) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject doesn't support serialization with position independent encoding"); +} + +template +struct SerializationObject::SerializeStateObject : public ISerialization::SerializeBinaryBulkState +{ + DataTypePtr nested_type; + SerializationPtr nested_serialization; + SerializeBinaryBulkStatePtr nested_state; +}; + +template +struct SerializationObject::DeserializeStateObject : public ISerialization::DeserializeBinaryBulkState +{ + BinarySerializationKind kind; + DataTypePtr nested_type; + SerializationPtr nested_serialization; + DeserializeBinaryBulkStatePtr nested_state; +}; + +template +void SerializationObject::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings); + if (state) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject doesn't support serialization with non-trivial state"); + + const auto & column_object = assert_cast(column); + if (!column_object.isFinalized()) + { + auto finalized = column_object.cloneFinalized(); + serializeBinaryBulkStatePrefix(*finalized, settings, state); + return; + } + + settings.path.push_back(Substream::ObjectStructure); + auto * stream = settings.getter(settings.path); + + if (!stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for kind of binary serialization"); + + auto [tuple_column, tuple_type] = unflattenObjectToTuple(column_object); + + writeIntBinary(static_cast(BinarySerializationKind::TUPLE), *stream); + writeStringBinary(tuple_type->getName(), *stream); + + auto state_object = std::make_shared(); + state_object->nested_type = tuple_type; + state_object->nested_serialization = tuple_type->getDefaultSerialization(); + + settings.path.back() = Substream::ObjectData; + state_object->nested_serialization->serializeBinaryBulkStatePrefix(*tuple_column, settings, state_object->nested_state); + + state = std::move(state_object); + settings.path.pop_back(); +} + +template +void SerializationObject::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings); + auto * state_object = checkAndGetState(state); + + settings.path.push_back(Substream::ObjectData); + state_object->nested_serialization->serializeBinaryBulkStateSuffix(settings, state_object->nested_state); + settings.path.pop_back(); +} + +template +void SerializationObject::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings); + if (state) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject doesn't support serialization with non-trivial state"); + + settings.path.push_back(Substream::ObjectStructure); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, + "Cannot read kind of binary serialization of DataTypeObject, because its stream is missing"); + + UInt8 kind_raw; + readIntBinary(kind_raw, *stream); + auto kind = magic_enum::enum_cast(kind_raw); + if (!kind) + throw Exception(ErrorCodes::INCORRECT_DATA, + "Unknown binary serialization kind of Object: {}", std::to_string(kind_raw)); + + auto state_object = std::make_shared(); + state_object->kind = *kind; + + if (state_object->kind == BinarySerializationKind::TUPLE) + { + String data_type_name; + readStringBinary(data_type_name, *stream); + state_object->nested_type = DataTypeFactory::instance().get(data_type_name); + state_object->nested_serialization = state_object->nested_type->getDefaultSerialization(); + + if (!isTuple(state_object->nested_type)) + throw Exception(ErrorCodes::INCORRECT_DATA, + "Data of type Object should be written as Tuple, got: {}", data_type_name); + } + else if (state_object->kind == BinarySerializationKind::STRING) + { + state_object->nested_type = std::make_shared(); + state_object->nested_serialization = std::make_shared(); + } + else + { + throw Exception(ErrorCodes::INCORRECT_DATA, + "Unknown binary serialization kind of Object: {}", std::to_string(kind_raw)); + } + + settings.path.push_back(Substream::ObjectData); + state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state); + settings.path.pop_back(); + + state = std::move(state_object); +} + +template +void SerializationObject::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings); + const auto & column_object = assert_cast(column); + auto * state_object = checkAndGetState(state); + + if (!column_object.isFinalized()) + { + auto finalized = column_object.cloneFinalized(); + serializeBinaryBulkWithMultipleStreams(*finalized, offset, limit, settings, state); + return; + } + + auto [tuple_column, tuple_type] = unflattenObjectToTuple(column_object); + + if (!state_object->nested_type->equals(*tuple_type)) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Types of internal column of Object mismatched. Expected: {}, Got: {}", + state_object->nested_type->getName(), tuple_type->getName()); + } + + settings.path.push_back(Substream::ObjectData); + if (auto * stream = settings.getter(settings.path)) + { + state_object->nested_serialization->serializeBinaryBulkWithMultipleStreams( + *tuple_column, offset, limit, settings, state_object->nested_state); + } + + settings.path.pop_back(); +} + +template +void SerializationObject::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + checkSerializationIsSupported(settings); + if (!column->empty()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject cannot be deserialized to non-empty column"); + + auto mutable_column = column->assumeMutable(); + auto & column_object = assert_cast(*mutable_column); + auto * state_object = checkAndGetState(state); + + settings.path.push_back(Substream::ObjectData); + if (state_object->kind == BinarySerializationKind::STRING) + deserializeBinaryBulkFromString(column_object, limit, settings, *state_object, cache); + else + deserializeBinaryBulkFromTuple(column_object, limit, settings, *state_object, cache); + + settings.path.pop_back(); + column_object.checkConsistency(); + column_object.finalize(); + column = std::move(mutable_column); +} + +template +void SerializationObject::deserializeBinaryBulkFromString( + ColumnObject & column_object, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeStateObject & state, + SubstreamsCache * cache) const +{ + ColumnPtr column_string = state.nested_type->createColumn(); + state.nested_serialization->deserializeBinaryBulkWithMultipleStreams( + column_string, limit, settings, state.nested_state, cache); + + ConvertImplGenericFromString::executeImpl(*column_string, column_object, *this, column_string->size()); +} + +template +void SerializationObject::deserializeBinaryBulkFromTuple( + ColumnObject & column_object, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeStateObject & state, + SubstreamsCache * cache) const +{ + ColumnPtr column_tuple = state.nested_type->createColumn(); + state.nested_serialization->deserializeBinaryBulkWithMultipleStreams( + column_tuple, limit, settings, state.nested_state, cache); + + auto [tuple_paths, tuple_types] = flattenTuple(state.nested_type); + auto flattened_tuple = flattenTuple(column_tuple); + const auto & tuple_columns = assert_cast(*flattened_tuple).getColumns(); + + assert(tuple_paths.size() == tuple_types.size()); + size_t num_subcolumns = tuple_paths.size(); + + if (tuple_columns.size() != num_subcolumns) + throw Exception(ErrorCodes::INCORRECT_DATA, + "Inconsistent type ({}) and column ({}) while reading column of type Object", + state.nested_type->getName(), column_tuple->getName()); + + for (size_t i = 0; i < num_subcolumns; ++i) + column_object.addSubcolumn(tuple_paths[i], tuple_columns[i]->assumeMutable()); +} + +template +void SerializationObject::serializeBinary(const Field &, WriteBuffer &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +template +void SerializationObject::deserializeBinary(Field &, ReadBuffer &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +template +void SerializationObject::serializeBinary(const IColumn &, size_t, WriteBuffer &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +template +void SerializationObject::deserializeBinary(IColumn &, ReadBuffer &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +/// TODO: use format different of JSON in serializations. + +template +void SerializationObject::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_object = assert_cast(column); + const auto & subcolumns = column_object.getSubcolumns(); + + writeChar('{', ostr); + for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it) + { + const auto & entry = *it; + if (it != subcolumns.begin()) + writeCString(",", ostr); + + writeDoubleQuoted(entry->path.getPath(), ostr); + writeChar(':', ostr); + serializeTextFromSubcolumn(entry->data, row_num, ostr, settings); + } + writeChar('}', ostr); +} + +template +void SerializationObject::serializeTextFromSubcolumn( + const ColumnObject::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & least_common_type = subcolumn.getLeastCommonType(); + + if (subcolumn.isFinalized()) + { + const auto & finalized_column = subcolumn.getFinalizedColumn(); + auto info = least_common_type->getSerializationInfo(finalized_column); + auto serialization = least_common_type->getSerialization(*info); + serialization->serializeTextJSON(finalized_column, row_num, ostr, settings); + return; + } + + size_t ind = row_num; + if (ind < subcolumn.getNumberOfDefaultsInPrefix()) + { + /// Suboptimal, but it should happen rarely. + auto tmp_column = subcolumn.getLeastCommonType()->createColumn(); + tmp_column->insertDefault(); + + auto info = least_common_type->getSerializationInfo(*tmp_column); + auto serialization = least_common_type->getSerialization(*info); + serialization->serializeTextJSON(*tmp_column, 0, ostr, settings); + return; + } + + ind -= subcolumn.getNumberOfDefaultsInPrefix(); + for (const auto & part : subcolumn.getData()) + { + if (ind < part->size()) + { + auto part_type = getDataTypeByColumn(*part); + auto info = part_type->getSerializationInfo(*part); + auto serialization = part_type->getSerialization(*info); + serialization->serializeTextJSON(*part, ind, ostr, settings); + return; + } + + ind -= part->size(); + } + + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Index ({}) for text serialization is out of range", row_num); +} + +template +void SerializationObject::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, settings); +} + +template +void SerializationObject::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString ostr_str; + serializeTextImpl(column, row_num, ostr_str, settings); + writeEscapedString(ostr_str.str(), ostr); +} + +template +void SerializationObject::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString ostr_str; + serializeTextImpl(column, row_num, ostr_str, settings); + writeQuotedString(ostr_str.str(), ostr); +} + +template +void SerializationObject::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, settings); +} + +template +void SerializationObject::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString ostr_str; + serializeTextImpl(column, row_num, ostr_str, settings); + writeCSVString(ostr_str.str(), ostr); +} + +SerializationPtr getObjectSerialization(const String & schema_format) +{ + if (schema_format == "json") + { +#if USE_SIMDJSON + return std::make_shared>>(); +#elif USE_RAPIDJSON + return std::make_shared>>(); +#else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "To use data type Object with JSON format ClickHouse should be built with Simdjson or Rapidjson"); +#endif + } + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown schema format '{}'", schema_format); +} + +} diff --git a/src/DataTypes/Serializations/SerializationObject.h b/src/DataTypes/Serializations/SerializationObject.h new file mode 100644 index 0000000000..47a7127cd1 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationObject.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +/** Serialization for data type Object. + * Supported only text serialization/deserialization. + * and binary bulk serialization/deserialization without position independent + * encoding, i.e. serialization/deserialization into Native format. + */ +template +class SerializationObject : public ISerialization +{ +public: + /** In Native format ColumnObject can be serialized + * in two formats: as Tuple or as String. + * The format is the following: + * + * 1 byte -- 0 if Tuple, 1 if String. + * [type_name] -- Only for tuple serialization. + * ... data of internal column ... + * + * ClickHouse client serializazes objects as tuples. + * String serialization exists for clients, which cannot + * do parsing by themselves and they can send raw data as + * string. It will be parsed on the server side. + */ + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + +private: + enum class BinarySerializationKind : UInt8 + { + TUPLE = 0, + STRING = 1, + }; + + struct SerializeStateObject; + struct DeserializeStateObject; + + void deserializeBinaryBulkFromString( + ColumnObject & column_object, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeStateObject & state, + SubstreamsCache * cache) const; + + void deserializeBinaryBulkFromTuple( + ColumnObject & column_object, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeStateObject & state, + SubstreamsCache * cache) const; + + template + void checkSerializationIsSupported(const TSettings & settings) const; + + template + void deserializeTextImpl(IColumn & column, Reader && reader) const; + + void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; + void serializeTextFromSubcolumn(const ColumnObject::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; + + /// Pool of parser objects to make SerializationObject thread safe. + mutable SimpleObjectPool parsers_pool; +}; + +SerializationPtr getObjectSerialization(const String & schema_format); + +} diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 6ce3761ff9..b595e73928 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -1,9 +1,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -281,10 +283,24 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, }); } -void SerializationTuple::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +void SerializationTuple::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const { - for (const auto & elem : elems) - elem->enumerateStreams(callback, path); + const auto * type_tuple = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_tuple = data.column ? &assert_cast(*data.column) : nullptr; + const auto * info_tuple = data.serialization_info ? &assert_cast(*data.serialization_info) : nullptr; + + for (size_t i = 0; i < elems.size(); ++i) + { + auto next_data = SubstreamData(elems[i]) + .withType(type_tuple ? type_tuple->getElement(i) : nullptr) + .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr) + .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr); + + elems[i]->enumerateStreams(settings, callback, next_data); + } } struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState @@ -332,6 +348,7 @@ static DeserializeBinaryBulkStateTuple * checkAndGetTupleDeserializeState(ISeria } void SerializationTuple::serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { @@ -339,7 +356,7 @@ void SerializationTuple::serializeBinaryBulkStatePrefix( tuple_state->states.resize(elems.size()); for (size_t i = 0; i < elems.size(); ++i) - elems[i]->serializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); + elems[i]->serializeBinaryBulkStatePrefix(extractElementColumn(column, i), settings, tuple_state->states[i]); state = std::move(tuple_state); } diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index 13668572ff..e94468e1b7 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { @@ -9,7 +9,7 @@ namespace DB class SerializationTuple final : public SimpleTextSerialization { public: - using ElementSerializationPtr = std::shared_ptr; + using ElementSerializationPtr = std::shared_ptr; using ElementSerializations = std::vector; SerializationTuple(const ElementSerializations & elems_, bool have_explicit_names_) @@ -29,11 +29,15 @@ public: void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - /** Each sub-column in a tuple is serialized in separate stream. + /** Each sub-column in a tuple is serialized in separate stream. */ - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; diff --git a/src/DataTypes/Serializations/SerializationTupleElement.cpp b/src/DataTypes/Serializations/SerializationTupleElement.cpp index 4b50810fcd..ebcaad28b1 100644 --- a/src/DataTypes/Serializations/SerializationTupleElement.cpp +++ b/src/DataTypes/Serializations/SerializationTupleElement.cpp @@ -4,20 +4,22 @@ namespace DB { void SerializationTupleElement::enumerateStreams( + EnumerateStreamsSettings & settings, const StreamCallback & callback, - SubstreamPath & path) const + const SubstreamData & data) const { - addToPath(path); - nested_serialization->enumerateStreams(callback, path); - path.pop_back(); + addToPath(settings.path); + nested_serialization->enumerateStreams(settings, callback, data); + settings.path.pop_back(); } void SerializationTupleElement::serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { addToPath(settings.path); - nested_serialization->serializeBinaryBulkStatePrefix(settings, state); + nested_serialization->serializeBinaryBulkStatePrefix(column, settings, state); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationTupleElement.h b/src/DataTypes/Serializations/SerializationTupleElement.h index b85014c9e6..01c26968a5 100644 --- a/src/DataTypes/Serializations/SerializationTupleElement.h +++ b/src/DataTypes/Serializations/SerializationTupleElement.h @@ -21,11 +21,13 @@ public: const String & getElementName() const { return name; } void enumerateStreams( + EnumerateStreamsSettings & settings, const StreamCallback & callback, - SubstreamPath & path) const override; + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( - SerializeBinaryBulkSettings & settings, + const IColumn & column, + SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; void serializeBinaryBulkStateSuffix( diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp index f75c9a1dd8..4ee31571b9 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.cpp +++ b/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -4,16 +4,22 @@ namespace DB { -void SerializationWrapper::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const + +void SerializationWrapper::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const { - nested_serialization->enumerateStreams(callback, path); + nested_serialization->enumerateStreams(settings, callback, data); } + void SerializationWrapper::serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { - nested_serialization->serializeBinaryBulkStatePrefix(settings, state); + nested_serialization->serializeBinaryBulkStatePrefix(column, settings, state); } void SerializationWrapper::serializeBinaryBulkStateSuffix( diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h index 43b88ee2fa..32381b2e96 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.h +++ b/src/DataTypes/Serializations/SerializationWrapper.h @@ -16,9 +16,13 @@ protected: public: explicit SerializationWrapper(const SerializationPtr & nested_serialization_) : nested_serialization(nested_serialization_) {} - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; diff --git a/src/DataTypes/Serializations/tests/CMakeLists.txt b/src/DataTypes/Serializations/tests/CMakeLists.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/DataTypes/Serializations/tests/gtest_json_parser.cpp b/src/DataTypes/Serializations/tests/gtest_json_parser.cpp new file mode 100644 index 0000000000..9b0c8e44d0 --- /dev/null +++ b/src/DataTypes/Serializations/tests/gtest_json_parser.cpp @@ -0,0 +1,216 @@ +#include +#include +#include +#include + +#include +#include + +#if USE_SIMDJSON + +using namespace DB; + +const String json1 = R"({"k1" : 1, "k2" : {"k3" : "aa", "k4" : 2}})"; + +/// Nested(k2 String, k3 Nested(k4 String)) +const String json2 = +R"({"k1" : [ + { + "k2" : "aaa", + "k3" : [{ "k4" : "bbb" }, { "k4" : "ccc" }] + }, + { + "k2" : "ddd", + "k3" : [{ "k4" : "eee" }, { "k4" : "fff" }] + } + ] +})"; + +TEST(JSONDataParser, ReadJSON) +{ + { + String json_bad = json1 + "aaaaaaa"; + + JSONDataParser parser; + ReadBufferFromString buf(json_bad); + String res; + parser.readJSON(res, buf); + ASSERT_EQ(json1, res); + } + + { + String json_bad = json2 + "aaaaaaa"; + + JSONDataParser parser; + ReadBufferFromString buf(json_bad); + String res; + parser.readJSON(res, buf); + ASSERT_EQ(json2, res); + } +} + +struct JSONPathAndValue +{ + PathInData path; + Field value; + + JSONPathAndValue(const PathInData & path_, const Field & value_) + : path(path_), value(value_) + { + } + + bool operator==(const JSONPathAndValue & other) const = default; + bool operator<(const JSONPathAndValue & other) const { return path.getPath() < other.path.getPath(); } +}; + +static std::ostream & operator<<(std::ostream & ostr, const JSONPathAndValue & path_and_value) +{ + ostr << "{ PathInData{"; + bool first = true; + for (const auto & part : path_and_value.path.getParts()) + { + ostr << (first ? "{" : ", {") << part.key << ", " << part.is_nested << ", " << static_cast(part.anonymous_array_level) << "}"; + first = false; + } + + ostr << "}, Field{" << applyVisitor(FieldVisitorToString(), path_and_value.value) << "} }"; + return ostr; +} + +using JSONValues = std::vector; + +static void check( + const String & json_str, + const String & tag, + JSONValues expected_values) +{ + JSONDataParser parser; + auto res = parser.parse(json_str.data(), json_str.size()); + ASSERT_TRUE(res.has_value()) << tag; + + const auto & [paths, values] = *res; + + ASSERT_EQ(paths.size(), expected_values.size()) << tag; + ASSERT_EQ(values.size(), expected_values.size()) << tag; + + JSONValues result_values; + for (size_t i = 0; i < paths.size(); ++i) + result_values.emplace_back(paths[i], values[i]); + + std::sort(expected_values.begin(), expected_values.end()); + std::sort(result_values.begin(), result_values.end()); + + ASSERT_EQ(result_values, expected_values) << tag; +} + +TEST(JSONDataParser, Parse) +{ + { + check(json1, "json1", + { + { PathInData{{{"k1", false, 0}}}, 1 }, + { PathInData{{{"k2", false, 0}, {"k3", false, 0}}}, "aa" }, + { PathInData{{{"k2", false, 0}, {"k4", false, 0}}}, 2 }, + }); + } + + { + check(json2, "json2", + { + { PathInData{{{"k1", true, 0}, {"k2", false, 0}}}, Array{"aaa", "ddd"} }, + { PathInData{{{"k1", true, 0}, {"k3", true, 0}, {"k4", false, 0}}}, Array{Array{"bbb", "ccc"}, Array{"eee", "fff"}} }, + }); + } + + { + /// Nested(k2 Tuple(k3 Array(Int), k4 Array(Int)), k5 String) + const String json3 = + R"({"k1": [ + { + "k2": { + "k3": [1, 2], + "k4": [3, 4] + }, + "k5": "foo" + }, + { + "k2": { + "k3": [5, 6], + "k4": [7, 8] + }, + "k5": "bar" + } + ]})"; + + check(json3, "json3", + { + { PathInData{{{"k1", true, 0}, {"k5", false, 0}}}, Array{"foo", "bar"} }, + { PathInData{{{"k1", true, 0}, {"k2", false, 0}, {"k3", false, 0}}}, Array{Array{1, 2}, Array{5, 6}} }, + { PathInData{{{"k1", true, 0}, {"k2", false, 0}, {"k4", false, 0}}}, Array{Array{3, 4}, Array{7, 8}} }, + }); + } + + { + /// Nested(k2 Nested(k3 Int, k4 Int), k5 String) + const String json4 = + R"({"k1": [ + { + "k2": [{"k3": 1, "k4": 3}, {"k3": 2, "k4": 4}], + "k5": "foo" + }, + { + "k2": [{"k3": 5, "k4": 7}, {"k3": 6, "k4": 8}], + "k5": "bar" + } + ]})"; + + check(json4, "json4", + { + { PathInData{{{"k1", true, 0}, {"k5", false, 0}}}, Array{"foo", "bar"} }, + { PathInData{{{"k1", true, 0}, {"k2", true, 0}, {"k3", false, 0}}}, Array{Array{1, 2}, Array{5, 6}} }, + { PathInData{{{"k1", true, 0}, {"k2", true, 0}, {"k4", false, 0}}}, Array{Array{3, 4}, Array{7, 8}} }, + }); + } + + { + const String json5 = R"({"k1": [[1, 2, 3], [4, 5], [6]]})"; + check(json5, "json5", + { + { PathInData{{{"k1", false, 0}}}, Array{Array{1, 2, 3}, Array{4, 5}, Array{6}} } + }); + } + + { + /// Array(Nested(k2 Int, k3 Int)) + const String json6 = R"({ + "k1": [ + [{"k2": 1, "k3": 2}, {"k2": 3, "k3": 4}], + [{"k2": 5, "k3": 6}] + ] + })"; + + check(json6, "json6", + { + { PathInData{{{"k1", true, 0}, {"k2", false, 1}}}, Array{Array{1, 3}, Array{5}} }, + { PathInData{{{"k1", true, 0}, {"k3", false, 1}}}, Array{Array{2, 4}, Array{6}} }, + }); + } + + { + /// Nested(k2 Array(Int), k3 Array(Int)) + const String json7 = R"({ + "k1": [ + {"k2": [1, 3], "k3": [2, 4]}, + {"k2": [5], "k3": [6]} + ] + })"; + + check(json7, "json7", + { + { PathInData{{{"k1", true, 0}, {"k2", false, 0}}}, Array{Array{1, 3}, Array{5}} }, + { PathInData{{{"k1", true, 0}, {"k3", false, 0}}}, Array{Array{2, 4}, Array{6}} }, + }); + } +} + +#endif diff --git a/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp b/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp new file mode 100644 index 0000000000..fc7432d5bf --- /dev/null +++ b/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if USE_SIMDJSON + +using namespace DB; + +TEST(SerializationObject, FromString) +{ + WriteBufferFromOwnString out; + + auto column_string = ColumnString::create(); + column_string->insert(R"({"k1" : 1, "k2" : [{"k3" : "aa", "k4" : 2}, {"k3": "bb", "k4": 3}]})"); + column_string->insert(R"({"k1" : 2, "k2" : [{"k3" : "cc", "k5" : 4}, {"k4": 5}, {"k4": 6}]})"); + + { + auto serialization = std::make_shared(); + + ISerialization::SerializeBinaryBulkSettings settings; + ISerialization::SerializeBinaryBulkStatePtr state; + settings.position_independent_encoding = false; + settings.getter = [&out](const auto &) { return &out; }; + + writeIntBinary(static_cast(1), out); + serialization->serializeBinaryBulkStatePrefix(*column_string, settings, state); + serialization->serializeBinaryBulkWithMultipleStreams(*column_string, 0, column_string->size(), settings, state); + serialization->serializeBinaryBulkStateSuffix(settings, state); + } + + auto type_object = std::make_shared("json", false); + ColumnPtr result_column = type_object->createColumn(); + + ReadBufferFromOwnString in(out.str()); + + { + auto serialization = type_object->getDefaultSerialization(); + + ISerialization::DeserializeBinaryBulkSettings settings; + ISerialization::DeserializeBinaryBulkStatePtr state; + settings.position_independent_encoding = false; + settings.getter = [&in](const auto &) { return ∈ }; + + serialization->deserializeBinaryBulkStatePrefix(settings, state); + serialization->deserializeBinaryBulkWithMultipleStreams(result_column, column_string->size(), settings, state, nullptr); + } + + auto & column_object = assert_cast(*result_column->assumeMutable()); + column_object.finalize(); + + ASSERT_TRUE(column_object.size() == 2); + ASSERT_TRUE(column_object.getSubcolumns().size() == 4); + + auto check_subcolumn = [&](const auto & name, const auto & type_name, const std::vector & expected) + { + const auto & subcolumn = column_object.getSubcolumn(PathInData{name}); + ASSERT_EQ(subcolumn.getLeastCommonType()->getName(), type_name); + + const auto & data = subcolumn.getFinalizedColumn(); + for (size_t i = 0; i < expected.size(); ++i) + ASSERT_EQ( + applyVisitor(FieldVisitorToString(), data[i]), + applyVisitor(FieldVisitorToString(), expected[i])); + }; + + check_subcolumn("k1", "Int8", {1, 2}); + check_subcolumn("k2.k3", "Array(String)", {Array{"aa", "bb"}, Array{"cc", "", ""}}); + check_subcolumn("k2.k4", "Array(Int8)", {Array{2, 3}, Array{0, 5, 6}}); + check_subcolumn("k2.k5", "Array(Int8)", {Array{0, 0}, Array{4, 0, 0}}); +} + +#endif diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp index 3c6b2aba5a..3757c38d22 100644 --- a/src/DataTypes/getLeastSupertype.cpp +++ b/src/DataTypes/getLeastSupertype.cpp @@ -29,23 +29,23 @@ #include #include -#include -#include -#include -#include -#include -#include #include #include -#include #include -#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include #include namespace DB { - namespace ErrorCodes { extern const int NO_COMMON_TYPE; @@ -53,10 +53,12 @@ namespace ErrorCodes namespace { + String typeToString(const DataTypePtr & type) { return type->getName(); } + String typeToString(const TypeIndex & type) { return String(magic_enum::enum_name(type)); } + String getExceptionMessagePrefix(const DataTypes & types) { WriteBufferFromOwnString res; - res << "There is no supertype for types "; bool first = true; for (const auto & type : types) @@ -65,17 +67,35 @@ namespace res << ", "; first = false; - res << type->getName(); - if (type->isMap()) - res << " KV"; - else if (type->isByteMap()) - res << " BYTE"; + res << typeToString(type); + + if (type->isMap()) + res << " KV"; + else if (type->isByteMap()) + res << " BYTE"; } return res.str(); } - template + String getExceptionMessagePrefix(const TypeIndexSet & types) + { + WriteBufferFromOwnString res; + + bool first = true; + for (const auto & type : types) + { + if (!first) + res << ", "; + first = false; + + res << typeToString(type); + } + + return res.str(); + } + + template DataTypePtr throwOrReturn(const DataTypes & types, std::string_view message_suffix, int error_code) { if constexpr (on_error == LeastSupertypeOnError::String) @@ -91,7 +111,7 @@ namespace } template - DataTypePtr getNumericType(const DataTypes & types, bool allow_extended_conversion) + DataTypePtr getNumericType(const TypeIndexSet & types, bool allow_extended_conversion) { bool all_numbers = true; @@ -99,50 +119,52 @@ namespace size_t max_bits_of_unsigned_integer = 0; size_t max_mantissa_bits_of_floating = 0; - auto maximize = [](size_t & what, size_t value) - { + auto maximize = [](size_t & what, size_t value) { if (value > what) what = value; }; for (const auto & type : types) { - if (typeid_cast(type.get())) + if (type == TypeIndex::UInt8) maximize(max_bits_of_unsigned_integer, 8); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::UInt16) maximize(max_bits_of_unsigned_integer, 16); - else if (typeid_cast(type.get()) || typeid_cast(type.get())) + else if (type == TypeIndex::UInt32 || type == TypeIndex::IPv4) maximize(max_bits_of_unsigned_integer, 32); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::UInt32) + maximize(max_bits_of_unsigned_integer, 32); + else if (type == TypeIndex::UInt64) maximize(max_bits_of_unsigned_integer, 64); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::UInt128) maximize(max_bits_of_unsigned_integer, 128); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::UInt256) maximize(max_bits_of_unsigned_integer, 256); - else if (typeid_cast(type.get()) || typeid_cast(type.get())) + else if (type == TypeIndex::Int8 || type == TypeIndex::Enum8) maximize(max_bits_of_signed_integer, 8); - else if (typeid_cast(type.get()) || typeid_cast(type.get())) + else if (type == TypeIndex::Int16 || type == TypeIndex::Enum16) maximize(max_bits_of_signed_integer, 16); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::Int32) maximize(max_bits_of_signed_integer, 32); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::Int64) maximize(max_bits_of_signed_integer, 64); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::Int128) maximize(max_bits_of_signed_integer, 128); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::Int256) maximize(max_bits_of_signed_integer, 256); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::Float32) maximize(max_mantissa_bits_of_floating, 24); - else if (typeid_cast(type.get())) + else if (type == TypeIndex::Float64) maximize(max_mantissa_bits_of_floating, 53); - else if (typeid_cast(type.get())) + else if (type != TypeIndex::Nothing) all_numbers = false; } if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating) { if (!all_numbers) - return throwOrReturn(types, "because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE); + return throwOrReturn( + types, "because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE); /// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit. /// Example, common of Int32, UInt32 = Int64. @@ -150,14 +172,15 @@ namespace size_t min_bit_width_of_integer = std::max(max_bits_of_signed_integer, max_bits_of_unsigned_integer); /// If unsigned is not covered by signed. - if (max_bits_of_signed_integer && max_bits_of_unsigned_integer >= max_bits_of_signed_integer) + if (max_bits_of_signed_integer && max_bits_of_unsigned_integer >= max_bits_of_signed_integer) //-V1051 { // Because 128 and 256 bit integers are significantly slower, we should not promote to them. // But if we already have wide numbers, promotion is necessary. if (min_bit_width_of_integer != 64 || allow_extended_conversion) ++min_bit_width_of_integer; else - return throwOrReturn(types, + return throwOrReturn( + types, "because some of them are signed integers and some are unsigned integers," " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE); @@ -172,9 +195,11 @@ namespace else if (min_mantissa_bits <= 53 || allow_extended_conversion) return std::make_shared(); else - return throwOrReturn(types, - " because some of them are integers and some are floating point," - " but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE); + return throwOrReturn( + types, + "because some of them are integers and some are floating point," + " but there is no floating point type, that can exactly represent all required integers", + ErrorCodes::NO_COMMON_TYPE); } /// If the result must be signed integer. @@ -193,9 +218,11 @@ namespace else if (min_bit_width_of_integer <= 256) return std::make_shared(); else - return throwOrReturn(types, - " because some of them are signed integers and some are unsigned integers," - " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE); + return throwOrReturn( + types, + "because some of them are signed integers and some are unsigned integers," + " but there is no signed integer type, that can exactly represent all required unsigned integer values", + ErrorCodes::NO_COMMON_TYPE); } /// All unsigned. @@ -213,8 +240,10 @@ namespace else if (min_bit_width_of_integer <= 256) return std::make_shared(); else - return throwOrReturn(types, - " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE); + return throwOrReturn( + types, + "but as all data types are unsigned integers, we must have found maximum unsigned integer type", + ErrorCodes::NO_COMMON_TYPE); } } @@ -244,7 +273,6 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conve break; } } - if (all_equal) return types[0]; } @@ -523,9 +551,12 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conve } size_t max_scale_date_time_index; - if (have_datetime64) { + if (have_datetime64) + { max_scale_date_time_index = getMaxScaleIndex(types); - } else { + } + else + { max_scale_date_time_index = getMaxScaleIndex(types); } @@ -544,8 +575,15 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conve { UInt32 num_supported = have_decimal32 + have_decimal64 + have_decimal128 + have_decimal256; - std::vector int_ids = {TypeIndex::Int8, TypeIndex::UInt8, TypeIndex::Int16, TypeIndex::UInt16, - TypeIndex::Int32, TypeIndex::UInt32, TypeIndex::Int64, TypeIndex::UInt64}; + std::vector int_ids + = {TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64}; std::vector num_ints(int_ids.size(), 0); TypeIndex max_int = TypeIndex::Nothing; @@ -567,7 +605,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conve constexpr std::array float_ids = {TypeIndex::Float32, TypeIndex::Float64}; bool have_float = false; - for (const auto & float_id: float_ids) + for (const auto & float_id : float_ids) { UInt32 num = type_ids.count(float_id); num_supported += num; @@ -584,7 +622,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conve for (const auto & type : types) { - if(!strcmp(type->getFamilyName(), "Decimal")) + if (!strcmp(type->getFamilyName(), "Decimal")) { auto bits_of_integer_part = getDecimalPrecision(*type) - getDecimalScale(*type, 0); @@ -649,7 +687,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conve /// For numeric types, the most complicated part. { - auto numeric_type = getNumericType(types, allow_extended_conversion); + auto numeric_type = getNumericType(type_ids, allow_extended_conversion); if (numeric_type) return numeric_type; } @@ -658,10 +696,67 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conve return throwOrReturn(types, "", ErrorCodes::NO_COMMON_TYPE); } -DataTypePtr tryGetLeastSupertype(const DataTypes & types, bool allow_extended_conversio) +DataTypePtr getLeastSupertypeOrString(const DataTypes & types, bool allow_extended_conversion) { - return getLeastSupertype(types, allow_extended_conversio); + return getLeastSupertype(types, allow_extended_conversion); } -template DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conversio); +DataTypePtr tryGetLeastSupertype(const DataTypes & types, bool allow_extended_conversion) +{ + return getLeastSupertype(types, allow_extended_conversion); +} + +template +DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_extended_conversion) +{ + if (types.empty()) + return std::make_shared(); + + if (types.size() == 1) + { + WhichDataType which(*types.begin()); + if (which.isNothing()) + return std::make_shared(); + + #define DISPATCH(TYPE) \ + if (which.idx == TypeIndex::TYPE) \ + return std::make_shared>(); /// NOLINT + + FOR_NUMERIC_TYPES(DISPATCH) + #undef DISPATCH + + if (which.isString()) + return std::make_shared(); + + return throwOrReturn(types, "because cannot get common type by type indexes with non-simple types", ErrorCodes::NO_COMMON_TYPE); + } + + if (types.contains(TypeIndex::String)) + { + bool only_string = types.size() == 2 && types.contains(TypeIndex::Nothing); + if (!only_string) + return throwOrReturn(types, "because some of them are String and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + return std::make_shared(); + } + + auto numeric_type = getNumericType(types, allow_extended_conversion); + if (numeric_type) + return numeric_type; + + return throwOrReturn(types, "", ErrorCodes::NO_COMMON_TYPE); +} + +DataTypePtr getLeastSupertypeOrString(const TypeIndexSet & types, bool allow_extended_conversion) +{ + return getLeastSupertype(types, allow_extended_conversion); +} + +DataTypePtr tryGetLeastSupertype(const TypeIndexSet & types, bool allow_extended_conversion) +{ + return getLeastSupertype(types, allow_extended_conversion); +} + +template DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conversion); +template DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_extended_conversion); } diff --git a/src/DataTypes/getLeastSupertype.h b/src/DataTypes/getLeastSupertype.h index 10b2cb0c50..201fab2043 100644 --- a/src/DataTypes/getLeastSupertype.h +++ b/src/DataTypes/getLeastSupertype.h @@ -20,13 +20,10 @@ */ #pragma once - #include - namespace DB { - template || std::is_same_v, T>> size_t getMaxScaleIndex(const DataTypes &types) @@ -67,8 +64,20 @@ enum class LeastSupertypeOnError template DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_extended_conversio = false); +/// Same as above but return String type instead of throwing exception. +/// All types can be casted to String, because they can be serialized to String. +DataTypePtr getLeastSupertypeOrString(const DataTypes & types, bool allow_extended_conversio = false); + /// Same as above but return nullptr instead of throwing exception. -DataTypePtr tryGetLeastSupertype(const DataTypes & types, bool allow_extended_conversio = false); +DataTypePtr tryGetLeastSupertype(const DataTypes & types, bool allow_extended_conversion=false); using TypeIndexSet = std::unordered_set; + +template +DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_extended_conversion=false); + +DataTypePtr getLeastSupertypeOrString(const TypeIndexSet & types, bool allow_extended_conversion=false); + +DataTypePtr tryGetLeastSupertype(const TypeIndexSet & types, bool allow_extended_conversion=false); + } diff --git a/src/Databases/DatabaseCnch.h b/src/Databases/DatabaseCnch.h index dc5b5ac0d7..3bb9d1eeb5 100644 --- a/src/Databases/DatabaseCnch.h +++ b/src/Databases/DatabaseCnch.h @@ -24,7 +24,7 @@ #include #include #include -#include "Storages/IStorage_fwd.h" +#include #include namespace DB diff --git a/src/FormaterTool/PartConverter.cpp b/src/FormaterTool/PartConverter.cpp index 302cabcf59..214b0fdff8 100644 --- a/src/FormaterTool/PartConverter.cpp +++ b/src/FormaterTool/PartConverter.cpp @@ -190,9 +190,10 @@ void PartConverter::execute() out->writeSuffix(); }; + auto storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), getContext()); for (auto & part : data_parts) { - auto input = std::make_shared(*storage, storage->getInMemoryMetadataPtr(), part, column_names, false, true); + auto input = std::make_shared(*storage, storage_snapshot, part, column_names, false, true); QueryPipeline pipeline; pipeline.init(Pipe(std::move(input))); pipeline.setMaxThreads(1); diff --git a/src/Formats/NativeWriter.cpp b/src/Formats/NativeWriter.cpp index 0c1e9d2b4e..e7e246b277 100644 --- a/src/Formats/NativeWriter.cpp +++ b/src/Formats/NativeWriter.cpp @@ -58,7 +58,7 @@ static void writeData(const ISerialization & serialization, const ColumnPtr & co ISerialization::SerializeBinaryBulkStatePtr state; // todo aron pass compile // serialization.serializeBinaryBulkStatePrefix(*full_column, settings, state); - serialization.serializeBinaryBulkStatePrefix(settings, state); + serialization.serializeBinaryBulkStatePrefix(*full_column, settings, state); serialization.serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); serialization.serializeBinaryBulkStateSuffix(settings, state); } diff --git a/src/Functions/FunctionSQLJSON.h b/src/Functions/FunctionSQLJSON.h index fd37207ea8..37a391a82f 100644 --- a/src/Functions/FunctionSQLJSON.h +++ b/src/Functions/FunctionSQLJSON.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Functions/FunctionsComparison.h b/src/Functions/FunctionsComparison.h index 263d9f95f1..8c811ab6fe 100644 --- a/src/Functions/FunctionsComparison.h +++ b/src/Functions/FunctionsComparison.h @@ -1081,7 +1081,7 @@ private: ColumnPtr executeGeneric(const ColumnWithTypeAndName & c0, const ColumnWithTypeAndName & c1) const { - DataTypePtr common_type = getLeastSupertype({c0.type, c1.type}); + DataTypePtr common_type = getLeastSupertype(DataTypes{c0.type, c1.type}); ColumnPtr c0_converted = castColumn(c0, common_type); ColumnPtr c1_converted = castColumn(c1, common_type); diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index a6ae83c225..78b7096043 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -47,6 +47,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -1368,13 +1371,11 @@ inline bool tryParseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & /** Throw exception with verbose message when string value is not parsed completely. */ -[[noreturn]] inline void throwExceptionForIncompletelyParsedValue(ReadBuffer & read_buffer, const DataTypePtr result_type) +[[noreturn]] inline void throwExceptionForIncompletelyParsedValue(ReadBuffer & read_buffer, const IDataType & result_type) { - const IDataType & to_type = *result_type; - WriteBufferFromOwnString message_buf; message_buf << "Cannot parse string " << quote << String(read_buffer.buffer().begin(), read_buffer.buffer().size()) - << " as " << to_type.getName() + << " as " << result_type.getName() << ": syntax error"; if (read_buffer.offset()) @@ -1383,13 +1384,12 @@ inline bool tryParseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & else message_buf << " at begin of string"; - if (isNativeNumber(to_type)) - message_buf << ". Note: there are to" << to_type.getName() << "OrZero and to" << to_type.getName() << "OrNull functions, which returns zero/NULL instead of throwing exception."; + if (isNativeNumber(result_type)) + message_buf << ". Note: there are to" << result_type.getName() << "OrZero and to" << result_type.getName() << "OrNull functions, which returns zero/NULL instead of throwing exception."; throw Exception(message_buf.str(), ErrorCodes::CANNOT_PARSE_TEXT); } - /** Conversion of DateTime to Date: throw off time component. */ template struct ConvertImpl @@ -1676,7 +1676,7 @@ struct ConvertThroughParsing } if (!isAllRead(read_buffer)) - throwExceptionForIncompletelyParsedValue(read_buffer, res_type); + throwExceptionForIncompletelyParsedValue(read_buffer, *res_type); } else { @@ -1807,18 +1807,32 @@ struct ConvertImplGenericFromString static_assert(std::is_same_v || std::is_same_v, "Can be used only to parse from ColumnString or ColumnFixedString"); - const IColumn & col_from = *arguments[0].column; + const IColumn & column_from = *arguments[0].column; const IDataType & data_type_to = *result_type; - if (const StringColumnType * col_from_string = checkAndGetColumn(&col_from)) - { - auto res = data_type_to.createColumn(); + auto res = data_type_to.createColumn(); + auto serialization = data_type_to.getDefaultSerialization(); + const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; - IColumn & column_to = *res; + executeImpl(column_from, *res, *serialization, input_rows_count, null_map, result_type.get()); + return res; + } + + static void executeImpl( + const IColumn & column_from, + IColumn & column_to, + const ISerialization & serialization_from, + size_t input_rows_count, + const PaddedPODArray * null_map = nullptr, + const IDataType * result_type = nullptr) + { + static_assert(std::is_same_v || std::is_same_v, + "Can be used only to parse from ColumnString or ColumnFixedString"); + + if (const StringColumnType * col_from_string = checkAndGetColumn(&column_from)) + { column_to.reserve(input_rows_count); FormatSettings format_settings; - auto serialization = data_type_to.getDefaultSerialization(); - const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; for (size_t i = 0; i < input_rows_count; ++i) { if (null_map && (*null_map)[i]) @@ -1829,19 +1843,22 @@ struct ConvertImplGenericFromString const auto & val = col_from_string->getDataAt(i); ReadBufferFromMemory read_buffer(val.data, val.size); - - serialization->deserializeWholeText(column_to, read_buffer, format_settings); + serialization_from.deserializeWholeText(column_to, read_buffer, format_settings); if (!read_buffer.eof()) - throwExceptionForIncompletelyParsedValue(read_buffer, result_type); + { + if (result_type) + throwExceptionForIncompletelyParsedValue(read_buffer, *result_type); + else + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, + "Cannot parse string to column {}. Expected eof", column_to.getName()); + } } - - return res; } else - throw Exception("Illegal column " + arguments[0].column->getName() - + " of first argument of conversion function from string", - ErrorCodes::ILLEGAL_COLUMN); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of conversion function from string", + column_from.getName()); } }; @@ -3477,20 +3494,61 @@ private: throw Exception{"CAST AS Tuple can only be performed between tuple types or from String.\nLeft type: " + from_type_untyped->getName() + ", right type: " + to_type->getName(), ErrorCodes::TYPE_MISMATCH}; - if (from_type->getElements().size() != to_type->getElements().size()) - throw Exception{"CAST AS Tuple can only be performed between tuple types with the same number of elements or from String.\n" - "Left type: " + from_type->getName() + ", right type: " + to_type->getName(), ErrorCodes::TYPE_MISMATCH}; - const auto & from_element_types = from_type->getElements(); const auto & to_element_types = to_type->getElements(); - auto element_wrappers = getElementWrappers(from_element_types, to_element_types); - return [element_wrappers, from_element_types, to_element_types] + + std::vector element_wrappers; + std::vector> to_reverse_index; + + /// For named tuples allow conversions for tuples with + /// different sets of elements. If element exists in @to_type + /// and doesn't exist in @to_type it will be filled by default values. + if (from_type->haveExplicitNames() && to_type->haveExplicitNames()) + { + const auto & from_names = from_type->getElementNames(); + std::unordered_map from_positions; + from_positions.reserve(from_names.size()); + for (size_t i = 0; i < from_names.size(); ++i) + from_positions[from_names[i]] = i; + + const auto & to_names = to_type->getElementNames(); + element_wrappers.reserve(to_names.size()); + to_reverse_index.reserve(from_names.size()); + + for (size_t i = 0; i < to_names.size(); ++i) + { + auto it = from_positions.find(to_names[i]); + if (it != from_positions.end()) + { + element_wrappers.emplace_back(prepareUnpackDictionaries(from_element_types[it->second], to_element_types[i])); + to_reverse_index.emplace_back(it->second); + } + else + { + element_wrappers.emplace_back(); + to_reverse_index.emplace_back(); + } + } + } + else + { + if (from_element_types.size() != to_element_types.size()) + throw Exception{"CAST AS Tuple can only be performed between tuple types with the same number of elements or from String.\n" + "Left type: " + from_type->getName() + ", right type: " + to_type->getName(), ErrorCodes::TYPE_MISMATCH}; + + element_wrappers = getElementWrappers(from_element_types, to_element_types); + to_reverse_index.reserve(to_element_types.size()); + for (size_t i = 0; i < to_element_types.size(); ++i) + to_reverse_index.emplace_back(i); + } + + return [element_wrappers, from_element_types, to_element_types, to_reverse_index] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) -> ColumnPtr { const auto * col = arguments.front().column.get(); - size_t tuple_size = from_element_types.size(); + size_t tuple_size = to_element_types.size(); const ColumnTuple & column_tuple = typeid_cast(*col); Columns converted_columns(tuple_size); @@ -3498,14 +3556,84 @@ private: /// invoke conversion for each element for (size_t i = 0; i < tuple_size; ++i) { - ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_element_types[i], "" }}; - converted_columns[i] = element_wrappers[i](element, to_element_types[i], nullable_source, input_rows_count); + if (to_reverse_index[i]) + { + size_t from_idx = *to_reverse_index[i]; + ColumnsWithTypeAndName element = {{column_tuple.getColumns()[from_idx], from_element_types[from_idx], "" }}; + converted_columns[i] = element_wrappers[i](element, to_element_types[i], nullable_source, input_rows_count); + } + else + { + converted_columns[i] = to_element_types[i]->createColumn()->cloneResized(input_rows_count); + } } return ColumnTuple::create(converted_columns); }; } + WrapperType createObjectWrapper(const DataTypePtr & from_type, const DataTypeObject * to_type) const + { + if (const auto * from_tuple = checkAndGetDataType(from_type.get())) + { + if (!from_tuple->haveExplicitNames()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName()); + + PathsInData paths; + DataTypes from_types; + + std::tie(paths, from_types) = flattenTuple(from_type); + auto to_types = from_types; + + for (auto & type : to_types) + { + if (isTuple(type) || isNested(type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName()); + + type = recursiveRemoveLowCardinality(type); + } + + return [element_wrappers = getElementWrappers(from_types, to_types), + has_nullable_subcolumns = to_type->hasNullableSubcolumns(), from_types, to_types, paths] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) + { + size_t tuple_size = to_types.size(); + auto flattened_column = flattenTuple(arguments.front().column); + const auto & column_tuple = assert_cast(*flattened_column); + + if (tuple_size != column_tuple.getColumns().size()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Expected tuple with {} subcolumn, but got {} subcolumns", + tuple_size, column_tuple.getColumns().size()); + + auto res = ColumnObject::create(has_nullable_subcolumns); + for (size_t i = 0; i < tuple_size; ++i) + { + ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }}; + auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count); + res->addSubcolumn(paths[i], converted_column->assumeMutable()); + } + + return res; + }; + } + else if (checkAndGetDataType(from_type.get())) + { + return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) + { + auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count); + auto & res_object = assert_cast(res->assumeMutableRef()); + res_object.finalize(); + return res; + }; + } + + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten named tuple or string. Got: {}", from_type->getName()); + } + /// The case of: tuple([key1, key2, ..., key_n], [value1, value2, ..., value_n]) WrapperType createTupleToMapWrapper(const DataTypes & from_kv_types, const DataTypes & to_kv_types) const { @@ -4140,6 +4268,8 @@ private: return createMapWrapper(from_type, checkAndGetDataType(to_type.get())); case TypeIndex::AggregateFunction: return createAggregateFunctionWrapper(from_type, checkAndGetDataType(to_type.get())); + case TypeIndex::Object: + return createObjectWrapper(from_type, checkAndGetDataType(to_type.get())); default: break; } diff --git a/src/Functions/FunctionsJSON.h b/src/Functions/FunctionsJSON.h index 48a1ca78ce..f4feac93ba 100644 --- a/src/Functions/FunctionsJSON.h +++ b/src/Functions/FunctionsJSON.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h index 1f8be6de9d..a797213465 100644 --- a/src/Functions/FunctionsRound.h +++ b/src/Functions/FunctionsRound.h @@ -687,7 +687,7 @@ public: throw Exception{"Elements of array of second argument of function " + getName() + " must be numeric type.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; } - return getLeastSupertype({type_x, type_arr_nested}); + return getLeastSupertype(DataTypes{type_x, type_arr_nested}); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override diff --git a/src/Functions/array/arrayIndex.h b/src/Functions/array/arrayIndex.h index fc94fa18f1..cf0653776f 100644 --- a/src/Functions/array/arrayIndex.h +++ b/src/Functions/array/arrayIndex.h @@ -531,7 +531,7 @@ private: auto arg_decayed = removeNullable(removeLowCardinality(arg)); return ((isNativeNumber(inner_type_decayed) || isEnum(inner_type_decayed)) && isNativeNumber(arg_decayed)) - || getLeastSupertype({inner_type_decayed, arg_decayed}); + || getLeastSupertype(DataTypes{inner_type_decayed, arg_decayed}); } #define INTEGRAL_TPL_PACK UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64 @@ -1005,7 +1005,7 @@ private: DataTypePtr array_elements_type = assert_cast(*arguments[0].type).getNestedType(); const DataTypePtr & index_type = arguments[1].type; - DataTypePtr common_type = getLeastSupertype({array_elements_type, index_type}); + DataTypePtr common_type = getLeastSupertype(DataTypes{array_elements_type, index_type}); ColumnPtr col_nested = castColumn({ col->getDataPtr(), array_elements_type, "" }, common_type); diff --git a/src/Functions/array/arrayResize.cpp b/src/Functions/array/arrayResize.cpp index 30a6f963a6..6f2148b438 100644 --- a/src/Functions/array/arrayResize.cpp +++ b/src/Functions/array/arrayResize.cpp @@ -60,7 +60,7 @@ public: if (number_of_arguments == 2) return arguments[0]; else /* if (number_of_arguments == 3) */ - return std::make_shared(getLeastSupertype({array_type->getNestedType(), arguments[2]})); + return std::make_shared(getLeastSupertype(DataTypes{array_type->getNestedType(), arguments[2]})); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type, size_t input_rows_count) const override diff --git a/src/Functions/blockSerializedSize.cpp b/src/Functions/blockSerializedSize.cpp index d406984c51..35be65f3fe 100644 --- a/src/Functions/blockSerializedSize.cpp +++ b/src/Functions/blockSerializedSize.cpp @@ -54,7 +54,7 @@ public: auto serialization = elem.type->getDefaultSerialization(); - serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkStatePrefix(*full_column, settings, state); serialization->serializeBinaryBulkWithMultipleStreams(*full_column, 0 /** offset */, 0 /** limit */, settings, state); diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index aa78ff8afa..2d0036017d 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -575,7 +575,7 @@ private: const ColumnWithTypeAndName & arg1 = arguments[1]; const ColumnWithTypeAndName & arg2 = arguments[2]; - DataTypePtr common_type = getLeastSupertype({arg1.type, arg2.type}); + DataTypePtr common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); ColumnPtr col_then = castColumn(arg1, common_type); ColumnPtr col_else = castColumn(arg2, common_type); @@ -913,7 +913,7 @@ public: throw Exception("Illegal type " + arguments[0]->getName() + " of first argument (condition) of function if. Must be UInt8.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - return getLeastSupertype({arguments[1], arguments[2]}); + return getLeastSupertype(DataTypes{arguments[1], arguments[2]}); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override diff --git a/src/Functions/ifNull.cpp b/src/Functions/ifNull.cpp index 8b910043ec..a353930b96 100644 --- a/src/Functions/ifNull.cpp +++ b/src/Functions/ifNull.cpp @@ -46,7 +46,7 @@ public: if (!arguments[0]->isNullable()) return arguments[0]; - return getLeastSupertype({removeNullable(arguments[0]), arguments[1]}); + return getLeastSupertype(DataTypes{removeNullable(arguments[0]), arguments[1]}); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override diff --git a/src/Functions/neighbor.cpp b/src/Functions/neighbor.cpp index e9b23cdb64..e428df0b60 100644 --- a/src/Functions/neighbor.cpp +++ b/src/Functions/neighbor.cpp @@ -76,7 +76,7 @@ public: // check that default value column has supertype with first argument if (number_of_arguments == 3) - return getLeastSupertype({arguments[0], arguments[2]}); + return getLeastSupertype(DataTypes{arguments[0], arguments[2]}); return arguments[0]; } diff --git a/src/Functions/transform.cpp b/src/Functions/transform.cpp index 706d7ae68f..2c67ee1f0e 100644 --- a/src/Functions/transform.cpp +++ b/src/Functions/transform.cpp @@ -11,6 +11,7 @@ #include #include #include +#include "DataTypes/IDataType.h" #include #include #include @@ -138,7 +139,7 @@ public: if (type_arr_to_nested->isValueRepresentedByNumber() && type_default->isValueRepresentedByNumber()) { /// We take the smallest common type for the elements of the array of values `to` and for `default`. - return getLeastSupertype({type_arr_to_nested, type_default}); + return getLeastSupertype(DataTypes{type_arr_to_nested, type_default}); } /// TODO More checks. diff --git a/src/Functions/tuple.cpp b/src/Functions/tuple.cpp index be661390f9..ca47962648 100644 --- a/src/Functions/tuple.cpp +++ b/src/Functions/tuple.cpp @@ -56,29 +56,36 @@ public: } bool useDefaultImplementationForConstants() const override { return true; } - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + // DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + // { + // if (arguments.empty()) + // throw Exception("Function " + getName() + " requires at least one argument.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + // DataTypes types; + // Strings names; + + // for (const auto & argument : arguments) + // { + // types.emplace_back(argument.type); + // names.emplace_back(argument.name); + // } + + // /// Create named tuple if possible. We don't print tuple element names + // /// because they are bad anyway -- aliases are not used, e.g. tuple(1 a) + // /// will have element name '1' and not 'a'. If we ever change this, and + // /// add the ability to access tuple elements by name, like tuple(1 a).a, + // /// we should probably enable printing for better discoverability. + // if (DataTypeTuple::canBeCreatedWithNames(names)) + // return std::make_shared(types, names, false /*print names*/); + + // return std::make_shared(types); + // } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (arguments.empty()) throw Exception("Function " + getName() + " requires at least one argument.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - DataTypes types; - Strings names; - - for (const auto & argument : arguments) - { - types.emplace_back(argument.type); - names.emplace_back(argument.name); - } - - /// Create named tuple if possible. We don't print tuple element names - /// because they are bad anyway -- aliases are not used, e.g. tuple(1 a) - /// will have element name '1' and not 'a'. If we ever change this, and - /// add the ability to access tuple elements by name, like tuple(1 a).a, - /// we should probably enable printing for better discoverability. - if (DataTypeTuple::canBeCreatedWithNames(names)) - return std::make_shared(types, names, false /*print names*/); - - return std::make_shared(types); + return std::make_shared(arguments); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override diff --git a/src/IO/BufferBase.h b/src/IO/BufferBase.h index 263da60109..403aae8741 100644 --- a/src/IO/BufferBase.h +++ b/src/IO/BufferBase.h @@ -139,6 +139,10 @@ public: target.bytes = bytes; } + BufferBase(const BufferBase &) = default; + + BufferBase & operator=(const BufferBase &) = default; + virtual ~BufferBase() { // dummy function diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 42bfab7749..2b69bd8f65 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -274,6 +274,8 @@ void readString(String & s, ReadBuffer & buf) } template void readStringInto>(PaddedPODArray & s, ReadBuffer & buf); +template void readStringInto(String & s, ReadBuffer & buf); +// template void readStringInto(NullOutput & s, ReadBuffer & buf); template void readWordInto(Vector & s, ReadBuffer & buf) @@ -875,7 +877,6 @@ template bool readJSONStringInto, bool>(PaddedPODArray(NullOutput & s, ReadBuffer & buf); template void readJSONStringInto(String & s, ReadBuffer & buf); - template ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf) { diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index 221018e004..4199c9eb91 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -1605,14 +1605,14 @@ Block Aggregator::prepareBlockAndFill( if (aggregate_functions[i]->isState()) { - auto callback = [&](auto & subcolumn) + auto callback = [&](IColumn & subcolumn) { /// The ColumnAggregateFunction column captures the shared ownership of the arena with aggregate function states. - if (auto * column_aggregate_func = typeid_cast(subcolumn.get())) + if (auto * column_aggregate_func = typeid_cast(&subcolumn)) for (auto & pool : data_variants.aggregates_pools) column_aggregate_func->addArena(pool); }; - callback(final_aggregate_columns[i]); + callback(*final_aggregate_columns[i]); final_aggregate_columns[i]->forEachSubcolumnRecursively(callback); } } diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp index 0b9cec2d59..a082d6da87 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -62,29 +63,39 @@ namespace ClusterProxy SelectStreamFactory::SelectStreamFactory( const Block & header_, + const ColumnsDescriptionByShardNum & objects_by_shard_, + const StorageSnapshotPtr & storage_snapshot_, QueryProcessingStage::Enum processed_stage_, StorageID main_table_, const Scalars & scalars_, bool has_virtual_shard_num_column_, - const Tables & external_tables_) + const Tables & external_tables_, + ExpressionActionsPtr actions_for_remote_) : header(header_), + objects_by_shard(objects_by_shard_), + storage_snapshot(storage_snapshot_), processed_stage{processed_stage_}, main_table(std::move(main_table_)), table_func_ptr{nullptr}, scalars{scalars_}, has_virtual_shard_num_column(has_virtual_shard_num_column_), - external_tables{external_tables_} + external_tables{external_tables_}, + actions_for_remote{actions_for_remote_} { } SelectStreamFactory::SelectStreamFactory( const Block & header_, + const ColumnsDescriptionByShardNum & objects_by_shard_, + const StorageSnapshotPtr & storage_snapshot_, QueryProcessingStage::Enum processed_stage_, ASTPtr table_func_ptr_, const Scalars & scalars_, bool has_virtual_shard_num_column_, const Tables & external_tables_) : header(header_), + objects_by_shard(objects_by_shard_), + storage_snapshot(storage_snapshot_), processed_stage{processed_stage_}, table_func_ptr{table_func_ptr_}, scalars{scalars_}, @@ -249,6 +260,10 @@ void SelectStreamFactory::createForShard( } } + auto it = objects_by_shard.find(shard_info.shard_num); + if (it != objects_by_shard.end()) + replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, query_ast); + auto emplace_local_stream = [&]() { plans.emplace_back(createLocalPlan(modified_query_ast, modified_header, context, processed_stage)); diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.h b/src/Interpreters/ClusterProxy/SelectStreamFactory.h index 701351a172..405bc49b28 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h @@ -25,6 +25,7 @@ #include #include #include +#include namespace DB { @@ -32,21 +33,28 @@ namespace DB namespace ClusterProxy { +using ColumnsDescriptionByShardNum = std::unordered_map; + class SelectStreamFactory final : public IStreamFactory { public: /// Database in a query. SelectStreamFactory( const Block & header_, + const ColumnsDescriptionByShardNum & objects_by_shard_, + const StorageSnapshotPtr & storage_snapshot_, QueryProcessingStage::Enum processed_stage_, StorageID main_table_, const Scalars & scalars_, bool has_virtual_shard_num_column_, - const Tables & external_tables); + const Tables & external_tables, + ExpressionActionsPtr actions_for_remote_ = nullptr); /// TableFunction in a query. SelectStreamFactory( const Block & header_, + const ColumnsDescriptionByShardNum & objects_by_shard_, + const StorageSnapshotPtr & storage_snapshot_, QueryProcessingStage::Enum processed_stage_, ASTPtr table_func_ptr_, const Scalars & scalars_, @@ -64,12 +72,16 @@ public: private: const Block header; + const ColumnsDescriptionByShardNum objects_by_shard; + const StorageSnapshotPtr storage_snapshot; QueryProcessingStage::Enum processed_stage; StorageID main_table = StorageID::createEmpty(); ASTPtr table_func_ptr; Scalars scalars; bool has_virtual_shard_num_column = false; Tables external_tables; + + ExpressionActionsPtr actions_for_remote; }; } diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 79f127c4a9..e5ea1f0ef4 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -38,25 +38,30 @@ #include #include #include +#include #include #include #include #include #include +#include +#include +#include +#include #include #include #include #include -#include #include -#include "Core/SettingsEnums.h" -#include "Databases/DatabaseExternalHive.h" -#include "Interpreters/StorageID.h" -#include "Transaction/TxnTimestamp.h" +#include +#include +#include +#include +#include #if !defined(ARCADIA_BUILD) -# include "config_core.h" +# include #endif #if USE_MYSQL @@ -378,6 +383,12 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( if (!table) database = nullptr; + if (table && hasDynamicSubcolumns(table->getInMemoryMetadata().getColumns())) + { + if (auto cnch_table = std::dynamic_pointer_cast(table)) + cnch_table->resetObjectColumns(context_); + } + return {database, table}; } diff --git a/src/Interpreters/DistributedStages/InterpreterDistributedStages.cpp b/src/Interpreters/DistributedStages/InterpreterDistributedStages.cpp index 0bfa2388d7..bdde12217d 100644 --- a/src/Interpreters/DistributedStages/InterpreterDistributedStages.cpp +++ b/src/Interpreters/DistributedStages/InterpreterDistributedStages.cpp @@ -125,11 +125,12 @@ PlanSegmentPtr MockPlanSegment(ContextPtr context) */ StorageID table_id = StorageID("system", "one"); StoragePtr storage = DatabaseCatalog::instance().getTable(table_id, context); - + auto metadata_snapshot = storage->getInMemoryMetadataPtr(); + auto storage_snapshot = storage->getStorageSnapshot(metadata_snapshot, context); QueryPlan query_plan; SelectQueryInfo select_query_info; - storage->read(query_plan, {"dummy"}, storage->getInMemoryMetadataPtr(), select_query_info, context, {}, 0, 0); + storage->read(query_plan, {"dummy"}, storage_snapshot, select_query_info, context, {}, 0, 0); plan_segment->setQueryPlan(std::move(query_plan)); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 29fcfbfc72..2aa9aee442 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -89,6 +90,7 @@ #include #include +#include #include #include #include @@ -900,6 +902,21 @@ void InterpreterCreateQuery::validateTableStructure(const ASTCreateQuery & creat } } } + + if (!create.attach && !settings.allow_experimental_object_type) + { + for (const auto & [name, type] : properties.columns.getAllPhysical()) + { + if (isObject(type)) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Cannot create table with column '{}' which type is '{}' " + "because experimental Object type is not allowed. " + "Set setting allow_experimental_object_type = 1 in order to allow it", + name, type->getName()); + } + } + } } void InterpreterCreateQuery::setEngine(ASTCreateQuery & create) const @@ -1533,6 +1550,14 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, /// we can safely destroy the object without a call to "shutdown", because there is guarantee /// that no background threads/similar resources remain after exception from "startup". + if (!res->supportsDynamicSubcolumns() && hasDynamicSubcolumns(res->getInMemoryMetadataPtr()->getColumns())) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Cannot create table with column of type Object, " + "because storage {} doesn't support dynamic subcolumns", + res->getName()); + } + res->startup(); return true; } diff --git a/src/Interpreters/InterpreterDescribeQuery.cpp b/src/Interpreters/InterpreterDescribeQuery.cpp index 4d686d0728..6cf0fd1eca 100644 --- a/src/Interpreters/InterpreterDescribeQuery.cpp +++ b/src/Interpreters/InterpreterDescribeQuery.cpp @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -53,7 +55,7 @@ BlockIO InterpreterDescribeQuery::execute() } -Block InterpreterDescribeQuery::getSampleBlock(ContextPtr context_) +Block InterpreterDescribeQuery::getSampleBlock(ContextPtr context_, bool include_subcolumns) { Block block; @@ -90,6 +92,14 @@ Block InterpreterDescribeQuery::getSampleBlock(ContextPtr context_) col.name = "ttl_expression"; block.insert(col); + if (include_subcolumns) + { + col.name = "is_subcolumn"; + col.type = std::make_shared(); + col.column = col.type->createColumn(); + block.insert(col); + } + return block; } @@ -97,9 +107,12 @@ Block InterpreterDescribeQuery::getSampleBlock(ContextPtr context_) BlockInputStreamPtr InterpreterDescribeQuery::executeImpl() { ColumnsDescription columns; + StorageSnapshotPtr storage_snapshot; const auto & ast = query_ptr->as(); const auto & table_expression = ast.table_expression->as(); + const auto & settings = getContext()->getSettingsRef(); + if (table_expression.subquery) { auto names_and_types = InterpreterSelectWithUnionQuery::getSampleBlock( @@ -116,12 +129,15 @@ BlockInputStreamPtr InterpreterDescribeQuery::executeImpl() auto table_id = getContext()->resolveStorageID(table_expression.database_and_table_name); getContext()->checkAccess(AccessType::SHOW_COLUMNS, table_id); auto table = DatabaseCatalog::instance().getTable(table_id, getContext()); - auto table_lock = table->lockForShare(getContext()->getInitialQueryId(), getContext()->getSettingsRef().lock_acquire_timeout); + auto table_lock = table->lockForShare(getContext()->getInitialQueryId(), settings.lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); + storage_snapshot = table->getStorageSnapshot(metadata_snapshot, getContext()); columns = metadata_snapshot->getColumns(); } - Block sample_block = getSampleBlock(getContext()); + bool extend_object_types = settings.describe_extend_object_types && storage_snapshot; + bool include_subcolumns = settings.describe_include_subcolumns; + Block sample_block = getSampleBlock(getContext(), include_subcolumns); MutableColumns res_columns = sample_block.cloneEmptyColumns(); auto dialect_type = getContext()->getSettingsRef().dialect_type; @@ -130,22 +146,21 @@ BlockInputStreamPtr InterpreterDescribeQuery::executeImpl() { size_t i = 0; res_columns[i++]->insert(column.name); - - /// Under ANSI mode, data type will be parsed by ANSI type parser, which converts Nullable to - /// null modifiers. And the nullability of root type will be demonstrated in the separate - /// field: `nullable`. For instance, the type field Nullable(Array(Array(Nullable(Array(String))))) - /// will be divided into two fields under ANSI mode: - /// type: Array(Array(Array(String NOT NULL) NULL) NOT NULL - /// nullable: true - if (dialect_type == DialectType::ANSI) + if (extend_object_types) + res_columns[i++]->insert(storage_snapshot->getConcreteType(column.name)->getName()); + else if (dialect_type == DialectType::ANSI) { + /// Under ANSI mode, data type will be parsed by ANSI type parser, which converts Nullable to + /// null modifiers. And the nullability of root type will be demonstrated in the separate + /// field: `nullable`. For instance, the type field Nullable(Array(Array(Nullable(Array(String))))) + /// will be divided into two fields under ANSI mode: + /// type: Array(Array(Array(String NOT NULL) NULL) NOT NULL + /// nullable: true ParserDataType type_parser(ParserSettings::ANSI); String type_name = column.type->getName(); const char * type_name_pos = type_name.data(); const char * type_name_end = type_name_pos + type_name.size(); - auto type_ast = parseQuery(type_parser, - type_name_pos, type_name_end, "data type", 0, - DBMS_DEFAULT_MAX_PARSER_DEPTH); + auto type_ast = parseQuery(type_parser, type_name_pos, type_name_end, "data type", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH); bool nullable; if (const auto * t_ast = type_ast->as()) @@ -198,6 +213,38 @@ BlockInputStreamPtr InterpreterDescribeQuery::executeImpl() res_columns[i++]->insertDefault(); } + if (include_subcolumns) + { + for (const auto & column : columns) + { + auto type = extend_object_types ? storage_snapshot->getConcreteType(column.name) : column.type; + + IDataType::forEachSubcolumn([&](const auto & path, const auto & name, const auto & data) + { + res_columns[0]->insert(Nested::concatenateName(column.name, name)); + res_columns[1]->insert(data.type->getName()); + + /// It's not trivial to calculate default expression for subcolumn. + /// So, leave it empty. + res_columns[2]->insertDefault(); + res_columns[3]->insertDefault(); + res_columns[4]->insert(column.comment); + + if (column.codec && ISerialization::isSpecialCompressionAllowed(path)) + res_columns[5]->insert(queryToString(column.codec->as()->arguments)); + else + res_columns[5]->insertDefault(); + + if (column.ttl) + res_columns[6]->insert(queryToString(column.ttl)); + else + res_columns[6]->insertDefault(); + + res_columns[7]->insert(1u); + }, ISerialization::SubstreamData(type->getDefaultSerialization()).withType(type)); + } + } + return std::make_shared(sample_block.cloneWithColumns(std::move(res_columns))); } diff --git a/src/Interpreters/InterpreterDescribeQuery.h b/src/Interpreters/InterpreterDescribeQuery.h index bdb4d307f3..b0882b3ab5 100644 --- a/src/Interpreters/InterpreterDescribeQuery.h +++ b/src/Interpreters/InterpreterDescribeQuery.h @@ -16,7 +16,7 @@ public: BlockIO execute() override; - static Block getSampleBlock(ContextPtr context_); + static Block getSampleBlock(ContextPtr context_, bool include_subcolumns); private: ASTPtr query_ptr; diff --git a/src/Interpreters/InterpreterOptimizeQuery.cpp b/src/Interpreters/InterpreterOptimizeQuery.cpp index 64de5ee047..1ecf7817eb 100644 --- a/src/Interpreters/InterpreterOptimizeQuery.cpp +++ b/src/Interpreters/InterpreterOptimizeQuery.cpp @@ -32,6 +32,7 @@ BlockIO InterpreterOptimizeQuery::execute() auto table_id = getContext()->resolveStorageID(ast, Context::ResolveOrdinary); StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext()); auto metadata_snapshot = table->getInMemoryMetadataPtr(); + auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot, getContext()); // Empty list of names means we deduplicate by all columns, but user can explicitly state which columns to use. Names column_names; @@ -46,7 +47,7 @@ BlockIO InterpreterOptimizeQuery::execute() column_names.emplace_back(col->getColumnName()); } - metadata_snapshot->check(column_names, NamesAndTypesList{}, table_id); + storage_snapshot->check(column_names); Names required_columns; { required_columns = metadata_snapshot->getColumnsRequiredForSortingKey(); diff --git a/src/Interpreters/InterpreterPerfectShard.cpp b/src/Interpreters/InterpreterPerfectShard.cpp index 21903c0c6d..3a3a9194e6 100644 --- a/src/Interpreters/InterpreterPerfectShard.cpp +++ b/src/Interpreters/InterpreterPerfectShard.cpp @@ -209,7 +209,7 @@ void InterpreterPerfectShard::sendQuery(QueryPlan & query_plan) } ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( - header, processed_stage, StorageID(main_database, main_table), scalars, false, context->getExternalTables() + header, {}, {}, processed_stage, StorageID(main_database, main_table), scalars, false, context->getExternalTables() ); LOG_TRACE(log, "Perfect-Shard will send query {} ", queryToString(query)); diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 5c41958e5f..de8f3b2a71 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -167,7 +167,7 @@ String InterpreterSelectQuery::generateFilterActions(ActionsDAGPtr & actions, co table_expr->children.push_back(table_expr->database_and_table_name); /// Using separate expression analyzer to prevent any possible alias injection - auto syntax_result = TreeRewriter(context).analyzeSelect(query_ast, TreeRewriterResult({}, storage, metadata_snapshot)); + auto syntax_result = TreeRewriter(context).analyzeSelect(query_ast, TreeRewriterResult({}, storage, storage_snapshot)); SelectQueryExpressionAnalyzer analyzer(query_ast, syntax_result, context, metadata_snapshot); actions = analyzer.simpleSelectActions(); @@ -366,6 +366,8 @@ InterpreterSelectQuery::InterpreterSelectQuery( table_id = storage->getStorageID(); if (!metadata_snapshot) metadata_snapshot = storage->getInMemoryMetadataPtr(); + + storage_snapshot = storage->getStorageSnapshotForQuery(metadata_snapshot, query_ptr, context); } if (has_input || !joined_tables.resolveTables()) @@ -424,7 +426,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( syntax_analyzer_result = TreeRewriter(context).analyzeSelect( query_ptr, - TreeRewriterResult(source_header.getNamesAndTypesList(), storage, metadata_snapshot), + TreeRewriterResult(source_header.getNamesAndTypesList(), storage, storage_snapshot), options, joined_tables.tablesWithColumns(), required_result_column_names, table_join); query_info.syntax_analyzer_result = syntax_analyzer_result; @@ -563,7 +565,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( } } - source_header = metadata_snapshot->getSampleBlockForColumns(required_columns, storage->getVirtuals(), storage->getStorageID()); + source_header = storage_snapshot->getSampleBlockForColumns(required_columns); } /// Calculate structure of the result. @@ -629,6 +631,9 @@ InterpreterSelectQuery::InterpreterSelectQuery( analysis_result.required_columns = required_columns; } + if (query_info.projection) + storage_snapshot->addProjection(query_info.projection->desc); + LOG_TRACE(log, "query: " + queryToString(query)); // std::ostringstream ostr; // for (auto & c : required_columns) @@ -729,8 +734,7 @@ Block InterpreterSelectQuery::getSampleBlockImpl() if (storage && !options.only_analyze) { - from_stage = storage->getQueryProcessingStage(context, options.to_stage, metadata_snapshot, query_info); - + from_stage = storage->getQueryProcessingStage(context, options.to_stage, storage_snapshot, query_info); /// TODO how can we make IN index work if we cache parts before selecting a projection? /// XXX Used for IN set index analysis. Is this a proper way? if (query_info.projection) @@ -1888,7 +1892,7 @@ void InterpreterSelectQuery::addPrewhereAliasActions() } auto syntax_result - = TreeRewriter(context).analyze(required_columns_all_expr, required_columns_after_prewhere, storage, metadata_snapshot); + = TreeRewriter(context).analyze(required_columns_all_expr, required_columns_after_prewhere, storage, storage_snapshot); alias_actions = ExpressionAnalyzer(required_columns_all_expr, syntax_result, context).getActionsDAG(true); /// The set of required columns could be added as a result of adding an action to calculate ALIAS. @@ -2199,9 +2203,9 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc * can be serialized / deserialized between workers. */ if (options.distributed_stages) - storage->read(query_plan, required_columns, metadata_snapshot, query_info, context, processing_stage, max_block_size, max_streams, true); + storage->read(query_plan, required_columns, storage_snapshot, query_info, context, processing_stage, max_block_size, max_streams, true); else - storage->read(query_plan, required_columns, metadata_snapshot, query_info, context, processing_stage, max_block_size, max_streams); + storage->read(query_plan, required_columns, storage_snapshot, query_info, context, processing_stage, max_block_size, max_streams); if (context->hasQueryContext() && !options.is_internal) { @@ -2216,10 +2220,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc /// Create step which reads from empty source if storage has no data. if (!query_plan.isInitialized()) { - auto header = query_info.projection - ? query_info.projection->desc->metadata->getSampleBlockForColumns( - query_info.projection->required_columns, storage->getVirtuals(), storage->getStorageID()) - : metadata_snapshot->getSampleBlockForColumns(required_columns, storage->getVirtuals(), storage->getStorageID()); + auto header = storage_snapshot->getSampleBlockForColumns(required_columns); /// add bitmap index result column for null source if (auto * bitmap_index_info = dynamic_cast(query_analyzer->getIndexContext()->get(MergeTreeIndexInfo::Type::BITMAP).get())) diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index ad2a30956a..866d070c25 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -247,6 +247,7 @@ private: Poco::Logger * log; StorageMetadataPtr metadata_snapshot; bool has_join = false; + StorageSnapshotPtr storage_snapshot; }; } diff --git a/src/Interpreters/InterpreterSelectQueryUseOptimizer.cpp b/src/Interpreters/InterpreterSelectQueryUseOptimizer.cpp index 124332ac9a..b4b96154ed 100644 --- a/src/Interpreters/InterpreterSelectQueryUseOptimizer.cpp +++ b/src/Interpreters/InterpreterSelectQueryUseOptimizer.cpp @@ -42,6 +42,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -61,6 +65,7 @@ namespace DB namespace ErrorCodes { extern const int TOO_MANY_PLAN_SEGMENTS; +extern const int OPTIMIZER_NONSUPPORT; extern const int LOGICAL_ERROR; } @@ -150,6 +155,16 @@ QueryPlanPtr InterpreterSelectQueryUseOptimizer::buildQueryPlan(bool skip_optimi return query_plan; } +static void blockQueryJSONUseOptimizer(std::set used_storage_ids, ContextMutablePtr context) +{ + for (const auto & storage_id : used_storage_ids) + { + auto storage = DatabaseCatalog::instance().getTable(storage_id, context); + if (hasDynamicSubcolumns(storage->getInMemoryMetadata().columns)) + throw Exception("JSON query is not supported in Optimizer mode.", ErrorCodes::OPTIMIZER_NONSUPPORT); + } +} + std::pair> InterpreterSelectQueryUseOptimizer::getPlanSegment() { Stopwatch stage_watch, total_watch; @@ -170,6 +185,8 @@ std::pair> InterpreterSelectQueryUseOpti stage_watch.restart(); std::set used_storage_ids = plan.allocateLocalTable(context); + + blockQueryJSONUseOptimizer(used_storage_ids, context); // select health worker before split if (context->getSettingsRef().enable_adaptive_scheduler && context->tryGetCurrentWorkerGroup()) { diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index f99b832b34..3f5bc3db1d 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -829,7 +829,7 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector & for (const String & column : stage.output_columns) all_asts->children.push_back(std::make_shared(column)); - auto syntax_result = TreeRewriter(context).analyze(all_asts, all_columns, storage, metadata_snapshot); + auto syntax_result = TreeRewriter(context).analyze(all_asts, all_columns, storage, storage->getStorageSnapshot(metadata_snapshot, context)); if (context->hasQueryContext()) for (const auto & it : syntax_result->getScalars()) context->getQueryContext()->addScalar(it.first, it.second); diff --git a/src/Interpreters/RuntimeFilter/RuntimeFilterTypes.h b/src/Interpreters/RuntimeFilter/RuntimeFilterTypes.h index 6735b0d631..e7e6bac9df 100644 --- a/src/Interpreters/RuntimeFilter/RuntimeFilterTypes.h +++ b/src/Interpreters/RuntimeFilter/RuntimeFilterTypes.h @@ -12,20 +12,6 @@ namespace DB { -#define FOR_NUMERIC_TYPES(M) \ -M(UInt8) \ -M(UInt16) \ -M(UInt32) \ -M(UInt64) \ -M(UInt128) \ -M(UInt256) \ -M(Int8) \ -M(Int16) \ -M(Int32) \ -M(Int64) \ -M(Int128) \ -M(Int256) - static size_t getFieldDiff(const Field & min, const Field & max) { if (min.getType() == Field::Types::Int64) diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 2cb1d9809f..3b8a9a2eb7 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -493,7 +493,7 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam DataTypePtr supertype; try { - supertype = DB::getLeastSupertype({ltype->second, rtype->second}, allow_extended_conversion); + supertype = DB::getLeastSupertype(DataTypes{ltype->second, rtype->second}, allow_extended_conversion); } catch (DB::Exception & ex) { diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index 4d1751d978..1e767f15d6 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -657,8 +657,8 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result, throw Exception("Select analyze for not select asts.", ErrorCodes::LOGICAL_ERROR); if (settings.optimize_functions_to_subcolumns && result.storage - && result.storage->supportsSubcolumns() && result.metadata_snapshot) - optimizeFunctionsToSubcolumns(query, result.metadata_snapshot); + && result.storage->supportsSubcolumns() && result.storage_snapshot) + optimizeFunctionsToSubcolumns(query, result.storage_snapshot->metadata); optimizeIf(query, result.aliases, settings.optimize_if_chain_to_multiif); @@ -719,7 +719,7 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result, /// Replace monotonous functions with its argument if (settings.optimize_monotonous_functions_in_order_by) optimizeMonotonousFunctionsInOrderBy(select_query, context, tables_with_columns, - result.metadata_snapshot ? result.metadata_snapshot->getSortingKeyColumns() : Names{}); + result.storage_snapshot ? result.storage_snapshot->metadata->getSortingKeyColumns() : Names{}); /// Remove duplicate items from ORDER BY. /// Execute it after all order by optimizations, diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index cb2ad04b09..1bed8a22ee 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -19,6 +19,7 @@ * All Bytedance's Modifications are Copyright (2023) Bytedance Ltd. and/or its affiliates. */ +#include #include #include @@ -931,10 +932,10 @@ void markTupleLiteralsAsLegacy(ASTPtr & query) TreeRewriterResult::TreeRewriterResult( const NamesAndTypesList & source_columns_, ConstStoragePtr storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, bool add_special) : storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , source_columns(source_columns_) { collectSourceColumns(add_special); @@ -947,13 +948,12 @@ void TreeRewriterResult::collectSourceColumns(bool add_special) { if (storage) { - const ColumnsDescription & columns = metadata_snapshot->getColumns(); - - NamesAndTypesList columns_from_storage; + auto options = GetColumnsOptions(add_special ? GetColumnsOptions::All : GetColumnsOptions::AllPhysical); + options.withExtendedObjects(); if (storage->supportsSubcolumns()) - columns_from_storage = add_special ? columns.getAllWithSubcolumns() : columns.getAllPhysicalWithSubcolumns(); - else - columns_from_storage = add_special ? columns.getAll() : columns.getAllPhysical(); + options.withSubcolumns(); + + auto columns_from_storage = storage_snapshot->getColumns(options); if (source_columns.empty()) source_columns.swap(columns_from_storage); @@ -1060,9 +1060,9 @@ void TreeRewriterResult::collectUsedColumns(const ContextPtr & context, ASTPtr & /// If we have no information about columns sizes, choose a column of minimum size of its data type. required.insert(ExpressionActions::getSmallestColumn(source_columns)); } - else if (is_select && metadata_snapshot && !columns_context.has_array_join) + else if (is_select && storage_snapshot && !columns_context.has_array_join) { - const auto & partition_desc = metadata_snapshot->getPartitionKey(); + const auto & partition_desc = storage_snapshot->metadata->getPartitionKey(); if (partition_desc.expression) { auto partition_source_columns = partition_desc.expression->getRequiredColumns(); @@ -1363,7 +1363,16 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( all_source_columns_set.insert(name); } - normalize(query, result.aliases, all_source_columns_set, select_options.ignore_alias, settings, /* allow_self_aliases = */ true, getContext(), result.storage, result.metadata_snapshot); + normalize( + query, + result.aliases, + all_source_columns_set, + select_options.ignore_alias, + settings, + /* allow_self_aliases = */ true, + getContext(), + result.storage, + result.storage_snapshot ? result.storage_snapshot->metadata : nullptr); // expand GROUP BY ALL if (select_query->group_by_all) @@ -1392,10 +1401,10 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( if (const auto * join_ast = select_query->join(); join_ast && tables_with_columns.size() >= 2) { auto & table_join_ast = join_ast->table_join->as(); - if (table_join_ast.using_expression_list && result.metadata_snapshot) - replaceAliasColumnsInQuery(table_join_ast.using_expression_list, result.metadata_snapshot->getColumns(), result.array_join_result_to_source, getContext()); - if (table_join_ast.on_expression && result.metadata_snapshot) - replaceAliasColumnsInQuery(table_join_ast.on_expression, result.metadata_snapshot->getColumns(), result.array_join_result_to_source, getContext()); + if (table_join_ast.using_expression_list && result.storage_snapshot && result.storage_snapshot->metadata) + replaceAliasColumnsInQuery(table_join_ast.using_expression_list, result.storage_snapshot->metadata->getColumns(), result.array_join_result_to_source, getContext()); + if (table_join_ast.on_expression && result.storage_snapshot && result.storage_snapshot->metadata) + replaceAliasColumnsInQuery(table_join_ast.on_expression, result.storage_snapshot->metadata->getColumns(), result.array_join_result_to_source, getContext()); collectJoinedColumns(*result.analyzed_join, table_join_ast, tables_with_columns, result.aliases, settings.join_using_null_safe, settings.ignore_array_join_check_in_join_on_condition, getContext(), settings.enable_join_on_1_equals_1); } @@ -1407,10 +1416,10 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( /// rewrite filters for select query, must go after getArrayJoinedColumns bool is_initiator = getContext()->getClientInfo().distributed_depth == 0; - if (settings.optimize_respect_aliases && result.metadata_snapshot && is_initiator) + if (settings.optimize_respect_aliases && result.storage_snapshot && is_initiator) { /// If query is changed, we need to redo some work to correct name resolution. - if (replaceAliasColumnsInQuery(query, result.metadata_snapshot->getColumns(), result.array_join_result_to_source, getContext())) + if (replaceAliasColumnsInQuery(query, result.storage_snapshot->metadata->getColumns(), result.array_join_result_to_source, getContext())) { result.aggregates = getAggregates(query, *select_query); result.window_function_asts = getWindowFunctions(query, *select_query); @@ -1434,7 +1443,7 @@ TreeRewriterResultPtr TreeRewriter::analyze( ASTPtr & query, const NamesAndTypesList & source_columns, ConstStoragePtr storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, bool allow_aggregations, bool allow_self_aliases) const { @@ -1443,9 +1452,18 @@ TreeRewriterResultPtr TreeRewriter::analyze( const auto & settings = getContext()->getSettingsRef(); - TreeRewriterResult result(source_columns, storage, metadata_snapshot, false); + TreeRewriterResult result(source_columns, storage, storage_snapshot, false); - normalize(query, result.aliases, result.source_columns_set, false, settings, allow_self_aliases, this->getContext(), storage, metadata_snapshot); + normalize( + query, + result.aliases, + result.source_columns_set, + false, + settings, + allow_self_aliases, + this->getContext(), + storage, + storage_snapshot ? storage_snapshot->metadata : nullptr); /// Executing scalar subqueries. Column defaults could be a scalar subquery. executeScalarSubqueries(query, getContext(), 0, result.scalars, false); diff --git a/src/Interpreters/TreeRewriter.h b/src/Interpreters/TreeRewriter.h index 8bd5056180..1559d9db3e 100644 --- a/src/Interpreters/TreeRewriter.h +++ b/src/Interpreters/TreeRewriter.h @@ -28,6 +28,7 @@ #include #include #include +#include namespace DB { @@ -44,7 +45,7 @@ using StorageMetadataPtr = std::shared_ptr; struct TreeRewriterResult { ConstStoragePtr storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; std::shared_ptr analyzed_join; const ASTTablesInSelectQueryElement * ast_join = nullptr; @@ -99,10 +100,10 @@ struct TreeRewriterResult /// Rewrite columns for compatibility. TablesWithColumns join_tables_to_rewrite; - TreeRewriterResult( + explicit TreeRewriterResult( const NamesAndTypesList & source_columns_, ConstStoragePtr storage_ = {}, - const StorageMetadataPtr & metadata_snapshot_ = {}, + const StorageSnapshotPtr & storage_snapshot_ = {}, bool add_special = true); void collectSourceColumns(bool add_special); @@ -137,7 +138,7 @@ public: ASTPtr & query, const NamesAndTypesList & source_columns_, ConstStoragePtr storage = {}, - const StorageMetadataPtr & metadata_snapshot = {}, + const StorageSnapshotPtr & storage_snapshot = {}, bool allow_aggregations = false, bool allow_self_aliases = true) const; diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 2d1cf4ce34..4e773a5594 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -298,6 +299,8 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID } return src; } + + return applyVisitor(FieldVisitorToString(), src); } else if (const DataTypeArray * type_array = typeid_cast(&type)) { @@ -407,6 +410,48 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID if (src.getType() == Field::Types::BitMap64) return src; } + else if (isObject(type)) + { + if (src.getType() == Field::Types::Object) + return src; /// Already in needed type. + + const auto * from_type_tuple = typeid_cast(from_type_hint); + if (src.getType() == Field::Types::Tuple && from_type_tuple && from_type_tuple->haveExplicitNames()) + { + const auto & names = from_type_tuple->getElementNames(); + const auto & tuple = src.get(); + + if (names.size() != tuple.size()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Bad size of tuple in IN or VALUES section (while converting to Object). Expected size: {}, actual size: {}", + names.size(), tuple.size()); + + Object object; + for (size_t i = 0; i < names.size(); ++i) + object[names[i]] = tuple[i]; + + return object; + } + + if (src.getType() == Field::Types::Map) + { + Object object; + const auto & map = src.get(); + for (const auto & i : map) + { + const auto & key = i.first; + const auto & value = i.second; + + if (key.getType() != Field::Types::String) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cannot convert from Map with key of type {} to Object", key.getTypeName()); + + object[key.get()] = value; + } + + return object; + } + } /// Conversion from string by parsing. if (src.getType() == Field::Types::String) diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 7b7e7d2722..b311e25f46 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -116,7 +116,7 @@ #include #include #include -#include "Interpreters/Context_fwd.h" +#include #include #include diff --git a/src/Interpreters/getColumnFromBlock.cpp b/src/Interpreters/getColumnFromBlock.cpp new file mode 100644 index 0000000000..ce6fa2904d --- /dev/null +++ b/src/Interpreters/getColumnFromBlock.cpp @@ -0,0 +1,50 @@ +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_FOUND_COLUMN_IN_BLOCK; +} + +ColumnPtr tryGetColumnFromBlock(const Block & block, const NameAndTypePair & requested_column) +{ + const auto * elem = block.findByName(requested_column.getNameInStorage()); + if (!elem) + return nullptr; + + DataTypePtr elem_type; + ColumnPtr elem_column; + + if (requested_column.isSubcolumn()) + { + auto subcolumn_name = requested_column.getSubcolumnName(); + elem_type = elem->type->tryGetSubcolumnType(subcolumn_name); + elem_column = elem->type->tryGetSubcolumn(subcolumn_name, elem->column); + + if (!elem_type || !elem_column) + return nullptr; + } + else + { + elem_type = elem->type; + elem_column = elem->column; + } + + return castColumn({elem_column, elem_type, ""}, requested_column.type); +} + +ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & requested_column) +{ + auto result_column = tryGetColumnFromBlock(block, requested_column); + if (!result_column) + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, + "Not found column or subcolumn {} in block. There are only columns: {}", + requested_column.name, block.dumpNames()); + + return result_column; +} + +} diff --git a/src/Interpreters/getColumnFromBlock.h b/src/Interpreters/getColumnFromBlock.h new file mode 100644 index 0000000000..26500cfdd1 --- /dev/null +++ b/src/Interpreters/getColumnFromBlock.h @@ -0,0 +1,13 @@ +#pragma once +#include + +namespace DB +{ + +/// Helps in-memory storages to extract columns from block. +/// Properly handles cases, when column is a subcolumn and when it is compressed. +ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & requested_column); + +ColumnPtr tryGetColumnFromBlock(const Block & block, const NameAndTypePair & requested_column); + +} diff --git a/src/Interpreters/getHeaderForProcessingStage.cpp b/src/Interpreters/getHeaderForProcessingStage.cpp index 19837cc05d..1c1f200827 100644 --- a/src/Interpreters/getHeaderForProcessingStage.cpp +++ b/src/Interpreters/getHeaderForProcessingStage.cpp @@ -80,9 +80,8 @@ bool removeJoin(ASTSelectQuery & select, TreeRewriterResult & rewriter_result, C } Block getHeaderForProcessingStage( - const IStorage & storage, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage) @@ -91,7 +90,7 @@ Block getHeaderForProcessingStage( { case QueryProcessingStage::FetchColumns: { - Block header = metadata_snapshot->getSampleBlockForColumns(column_names, storage.getVirtuals(), storage.getStorageID()); + Block header = storage_snapshot->getSampleBlockForColumns(column_names); if (query_info.prewhere_info) { auto & prewhere_info = *query_info.prewhere_info; @@ -121,7 +120,7 @@ Block getHeaderForProcessingStage( removeJoin(*query->as(), new_rewriter_result, context); auto stream = std::make_shared( - metadata_snapshot->getSampleBlockForColumns(column_names, storage.getVirtuals(), storage.getStorageID())); + storage_snapshot->getSampleBlockForColumns(column_names)); return InterpreterSelectQuery(query, context, stream, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); } } diff --git a/src/Interpreters/getHeaderForProcessingStage.h b/src/Interpreters/getHeaderForProcessingStage.h index 54a1126a3d..a578c414e3 100644 --- a/src/Interpreters/getHeaderForProcessingStage.h +++ b/src/Interpreters/getHeaderForProcessingStage.h @@ -10,8 +10,8 @@ namespace DB { class IStorage; -struct StorageInMemoryMetadata; -using StorageMetadataPtr = std::shared_ptr; +struct StorageSnapshot; +using StorageSnapshotPtr = std::shared_ptr; struct SelectQueryInfo; struct TreeRewriterResult; class ASTSelectQuery; @@ -20,9 +20,8 @@ bool hasJoin(const ASTSelectQuery & select); bool removeJoin(ASTSelectQuery & select, TreeRewriterResult & rewriter_result, ContextPtr context); Block getHeaderForProcessingStage( - const IStorage & storage, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage); diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index d1c2968417..b54706e300 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -15,11 +15,20 @@ #include #include #include +#include +#include +#include +#include namespace DB { +namespace ErrorCode +{ + extern const int LOGICAL_ERROR; +} + namespace { @@ -180,4 +189,103 @@ ActionsDAGPtr evaluateMissingDefaults( return createExpressions(header, expr_list, save_unneeded_columns, required_columns, context); } +static bool arrayHasNoElementsRead(const IColumn & column) +{ + const auto * column_array = typeid_cast(&column); + + if (!column_array) + return false; + + size_t size = column_array->size(); + if (!size) + return false; + + size_t data_size = column_array->getData().size(); + if (data_size) + return false; + + size_t last_offset = column_array->getOffsets()[size - 1]; + return last_offset != 0; +} + +void fillMissingColumns( + Columns & res_columns, + size_t num_rows, + const NamesAndTypesList & requested_columns, + StorageMetadataPtr metadata_snapshot, + size_t num_bitmap_columns) +{ + size_t num_columns = requested_columns.size(); + + if (res_columns.size() != num_columns + num_bitmap_columns) + throw Exception( + "invalid number of columns passed to MergeTreeReader::fillMissingColumns. " + "Expected " + + toString(num_columns + num_bitmap_columns) + + ", " + "got " + + toString(res_columns.size()), + ErrorCodes::LOGICAL_ERROR); + /// For a missing column of a nested data structure we must create not a column of empty + /// arrays, but a column of arrays of correct length. + + /// First, collect offset columns for all arrays in the block. + + std::unordered_map offset_columns; + auto requested_column = requested_columns.begin(); + for (size_t i = 0; i < num_columns; ++i, ++requested_column) + { + if (res_columns[i] == nullptr) + continue; + + if (const auto * array = typeid_cast(res_columns[i].get())) + { + String offsets_name = Nested::extractTableName(requested_column->name); + auto & offsets_column = offset_columns[offsets_name]; + + /// If for some reason multiple offsets columns are present for the same nested data structure, + /// choose the one that is not empty. + if (!offsets_column || offsets_column->empty()) + offsets_column = array->getOffsetsPtr(); + } + } + + /// insert default values only for columns without default expressions + requested_column = requested_columns.begin(); + for (size_t i = 0; i < num_columns; ++i, ++requested_column) + { + const auto & [name, type] = *requested_column; + + if (res_columns[i] && arrayHasNoElementsRead(*res_columns[i])) + res_columns[i] = nullptr; + + if (res_columns[i] == nullptr) + { + if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name)) + continue; + + String offsets_name = Nested::extractTableName(name); + auto offset_it = offset_columns.find(offsets_name); + const auto * array_type = typeid_cast(type.get()); + if (offset_it != offset_columns.end() && array_type) + { + const auto & nested_type = array_type->getNestedType(); + ColumnPtr offsets_column = offset_it->second; + size_t nested_rows = typeid_cast(*offsets_column).getData().back(); + + ColumnPtr nested_column = + nested_type->createColumnConstWithDefaultValue(nested_rows)->convertToFullColumnIfConst(); + + res_columns[i] = ColumnArray::create(nested_column, offsets_column); + } + else + { + /// We must turn a constant column into a full column because the interpreter could infer + /// that it is constant everywhere but in some blocks (from other parts) it can be a full column. + res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst(); + } + } + } +} + } diff --git a/src/Interpreters/inplaceBlockConversions.h b/src/Interpreters/inplaceBlockConversions.h index cc8261693f..9eb1e8d24c 100644 --- a/src/Interpreters/inplaceBlockConversions.h +++ b/src/Interpreters/inplaceBlockConversions.h @@ -5,7 +5,7 @@ #include #include #include - +#include namespace DB { @@ -14,6 +14,13 @@ class Block; class NamesAndTypesList; class ColumnsDescription; +class IColumn; +using ColumnPtr = COW::Ptr; +using Columns = std::vector; + +struct StorageInMemoryMetadata; +using StorageMetadataPtr = std::shared_ptr; + class ActionsDAG; using ActionsDAGPtr = std::shared_ptr; @@ -31,4 +38,11 @@ ActionsDAGPtr evaluateMissingDefaults( /// Tries to convert columns in block to required_columns void performRequiredConversions(Block & block, const NamesAndTypesList & required_columns, ContextPtr context); +void fillMissingColumns( + Columns & res_columns, + size_t num_rows, + const NamesAndTypesList & requested_columns, + StorageMetadataPtr metadata_snapshot, + size_t num_bitmap_columns = 0); + } diff --git a/src/MergeTreeCommon/CnchStorageCommon.h b/src/MergeTreeCommon/CnchStorageCommon.h index cde460a35c..aa0d0eae36 100644 --- a/src/MergeTreeCommon/CnchStorageCommon.h +++ b/src/MergeTreeCommon/CnchStorageCommon.h @@ -69,6 +69,23 @@ enum class CNCHStorageMediumType String toStr(CNCHStorageMediumType tp); CNCHStorageMediumType fromStr(const String & type_str); +enum class WorkerEngineType : uint8_t +{ + CLOUD, + DICT, +}; + +inline static String toString(WorkerEngineType type) +{ + switch (type) + { + case WorkerEngineType::CLOUD: + return "Cloud"; + case WorkerEngineType::DICT: + return "DictCloud"; + } +} + class CnchStorageCommonHelper { public: diff --git a/src/MergeTreeCommon/MergeTreeMetaBase.cpp b/src/MergeTreeCommon/MergeTreeMetaBase.cpp index 077b89e3c6..4b127484b5 100644 --- a/src/MergeTreeCommon/MergeTreeMetaBase.cpp +++ b/src/MergeTreeCommon/MergeTreeMetaBase.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -906,6 +907,53 @@ MergeTreeMetaBase::DataParts MergeTreeMetaBase::getDataParts(const DataPartState return res; } +MergeTreeMetaBase::DataPartsVector MergeTreeMetaBase::getDataPartsVectorUnlocked( + const DataPartStates & affordable_states, + const DataPartsLock & /*lock*/, + DataPartStateVector * out_states, + bool require_projection_parts) const +{ + DataPartsVector res; + DataPartsVector buf; + + for (auto state : affordable_states) + { + auto range = getDataPartsStateRange(state); + + if (require_projection_parts) + { + for (const auto & part : range) + { + for (const auto & [_, projection_part] : part->getProjectionParts()) + res.push_back(projection_part); + } + } + else + { + std::swap(buf, res); + res.clear(); + std::merge(range.begin(), range.end(), buf.begin(), buf.end(), std::back_inserter(res), LessDataPart()); //-V783 + } + } + + if (out_states != nullptr) + { + out_states->resize(res.size()); + if (require_projection_parts) + { + for (size_t i = 0; i < res.size(); ++i) + (*out_states)[i] = res[i]->getParentPart()->getState(); + } + else + { + for (size_t i = 0; i < res.size(); ++i) + (*out_states)[i] = res[i]->getState(); + } + } + + return res; +} + MergeTreeMetaBase::DataPartsVector MergeTreeMetaBase::getDataPartsVector( const DataPartStates & affordable_states, DataPartStateVector * out_states, bool require_projection_parts) const { @@ -1615,6 +1663,16 @@ UInt64 MergeTreeMetaBase::getTableHashForClusterBy() const } +StorageSnapshotPtr MergeTreeMetaBase::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const +{ + return std::make_shared(*this, metadata_snapshot, object_columns); +} + +StorageSnapshotPtr MergeTreeMetaBase::getStorageSnapshotWithoutParts(const StorageMetadataPtr & metadata_snapshot) const +{ + return std::make_shared(*this, metadata_snapshot, object_columns); +} + MergeTreeSettingsPtr MergeTreeMetaBase::getChangedSettings(const ASTPtr new_settings) const { MergeTreeSettingsPtr changed_settings = getSettings(); diff --git a/src/MergeTreeCommon/MergeTreeMetaBase.h b/src/MergeTreeCommon/MergeTreeMetaBase.h index a77d3c0158..08a61b07c9 100644 --- a/src/MergeTreeCommon/MergeTreeMetaBase.h +++ b/src/MergeTreeCommon/MergeTreeMetaBase.h @@ -24,6 +24,10 @@ #include #include #include +#include +#include +#include +#include #include #include #include @@ -174,6 +178,7 @@ public: } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } bool supportsPrewhere() const override { return true; } bool supportsSampling() const override { return true; } bool supportsIndexForIn() const override { return true; } @@ -197,6 +202,13 @@ public: /// out_states will contain snapshot of each part state DataPartsVector getDataPartsVector( const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr, bool require_projection_parts = false) const; + + DataPartsVector getDataPartsVectorUnlocked( + const DataPartStates & affordable_states, + const DataPartsLock & lock, + DataPartStateVector * out_states = nullptr, + bool require_projection_parts = false) const; + /// Returns all parts in specified partition DataPartsVector getDataPartsVectorInPartition(DataPartState /*state*/, const String & /*partition_id*/) const; @@ -385,6 +397,13 @@ public: bool isBucketTable() const override { return getInMemoryMetadata().isClusterByKeyDefined(); } UInt64 getTableHashForClusterBy() const override; // to compare table engines efficiently + /// Snapshot for MergeTree contains the current set of data parts + /// at the moment of the start of query. + struct SnapshotData : public StorageSnapshot::Data + { + DataPartsVector parts; + }; + void addMutationEntry(const CnchMergeTreeMutationEntry & entry); void removeMutationEntry(TxnTimestamp create_time); Strings getPlainMutationEntries(); @@ -394,6 +413,14 @@ public: virtual bool supportsOptimizer() const override { return true; } + void resetObjectColumns(const ColumnsDescription & object_columns_) { object_columns = object_columns_; } + + // TODO: @lianwenlong not thread safe if storage cache enabled + virtual StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const override; + + /// The same as above but does not hold vector of data parts. + virtual StorageSnapshotPtr getStorageSnapshotWithoutParts(const StorageMetadataPtr & metadata_snapshot) const; + protected: friend class IMergeTreeDataPart; friend class MergeTreeDataPartCNCH; @@ -402,6 +429,11 @@ protected: friend class StorageReplicatedMergeTree; friend class MergeTreeDataWriter; + /// Current description of columns of data type Object. + /// It changes only when set of parts is changed and is + /// protected by @data_parts_mutex. + ColumnsDescription object_columns; + bool require_part_metadata; /// Current column sizes in compressed and uncompressed form. diff --git a/src/Optimizer/Correlation.cpp b/src/Optimizer/Correlation.cpp index 064fec77f0..28dd3ed816 100644 --- a/src/Optimizer/Correlation.cpp +++ b/src/Optimizer/Correlation.cpp @@ -127,7 +127,7 @@ std::pair DecorrelationResult::buildJoinClause(PlanNodePtr & query if (!JoinCommon::isJoinCompatibleTypes(query_key_type, subquery_key_type)) { - auto common_type = getLeastSupertype({query_key_type, subquery_key_type}, + auto common_type = getLeastSupertype(DataTypes{query_key_type, subquery_key_type}, context->getSettingsRef().allow_extended_type_conversion); query_key_name = query_planner.addColumn(makeCastFunction(query_expr, common_type)).first; subquery_key_name = subquery_planner.addColumn(makeCastFunction(subquery_expr, common_type)).first; diff --git a/src/Optimizer/DomainTranslator.cpp b/src/Optimizer/DomainTranslator.cpp index 06cb064514..2d7fdd061f 100644 --- a/src/Optimizer/DomainTranslator.cpp +++ b/src/Optimizer/DomainTranslator.cpp @@ -576,7 +576,7 @@ DomainVisitor::canImplicitCoerceValue(Field & value, DataTypePtr & from_type_ return convertFieldToType(value, *to_type, from_type.get()); //Based on whether there is a super type between them - DataTypePtr super_type = tryGetLeastSupertype({from_type, to_type}, context->getSettingsRef().allow_extended_type_conversion); + DataTypePtr super_type = tryGetLeastSupertype(DataTypes{from_type, to_type}, context->getSettingsRef().allow_extended_type_conversion); //have super_type and super_type equals to_type, which means to_type is wider type; if (super_type != nullptr && super_type->equals(*to_type)) diff --git a/src/Optimizer/Rewriter/ColumnPruning.cpp b/src/Optimizer/Rewriter/ColumnPruning.cpp index 9cc4249f89..65d6d046d4 100644 --- a/src/Optimizer/Rewriter/ColumnPruning.cpp +++ b/src/Optimizer/Rewriter/ColumnPruning.cpp @@ -488,7 +488,7 @@ PlanNodePtr ColumnPruningVisitor::visitTableScanNode(TableScanNode & node, NameS // Hack: ColumnPruning::selectColumnWithMinSize ignores subcolumn, by checking `NameAndTypePair::subcolumn_delimiter_position`. // This is unexpected, so we rebuild the NameAndTypePair candidate_columns.emplace_back( - pair.first, columns_desc.getColumnOrSubcolumn(ColumnsDescription::AllPhysical, pair.first).type); + pair.first, columns_desc.getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, pair.first).type); } else { diff --git a/src/Optimizer/Rewriter/EliminateJoinByForeignKey.cpp b/src/Optimizer/Rewriter/EliminateJoinByForeignKey.cpp index aa1d2748e9..3e548fde86 100644 --- a/src/Optimizer/Rewriter/EliminateJoinByForeignKey.cpp +++ b/src/Optimizer/Rewriter/EliminateJoinByForeignKey.cpp @@ -1227,7 +1227,7 @@ PlanNodePtr EliminateJoinByFK::Eliminator::createNewJoinThenEnd(const String & f // 3-1. add new projection at pk side with cast function. if (!JoinCommon::isJoinCompatibleTypes(left_type, right_type)) { - auto common_type = getLeastSupertype({left_type, right_type}, context->getSettingsRef().allow_extended_type_conversion); + auto common_type = getLeastSupertype(DataTypes{left_type, right_type}, context->getSettingsRef().allow_extended_type_conversion); if (!common_type->equals(*left_type)) throw Exception(ErrorCodes::LOGICAL_ERROR, "EliminateByForeignKey::Eliminator logical error! fk type isn't the leastSupertType of pk type."); auto cast_function = makeCastFunction(std::make_shared(pk_name), common_type); diff --git a/src/Optimizer/Rewriter/PredicatePushdown.cpp b/src/Optimizer/Rewriter/PredicatePushdown.cpp index 4ef0445731..6ba5f47cf3 100644 --- a/src/Optimizer/Rewriter/PredicatePushdown.cpp +++ b/src/Optimizer/Rewriter/PredicatePushdown.cpp @@ -559,7 +559,7 @@ PlanNodePtr PredicateVisitor::visitJoinNode(JoinNode & node, PredicateContext & if (!JoinCommon::isJoinCompatibleTypes(left_type, right_type)) { - auto common_type = getLeastSupertype({left_type, right_type}, allow_extended_type_conversion); + auto common_type = getLeastSupertype(DataTypes{left_type, right_type}, allow_extended_type_conversion); left_key = left_planner.addColumn(makeCastFunction(std::make_shared(left_key), common_type)).first; right_key = right_planner.addColumn(makeCastFunction(std::make_shared(right_key), common_type)).first; need_project = true; diff --git a/src/Optimizer/SelectQueryInfoHelper.cpp b/src/Optimizer/SelectQueryInfoHelper.cpp index 40e2cd0463..d843345da0 100644 --- a/src/Optimizer/SelectQueryInfoHelper.cpp +++ b/src/Optimizer/SelectQueryInfoHelper.cpp @@ -8,6 +8,7 @@ #include #include #include +#include "Storages/StorageSnapshot.h" namespace DB { @@ -23,17 +24,19 @@ SelectQueryInfo buildSelectQueryInfoForQuery(const ASTPtr & query, ContextPtr co StoragePtr storage; StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; const auto * table_expression = getTableExpression(*select_query, 0); if (table_expression && table_expression->database_and_table_name) { storage = DatabaseCatalog::instance().getTable(StorageID{table_expression->database_and_table_name}, context); metadata_snapshot = storage->getInMemoryMetadataPtr(); + storage_snapshot = storage->getStorageSnapshot(metadata_snapshot, context); } // fill syntax_analyzer_result query_info.syntax_analyzer_result - = TreeRewriter(context).analyzeSelect(query_info.query, TreeRewriterResult({}, storage, metadata_snapshot)); + = TreeRewriter(context).analyzeSelect(query_info.query, TreeRewriterResult({}, storage, storage_snapshot)); // fill prepared_set auto query_analyzer diff --git a/src/Optimizer/SymbolTransformMap.cpp b/src/Optimizer/SymbolTransformMap.cpp index 112ab5e54a..edf04d5825 100644 --- a/src/Optimizer/SymbolTransformMap.cpp +++ b/src/Optimizer/SymbolTransformMap.cpp @@ -241,7 +241,7 @@ ASTPtr IdentifierToColumnReference::visitASTIdentifier(ASTPtr & node, Void &) { const auto & iden = node->as(); const auto & columns = storage_metadata->getColumns(); - if (columns.hasColumnOrSubcolumn(ColumnsDescription::AllPhysical, iden.name())) + if (columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, iden.name())) return std::make_shared(storage, unique_id, iden.name()); return node; } diff --git a/src/Processors/Exchange/DataTrans/NativeChunkOutputStream.cpp b/src/Processors/Exchange/DataTrans/NativeChunkOutputStream.cpp index afe3838153..24c4a93562 100644 --- a/src/Processors/Exchange/DataTrans/NativeChunkOutputStream.cpp +++ b/src/Processors/Exchange/DataTrans/NativeChunkOutputStream.cpp @@ -48,7 +48,7 @@ static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuf auto serialization = type.getDefaultSerialization(); ISerialization::SerializeBinaryBulkStatePtr state; - serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkStatePrefix(*full_column, settings, state); serialization->serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); serialization->serializeBinaryBulkStateSuffix(settings, state); } diff --git a/src/Processors/Formats/IInputFormat.cpp b/src/Processors/Formats/IInputFormat.cpp index 5594e04dc7..cdaf9982a5 100644 --- a/src/Processors/Formats/IInputFormat.cpp +++ b/src/Processors/Formats/IInputFormat.cpp @@ -21,4 +21,9 @@ void IInputFormat::resetParser() getPort().getInputPort().reopen(); } +void IInputFormat::setReadBuffer(ReadBuffer & in_) +{ + in = in_; +} + } diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h index e451192f91..295f94a36a 100644 --- a/src/Processors/Formats/IInputFormat.h +++ b/src/Processors/Formats/IInputFormat.h @@ -79,6 +79,8 @@ public: */ virtual void resetParser(); + virtual void setReadBuffer(ReadBuffer & in_); + virtual const BlockMissingValues & getMissingValues() const { static const BlockMissingValues none; diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp index be29906113..20088481bc 100644 --- a/src/Processors/Formats/IRowInputFormat.cpp +++ b/src/Processors/Formats/IRowInputFormat.cpp @@ -242,6 +242,9 @@ Chunk IRowInputFormat::generate() return {}; } + for (const auto & column : columns) + column->finalize(); + Chunk chunk(std::move(columns), num_rows); //chunk.setChunkInfo(std::move(chunk_missing_values)); return chunk; diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index bc57803152..8723b0bba3 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -14,116 +14,197 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } -JSONAsStringRowInputFormat::JSONAsStringRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) : - IRowInputFormat(header_, in_, std::move(params_)), buf(in) + +JSONAsRowInputFormat::JSONAsRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) + : JSONAsRowInputFormat(header_, std::make_unique(in_), params_) {} + +JSONAsRowInputFormat::JSONAsRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_) : + IRowInputFormat(header_, *buf_, std::move(params_)), buf(std::move(buf_)) { if (header_.columns() > 1) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "This input format is only suitable for tables with a single column of type String but the number of columns is {}", + "This input format is only suitable for tables with a single column of type String or Object, but the number of columns is {}", header_.columns()); +} +void JSONAsRowInputFormat::resetParser() +{ + IRowInputFormat::resetParser(); + buf.reset(); +} + +void JSONAsRowInputFormat::readPrefix() +{ + /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. + skipBOMIfExists(*buf); + + skipWhitespaceIfAny(*buf); + if (!buf->eof() && *buf->position() == '[') + { + ++buf->position(); + data_in_square_brackets = true; + } +} + +void JSONAsRowInputFormat::readSuffix() +{ + skipWhitespaceIfAny(*buf); + if (data_in_square_brackets) + { + assertChar(']', *buf); + skipWhitespaceIfAny(*buf); + data_in_square_brackets = false; + } + if (!buf->eof() && *buf->position() == ';') + { + ++buf->position(); + skipWhitespaceIfAny(*buf); + } + assertEOF(*buf); +} + +bool JSONAsRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) +{ + assert(columns.size() == 1); + assert(serializations.size() == 1); + + if (!allow_new_rows) + return false; + + skipWhitespaceIfAny(*buf); + if (!buf->eof()) + { + if (!data_in_square_brackets && *buf->position() == ';') + { + /// ';' means the end of query, but it cannot be before ']'. + return allow_new_rows = false; + } + else if (data_in_square_brackets && *buf->position() == ']') + { + /// ']' means the end of query. + return allow_new_rows = false; + } + } + + if (!buf->eof()) + readJSONObject(*columns[0]); + + skipWhitespaceIfAny(*buf); + if (!buf->eof() && *buf->position() == ',') + ++buf->position(); + skipWhitespaceIfAny(*buf); + + return !buf->eof(); +} + +void JSONAsRowInputFormat::setReadBuffer(ReadBuffer & in_) +{ + buf = std::make_unique(in_); + IInputFormat::setReadBuffer(*buf); +} + + +JSONAsStringRowInputFormat::JSONAsStringRowInputFormat( + const Block & header_, ReadBuffer & in_, Params params_) + : JSONAsRowInputFormat(header_, in_, params_) +{ if (!isString(removeNullable(removeLowCardinality(header_.getByPosition(0).type)))) throw Exception(ErrorCodes::BAD_ARGUMENTS, "This input format is only suitable for tables with a single column of type String but the column type is {}", header_.getByPosition(0).type->getName()); } -void JSONAsStringRowInputFormat::resetParser() -{ - IRowInputFormat::resetParser(); - buf.reset(); -} - void JSONAsStringRowInputFormat::readJSONObject(IColumn & column) { - PeekableReadBufferCheckpoint checkpoint{buf}; + PeekableReadBufferCheckpoint checkpoint{*buf}; size_t balance = 0; bool quotes = false; - if (*buf.position() != '{') + if (*buf->position() != '{') throw Exception("JSON object must begin with '{'.", ErrorCodes::INCORRECT_DATA); - ++buf.position(); + ++buf->position(); ++balance; char * pos; while (balance) { - if (buf.eof()) + if (buf->eof()) throw Exception("Unexpected end of file while parsing JSON object.", ErrorCodes::INCORRECT_DATA); if (quotes) { - pos = find_first_symbols<'"', '\\'>(buf.position(), buf.buffer().end()); - buf.position() = pos; - if (buf.position() == buf.buffer().end()) + pos = find_first_symbols<'"', '\\'>(buf->position(), buf->buffer().end()); + buf->position() = pos; + if (buf->position() == buf->buffer().end()) continue; - if (*buf.position() == '"') + if (*buf->position() == '"') { quotes = false; - ++buf.position(); + ++buf->position(); } - else if (*buf.position() == '\\') + else if (*buf->position() == '\\') { - ++buf.position(); - if (!buf.eof()) + ++buf->position(); + if (!buf->eof()) { - ++buf.position(); + ++buf->position(); } } } else { - pos = find_first_symbols<'"', '{', '}', '\\'>(buf.position(), buf.buffer().end()); - buf.position() = pos; - if (buf.position() == buf.buffer().end()) + pos = find_first_symbols<'"', '{', '}', '\\'>(buf->position(), buf->buffer().end()); + buf->position() = pos; + if (buf->position() == buf->buffer().end()) continue; - if (*buf.position() == '{') + if (*buf->position() == '{') { ++balance; - ++buf.position(); + ++buf->position(); } - else if (*buf.position() == '}') + else if (*buf->position() == '}') { --balance; - ++buf.position(); + ++buf->position(); } - else if (*buf.position() == '\\') + else if (*buf->position() == '\\') { - ++buf.position(); - if (!buf.eof()) + ++buf->position(); + if (!buf->eof()) { - ++buf.position(); + ++buf->position(); } } - else if (*buf.position() == '"') + else if (*buf->position() == '"') { quotes = true; - ++buf.position(); + ++buf->position(); } } } - buf.makeContinuousMemoryFromCheckpointToPos(); - char * end = buf.position(); - buf.rollbackToCheckpoint(); - column.insertData(buf.position(), end - buf.position()); - buf.position() = end; + buf->makeContinuousMemoryFromCheckpointToPos(); + char * end = buf->position(); + buf->rollbackToCheckpoint(); + column.insertData(buf->position(), end - buf->position()); + buf->position() = end; } -bool JSONAsStringRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) +JSONAsObjectRowInputFormat::JSONAsObjectRowInputFormat( + const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_) + : JSONAsRowInputFormat(header_, in_, params_) + , format_settings(format_settings_) { - skipWhitespaceIfAny(buf); + if (!isObject(header_.getByPosition(0).type)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Input format JSONAsObject is only suitable for tables with a single column of type Object but the column type is {}", + header_.getByPosition(0).type->getName()); +} - if (!buf.eof()) - readJSONObject(*columns[0]); - - skipWhitespaceIfAny(buf); - if (!buf.eof() && *buf.position() == ',') - ++buf.position(); - skipWhitespaceIfAny(buf); - - return !buf.eof(); +void JSONAsObjectRowInputFormat::readJSONObject(IColumn & column) +{ + serializations[0]->deserializeTextJSON(column, *buf, format_settings); } void registerInputFormatProcessorJSONAsString(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h index c15a769343..cde53b2416 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h @@ -10,22 +10,56 @@ namespace DB class ReadBuffer; /// This format parses a sequence of JSON objects separated by newlines, spaces and/or comma. +class JSONAsRowInputFormat : public IRowInputFormat +{ +public: + JSONAsRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_); + + void resetParser() override; + void setReadBuffer(ReadBuffer & in_) override; + +private: + JSONAsRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_); + + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; + + void readPrefix() override; + void readSuffix() override; + +protected: + virtual void readJSONObject(IColumn & column) = 0; + std::unique_ptr buf; + +private: + /// This flag is needed to know if data is in square brackets. + bool data_in_square_brackets = false; + bool allow_new_rows = true; +}; + /// Each JSON object is parsed as a whole to string. /// This format can only parse a table with single field of type String. - -class JSONAsStringRowInputFormat : public IRowInputFormat +class JSONAsStringRowInputFormat final : public JSONAsRowInputFormat { public: JSONAsStringRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_); - - bool readRow(MutableColumns & columns, RowReadExtension & ext) override; String getName() const override { return "JSONAsStringRowInputFormat"; } - void resetParser() override; private: - void readJSONObject(IColumn & column); + void readJSONObject(IColumn & column) override; +}; - PeekableReadBuffer buf; + +/// Each JSON object is parsed as a whole to object. +/// This format can only parse a table with single field of type Object. +class JSONAsObjectRowInputFormat final : public JSONAsRowInputFormat +{ +public: + JSONAsObjectRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_); + String getName() const override { return "JSONAsObjectRowInputFormat"; } + +private: + void readJSONObject(IColumn & column) override; + const FormatSettings format_settings; }; } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 42ec1457a2..565f8be5f7 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -113,6 +113,9 @@ Chunk ValuesBlockInputFormat::generate() return {}; } + for (const auto & column : columns) + column->finalize(); + size_t rows_in_block = columns[0]->size(); return Chunk{std::move(columns), rows_in_block}; } diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 8675bb3579..8215997dfe 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -2060,6 +2060,7 @@ struct WindowFunctionLagLeadInFrame final : public WindowFunction #include +#include +#include +#include #include #include -#include #include namespace DB @@ -287,6 +289,14 @@ void fillPartModel(const IStorage & storage, const IMergeTreeDataPart & part, Pr part_model.set_columns(part.getColumns().toString()); } + if (DB::hasDynamicSubcolumns(storage.getInMemoryMetadata().columns)) + { + if (auto commit_time = DB::getColumnsCommitTimeForJSONTable(storage, *(part.getColumnsPtr()))) + part_model.set_columns_commit_time(commit_time); + + part_model.set_columns(part.getColumns().toString()); + } + if (!part.min_unique_key.empty()) part_model.set_min_unique_key(part.min_unique_key); if (!part.max_unique_key.empty()) @@ -364,9 +374,21 @@ void fillPartsModelForSend( part_model = part->part_model(); part_model.set_commit_time(part->getCommitTime()); part_model.set_virtual_part_size(part->getVirtualPartSize()); + + auto table_has_dynamic_subcolumns = hasDynamicSubcolumns(storage.getInMemoryMetadataPtr()->columns); + if (table_has_dynamic_subcolumns) + { + if (part_model.has_columns()) + { + continue; + } + } + if (part_model.has_columns_commit_time() && sent_columns_commit_time.count(part_model.columns_commit_time()) == 0) { - part_model.set_columns(storage.getPartColumns(part_model.columns_commit_time())->toString()); + auto storage_columns = storage.getPartColumns(part_model.columns_commit_time()); + + part_model.set_columns(storage_columns->toString()); sent_columns_commit_time.insert(part_model.columns_commit_time()); } part_model.set_disk_cache_host_port(part->disk_cache_host_port); diff --git a/src/Protos/cnch_worker_rpc.proto b/src/Protos/cnch_worker_rpc.proto index 821f4897d6..0414455e78 100644 --- a/src/Protos/cnch_worker_rpc.proto +++ b/src/Protos/cnch_worker_rpc.proto @@ -41,6 +41,8 @@ message SubmitManipulationTaskReq optional string mutate_commands = 10; /// used to mark task of bucket table optional bool is_bucket_table = 11; + /// used for json column merge + optional string dynamic_object_column_schema = 12; } message SubmitManipulationTaskResp @@ -435,6 +437,8 @@ message SendResourcesReq /// data parts repeated TableDataParts data_parts = 5; optional string disk_cache_mode = 6; + repeated UDFInfo udf_infos = 7; + repeated string dynamic_object_column_schema = 8; } message SendResourcesResp diff --git a/src/Protos/enum.proto b/src/Protos/enum.proto index 6ecb146b98..94aadc3132 100644 --- a/src/Protos/enum.proto +++ b/src/Protos/enum.proto @@ -138,6 +138,7 @@ message FieldType { SketchBinary = 30; IPv4 = 31; IPv6 = 32; + Object = 33; NegativeInfinity = 254; PositiveInfinity = 255; } diff --git a/src/QueryPlan/PlanSegmentSourceStep.cpp b/src/QueryPlan/PlanSegmentSourceStep.cpp index 9fd2ec4e1b..fc7ba63ec1 100644 --- a/src/QueryPlan/PlanSegmentSourceStep.cpp +++ b/src/QueryPlan/PlanSegmentSourceStep.cpp @@ -64,9 +64,10 @@ void PlanSegmentSourceStep::initializePipeline(QueryPipeline & pipeline, const B QueryPlanStepPtr PlanSegmentSourceStep::generateStep() { StoragePtr storage = DatabaseCatalog::instance().getTable({storage_id.database_name, storage_id.table_name}, context); + auto storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context); auto pipe = storage->read(column_names, - storage->getInMemoryMetadataPtr(), + storage_snapshot, query_info, context, processed_stage, diff --git a/src/QueryPlan/PlanSerDerHelper.cpp b/src/QueryPlan/PlanSerDerHelper.cpp index bb56b47240..982b0e342f 100644 --- a/src/QueryPlan/PlanSerDerHelper.cpp +++ b/src/QueryPlan/PlanSerDerHelper.cpp @@ -114,7 +114,7 @@ void serializeColumn(const ColumnPtr & column, const DataTypePtr & data_type, Wr auto serialization = data_type->getDefaultSerialization(); ISerialization::SerializeBinaryBulkStatePtr state; - serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkStatePrefix(*full_column, settings, state); serialization->serializeBinaryBulkWithMultipleStreams(*full_column, 0, 0, settings, state); serialization->serializeBinaryBulkStateSuffix(settings, state); } diff --git a/src/QueryPlan/ReadFromMergeTree.cpp b/src/QueryPlan/ReadFromMergeTree.cpp index ed487c6c4b..4a047cc6da 100644 --- a/src/QueryPlan/ReadFromMergeTree.cpp +++ b/src/QueryPlan/ReadFromMergeTree.cpp @@ -167,8 +167,7 @@ ReadFromMergeTree::ReadFromMergeTree( Names virt_column_names_, const MergeTreeMetaBase & data_, const SelectQueryInfo & query_info_, - StorageMetadataPtr metadata_snapshot_, - StorageMetadataPtr metadata_snapshot_base_, + StorageSnapshotPtr storage_snapshot_, ContextPtr context_, size_t max_block_size_, size_t num_streams_, @@ -177,15 +176,13 @@ ReadFromMergeTree::ReadFromMergeTree( std::shared_ptr max_block_numbers_to_read_, Poco::Logger * log_, MergeTreeDataSelectAnalysisResultPtr analyzed_result_ptr_) - : ISourceStep(DataStream{ - .header = MergeTreeBaseSelectProcessor::transformHeader( - metadata_snapshot_->getSampleBlockForColumns(real_column_names_, data_.getVirtuals(), data_.getStorageID()), - getPrewhereInfo(query_info_), - data_.getPartitionValueType(), - virt_column_names_, - getIndexContext(query_info_), query_info_.read_bitmap_index)}) + : ISourceStep(DataStream{.header = MergeTreeBaseSelectProcessor::transformHeader( + storage_snapshot_->getSampleBlockForColumns(real_column_names_), + getPrewhereInfo(query_info_), + data_.getPartitionValueType(), + virt_column_names_, + getIndexContext(query_info_), query_info_.read_bitmap_index)}) , reader_settings(getMergeTreeReaderSettings(context_, data_)) - , prepared_parts(std::move(parts_)) , delete_bitmap_getter(std::move(delete_bitmap_getter_)) , real_column_names(std::move(real_column_names_)) @@ -194,8 +191,8 @@ ReadFromMergeTree::ReadFromMergeTree( , query_info(query_info_) , prewhere_info(getPrewhereInfo(query_info)) , actions_settings(ExpressionActionsSettings::fromContext(context_)) - , metadata_snapshot(std::move(metadata_snapshot_)) - , metadata_snapshot_base(std::move(metadata_snapshot_base_)) + , storage_snapshot(std::move(storage_snapshot_)) + , metadata_for_reading(storage_snapshot->getMetadataForQuery()) , context(std::move(context_)) , max_block_size(max_block_size_) , requested_num_streams(num_streams_) @@ -249,7 +246,7 @@ Pipe ReadFromMergeTree::readFromPool( std::move(parts_with_range), delete_bitmap_getter, data, - metadata_snapshot, + storage_snapshot, query_info, true, required_columns, @@ -265,7 +262,7 @@ Pipe ReadFromMergeTree::readFromPool( auto source = std::make_shared( i, pool, min_marks_for_concurrent_read, max_block_size, settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes, - data, metadata_snapshot, use_uncompressed_cache, + data, storage_snapshot, use_uncompressed_cache, query_info, actions_settings, reader_settings, virt_column_names); if (i == 0) @@ -287,7 +284,7 @@ ProcessorPtr ReadFromMergeTree::createSource( bool use_uncompressed_cache) { return std::make_shared( - data, metadata_snapshot, part.data_part, std::move(combineFilterBitmap(part, delete_bitmap_getter)), max_block_size, preferred_block_size_bytes, + data, storage_snapshot, part.data_part, std::move(combineFilterBitmap(part, delete_bitmap_getter)), max_block_size, preferred_block_size_bytes, preferred_max_column_in_block_size_bytes, required_columns, part.ranges, use_uncompressed_cache, query_info, actions_settings, true, reader_settings, virt_column_names, part.part_index_in_query); } @@ -570,7 +567,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder( { SortDescription sort_description; for (size_t j = 0; j < input_order_info->order_key_prefix_descr.size(); ++j) - sort_description.emplace_back(metadata_snapshot->getSortingKey().column_names[j], + sort_description.emplace_back(storage_snapshot->metadata->getSortingKey().column_names[j], input_order_info->direction, 1); auto sorting_key_expr = std::make_shared(sorting_key_prefix_expr); @@ -795,7 +792,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( } auto sorting_expr = std::make_shared( - metadata_snapshot->getSortingKey().expression->getActionsDAG().clone()); + metadata_for_reading->getSortingKey().expression->getActionsDAG().clone()); pipe.addSimpleTransform([sorting_expr](const Block & header) { @@ -812,12 +809,12 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( continue; } - Names sort_columns = metadata_snapshot->getSortingKeyColumns(); + Names sort_columns = metadata_for_reading->getSortingKeyColumns(); SortDescription sort_description; size_t sort_columns_size = sort_columns.size(); sort_description.reserve(sort_columns_size); - Names partition_key_columns = metadata_snapshot->getPartitionKey().column_names; + Names partition_key_columns = metadata_for_reading->getPartitionKey().column_names; const auto & header = pipe.getHeader(); for (size_t i = 0; i < sort_columns_size; ++i) @@ -857,7 +854,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( out_projection = createProjection(pipe.getHeader()); auto sorting_expr = std::make_shared( - metadata_snapshot->getSortingKey().expression->getActionsDAG().clone()); + metadata_for_reading->getSortingKey().expression->getActionsDAG().clone()); pipe.addSimpleTransform([sorting_expr](const Block & header) { @@ -874,8 +871,8 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(Merge { return selectRangesToRead( std::move(parts), - metadata_snapshot_base, - metadata_snapshot, + storage_snapshot->metadata, + storage_snapshot->getMetadataForQuery(), query_info, context, requested_num_streams, @@ -917,8 +914,8 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( result.column_names_to_read.push_back(ExpressionActions::getSmallestColumn(available_real_columns)); } - metadata_snapshot->check(result.column_names_to_read, data.getVirtuals(), data.getStorageID()); - + // storage_snapshot->check(result.column_names_to_read); + // Build and check if primary key is used when necessary const auto & primary_key = metadata_snapshot->getPrimaryKey(); Names primary_key_columns = primary_key.column_names; @@ -1170,7 +1167,7 @@ void ReadFromMergeTree::initializePipeline(QueryPipeline & pipeline, const Build if (select.final()) { /// Add columns needed to calculate the sorting expression and the sign. - std::vector add_columns = metadata_snapshot->getColumnsRequiredForSortingKey(); + std::vector add_columns = metadata_for_reading->getColumnsRequiredForSortingKey(); column_names_to_read.insert(column_names_to_read.end(), add_columns.begin(), add_columns.end()); if (!data.merging_params.sign_column.empty()) @@ -1189,10 +1186,10 @@ void ReadFromMergeTree::initializePipeline(QueryPipeline & pipeline, const Build else if ((settings.optimize_read_in_order || settings.optimize_aggregation_in_order) && input_order_info) { size_t prefix_size = input_order_info->order_key_prefix_descr.size(); - auto order_key_prefix_ast = metadata_snapshot->getSortingKey().expression_list_ast->clone(); + auto order_key_prefix_ast = metadata_for_reading->getSortingKey().expression_list_ast->clone(); order_key_prefix_ast->children.resize(prefix_size); - auto syntax_result = TreeRewriter(context).analyze(order_key_prefix_ast, metadata_snapshot->getColumns().getAllPhysical()); + auto syntax_result = TreeRewriter(context).analyze(order_key_prefix_ast, metadata_for_reading->getColumns().getAllPhysical()); auto sorting_key_prefix_expr = ExpressionAnalyzer(order_key_prefix_ast, syntax_result, context).getActionsDAG(false); pipe = spreadMarkRangesAmongStreamsWithOrder( diff --git a/src/QueryPlan/ReadFromMergeTree.h b/src/QueryPlan/ReadFromMergeTree.h index 629c537c08..d69917cb48 100644 --- a/src/QueryPlan/ReadFromMergeTree.h +++ b/src/QueryPlan/ReadFromMergeTree.h @@ -94,8 +94,7 @@ public: Names virt_column_names_, const MergeTreeMetaBase & data_, const SelectQueryInfo & query_info_, - StorageMetadataPtr metadata_snapshot_, - StorageMetadataPtr metadata_snapshot_base_, + StorageSnapshotPtr storage_snapshot, ContextPtr context_, size_t max_block_size_, size_t num_streams_, @@ -150,8 +149,8 @@ private: PrewhereInfoPtr prewhere_info; ExpressionActionsSettings actions_settings; - StorageMetadataPtr metadata_snapshot; - StorageMetadataPtr metadata_snapshot_base; + StorageSnapshotPtr storage_snapshot; + StorageMetadataPtr metadata_for_reading; ContextPtr context; diff --git a/src/QueryPlan/TableScanStep.cpp b/src/QueryPlan/TableScanStep.cpp index 70e1b4ad7c..2107a78391 100644 --- a/src/QueryPlan/TableScanStep.cpp +++ b/src/QueryPlan/TableScanStep.cpp @@ -13,8 +13,7 @@ * limitations under the License. */ -#include -#include +#include #include #include @@ -795,8 +794,8 @@ void TableScanStep::formatOutputStream(ContextPtr context) storage = DatabaseCatalog::instance().getTable(storage_id, context); storage_id.uuid = storage->getStorageUUID(); - // TODO: in long term, we should use different constructor for server/worker - Block header = storage->getInMemoryMetadataPtr()->getSampleBlockForColumns(getRequiredColumns(), storage->getVirtuals()); + // TODO: in long term, we should use different constructor for server/worker + Block header = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context)->getSampleBlockForColumns(getRequiredColumns()); NameToNameMap name_to_name_map; for (auto & item : column_alias) @@ -1085,12 +1084,10 @@ void TableScanStep::initializePipeline(QueryPipeline & pipeline, const BuildQuer if (use_projection_index) { - auto metadata_snapshot = storage->getInMemoryMetadataPtr(); - auto input_columns_block - = metadata_snapshot->getSampleBlockForColumns(getRequiredColumns(), storage->getVirtuals(), storage_id); + auto storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), build_context.context); + auto input_columns_block = storage_snapshot->getSampleBlockForColumns(getRequiredColumns()); auto input_columns = input_columns_block.getNamesAndTypesList(); - auto required_columns_block = metadata_snapshot->getSampleBlockForColumns( - getRequiredColumns(OutputAndPrewhere), storage->getVirtuals(), storage_id); + auto required_columns_block = storage_snapshot->getSampleBlockForColumns(getRequiredColumns(OutputAndPrewhere)); auto required_columns = required_columns_block.getNamesAndTypesList(); if (log->debug()) @@ -1123,7 +1120,7 @@ void TableScanStep::initializePipeline(QueryPipeline & pipeline, const BuildQuer .prepared_sets = query_info.sets, .outputs = column_alias /* no meaning */}; - query_info.index_context = MergeTreeIndexContext::buildFromProjection(inline_expressions, index_building_context, metadata_snapshot); + query_info.index_context = MergeTreeIndexContext::buildFromProjection(inline_expressions, index_building_context, storage_snapshot->metadata); } ExecutePlan execute_plan; @@ -1144,14 +1141,14 @@ void TableScanStep::initializePipeline(QueryPipeline & pipeline, const BuildQuer if (execute_plan.empty()) { + auto storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), build_context.context); auto pipe = storage->read( - interpreter->getRequiredColumns(), storage->getInMemoryMetadataPtr(), query_info, build_context.context, QueryProcessingStage::Enum::FetchColumns, max_block_size, max_streams); + interpreter->getRequiredColumns(), storage_snapshot, query_info, build_context.context, QueryProcessingStage::Enum::FetchColumns, max_block_size, max_streams); QueryPlanStepPtr step; if (pipe.empty()) { - auto header - = storage->getInMemoryMetadataPtr()->getSampleBlockForColumns(getRequiredColumns(), storage->getVirtuals(), storage_id); + auto header = storage_snapshot->getSampleBlockForColumns(getRequiredColumns()); auto null_pipe = InterpreterSelectQuery::generateNullSourcePipe(header, query_info); auto read_from_pipe = std::make_shared(std::move(null_pipe)); read_from_pipe->setStepDescription("Read from NullSource"); @@ -1181,8 +1178,9 @@ void TableScanStep::initializePipeline(QueryPipeline & pipeline, const BuildQuer MergeTreeDataSelectExecutor merge_tree_reader{*merge_tree_storage}; auto metadata_snapshot = storage->getInMemoryMetadataPtr(); auto context = build_context.context; + auto storage_snapshot = storage->getStorageSnapshot(metadata_snapshot, context); Pipes pipes; - + // num of pipes may be smaller than num of plan elements since MergeTreeDataSelectExecutor // can infer an empty result for a part group. hence we record a mapping of pipe->plan element std::vector plan_element_ids; @@ -1208,12 +1206,15 @@ void TableScanStep::initializePipeline(QueryPipeline & pipeline, const BuildQuer projection_query_info.prewhere_info->prewhere_actions = plan_element.prewhere_actions; } MergeTreeData::DeleteBitmapGetter null_getter = [](auto & /*part*/) { return nullptr; }; + auto proj_snapshot = std::make_shared( + storage_snapshot->storage, storage_snapshot->metadata, storage_snapshot->object_columns); + proj_snapshot->addProjection(plan_element.projection_desc); + auto read_plan = merge_tree_reader.readFromParts( {}, null_getter, plan_element.projection_required_columns, - metadata_snapshot, - plan_element.projection_desc->metadata, + proj_snapshot, projection_query_info, context, max_block_size, @@ -1258,8 +1259,7 @@ void TableScanStep::initializePipeline(QueryPipeline & pipeline, const BuildQuer {}, delete_bitmap_getter, getRequiredColumns(), - metadata_snapshot, - metadata_snapshot, + storage_snapshot, query_info_for_index, context, max_block_size, @@ -1538,9 +1538,11 @@ void TableScanStep::allocate(ContextPtr context) // init query_info.syntax_analyzer if (!query_info.syntax_analyzer_result) { - Block header = storage->getInMemoryMetadataPtr()->getSampleBlockForColumns(getRequiredColumns(), storage->getVirtuals()); + auto storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context); + Block header = storage_snapshot->getSampleBlockForColumns(getRequiredColumns()); + auto tree_rewriter_result - = std::make_shared(header.getNamesAndTypesList(), storage, storage->getInMemoryMetadataPtr()); + = std::make_shared(header.getNamesAndTypesList(), storage, storage_snapshot); tree_rewriter_result->required_source_columns = header.getNamesAndTypesList(); tree_rewriter_result->analyzed_join = std::make_shared(); query_info.syntax_analyzer_result = tree_rewriter_result; diff --git a/src/Storages/ColumnDefault.h b/src/Storages/ColumnDefault.h index 38b61415a9..096a1f177a 100644 --- a/src/Storages/ColumnDefault.h +++ b/src/Storages/ColumnDefault.h @@ -13,7 +13,8 @@ enum class ColumnDefaultKind { Default, Materialized, - Alias + Alias, + Ephemeral }; diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 6a9ee5903c..a46987639d 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -278,7 +278,7 @@ static auto getNameRange(const ColumnsDescription::ColumnsContainer & columns, c return std::make_pair(begin, end); } -void ColumnsDescription::add(ColumnDescription column, const String & after_column, bool first) +void ColumnsDescription::add(ColumnDescription column, const String & after_column, bool first, bool add_subcolumns) { if (has(column.name)) throw Exception("Cannot add column " + column.name + ": column with this name already exists", @@ -298,7 +298,8 @@ void ColumnsDescription::add(ColumnDescription column, const String & after_colu insert_it = range.second; } - addSubcolumns(column.name, column.type); + if (add_subcolumns) + addSubcolumns(column.name, column.type); columns.get<0>().insert(insert_it, std::move(column)); } @@ -442,6 +443,65 @@ NamesAndTypesList ColumnsDescription::getAll() const return ret; } +NamesAndTypesList ColumnsDescription::getEphemeral() const +{ + NamesAndTypesList ret; + for (const auto & col : columns) + if (col.default_desc.kind == ColumnDefaultKind::Ephemeral) + ret.emplace_back(col.name, col.type); + return ret; +} + +NamesAndTypesList ColumnsDescription::getSubcolumns(const String & name_in_storage) const +{ + auto range = subcolumns.get<1>().equal_range(name_in_storage); + return NamesAndTypesList(range.first, range.second); +} + +void ColumnsDescription::addSubcolumnsToList(NamesAndTypesList & source_list) const +{ + NamesAndTypesList subcolumns_list; + for (const auto & col : source_list) + { + auto range = subcolumns.get<1>().equal_range(col.name); + if (range.first != range.second) + subcolumns_list.insert(subcolumns_list.end(), range.first, range.second); + } + + source_list.splice(source_list.end(), std::move(subcolumns_list)); +} + +NamesAndTypesList ColumnsDescription::get(const GetColumnsOptions & options) const +{ + NamesAndTypesList res; + switch (options.kind) + { + case GetColumnsOptions::All: + res = getAll(); + break; + case GetColumnsOptions::AllPhysical: + res = getAllPhysical(); + break; + case GetColumnsOptions::Ordinary: + res = getOrdinary(); + break; + case GetColumnsOptions::Materialized: + res = getMaterialized(); + break; + case GetColumnsOptions::Aliases: + res = getAliases(); + break; + case GetColumnsOptions::Ephemeral: + res = getEphemeral(); + break; + } + + if (options.with_subcolumns) + addSubcolumnsToList(res); + + return res; +} + bool ColumnsDescription::has(const String & column_name) const { return columns.get<1>().find(column_name) != columns.get<1>().end(); @@ -468,29 +528,31 @@ const ColumnDescription & ColumnsDescription::get(const String & column_name) co return *it; } -static ColumnsDescription::GetFlags defaultKindToGetFlag(ColumnDefaultKind kind) +static GetColumnsOptions::Kind defaultKindToGetKind(ColumnDefaultKind kind) { switch (kind) { case ColumnDefaultKind::Default: - return ColumnsDescription::Ordinary; + return GetColumnsOptions::Ordinary; case ColumnDefaultKind::Materialized: - return ColumnsDescription::Materialized; + return GetColumnsOptions::Materialized; case ColumnDefaultKind::Alias: - return ColumnsDescription::Aliases; + return GetColumnsOptions::Aliases; + case ColumnDefaultKind::Ephemeral: + return GetColumnsOptions::Ephemeral; } __builtin_unreachable(); } -NamesAndTypesList ColumnsDescription::getByNames(GetFlags flags, const Names & names, bool with_subcolumns) const +NamesAndTypesList ColumnsDescription::getByNames(const GetColumnsOptions & options, const Names & names) const { NamesAndTypesList res; for (const auto & name : names) { if (auto it = columns.get<1>().find(name); it != columns.get<1>().end()) { - auto kind = defaultKindToGetFlag(it->default_desc.kind); - if (flags & kind) + auto kind = defaultKindToGetKind(it->default_desc.kind); + if (options.kind & kind) { res.emplace_back(name, it->type); continue; @@ -512,7 +574,7 @@ NamesAndTypesList ColumnsDescription::getByNames(GetFlags flags, const Names & n res.emplace_back(RowExistsColumn::ROW_EXISTS_COLUMN); continue; } - else if (with_subcolumns) + else if (options.with_subcolumns) { auto jt = subcolumns.get<0>().find(name); if (jt != subcolumns.get<0>().end()) @@ -561,38 +623,83 @@ std::optional ColumnsDescription::tryGetMapImplicitColumn(const return {}; } -std::optional ColumnsDescription::tryGetColumnOrSubcolumn(GetFlags flags, const String & column_name) const +std::optional ColumnsDescription::tryGetColumn(const GetColumnsOptions & options, const String & column_name) const { auto it = columns.get<1>().find(column_name); - if (it != columns.get<1>().end() && (defaultKindToGetFlag(it->default_desc.kind) & flags)) + if (it != columns.get<1>().end() && (defaultKindToGetKind(it->default_desc.kind) & options.kind)) return NameAndTypePair(it->name, it->type); - auto jt = subcolumns.get<0>().find(column_name); - if (jt != subcolumns.get<0>().end()) - return *jt; + if (options.with_subcolumns) + { + auto jt = subcolumns.get<0>().find(column_name); + if (jt != subcolumns.get<0>().end()) + return *jt; + } + if (auto res = tryGetMapImplicitColumn(column_name)) return res; + if (column_name == "_part_row_number") + { + return NameAndTypePair("_part_row_number", std::make_shared()); + } + + if (column_name == RowExistsColumn::ROW_EXISTS_COLUMN.name) + { + return RowExistsColumn::ROW_EXISTS_COLUMN; + } + return {}; } -NameAndTypePair ColumnsDescription::getColumnOrSubcolumn(GetFlags flags, const String & column_name) const +NameAndTypePair ColumnsDescription::getColumn(const GetColumnsOptions & options, const String & column_name) const { - auto column = tryGetColumnOrSubcolumn(flags, column_name); + auto column = tryGetColumn(options, column_name); + if (!column) + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, + "There is no column {} in table.", column_name); + + return *column; +} + +std::optional ColumnsDescription::tryGetColumnDescription(const GetColumnsOptions & options, const String & column_name) const +{ + auto it = columns.get<1>().find(column_name); + if (it != columns.get<1>().end() && (defaultKindToGetKind(it->default_desc.kind) & options.kind)) + return *it; + + if (options.with_subcolumns) + { + auto jt = subcolumns.get<0>().find(column_name); + if (jt != subcolumns.get<0>().end()) + return ColumnDescription{jt->name, jt->type}; + } + + return {}; +} + +std::optional ColumnsDescription::tryGetColumnOrSubcolumnDescription(GetColumnsOptions::Kind kind, const String & column_name) const +{ + return tryGetColumnDescription(GetColumnsOptions(kind).withSubcolumns(), column_name); +} + +std::optional ColumnsDescription::tryGetColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const +{ + return tryGetColumn(GetColumnsOptions(kind).withSubcolumns(), column_name); +} + +NameAndTypePair ColumnsDescription::getColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const +{ + auto column = tryGetColumnOrSubcolumn(kind, column_name); if (!column) throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column or subcolumn {} in table.", column_name); - return *column; } std::optional ColumnsDescription::tryGetPhysical(const String & column_name) const { - auto it = columns.get<1>().find(column_name); - if (it == columns.get<1>().end() || it->default_desc.kind == ColumnDefaultKind::Alias) - return {}; - - return NameAndTypePair(it->name, it->type); + return tryGetColumn(GetColumnsOptions::AllPhysical, column_name); } NameAndTypePair ColumnsDescription::getPhysical(const String & column_name) const @@ -611,27 +718,16 @@ bool ColumnsDescription::hasPhysical(const String & column_name) const return it != columns.get<1>().end() && it->default_desc.kind != ColumnDefaultKind::Alias; } -bool ColumnsDescription::hasColumnOrSubcolumn(GetFlags flags, const String & column_name) const -{ +bool ColumnsDescription::hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const{ auto it = columns.get<1>().find(column_name); // @ByteMap if (tryGetMapImplicitColumn(column_name)) return true; return (it != columns.get<1>().end() - && (defaultKindToGetFlag(it->default_desc.kind) & flags)) + && (defaultKindToGetKind(it->default_desc.kind) & kind)) || hasSubcolumn(column_name); } -void ColumnsDescription::addSubcolumnsToList(NamesAndTypesList & source_list) const -{ - for (const auto & col : source_list) - { - auto range = subcolumns.get<1>().equal_range(col.name); - if (range.first != range.second) - source_list.insert(source_list.end(), range.first, range.second); - } -} - NamesAndTypesList ColumnsDescription::getAllWithSubcolumns() const { auto columns_list = getAll(); diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 28ae462ae5..2bcc6a1c53 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -49,6 +49,51 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +struct GetColumnsOptions +{ + enum Kind : UInt8 + { + Ordinary = 1, + Materialized = 2, + Aliases = 4, + Ephemeral = 8, + + AllPhysical = Ordinary | Materialized, + All = AllPhysical | Aliases | Ephemeral, + }; + + GetColumnsOptions(Kind kind_) : kind(kind_) {} + + GetColumnsOptions & withSubcolumns(bool value = true) + { + with_subcolumns = value; + return *this; + } + + GetColumnsOptions & withVirtuals(bool value = true) + { + with_virtuals = value; + return *this; + } + + GetColumnsOptions & withExtendedObjects(bool value = true) + { + with_extended_objects = value; + return *this; + } + + GetColumnsOptions & withSystemColumns(bool value = true) + { + with_system_columns = value; + return *this; + } + + Kind kind; + bool with_subcolumns = false; + bool with_virtuals = false; + bool with_extended_objects = false; + bool with_system_columns = false; +}; /// Description of a single table column (in CREATE TABLE for example). struct ColumnDescription @@ -87,7 +132,7 @@ public: explicit ColumnsDescription(NamesAndTypesList ordinary, NamesAndAliases aliases); /// `after_column` can be a Nested column name; - void add(ColumnDescription column, const String & after_column = String(), bool first = false); + void add(ColumnDescription column, const String & after_column = String(), bool first = false, bool add_subcolumns = true); /// `column_name` can be a Nested column name; void remove(const String & column_name); @@ -104,17 +149,8 @@ public: auto begin() const { return columns.begin(); } auto end() const { return columns.end(); } - enum GetFlags : UInt8 - { - Ordinary = 1, - Materialized = 2, - Aliases = 4, - - AllPhysical = Ordinary | Materialized, - All = AllPhysical | Aliases, - }; - - NamesAndTypesList getByNames(GetFlags flags, const Names & names, bool with_subcolumns) const; + NamesAndTypesList get(const GetColumnsOptions & options) const; + NamesAndTypesList getByNames(const GetColumnsOptions & options, const Names & names) const; NamesAndTypesList getOrdinary() const; NamesAndTypesList getMaterialized() const; @@ -124,6 +160,8 @@ public: NamesAndTypesList getAllWithSubcolumns() const; NamesAndTypesList getAllPhysicalWithSubcolumns() const; NamesAndTypesList getSubcolumnsOfAllPhysical() const; + NamesAndTypesList getSubcolumns(const String & name_in_storage) const; + NamesAndTypesList getEphemeral() const; using ColumnTTLs = std::unordered_map; ColumnTTLs getColumnTTLs() const; @@ -145,9 +183,12 @@ public: auto it = columns.get<1>().find(column_name); if (it == columns.get<1>().end()) throw Exception("Cannot find column " + column_name + " in ColumnsDescription", ErrorCodes::LOGICAL_ERROR); + + removeSubcolumns(it->name); if (!columns.get<1>().modify(it, std::forward(f))) throw Exception("Cannot modify ColumnDescription for column " + column_name + ": column name cannot be changed", ErrorCodes::LOGICAL_ERROR); + addSubcolumns(it->name, it->type); modifyColumnOrder(column_name, after_column, first); } @@ -155,15 +196,20 @@ public: Names getNamesOfOrdinary() const; bool hasPhysical(const String & column_name) const; - bool hasColumnOrSubcolumn(GetFlags flags, const String & column_name) const; + bool hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const; NameAndTypePair getPhysical(const String & column_name) const; - NameAndTypePair getColumnOrSubcolumn(GetFlags flags, const String & column_name) const; + NameAndTypePair getColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const; + NameAndTypePair getColumn(const GetColumnsOptions & options, const String & column_name) const; std::optional tryGetPhysical(const String & column_name) const; - std::optional tryGetColumnOrSubcolumn(GetFlags flags, const String & column_name) const; + std::optional tryGetColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const; + std::optional tryGetColumn(const GetColumnsOptions & options, const String & column_name) const; std::optional tryGetMapImplicitColumn(const String & column_name) const; + std::optional tryGetColumnOrSubcolumnDescription(GetColumnsOptions::Kind kind, const String & column_name) const; + std::optional tryGetColumnDescription(const GetColumnsOptions & options, const String & column_name) const; + ColumnDefaults getDefaults() const; /// TODO: remove bool hasDefault(const String & column_name) const; bool hasDefaults() const; diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 578da239c2..a8ab5c00ba 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -274,7 +274,7 @@ Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, c Pipe StorageHDFS::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context_, QueryProcessingStage::Enum /*processed_stage*/, @@ -309,7 +309,7 @@ Pipe StorageHDFS::read( for (size_t i = 0; i < num_streams; ++i) pipes.emplace_back(std::make_shared( - sources_info, uri_without_path, format_name, compression_method, metadata_snapshot->getSampleBlock(), context_, max_block_size)); + sources_info, uri_without_path, format_name, compression_method, storage_snapshot->metadata->getSampleBlock(), context_, max_block_size)); return Pipe::unitePipes(std::move(pipes)); } diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index 4a6614be2e..f24992ba41 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -25,7 +25,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/Hive/StorageCloudHive.cpp b/src/Storages/Hive/StorageCloudHive.cpp index db3ec51b95..b3f2866c66 100644 --- a/src/Storages/Hive/StorageCloudHive.cpp +++ b/src/Storages/Hive/StorageCloudHive.cpp @@ -32,7 +32,7 @@ StorageCloudHive::StorageCloudHive( Pipe StorageCloudHive::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum /*processed_stage*/, @@ -54,11 +54,11 @@ Pipe StorageCloudHive::read( } HiveFiles hive_files = getHiveFiles(); - selectFiles(local_context, metadata_snapshot, query_info, hive_files, num_streams); + selectFiles(local_context, storage_snapshot->metadata, query_info, hive_files, num_streams); Pipes pipes; auto block_info = std::make_shared( - metadata_snapshot->getSampleBlockForColumns(real_columns), need_path_colum, need_file_column, metadata_snapshot->getPartitionKey()); + storage_snapshot->getSampleBlockForColumns(real_columns), need_path_colum, need_file_column, storage_snapshot->metadata->getPartitionKey()); auto allocator = std::make_shared(std::move(hive_files)); if (block_info->to_read.columns() == 0) diff --git a/src/Storages/Hive/StorageCloudHive.h b/src/Storages/Hive/StorageCloudHive.h index 326f21fe9b..0bf2223c6f 100644 --- a/src/Storages/Hive/StorageCloudHive.h +++ b/src/Storages/Hive/StorageCloudHive.h @@ -27,7 +27,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/Hive/StorageCnchHive.cpp b/src/Storages/Hive/StorageCnchHive.cpp index 76d73f093f..7af75e73e4 100644 --- a/src/Storages/Hive/StorageCnchHive.cpp +++ b/src/Storages/Hive/StorageCnchHive.cpp @@ -118,7 +118,7 @@ bool StorageCnchHive::isBucketTable() const } QueryProcessingStage::Enum StorageCnchHive::getQueryProcessingStage( - ContextPtr local_context, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const + ContextPtr local_context, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const { const auto & local_settings = local_context->getSettingsRef(); @@ -239,7 +239,7 @@ PrepareContextResult StorageCnchHive::prepareReadContext( Pipe StorageCnchHive::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -247,7 +247,7 @@ Pipe StorageCnchHive::read( unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); } @@ -255,14 +255,14 @@ Pipe StorageCnchHive::read( void StorageCnchHive::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, const size_t /*max_block_size*/, unsigned num_streams) { - PrepareContextResult result = prepareReadContext(column_names, metadata_snapshot, query_info, local_context, num_streams); + PrepareContextResult result = prepareReadContext(column_names, storage_snapshot->metadata, query_info, local_context, num_streams); Block header = InterpreterSelectQuery(query_info.query, local_context, SelectQueryOptions(processed_stage)).getSampleBlock(); auto worker_group = getWorkerGroupForTable(local_context, shared_from_this()); @@ -282,6 +282,8 @@ void StorageCnchHive::read( ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( header, + {}, + storage_snapshot, processed_stage, StorageID::createEmpty(), /// Don't check whether table exists in cnch-worker scalars, diff --git a/src/Storages/Hive/StorageCnchHive.h b/src/Storages/Hive/StorageCnchHive.h index 15d6400473..0671892a51 100644 --- a/src/Storages/Hive/StorageCnchHive.h +++ b/src/Storages/Hive/StorageCnchHive.h @@ -50,7 +50,7 @@ public: std::shared_ptr settings_); QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; std::optional getVirtualWarehouseName(VirtualWarehouseType vw_type) const override; @@ -63,7 +63,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -73,7 +73,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index db08dfcf5d..e3459a8d63 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -112,7 +112,7 @@ TableExclusiveLockHolder IStorage::lockExclusively(const String & query_id, cons Pipe IStorage::read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -125,17 +125,17 @@ Pipe IStorage::read( void IStorage::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { - auto pipe = read(column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams); + auto pipe = read(column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); if (pipe.empty()) { - auto header = (query_info.projection ? query_info.projection->desc->metadata : metadata_snapshot) + auto header = (query_info.projection ? query_info.projection->desc->metadata : storage_snapshot->metadata) ->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info, context); } @@ -149,7 +149,7 @@ void IStorage::read( void IStorage::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -160,7 +160,7 @@ void IStorage::read( if (distributed_stages) { //IStorage::read(query_plan, column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams); - auto header = getHeaderForProcessingStage(*this, column_names, metadata_snapshot, query_info, context, processed_stage); + auto header = getHeaderForProcessingStage(column_names, storage_snapshot, query_info, context, processed_stage); auto read_step = std::make_unique(header, getStorageID(), query_info, diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 9d4d6d480c..18177bee34 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -44,6 +44,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -187,6 +191,10 @@ public: /// Support trivial count optimization for high level storage, only TRUE for StorageCnchMergeTree virtual bool supportsTrivialCount() const { return false; } + /// Returns true if the storage supports storing of dynamic subcolumns. + /// For now it makes sense only for data type Object. + virtual bool supportsDynamicSubcolumns() const { return false; } + /// Requires squashing small blocks to large for optimal storage. /// This is true for most storages that store data on disk. virtual bool prefersLargeBlocks() const { return true; } @@ -243,6 +251,9 @@ public: virtual bool isBucketTable() const {return false;} virtual UInt64 getTableHashForClusterBy() const {return 0;} + /// Return true if there is at least one part containing lightweight deleted mask. + virtual bool hasLightweightDeletedMask() const { return false; } + /// Return true if storage can execute lightweight delete. virtual bool supportsLightweightDelete() const { return false; } @@ -324,7 +335,7 @@ public: * since it cannot return Complete for intermediate queries never. */ virtual QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const { return QueryProcessingStage::FetchColumns; } @@ -385,7 +396,7 @@ public: */ virtual Pipe read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -397,7 +408,7 @@ public: virtual void read( QueryPlan & query_plan, const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -408,7 +419,7 @@ public: virtual void read( QueryPlan & query_plan, const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -690,6 +701,18 @@ public: /// Does not takes underlying Storage (if any) into account. virtual std::optional lifetimeBytes() const { return {}; } + /// Creates a storage snapshot from given metadata. + virtual StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const + { + return std::make_shared(*this, metadata_snapshot); + } + + /// Creates a storage snapshot from given metadata and columns, which are used in query. + virtual StorageSnapshotPtr getStorageSnapshotForQuery(const StorageMetadataPtr & metadata_snapshot, const ASTPtr & /*query*/, ContextPtr query_context) const + { + return getStorageSnapshot(metadata_snapshot, query_context); + } + bool is_detached{false}; TxnTimestamp commit_time; diff --git a/src/Storages/IndicesDescription.cpp b/src/Storages/IndicesDescription.cpp index d7cd5ef65b..cd66b92c7f 100644 --- a/src/Storages/IndicesDescription.cpp +++ b/src/Storages/IndicesDescription.cpp @@ -40,6 +40,7 @@ namespace ErrorCodes { extern const int INCORRECT_QUERY; extern const int LOGICAL_ERROR; + extern const int TYPE_MISMATCH; }; IndexDescription::IndexDescription(const IndexDescription & other) @@ -120,6 +121,10 @@ IndexDescription IndexDescription::getIndexFromAST(const ASTPtr & definition_ast for (size_t i = 0; i < block_without_columns.columns(); ++i) { const auto & column = block_without_columns.getByPosition(i); + if (column.type->hasDynamicSubcolumns()) + throw Exception( + fmt::format("Column {} with type {} is not allowed in index expression.", column.name, column.type->getName()), + ErrorCodes::TYPE_MISMATCH); result.column_names.emplace_back(column.name); result.data_types.emplace_back(column.type); result.sample_block.insert(ColumnWithTypeAndName(column.type->createColumn(), column.type, column.name)); diff --git a/src/Storages/IngestColumnCnch/IngestColumnBlockInputStream.cpp b/src/Storages/IngestColumnCnch/IngestColumnBlockInputStream.cpp index a07cb5886b..3d40cb438c 100644 --- a/src/Storages/IngestColumnCnch/IngestColumnBlockInputStream.cpp +++ b/src/Storages/IngestColumnCnch/IngestColumnBlockInputStream.cpp @@ -75,8 +75,11 @@ IngestColumnBlockInputStream::IngestColumnBlockInputStream( if (!source_cloud_merge_tree) throw Exception("source table in worker is not CloudMeregTree", ErrorCodes::LOGICAL_ERROR); - target_meta_data_ptr = target_cloud_merge_tree->getInMemoryMetadataPtr(); - source_meta_data_ptr = source_cloud_merge_tree->getInMemoryMetadataPtr(); + auto target_meta_data_ptr = target_cloud_merge_tree->getInMemoryMetadataPtr(); + auto source_meta_data_ptr = source_cloud_merge_tree->getInMemoryMetadataPtr(); + target_storage_snapshot = target_cloud_merge_tree->getStorageSnapshot(target_meta_data_ptr, local_context); + source_storage_snapshot = source_cloud_merge_tree->getStorageSnapshot(source_meta_data_ptr, local_context); + ordered_key_names = getOrderedKeys(command.key_names, *target_meta_data_ptr); partition_id = target_cloud_merge_tree->getPartitionIDFromQuery(command.partition, context); diff --git a/src/Storages/IngestColumnCnch/IngestColumnBlockInputStream.h b/src/Storages/IngestColumnCnch/IngestColumnBlockInputStream.h index 42341bd5eb..f2e3959bec 100644 --- a/src/Storages/IngestColumnCnch/IngestColumnBlockInputStream.h +++ b/src/Storages/IngestColumnCnch/IngestColumnBlockInputStream.h @@ -5,6 +5,7 @@ #include #include #include +#include "Storages/StorageSnapshot.h" namespace DB { @@ -33,8 +34,8 @@ private: StoragePtr source_storage; StorageCloudMergeTree * target_cloud_merge_tree; StorageCloudMergeTree * source_cloud_merge_tree; - StorageMetadataPtr target_meta_data_ptr; - StorageMetadataPtr source_meta_data_ptr; + StorageSnapshotPtr target_storage_snapshot; + StorageSnapshotPtr source_storage_snapshot; Names ordered_key_names; Names ingest_column_names; String partition_id; diff --git a/src/Storages/IngestColumnCnch/IngestColumnHelper.cpp b/src/Storages/IngestColumnCnch/IngestColumnHelper.cpp index 6fc1e43aee..783ed99a9a 100644 --- a/src/Storages/IngestColumnCnch/IngestColumnHelper.cpp +++ b/src/Storages/IngestColumnCnch/IngestColumnHelper.cpp @@ -39,8 +39,8 @@ void checkColumnStructure(const StorageInMemoryMetadata & target_data, const Sto { for (const auto & col_name : names) { - const auto & target = target_data.getColumns().getColumnOrSubcolumn(ColumnsDescription::GetFlags::AllPhysical, col_name); - const auto & src = src_data.getColumns().getColumnOrSubcolumn(ColumnsDescription::GetFlags::AllPhysical, col_name); + const auto & target = target_data.getColumns().getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, col_name); + const auto & src = src_data.getColumns().getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, col_name); if (target.name != src.name) throw Exception("Column structure mismatch, found different names of column " + backQuoteIfNeed(col_name), diff --git a/src/Storages/IngestColumnCnch/memoryEfficientIngestColumn.cpp b/src/Storages/IngestColumnCnch/memoryEfficientIngestColumn.cpp index 1b7ecdf68c..b389305b97 100644 --- a/src/Storages/IngestColumnCnch/memoryEfficientIngestColumn.cpp +++ b/src/Storages/IngestColumnCnch/memoryEfficientIngestColumn.cpp @@ -35,8 +35,8 @@ MemoryEfficientIngestColumn::MemoryEfficientIngestColumn(IngestColumnBlockInputS settings{stream_.context->getSettingsRef()}, target_cloud_merge_tree{*stream_.target_cloud_merge_tree}, source_cloud_merge_tree{*stream_.source_cloud_merge_tree}, - target_meta_data_ptr{stream_.target_meta_data_ptr}, - source_meta_data_ptr{stream_.source_meta_data_ptr}, + target_storage_snapshot{stream_.target_storage_snapshot}, + source_storage_snapshot{stream_.source_storage_snapshot}, visible_target_parts{stream_.visible_target_parts}, visible_source_parts{stream_.visible_source_parts}, number_of_threads_for_read_source_parts{std::min(settings.parallel_ingest_threads.value, stream_.visible_source_parts.size())}, @@ -130,7 +130,7 @@ void MemoryEfficientIngestColumn::probeHashMapWithTargetData( auto source_input = std::make_unique( target_cloud_merge_tree, - target_meta_data_ptr, + target_storage_snapshot, part, stream.ordered_key_names, read_with_direct_io, true); QueryPipeline source_pipeline; @@ -225,7 +225,7 @@ HashMapWithSavedHash MemoryEfficientIngestCol part->bytes_on_disk >= settings.min_bytes_to_use_direct_io; auto source_input = std::make_unique( source_cloud_merge_tree, - source_meta_data_ptr, + source_storage_snapshot, part, stream.ordered_key_names, read_with_direct_io, true); QueryPipeline source_pipeline; source_pipeline.init(Pipe(std::move(source_input))); @@ -285,11 +285,11 @@ void MemoryEfficientIngestColumn::insertNewData( std::atomic has_read_exception = false; const Names source_read_columns = IngestColumn::getColumnsFromSourceTableForInsertNewPart(stream.ordered_key_names, - stream.ingest_column_names, source_meta_data_ptr); + stream.ingest_column_names, source_storage_snapshot->metadata); - BlockOutputStreamPtr new_part_output = target_cloud_merge_tree.write(ASTPtr(), target_meta_data_ptr, context); + BlockOutputStreamPtr new_part_output = target_cloud_merge_tree.write(ASTPtr(), target_storage_snapshot->metadata, context); new_part_output = std::make_shared( - new_part_output, target_meta_data_ptr->getSampleBlock(), settings.min_insert_block_size_rows, settings.min_insert_block_size_bytes); + new_part_output, target_storage_snapshot->metadata->getSampleBlock(), settings.min_insert_block_size_rows, settings.min_insert_block_size_bytes); Block ingested_header; for (const auto & name : source_read_columns) @@ -298,12 +298,12 @@ void MemoryEfficientIngestColumn::insertNewData( /// No need to add implicit map column if (isMapImplicitKey(name)) column_name = parseMapNameFromImplicitColName(name); - auto column = target_meta_data_ptr->getColumns().getColumnOrSubcolumn(ColumnsDescription::GetFlags::AllPhysical, column_name); + auto column = target_storage_snapshot->metadata->getColumns().getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name); ingested_header.insertUnique(ColumnWithTypeAndName(column.type, column.name)); } new_part_output = std::make_shared( - new_part_output, ingested_header, target_meta_data_ptr->getColumns(), context); + new_part_output, ingested_header, target_storage_snapshot->metadata->getColumns(), context); new_part_output->writePrefix(); @@ -341,7 +341,7 @@ void MemoryEfficientIngestColumn::insertNewData( auto source_input = std::make_unique( source_cloud_merge_tree, - source_meta_data_ptr, + source_storage_snapshot, part, source_read_columns, read_with_direct_io, true); QueryPipeline source_pipeline; source_pipeline.init(Pipe(std::move(source_input))); @@ -362,7 +362,7 @@ void MemoryEfficientIngestColumn::insertNewData( new_part_output_mutex, number_of_buckets, *new_part_output, - target_meta_data_ptr, + target_storage_snapshot->metadata, log ); } @@ -485,10 +485,10 @@ MergeTreeMutableDataPartPtr MemoryEfficientIngestColumn::updateTargetPartWithout BlockInputStreamPtr res_block_in = std::make_shared( - target_meta_data_ptr->getSampleBlockForColumns(stream.ingest_column_names), + target_storage_snapshot->getSampleBlockForColumns(stream.ingest_column_names), target_part->rows_count, settings.min_insert_block_size_rows); - updateTempPartWithData(new_partial_part, target_part, res_block_in, target_meta_data_ptr); + updateTempPartWithData(new_partial_part, target_part, res_block_in, target_storage_snapshot->metadata); return new_partial_part; } @@ -522,7 +522,7 @@ MergeTreeMutableDataPartPtr MemoryEfficientIngestColumn::updateTargetPart( BlockInputStreamPtr res_block_in = std::make_shared(std::move(res_block_list)); - updateTempPartWithData(new_partial_part, target_part, res_block_in, target_meta_data_ptr); + updateTempPartWithData(new_partial_part, target_part, res_block_in, target_storage_snapshot->metadata); return new_partial_part; } @@ -545,7 +545,7 @@ IngestColumn::TargetPartData MemoryEfficientIngestColumn::readTargetPartForUpdat auto source_input = std::make_unique( target_cloud_merge_tree, - target_meta_data_ptr, + target_storage_snapshot, target_part, all_columns, read_with_direct_io, true); QueryPipeline source_pipeline; source_pipeline.init(Pipe(std::move(source_input))); @@ -567,7 +567,7 @@ void MemoryEfficientIngestColumn::updateTargetDataWithSourcePart( auto source_input = std::make_unique( source_cloud_merge_tree, - source_meta_data_ptr, + source_storage_snapshot, source_part, all_columns, read_with_direct_io, true); QueryPipeline source_pipeline; source_pipeline.init(Pipe(std::move(source_input))); diff --git a/src/Storages/IngestColumnCnch/memoryEfficientIngestColumn.h b/src/Storages/IngestColumnCnch/memoryEfficientIngestColumn.h index 43fea49959..8a9bc31b73 100644 --- a/src/Storages/IngestColumnCnch/memoryEfficientIngestColumn.h +++ b/src/Storages/IngestColumnCnch/memoryEfficientIngestColumn.h @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace DB @@ -66,8 +67,8 @@ private: StorageCloudMergeTree & target_cloud_merge_tree; const StorageCloudMergeTree & source_cloud_merge_tree; - const StorageMetadataPtr & target_meta_data_ptr; - const StorageMetadataPtr & source_meta_data_ptr; + const StorageSnapshotPtr & target_storage_snapshot; + const StorageSnapshotPtr & source_storage_snapshot; const MergeTreeDataPartsVector & visible_target_parts; const MergeTreeDataPartsVector & visible_source_parts; const size_t number_of_threads_for_read_source_parts; diff --git a/src/Storages/IngestPartition.cpp b/src/Storages/IngestPartition.cpp index b9bef779f4..99748c30e3 100644 --- a/src/Storages/IngestPartition.cpp +++ b/src/Storages/IngestPartition.cpp @@ -274,8 +274,8 @@ void IngestPartition::checkColumnStructure(const StorageInMemoryMetadata & targe { for (const auto & col_name : names) { - const auto & target = target_data.getColumns().getColumnOrSubcolumn(ColumnsDescription::GetFlags::AllPhysical, col_name); - const auto & src = src_data.getColumns().getColumnOrSubcolumn(ColumnsDescription::GetFlags::AllPhysical, col_name); + const auto & target = target_data.getColumns().getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, col_name); + const auto & src = src_data.getColumns().getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, col_name); if (target.name != src.name) throw Exception("Column structure mismatch, found different names of column " + backQuoteIfNeed(col_name), @@ -499,7 +499,7 @@ bool IngestPartition::ingestPartition() /// No need to add implicit map column if (isMapImplicitKey(name)) column_name = parseMapNameFromImplicitColName(name); - auto column = target_meta->getColumns().getColumnOrSubcolumn(ColumnsDescription::GetFlags::AllPhysical, column_name); + auto column = target_meta->getColumns().getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name); ingested_header.insertUnique(ColumnWithTypeAndName(column.type, column.name)); } @@ -526,6 +526,7 @@ IngestPartition::IngestSources IngestPartition::generateSourceBlocks(MergeTreeDa auto settings = context->getSettingsRef(); IngestPartition::IngestSources src_blocks; auto match_type = std::make_shared(); + auto storage_snapshot = source_data.getStorageSnapshot(source_data.getInMemoryMetadataPtr(), context); for (auto & read_part : parts_to_read) { @@ -534,7 +535,7 @@ IngestPartition::IngestSources IngestPartition::generateSourceBlocks(MergeTreeDa auto source_input = std::make_shared(source_data, - source_data.getInMemoryMetadataPtr(), + storage_snapshot, read_part, nullptr, all_columns_with_partition_key, @@ -593,7 +594,7 @@ IngestParts IngestPartition::generateIngestParts(MergeTreeData & data, const Mer ASTPtr IngestPartition::getDefaultFilter(const String & column_name) { - auto name_type = target_table->getInMemoryMetadata().getColumns().getColumnOrSubcolumn(ColumnsDescription::GetFlags::AllPhysical, column_name); + auto name_type = target_table->getInMemoryMetadata().getColumns().getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name); Field value = name_type.type->getDefault(); auto literal = std::make_shared(value); auto identifier = std::make_shared(column_name); @@ -869,8 +870,9 @@ void IngestPartition::ingestWidePart(MergeTreeData & data, * and source block has 2 too. if 2 of target block is not read, a new part will be generated, * so that we get duplicated key. */ + auto storage_snapshot = data.getStorageSnapshot(data.getInMemoryMetadataPtr(), context); auto source_input = std::make_shared(data, - data.getInMemoryMetadataPtr(), + storage_snapshot, target_part, nullptr, all_columns, @@ -937,8 +939,10 @@ void IngestPartition::ingestCompactPart( { bool read_with_direct_io = settings.min_bytes_to_use_direct_io != 0 && target_part->getBytesOnDisk() >= settings.min_bytes_to_use_direct_io; + + auto storage_snapshot = data.getStorageSnapshot(data.getInMemoryMetadataPtr(), {}); auto source_input = std::make_shared(data, - data.getInMemoryMetadataPtr(), + storage_snapshot, target_part, nullptr, part_columns.getNames(), diff --git a/src/Storages/LiveView/StorageBlocks.h b/src/Storages/LiveView/StorageBlocks.h index 6cf7ce59fa..388a94462e 100644 --- a/src/Storages/LiveView/StorageBlocks.h +++ b/src/Storages/LiveView/StorageBlocks.h @@ -34,14 +34,14 @@ public: bool supportsFinal() const override { return true; } QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override { return to_stage; } Pipe read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/LiveView/StorageLiveView.cpp b/src/Storages/LiveView/StorageLiveView.cpp index d462ae6e3c..0dd4f27b4b 100644 --- a/src/Storages/LiveView/StorageLiveView.cpp +++ b/src/Storages/LiveView/StorageLiveView.cpp @@ -502,7 +502,7 @@ void StorageLiveView::refresh(bool grab_lock) Pipe StorageLiveView::read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/LiveView/StorageLiveView.h b/src/Storages/LiveView/StorageLiveView.h index 23a9c84cb9..deaf74bed4 100644 --- a/src/Storages/LiveView/StorageLiveView.h +++ b/src/Storages/LiveView/StorageLiveView.h @@ -146,7 +146,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index f7f268c342..62213a3d9d 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -98,8 +97,18 @@ IMergeTreeReader::IMergeTreeReader( } columns_from_part.set_empty_key(StringRef()); + for (const auto & column_from_part : part_columns) columns_from_part.emplace(column_from_part.name, &column_from_part.type); + + // auto requested_columns = data_part_->getType() == MergeTreeDataPartType::WIDE ? Nested::convertToSubcolumns(columns_) : columns_; + + // columns_to_read.reserve(requested_columns.size()); + + // for (const auto & column : requested_columns) + // { + // columns_to_read.emplace_back(getColumnFromPart(column)); + // } } IMergeTreeReader::~IMergeTreeReader() = default; @@ -110,101 +119,15 @@ const IMergeTreeReader::ValueSizeMap & IMergeTreeReader::getAvgValueSizeHints() return avg_value_size_hints; } - -static bool arrayHasNoElementsRead(const IColumn & column) -{ - const auto * column_array = typeid_cast(&column); - - if (!column_array) - return false; - - size_t size = column_array->size(); - if (!size) - return false; - - size_t data_size = column_array->getData().size(); - if (data_size) - return false; - - size_t last_offset = column_array->getOffsets()[size - 1]; - return last_offset != 0; -} - -void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows, bool /* check_column_size */) +void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows, bool /* check_column_size */) const { try { - size_t num_columns = columns.size(); size_t num_bitmap_columns = hasBitmapIndexReader() ? getBitmapOutputColumns().size() : 0; - if (res_columns.size() != num_columns + num_bitmap_columns) - throw Exception("invalid number of columns passed to MergeTreeReader::fillMissingColumns. " - "Expected " + toString(num_columns + num_bitmap_columns) + ", " - "got " + toString(res_columns.size()), ErrorCodes::LOGICAL_ERROR); - - /// For a missing column of a nested data structure we must create not a column of empty - /// arrays, but a column of arrays of correct length. - - /// First, collect offset columns for all arrays in the block. - OffsetColumns offset_columns; - auto requested_column = columns.begin(); - for (size_t i = 0; i < num_columns; ++i, ++requested_column) - { - if (res_columns[i] == nullptr) - continue; - - if (const auto * array = typeid_cast(res_columns[i].get())) - { - String offsets_name = Nested::extractTableName(requested_column->name); - auto & offsets_column = offset_columns[offsets_name]; - - /// If for some reason multiple offsets columns are present for the same nested data structure, - /// choose the one that is not empty. - if (!offsets_column || offsets_column->empty()) - offsets_column = array->getOffsetsPtr(); - } - } - - should_evaluate_missing_defaults = false; - - /// insert default values only for columns without default expressions - requested_column = columns.begin(); - for (size_t i = 0; i < num_columns; ++i, ++requested_column) - { - auto & [name, type] = *requested_column; - - if (res_columns[i] && arrayHasNoElementsRead(*res_columns[i])) - res_columns[i] = nullptr; - - if (res_columns[i] == nullptr) - { - if (metadata_snapshot->getColumns().hasDefault(name)) - { - should_evaluate_missing_defaults = true; - continue; - } - - String offsets_name = Nested::extractTableName(name); - auto offset_it = offset_columns.find(offsets_name); - const auto * array_type = typeid_cast(type.get()); - if (offset_it != offset_columns.end() && array_type) - { - const auto & nested_type = array_type->getNestedType(); - ColumnPtr offsets_column = offset_it->second; - size_t nested_rows = typeid_cast(*offsets_column).getData().back(); - - ColumnPtr nested_column = - nested_type->createColumnConstWithDefaultValue(nested_rows)->convertToFullColumnIfConst(); - - res_columns[i] = ColumnArray::create(nested_column, offsets_column); - } - else - { - /// We must turn a constant column into a full column because the interpreter could infer - /// that it is constant everywhere but in some blocks (from other parts) it can be a full column. - res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst(); - } - } - } + + DB::fillMissingColumns(res_columns, num_rows, columns, metadata_snapshot, num_bitmap_columns); + should_evaluate_missing_defaults = std::any_of( + res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; }); } catch (Exception & e) { @@ -214,7 +137,7 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e } } -void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns & res_columns) +void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns & res_columns) const { try { diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h index 29e463bd19..5f0cc99d7f 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.h +++ b/src/Storages/MergeTree/IMergeTreeReader.h @@ -66,9 +66,10 @@ public: /// Add columns from ordered_names that are not present in the block. /// Missing columns are added in the order specified by ordered_names. /// num_rows is needed in case if all res_columns are nullptr. - void fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows, bool check_column_size = true); + void fillMissingColumns( + Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows, bool check_column_size = true) const; /// Evaluate defaulted columns if necessary. - void evaluateMissingDefaults(Block additional_columns, Columns & res_columns); + void evaluateMissingDefaults(Block additional_columns, Columns & res_columns) const; /// If part metadata is not equal to storage metadata, than /// try to perform conversions of columns. @@ -160,6 +161,10 @@ protected: std::set dup_implicit_keys; Names names; // only initialized if duplicate implicit key exit + /// Actual column names and types of columns in part, + /// which may differ from table metadata. + NamesAndTypes columns_to_read; + UncompressedCache * uncompressed_cache; MarkCache * mark_cache; diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp index 2e5a71afd5..860ad9fb38 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp @@ -56,8 +56,7 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart( [&](const ISerialization::SubstreamPath & substream_path) { ++stream_counts[ISerialization::getFileNameForStream(column, substream_path)]; - }, - {}); + }); } NameSet remove_files; diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index bdb93dc277..da67f8b533 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1612,7 +1612,7 @@ bool KeyCondition::tryParseAtomFromAST(const ASTPtr & node, ContextPtr context, } else { - DataTypePtr common_type = tryGetLeastSupertype({key_expr_type_not_null, const_type}); + DataTypePtr common_type = tryGetLeastSupertype(DataTypes{key_expr_type_not_null, const_type}); if (!common_type) return false; diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index f81221dd47..292f428067 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -49,7 +49,7 @@ namespace ErrorCodes MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( Block header, const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const SelectQueryInfo & query_info_, ExpressionActionsSettings actions_settings, UInt64 max_block_size_rows_, @@ -60,7 +60,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( const Names & virt_column_names_) : SourceWithProgress(transformHeader(std::move(header), getPrewhereInfo(query_info_), storage_.getPartitionValueType(), virt_column_names_, getIndexContext(query_info_), query_info_.read_bitmap_index)) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , prewhere_info(getPrewhereInfo(query_info_)) , index_context(getIndexContext(query_info_)) , max_block_size_rows(max_block_size_rows_) @@ -201,7 +201,7 @@ void MergeTreeBaseSelectProcessor::initializeReaders( reader = task->data_part->getReader( task->task_columns.columns, - metadata_snapshot, + storage_snapshot->metadata, mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), @@ -240,7 +240,7 @@ void MergeTreeBaseSelectProcessor::initializeReaders( pre_reader = task->data_part->getReader( task->task_columns.pre_columns, - metadata_snapshot, + storage_snapshot->metadata, mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index bcdb447d5b..0691dfd61e 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -45,7 +45,7 @@ public: MergeTreeBaseSelectProcessor( Block header, const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const SelectQueryInfo & query_info_, ExpressionActionsSettings actions_settings, UInt64 max_block_size_rows_, @@ -83,7 +83,7 @@ protected: protected: const MergeTreeMetaBase & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; PrewhereInfoPtr prewhere_info; std::unique_ptr prewhere_actions; diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index 5d9cc8f508..445bb75492 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -26,7 +26,8 @@ #include #include #include -#include "Storages/ColumnsDescription.h" +#include +#include #include #include @@ -59,7 +60,7 @@ bool injectRequiredColumnsRecursively( /// stages. checkStackSize(); - auto column_in_storage = storage_columns.tryGetColumnOrSubcolumn(ColumnsDescription::AllPhysical, column_name); + auto column_in_storage = storage_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name); if (column_in_storage) { auto column_name_in_part = column_in_storage->getNameInStorage(); @@ -124,8 +125,15 @@ NameSet injectRequiredColumns(const MergeTreeMetaBase & storage, have_at_least_one_physical_column = true; continue; } + + auto name_in_storage = Nested::extractTableName(columns[i]); + if (storage_columns.has(name_in_storage) && isObject(storage_columns.get(name_in_storage).type)) + { + have_at_least_one_physical_column = true; + continue; + } /// We are going to fetch only physical columns - if (!storage_columns.hasColumnOrSubcolumn(ColumnsDescription::AllPhysical, columns[i])) + if (!storage_columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, columns[i])) throw Exception("There is no physical column or subcolumn " + columns[i] + " in table.", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); have_at_least_one_physical_column |= injectRequiredColumnsRecursively( @@ -140,7 +148,7 @@ NameSet injectRequiredColumns(const MergeTreeMetaBase & storage, if (!have_at_least_one_physical_column) { /// todo(weiping): temporariliy skip low cardinality for default injected column - if (!default_injected_column.empty() && !storage_columns.getColumnOrSubcolumn(ColumnsDescription::AllPhysical, default_injected_column).getTypeInStorage()->lowCardinality()) + if (!default_injected_column.empty() && !storage_columns.getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, default_injected_column).getTypeInStorage()->lowCardinality()) { columns.push_back(default_injected_column); } @@ -298,7 +306,7 @@ void MergeTreeBlockSizePredictor::update(const Block & sample_block, const Colum MergeTreeReadTaskColumns getReadTaskColumns( const MergeTreeMetaBase & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const MergeTreeMetaBase::DataPartPtr & data_part, const Names & required_columns, const PrewhereInfoPtr & prewhere_info, @@ -309,7 +317,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( Names pre_column_names; /// inject columns required for defaults evaluation - bool should_reorder = !injectRequiredColumns(storage, metadata_snapshot, data_part, column_names, "").empty(); + bool should_reorder = !injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, column_names, "").empty(); if (prewhere_info) { @@ -334,7 +342,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( if (pre_column_names.empty()) pre_column_names.push_back(column_names[0]); - const auto injected_pre_columns = injectRequiredColumns(storage, metadata_snapshot, data_part, pre_column_names, ""); + const auto injected_pre_columns = injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, pre_column_names, ""); if (!injected_pre_columns.empty()) should_reorder = true; @@ -385,15 +393,16 @@ MergeTreeReadTaskColumns getReadTaskColumns( } MergeTreeReadTaskColumns result; + NamesAndTypesList all_columns; result.bitmap_index_pre_columns = std::move(bitmap_pre_column_names); result.bitmap_index_columns = std::move(bitmap_column_names); if (check_columns) { - const auto & columns = metadata_snapshot->getColumns(); - result.pre_columns = columns.getByNames(ColumnsDescription::All, pre_column_names, true); - result.columns = columns.getByNames(ColumnsDescription::All, column_names, true); + auto options = GetColumnsOptions(GetColumnsOptions::All).withSubcolumns().withExtendedObjects(); + result.pre_columns = storage_snapshot->getColumnsByNames(options, pre_column_names); + result.columns = storage_snapshot->getColumnsByNames(options, column_names); } else { diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index cd8fbe721f..83fc98bd48 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -103,7 +103,7 @@ struct MergeTreeReadTask MergeTreeReadTaskColumns getReadTaskColumns( const MergeTreeMetaBase & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const MergeTreeMetaBase::DataPartPtr & data_part, const Names & required_columns, const PrewhereInfoPtr & prewhere_info, diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 5b414db263..085ec25204 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include @@ -83,6 +85,7 @@ #include #include +#include #include #include @@ -557,6 +560,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) if (part_names_with_disks.empty() && wal_name_with_disks.empty()) { + resetObjectColumnsFromActiveParts(part_lock); LOG_DEBUG(log, "There are no data parts"); return; } @@ -622,9 +626,9 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) } } + resetObjectColumnsFromActiveParts({}); calculateColumnSizesImpl(); - LOG_DEBUG(log, "Loaded data parts ({} items)", data_parts_indexes.size()); } @@ -1979,6 +1983,12 @@ bool MergeTreeData::renameTempPartAndReplace( modifyPartState(part_it, DataPartState::Committed); addPartContributionToColumnSizes(part); + + if (covered_parts.empty()) + updateObjectColumns(*part_it, lock); + else + resetObjectColumnsFromActiveParts(lock); + addPartContributionToDataVolume(part); } @@ -2048,9 +2058,10 @@ bool MergeTreeData::renameTempPartInDetachDirecotry(MutableDataPartPtr & new_par return true; } -void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVector & remove, bool clear_without_timeout, DataPartsLock & /*acquired_lock*/) +void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVector & remove, bool clear_without_timeout, DataPartsLock & acquired_lock) { auto remove_time = clear_without_timeout ? 0 : time(nullptr); + bool removed_active_part = false; for (const DataPartPtr & part : remove) { @@ -2058,6 +2069,7 @@ void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVect { removePartContributionToColumnSizes(part); removePartContributionToDataVolume(part); + removed_active_part = true; } if (part->getState() == IMergeTreeDataPart::State::Committed || clear_without_timeout) @@ -2069,11 +2081,15 @@ void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVect if (isInMemoryPart(part) && getSettings()->in_memory_parts_enable_wal) getWriteAheadLog()->dropPart(part->name); } + + if (removed_active_part) + resetObjectColumnsFromActiveParts(acquired_lock); } void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove) { auto lock = lockParts(); + bool removed_active_part = false; for (const auto & part : remove) { @@ -2081,6 +2097,9 @@ void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(con if (it_part == data_parts_by_info.end()) throw Exception("Part " + part->getNameWithState() + " not found in data_parts", ErrorCodes::LOGICAL_ERROR); + if (part->getState() == IMergeTreeDataPart::State::Committed) + removed_active_part = true; + modifyPartState(part, IMergeTreeDataPart::State::Temporary); /// Erase immediately data_parts_indexes.erase(it_part); @@ -2088,6 +2107,9 @@ void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(con if (metastore) metastore->dropPart(*this, part); } + + if (removed_active_part) + resetObjectColumnsFromActiveParts(lock); } void MergeTreeData::removePartsFromWorkingSet(const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock * acquired_lock) @@ -2187,6 +2209,8 @@ restore_covered) LOG_INFO(log, "Renaming {} to {}{} and forgetting it.", part_to_detach->relative_path, prefix, part_to_detach->name); auto lock = lockParts(); + bool removed_active_part = false; + bool restored_active_part = false; auto it_part = data_parts_by_info.find(part_to_detach->info); if (it_part == data_parts_by_info.end()) @@ -2199,6 +2223,7 @@ restore_covered) { removePartContributionToDataVolume(part); removePartContributionToColumnSizes(part); + removed_active_part = true; } modifyPartState(it_part, DataPartState::Deleting); @@ -2251,6 +2276,7 @@ restore_covered) addPartContributionToColumnSizes(*it); addPartContributionToDataVolume(*it); modifyPartState(it, DataPartState::Committed); // iterator is not invalidated here + restored_active_part = true; } pos = (*it)->info.max_block + 1; @@ -2301,6 +2327,9 @@ restore_covered) LOG_ERROR(log, "The set of parts restored in place of {} looks incomplete. There might or might not be a data loss.{}", part->name, (error_parts.empty() ? "" : " Suspicious parts: " + error_parts)); } } + + if (removed_active_part || restored_active_part) + resetObjectColumnsFromActiveParts(lock); } @@ -3397,7 +3426,7 @@ using PartitionIdToMaxBlock = std::unordered_map; static void selectBestProjection( const MergeTreeDataSelectExecutor & reader, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, const Names & required_columns, ProjectionCandidate & candidate, @@ -3426,7 +3455,7 @@ static void selectBestProjection( auto sum_marks = reader.estimateNumMarksToRead( projection_parts, candidate.required_columns, - metadata_snapshot, + storage_snapshot->metadata, candidate.desc->metadata, query_info, // TODO syntax_analysis_result set in index query_context, @@ -3444,8 +3473,8 @@ static void selectBestProjection( sum_marks += reader.estimateNumMarksToRead( normal_parts, required_columns, - metadata_snapshot, - metadata_snapshot, + storage_snapshot->metadata, + storage_snapshot->metadata, query_info, // TODO syntax_analysis_result set in index query_context, settings.max_threads, @@ -3462,8 +3491,9 @@ static void selectBestProjection( bool MergeTreeData::getQueryProcessingStageWithAggregateProjection( - ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot, SelectQueryInfo & query_info) const + ContextPtr query_context, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const { + const auto & metadata_snapshot = storage_snapshot->metadata; const auto & settings = query_context->getSettingsRef(); if (!settings.allow_experimental_projection_optimization || query_info.ignore_projections || query_info.is_projection_query) return false; @@ -3681,7 +3711,7 @@ bool MergeTreeData::getQueryProcessingStageWithAggregateProjection( { selectBestProjection( reader, - metadata_snapshot, + storage_snapshot, query_info, analysis_result.required_columns, candidate, @@ -3719,7 +3749,7 @@ bool MergeTreeData::getQueryProcessingStageWithAggregateProjection( { selectBestProjection( reader, - metadata_snapshot, + storage_snapshot, query_info, analysis_result.required_columns, candidate, @@ -3753,12 +3783,12 @@ bool MergeTreeData::getQueryProcessingStageWithAggregateProjection( QueryProcessingStage::Enum MergeTreeData::getQueryProcessingStage( ContextPtr query_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const { if (to_stage >= QueryProcessingStage::Enum::WithMergeableState) { - if (getQueryProcessingStageWithAggregateProjection(query_context, metadata_snapshot, query_info)) + if (getQueryProcessingStageWithAggregateProjection(query_context, storage_snapshot, query_info)) { if (query_info.projection->desc->type == ProjectionDescription::Type::Aggregate) return QueryProcessingStage::Enum::WithMergeableState; @@ -4682,6 +4712,58 @@ void MergeTreeData::removeWriteAheadLog(const String & file_name) const metastore->removeWAL(*this, file_name); } +ColumnsDescription MergeTreeData::getConcreteObjectColumns( + boost::iterator_range range, const ColumnsDescription & storage_columns) +{ + return DB::getConcreteObjectColumns( + range.begin(), range.end(), + storage_columns, [](const auto & part) -> const auto & { return part->getColumns(); }); +} + +void MergeTreeData::resetObjectColumnsFromActiveParts(const DataPartsLock & /*lock*/) +{ + auto metadata_snapshot = getInMemoryMetadataPtr(); + const auto & columns = metadata_snapshot->getColumns(); + if (!hasDynamicSubcolumns(columns)) + return; + + auto range = getDataPartsStateRange(DataPartState::Committed); + object_columns = getConcreteObjectColumns(range, columns); +} + +void MergeTreeData::updateObjectColumns(const DataPartPtr & part, const DataPartsLock & /*lock*/) +{ + auto metadata_snapshot = getInMemoryMetadataPtr(); + const auto & columns = metadata_snapshot->getColumns(); + if (!hasDynamicSubcolumns(columns)) + return; + + DB::updateObjectColumns(object_columns, columns, part->getColumns()); +} + +ColumnsDescription MergeTreeData::getConcreteObjectColumns( + const DataPartsVector & parts, const ColumnsDescription & storage_columns) +{ + return DB::getConcreteObjectColumns( + parts.begin(), parts.end(), + storage_columns, [](const auto & part) -> const auto & { return part->getColumns(); }); +} + +StorageSnapshotPtr MergeTreeData::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const +{ + auto snapshot_data = std::make_unique(); + + auto lock = lockParts(); + snapshot_data->parts = getDataPartsVectorUnlocked({DataPartState::Committed}, lock); + return std::make_shared(*this, metadata_snapshot, object_columns, std::move(snapshot_data)); +} + +StorageSnapshotPtr MergeTreeData::getStorageSnapshotWithoutParts(const StorageMetadataPtr & metadata_snapshot) const +{ + auto lock = lockParts(); + return std::make_shared(*this, metadata_snapshot, object_columns, std::make_unique()); +} + CurrentlySubmergingEmergingTagger::~CurrentlySubmergingEmergingTagger() { std::lock_guard lock(storage.currently_submerging_emerging_mutex); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 35590ded3b..86102c6534 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -240,21 +240,26 @@ public: ~MergeTreeData() override; bool getQueryProcessingStageWithAggregateProjection( - ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot, SelectQueryInfo & query_info) const; + ContextPtr query_context, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const; QueryProcessingStage::Enum getQueryProcessingStage( ContextPtr query_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & info) const override; static bool partsContainSameProjections(const DataPartPtr & left, const DataPartPtr & right); bool mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, ContextPtr, const StorageMetadataPtr & metadata_snapshot) const override; - + /// Load the set of data parts from disk. Call once - immediately after the object is created. void loadDataParts(bool skip_sanity_checks); + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const override; + + /// The same as above but does not hold vector of data parts. + StorageSnapshotPtr getStorageSnapshotWithoutParts(const StorageMetadataPtr & metadata_snapshot) const override; + /** ----------------------- COMPATIBLE CODE BEGIN-------------------------- */ /* compatible with old metastore. remove this later */ @@ -483,6 +488,10 @@ public: const PartitionCommands & commands, ContextPtr query_context) override; + /// Creates description of columns of data type Object from the range of data parts. + static ColumnsDescription getConcreteObjectColumns( + const DataPartsVector & parts, const ColumnsDescription & storage_columns); + /// Extracts MergeTreeData of other *MergeTree* storage /// and checks that their structure suitable for ALTER TABLE ATTACH PARTITION FROM /// Tables structure should be locked. @@ -604,7 +613,7 @@ protected: friend struct ReplicatedMergeTreeTableMetadata; friend class StorageReplicatedMergeTree; friend class MergeTreeDataWriter; - + MergeTreePartsMover parts_mover; std::optional totalRowsByPartitionPredicateImpl( @@ -690,6 +699,12 @@ protected: /// Moves part to specified space, used in ALTER ... MOVE ... queries bool movePartsToSpace(const DataPartsVector & parts, SpacePtr space); + /// Creates description of columns of data type Object from the range of data parts. + static ColumnsDescription getConcreteObjectColumns( + boost::iterator_range range, const ColumnsDescription & storage_columns); + void resetObjectColumnsFromActiveParts(const DataPartsLock & lock); + void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock); + /// Throttlers used in DataPartsExchange to lower maximum fetch/sends /// speed. ThrottlerPtr replicated_fetches_throttler; diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 771ceb9b1e..0f0f712fc7 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -1114,10 +1114,11 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor MergeStageProgress horizontal_stage_progress( column_sizes ? column_sizes->keyColumnsWeight() : 1.0); + auto storage_snapshot = data.getStorageSnapshot(metadata_snapshot, context); for (const auto & part : parts) { auto input = std::make_unique( - data, metadata_snapshot, part, merging_column_names, read_with_direct_io, true); + data, storage_snapshot, part, merging_column_names, read_with_direct_io, true); input->setProgressCallback( MergeProgressCallback(merge_entry, watch_prev_elapsed, horizontal_stage_progress)); @@ -1321,6 +1322,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor /*blocks_are_granules_size = */ false, context->getSettingsRef().optimize_map_column_serialization); + for (size_t column_num = 0, gathering_column_names_size = gathering_column_names.size(); column_num < gathering_column_names_size; ++column_num, ++it_name_and_type) { @@ -1335,7 +1337,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor { /// FIXME(UNIQUE KEY): set delete bitmap from snapshot auto column_part_source = std::make_shared( - data, metadata_snapshot, parts[part_num], /*delete_bitmap*/nullptr, column_names_, read_with_direct_io, + data, storage_snapshot, parts[part_num], /*delete_bitmap*/nullptr, column_names_, read_with_direct_io, /*take_column_types_from_storage*/true, /*quiet=*/ false); @@ -2113,8 +2115,7 @@ NameToNameVector MergeTreeDataMergerMutator::collectFilesForRenames( [&](const ISerialization::SubstreamPath & substream_path) { ++stream_counts[ISerialization::getFileNameForStream(column, substream_path)]; - }, - {}); + }); } NameToNameVector rename_vector; diff --git a/src/Storages/MergeTree/MergeTreeDataPartCNCH.cpp b/src/Storages/MergeTree/MergeTreeDataPartCNCH.cpp index b2a41bab99..729cf98065 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCNCH.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCNCH.cpp @@ -1060,8 +1060,7 @@ ColumnSize MergeTreeDataPartCNCH::getColumnSizeImpl(const NameAndTypePair & colu auto mrk_checksum = checksums->files.find(file_name + index_granularity_info.marks_file_extension); if (mrk_checksum != checksums->files.end()) size.marks += mrk_checksum->second.file_size; - }, - {}); + }); return size; } @@ -1231,9 +1230,9 @@ void MergeTreeDataPartCNCH::preload(UInt64 preload_level, ThreadPool & pool, UIn preload_level); segments.insert(segments.end(), std::make_move_iterator(seg.begin()), std::make_move_iterator(seg.end())); }; - ISerialization::SubstreamPath substream_path; + auto serialization = getSerializationForColumn(real_column); - serialization->enumerateStreams(callback, substream_path); + serialization->enumerateStreams(callback); }; for (const NameAndTypePair & column : *columns_ptr) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index 8e9cff9263..17065a08d5 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -134,7 +134,7 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl( auto mrk_checksum = checksums->files.find(file_name + index_granularity_info.marks_file_extension); if (mrk_checksum != checksums->files.end()) size.marks += mrk_checksum->second.file_size; - }, {}); + }); return size; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index fd669f9847..b7ac5d7601 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -221,7 +221,7 @@ void writeColumnSingleGranule( serialize_settings.position_independent_encoding = true; //-V1048 serialize_settings.low_cardinality_max_dictionary_size = 0; //-V1048 - serialization->serializeBinaryBulkStatePrefix(serialize_settings, state); + serialization->serializeBinaryBulkStatePrefix(*column.column, serialize_settings, state); serialization->serializeBinaryBulkWithMultipleStreams(*column.column, from_row, number_of_rows, serialize_settings, state); serialization->serializeBinaryBulkStateSuffix(serialize_settings, state); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index b10d02a309..665e5b9d6f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -1002,7 +1002,7 @@ void MergeTreeDataPartWriterOnDisk::writeColumn( { ISerialization::SerializeBinaryBulkSettings serialize_settings; serialize_settings.getter = createStreamGetter(name_and_type, offset_columns); - serializations[name]->serializeBinaryBulkStatePrefix(serialize_settings, it->second); + serializations[name]->serializeBinaryBulkStatePrefix(column, serialize_settings, it->second); } const auto & global_settings = storage.getContext()->getSettingsRef(); @@ -1090,7 +1090,8 @@ void MergeTreeDataPartWriterOnDisk::writeColumn( if (write_final_mark) writeFinalMark(name_and_type, offset_columns, serialize_settings.path); - + + serializations[name]->enumerateStreams(finalizeStreams(name), serialize_settings.path); } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 072aaadb2e..76b6b30dfc 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -151,7 +151,7 @@ static RelativeSize convertAbsoluteSampleSizeToRelative(const ASTPtr & node, siz QueryPlanPtr MergeTreeDataSelectExecutor::read( const Names & column_names_to_return, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, const UInt64 max_block_size, @@ -160,6 +160,8 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( std::shared_ptr max_block_numbers_to_read) const { const auto & settings = context->getSettingsRef(); + const auto & metadata_for_reading = storage_snapshot->getMetadataForQuery(); + auto parts = data.getDataPartsVector(); if (settings.enable_ab_index_optimization) @@ -170,7 +172,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( if (!query_info.projection) { MergeTreeMetaBase::DeleteBitmapGetter delete_bitmap_getter; - if (metadata_snapshot->hasUniqueKey()) + if (metadata_for_reading->hasUniqueKey()) { /// get a consistent snapshot of delete bitmaps for query, /// otherwise concurrent upserts that modify part's delete bitmap will cause incorrect query result @@ -192,8 +194,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( parts, delete_bitmap_getter, column_names_to_return, - metadata_snapshot, - metadata_snapshot, + storage_snapshot, query_info, context, max_block_size, @@ -201,7 +202,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( max_block_numbers_to_read); if (plan->isInitialized() && settings.allow_experimental_projection_optimization && settings.force_optimize_projection - && !metadata_snapshot->projections.empty()) + && !metadata_for_reading->projections.empty()) throw Exception( "No projection is used when allow_experimental_projection_optimization = 1 and force_optimize_projection = 1", ErrorCodes::PROJECTION_NOT_USED); @@ -240,8 +241,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( projection_parts, null_getter, query_info.projection->required_columns, - metadata_snapshot, - query_info.projection->desc->metadata, + storage_snapshot, query_info, context, max_block_size, @@ -1391,8 +1391,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( MergeTreeMetaBase::DataPartsVector parts, MergeTreeMetaBase::DeleteBitmapGetter delete_bitmap_getter, const Names & column_names_to_return, - const StorageMetadataPtr & metadata_snapshot_base, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, const UInt64 max_block_size, @@ -1426,8 +1425,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( virt_column_names, data, query_info, - metadata_snapshot, - metadata_snapshot_base, + storage_snapshot, context, max_block_size, num_streams, diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h index 8fcd6c9693..9643ee4570 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h @@ -48,7 +48,7 @@ public: QueryPlanPtr read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, UInt64 max_block_size, @@ -61,8 +61,7 @@ public: MergeTreeMetaBase::DataPartsVector parts, MergeTreeMetaBase::DeleteBitmapGetter delete_bitmap_getter, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot_base, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, UInt64 max_block_size, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 76e10315eb..5712eb0a2f 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -41,6 +41,13 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -348,6 +355,18 @@ MergeTreeMetaBase::MutableDataPartPtr MergeTreeDataWriter::writeTempPart( Block & block = block_with_partition.block; Int64 bucket_number = block_with_partition.bucket_info.bucket_number; + auto columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); + auto storage_snapshot = data.getStorageSnapshot(metadata_snapshot, context); + + if (hasDynamicSubcolumns(metadata_snapshot->getColumns())) + { + convertDynamicColumnsToTuples(block, storage_snapshot); + } + + for (auto & column : columns) + if (column.type->hasDynamicSubcolumns()) + column.type = block.getByName(column.name).type; + static const String TMP_PREFIX = "tmp_insert_"; /// This will generate unique name in scope of current server process. @@ -456,7 +475,6 @@ MergeTreeMetaBase::MutableDataPartPtr MergeTreeDataWriter::writeTempPart( for (const auto & ttl_entry : move_ttl_entries) updateTTL(ttl_entry, move_ttl_infos, move_ttl_infos.moves_ttl[ttl_entry.result_column], block, false); - NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); ReservationPtr reservation = data.reserveSpacePreferringTTLRules( metadata_snapshot, expected_size, move_ttl_infos, time(nullptr), 0, true, nullptr, write_location); diff --git a/src/Storages/MergeTree/MergeTreeFillDeleteWithDefaultValueSource.cpp b/src/Storages/MergeTree/MergeTreeFillDeleteWithDefaultValueSource.cpp index 91dcb951b3..1d2fa5bfff 100644 --- a/src/Storages/MergeTree/MergeTreeFillDeleteWithDefaultValueSource.cpp +++ b/src/Storages/MergeTree/MergeTreeFillDeleteWithDefaultValueSource.cpp @@ -57,7 +57,7 @@ MergeTreeFillDeleteWithDefaultValueSource::MergeTreeFillDeleteWithDefaultValueSo /// Add columns because we don't want to read empty blocks injectRequiredColumns(storage, metadata_snapshot, data_part, columns_to_read, ""); - NamesAndTypesList columns_for_reader = metadata_snapshot->getColumns().getByNames(ColumnsDescription::AllPhysical, columns_to_read, false); + NamesAndTypesList columns_for_reader = metadata_snapshot->getColumns().getByNames(GetColumnsOptions::AllPhysical, columns_to_read); MergeTreeReaderSettings reader_settings = { diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index 6cee80983d..3b7a5f5a25 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -74,7 +74,7 @@ void MergeTreeIndexGranuleSet::serializeBinary(WriteBuffer & ostr) const auto serialization = type->getDefaultSerialization(); ISerialization::SerializeBinaryBulkStatePtr state; - serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkStatePrefix(*block.getByPosition(i).column, settings, state); serialization->serializeBinaryBulkWithMultipleStreams(*block.getByPosition(i).column, 0, size(), settings, state); serialization->serializeBinaryBulkStateSuffix(settings, state); } diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp index 139da4b3c2..2f789ef14c 100644 --- a/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/src/Storages/MergeTree/MergeTreePartition.cpp @@ -244,6 +244,18 @@ namespace for (roaring::Roaring64MapSetBitForwardIterator it(x); it != x.end(); ++it) applyVisitor(*this, Field(*it)); } + void operator() (const Object & x) const + { + UInt8 type = Field::Types::Object; + hash.update(type); + hash.update(x.size()); + + for (const auto & [key, value]: x) + { + hash.update(key); + applyVisitor(*this, value); + } + } }; } diff --git a/src/Storages/MergeTree/MergeTreePrefetchedReaderCNCH.cpp b/src/Storages/MergeTree/MergeTreePrefetchedReaderCNCH.cpp index dec01b7c26..c05e638cb2 100644 --- a/src/Storages/MergeTree/MergeTreePrefetchedReaderCNCH.cpp +++ b/src/Storages/MergeTree/MergeTreePrefetchedReaderCNCH.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include "MarkRange.h" #include "MergeTreeIOSettings.h" #include "MergeTreeIndexGranularityInfo.h" @@ -44,9 +45,11 @@ MergeTreePrefetchedReaderCNCH::MergeTreePrefetchedReaderCNCH( try { /// need to use columns from IMergeTreeReader to read converted subcolumns of nested columns - for (const NameAndTypePair& column : columns) - addStreams(column, profile_callback_, clock_type_, - &mocked_index_granularity_info); + for (const NameAndTypePair & column : columns) + { + auto column_in_part = getColumnFromPart(column); + addStreams(column_in_part, profile_callback_, clock_type_, &mocked_index_granularity_info); + } } catch (...) { @@ -63,9 +66,10 @@ MergeTreePrefetchedReaderCNCH::~MergeTreePrefetchedReaderCNCH() { for (const NameAndTypePair& column : columns) { - auto serialization = data_part->getSerializationForColumn(column); + auto column_in_part = getColumnFromPart(column); + auto serialization = data_part->getSerializationForColumn(column_in_part); serialization->enumerateStreams([&](const ISerialization::SubstreamPath& substream_path) { - String stream_name = ISerialization::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(column_in_part, substream_path); for (const auto& extension : {".bin", ".mrk"}) future_files->releaseSegment(stream_name + extension); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index a407c0c6ea..d1a19600ed 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -47,7 +47,7 @@ MergeTreeReadPool::MergeTreeReadPool( RangesInDataParts && parts_, MergeTreeMetaBase::DeleteBitmapGetter delete_bitmap_getter_, const MergeTreeMetaBase & data_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const SelectQueryInfo & query_info_, const bool check_columns_, const Names & column_names_, @@ -57,7 +57,7 @@ MergeTreeReadPool::MergeTreeReadPool( : backoff_settings{backoff_settings_} , backoff_state{threads_} , data{data_} - , metadata_snapshot{metadata_snapshot_} + , storage_snapshot{storage_snapshot_} , column_names{column_names_} , do_not_steal_tasks{do_not_steal_tasks_} , predict_block_size_bytes{preferred_block_size_bytes_ > 0} @@ -182,7 +182,7 @@ MergeTreeReadTaskPtr MergeTreeReadPool::getTask(const size_t min_marks_to_read, Block MergeTreeReadPool::getHeader() const { - return metadata_snapshot->getSampleBlockForColumns(column_names, data.getVirtuals(), data.getStorageID()); + return storage_snapshot->getSampleBlockForColumns(column_names); } void MergeTreeReadPool::profileFeedback(const ReadBufferFromFileBase::ProfileInfo info) @@ -229,7 +229,7 @@ std::vector MergeTreeReadPool::fillPerPartInfo( const RangesInDataParts & parts, MergeTreeMetaBase::DeleteBitmapGetter delete_bitmap_getter, const MergeTreeIndexContextPtr & index_context, const bool check_columns) { std::vector per_part_sum_marks; - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); for (const auto i : collections::range(0, parts.size())) { @@ -243,7 +243,7 @@ std::vector MergeTreeReadPool::fillPerPartInfo( per_part_sum_marks.push_back(sum_marks); auto task_columns = - getReadTaskColumns(data, metadata_snapshot, part.data_part, column_names, prewhere_info, index_context, check_columns); + getReadTaskColumns(data, storage_snapshot, part.data_part, column_names, prewhere_info, index_context, check_columns); PerPartParams params; const auto & required_column_names = task_columns.columns.getNames(); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index b6dea5f771..dc625c9a6e 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -101,7 +101,7 @@ public: MergeTreeReadPool( const size_t threads_, const size_t sum_marks_, const size_t min_marks_for_concurrent_read_, RangesInDataParts && parts_, MergeTreeMetaBase::DeleteBitmapGetter delete_bitmap_getter, - const MergeTreeMetaBase & data_, const StorageMetadataPtr & metadata_snapshot_, + const MergeTreeMetaBase & data_, const StorageSnapshotPtr & storage_snapshot_, const SelectQueryInfo & query_info_, const bool check_columns_, const Names & column_names_, const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, @@ -126,7 +126,7 @@ private: const RangesInDataParts & parts, const size_t min_marks_for_concurrent_read); const MergeTreeMetaBase & data; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; const Names column_names; bool do_not_steal_tasks; bool predict_block_size_bytes; diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index 80b47942c6..953f627ed4 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -170,6 +170,15 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( if (name_and_type->name == "_part_row_number") continue; + if (name_and_type->isSubcolumn()) + { + auto storage_column_from_part = getColumnFromPart( + {name_and_type->getNameInStorage(), name_and_type->getTypeInStorage()}); + + if (!storage_column_from_part.type->tryGetSubcolumnType(name_and_type->getSubcolumnName())) + continue; + } + NameAndTypePair column_from_part = getColumnFromPart(*name_and_type); auto position = compact_part->getColumnPositionWithoutMap(column_from_part.name); diff --git a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp index 94bf04d206..29e31a40fa 100644 --- a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -71,19 +72,6 @@ MergeTreeReaderInMemory::MergeTreeReaderInMemory( } } -static ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & name_and_type) -{ - auto storage_name = name_and_type.getNameInStorage(); - if (!block.has(storage_name)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Not found column '{}' in block", storage_name); - - const auto & column = block.getByName(storage_name).column; - if (name_and_type.isSubcolumn()) - return name_and_type.getTypeInStorage()->getSubcolumn(name_and_type.getSubcolumnName(), *column); - - return column; -} - size_t MergeTreeReaderInMemory::readRows(size_t from_mark, size_t current_task_last_mark, size_t from_row, size_t max_rows_to_read, Columns& res_columns) { diff --git a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp index 5cb35b85a7..1168c9f159 100644 --- a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp @@ -35,7 +35,7 @@ namespace ErrorCodes MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr storage_snapshot_, const MergeTreeMetaBase::DataPartPtr & owned_data_part_, ImmutableDeleteBitmapPtr delete_bitmap_, UInt64 max_block_size_rows_, @@ -53,8 +53,8 @@ MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( bool quiet) : MergeTreeBaseSelectProcessor{ - metadata_snapshot_->getSampleBlockForColumns(required_columns_, storage_.getVirtuals(), storage_.getStorageID()), - storage_, metadata_snapshot_, query_info_, std::move(actions_settings), max_block_size_rows_, + storage_snapshot_->getSampleBlockForColumns(required_columns_), + storage_, storage_snapshot_, query_info_, std::move(actions_settings), max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, required_columns{std::move(required_columns_)}, @@ -94,7 +94,7 @@ try if (all_mark_ranges.empty()) return true; - task_columns = getReadTaskColumns(storage, metadata_snapshot, data_part, required_columns, prewhere_info, index_context, check_columns); + task_columns = getReadTaskColumns(storage, storage_snapshot, data_part, required_columns, prewhere_info, index_context, check_columns); /// will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & column_names = task_columns.columns.getNames(); @@ -106,7 +106,7 @@ try auto size_predictor = (preferred_block_size_bytes == 0) ? nullptr - : std::make_unique(data_part, ordered_names, metadata_snapshot->getSampleBlock()); + : std::make_unique(data_part, ordered_names, storage_snapshot->metadata->getSampleBlock()); task = std::make_unique( data_part, delete_bitmap, mark_ranges_for_task, part_index_in_query, ordered_names, column_name_set, diff --git a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h index 91e220352f..1cdac55457 100644 --- a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h @@ -39,7 +39,7 @@ class MergeTreeReverseSelectProcessor : public MergeTreeBaseSelectProcessor public: MergeTreeReverseSelectProcessor( const MergeTreeMetaBase & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr storage_snapshot_, const MergeTreeMetaBase::DataPartPtr & owned_data_part, ImmutableDeleteBitmapPtr delete_bitmap, UInt64 max_block_size_rows, diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index ac8fe0d1a9..a8536618c9 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -35,7 +35,7 @@ namespace ErrorCodes MergeTreeSelectProcessor::MergeTreeSelectProcessor( const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const MergeTreeMetaBase::DataPartPtr & owned_data_part_, ImmutableDeleteBitmapPtr delete_bitmap_, UInt64 max_block_size_rows_, @@ -53,8 +53,8 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( bool quiet) : MergeTreeBaseSelectProcessor{ - metadata_snapshot_->getSampleBlockForColumns(required_columns_, storage_.getVirtuals(), storage_.getStorageID()), - storage_, metadata_snapshot_, query_info_, std::move(actions_settings), max_block_size_rows_, + storage_snapshot_->getSampleBlockForColumns(required_columns_), + storage_, storage_snapshot_, query_info_, std::move(actions_settings), max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, required_columns{std::move(required_columns_)}, @@ -92,12 +92,12 @@ try is_first_task = false; task_columns = getReadTaskColumns( - storage, metadata_snapshot, data_part, + storage, storage_snapshot, data_part, required_columns, prewhere_info, index_context, check_columns); auto size_predictor = (preferred_block_size_bytes == 0) ? nullptr - : std::make_unique(data_part, ordered_names, metadata_snapshot->getSampleBlock()); + : std::make_unique(data_part, ordered_names, storage_snapshot->metadata->getSampleBlock()); /// will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & column_names = task_columns.columns.getNames(); diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.h b/src/Storages/MergeTree/MergeTreeSelectProcessor.h index a7fb09c213..19b0c494a0 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.h @@ -39,7 +39,7 @@ class MergeTreeSelectProcessor : public MergeTreeBaseSelectProcessor public: MergeTreeSelectProcessor( const MergeTreeMetaBase & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot_, const MergeTreeMetaBase::DataPartPtr & owned_data_part, ImmutableDeleteBitmapPtr delete_bitmap, UInt64 max_block_size_rows, diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index f47782f470..68ee40b3f7 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -25,7 +25,7 @@ #include #include #include -#include "Core/Defines.h" +#include namespace DB { @@ -36,20 +36,29 @@ namespace ErrorCodes MergeTreeSequentialSource::MergeTreeSequentialSource( const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, MergeTreeMetaBase::DataPartPtr data_part_, Names columns_to_read_, bool read_with_direct_io_, bool take_column_types_from_storage, bool quiet, CnchMergePrefetcher::PartFutureFiles* future_files) - : MergeTreeSequentialSource(storage_, metadata_snapshot_, - data_part_, data_part_->getDeleteBitmap(), columns_to_read_, read_with_direct_io_, - take_column_types_from_storage, quiet, future_files) {} + : MergeTreeSequentialSource( + storage_, + storage_snapshot_, + data_part_, + data_part_->getDeleteBitmap(), + columns_to_read_, + read_with_direct_io_, + take_column_types_from_storage, + quiet, + future_files) +{ +} MergeTreeSequentialSource::MergeTreeSequentialSource( const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, MergeTreeMetaBase::DataPartPtr data_part_, ImmutableDeleteBitmapPtr delete_bitmap_, Names columns_to_read_, @@ -58,10 +67,9 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( bool quiet, CnchMergePrefetcher::PartFutureFiles* future_files, BitEngineReadType bitengine_read_type) - : SourceWithProgress(metadata_snapshot_->getSampleBlockForColumns( - columns_to_read_, storage_.getVirtuals(), storage_.getStorageID(), bitengine_read_type)) + : SourceWithProgress(storage_snapshot_->getSampleBlockForColumns(columns_to_read_, {}, bitengine_read_type)) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , data_part(std::move(data_part_)) , delete_bitmap(std::move(delete_bitmap_)) , columns_to_read(std::move(columns_to_read_)) @@ -73,12 +81,14 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( addTotalRowsApprox(data_part->rows_count - num_deletes); /// Add columns because we don't want to read empty blocks - injectRequiredColumns(storage, metadata_snapshot, data_part, columns_to_read, + injectRequiredColumns(storage, storage_snapshot->metadata, data_part, columns_to_read, future_files == nullptr ? "" : future_files->getFixedInjectedColumn()); NamesAndTypesList columns_for_reader; if (take_column_types_from_storage) { - columns_for_reader = metadata_snapshot->getColumns().getByNames(ColumnsDescription::AllPhysical, columns_to_read, false); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); + columns_for_reader = storage_snapshot->getColumnsByNames(options, columns_to_read); + if (bitengine_read_type != BitEngineReadType::ONLY_SOURCE) columns_for_reader = columns_for_reader.addTypes(columns_for_reader.getNames(), bitengine_read_type); } @@ -116,14 +126,14 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( reader_settings.convert_nested_to_subcolumns = true; reader = std::make_unique( - data_part, columns_for_reader, metadata_snapshot, nullptr, + data_part, columns_for_reader, storage_snapshot->metadata, nullptr, MarkRanges{MarkRange(0, data_part->getMarksCount())}, reader_settings, future_files ); } else { - reader = data_part->getReader(columns_for_reader, metadata_snapshot, + reader = data_part->getReader(columns_for_reader, storage_snapshot->metadata, MarkRanges{MarkRange(0, data_part->getMarksCount())}, /* uncompressed_cache = */ nullptr, mark_cache.get(), reader_settings, nullptr, {}, {}, internal_progress_callback); } diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.h b/src/Storages/MergeTree/MergeTreeSequentialSource.h index 9c853ab839..18089f7306 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.h +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.h @@ -38,7 +38,7 @@ public: /// the columns you want to read. MergeTreeSequentialSource( const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, MergeTreeMetaBase::DataPartPtr data_part_, Names columns_to_read_, bool read_with_direct_io_, @@ -48,7 +48,7 @@ public: MergeTreeSequentialSource( const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, MergeTreeMetaBase::DataPartPtr data_part_, ImmutableDeleteBitmapPtr delete_bitmap_, Names columns_to_read_, @@ -72,7 +72,7 @@ protected: private: const MergeTreeMetaBase & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; /// Data part will not be removed if the pointer owns it MergeTreeMetaBase::DataPartPtr data_part; diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index a669143e0d..73c51885aa 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -517,7 +517,6 @@ enum StealingCacheMode : UInt64 M(String, cnch_table_uuid, "", "Used for CloudMergeTree to get uuid of Cnch Table for ingestion task, like Kafka", 0) \ \ M(String, remote_storage_type, "hdfs", "Table's storage type[deprcated]", 0) \ - \ /** BitEngine related settings */ \ M(UInt64, bitengine_split_index, 0, "Copatible setting for split BitEngine dict data, no real use", 0) \ M(Float, bitengine_encode_loss_rate, 0.1, "The threshold that BitEngine discard some data and no exception will be thrown when encoding", 0) \ @@ -528,6 +527,10 @@ enum StealingCacheMode : UInt64 M(Bool, enable_hybrid_allocation, false, "Whether or not enable hybrid allocation, default disabled", 0) \ M(UInt64, min_rows_per_vp, 2000000, "Minimum size of a virtual part", 0) \ M(Float, part_to_vw_size_ratio, 0.1, "Part to vw worker size's ration", 0) \ + /** JSON related settings start*/ \ + M(UInt64, json_subcolumns_threshold, 1000, "Max number of json sub columns", 0) \ + M(UInt64, json_partial_schema_assemble_batch_size, 100, "Batch size to assemble dynamic object column schema", 0) \ + /** JSON related settings end*/ \ \ /// Settings that should not change after the creation of a table. #define APPLY_FOR_IMMUTABLE_MERGE_TREE_SETTINGS(M) \ diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp index 780bdedbe2..e525c006d4 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp @@ -40,7 +40,7 @@ MergeTreeThreadSelectBlockInputProcessor::MergeTreeThreadSelectBlockInputProcess size_t preferred_block_size_bytes_, size_t preferred_max_column_in_block_size_bytes_, const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const bool use_uncompressed_cache_, const SelectQueryInfo & query_info_, ExpressionActionsSettings actions_settings, @@ -48,7 +48,7 @@ MergeTreeThreadSelectBlockInputProcessor::MergeTreeThreadSelectBlockInputProcess const Names & virt_column_names_) : MergeTreeBaseSelectProcessor{ - pool_->getHeader(), storage_, metadata_snapshot_, query_info_, std::move(actions_settings), max_block_size_rows_, + pool_->getHeader(), storage_, storage_snapshot_, query_info_, std::move(actions_settings), max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, thread{thread_}, diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h index f0165f7e89..0c558ef2be 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h +++ b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h @@ -43,7 +43,7 @@ public: size_t preferred_block_size_bytes_, size_t preferred_max_column_in_block_size_bytes_, const MergeTreeMetaBase & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const bool use_uncompressed_cache_, const SelectQueryInfo & query_info_, ExpressionActionsSettings actions_settings, diff --git a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h index ce4bba1658..87ef663a47 100644 --- a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h +++ b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h @@ -31,6 +31,7 @@ #include #include +#include namespace ErrorCodes { @@ -46,10 +47,24 @@ class StorageFromMergeTreeDataPart final : public shared_ptr_helper; public: String getName() const override { return "FromMergeTreeDataPart"; } + + StorageSnapshotPtr getStorageSnapshot( + const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const override + { + const auto & storage_columns = metadata_snapshot->getColumns(); + if (!hasDynamicSubcolumns(storage_columns)) + return std::make_shared(*this, metadata_snapshot); + + auto object_columns = getConcreteObjectColumns( + parts.begin(), parts.end(), + storage_columns, [](const auto & part) -> const auto & { return part->getColumns(); }); + + return std::make_shared(*this, metadata_snapshot, object_columns); + } Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -69,8 +84,7 @@ public: parts, delete_bitmap_getter, column_names, - metadata_snapshot, - metadata_snapshot, + storage_snapshot, query_info, context, max_block_size, @@ -83,6 +97,8 @@ public: bool supportsPrewhere() const override { return true; } bool supportsIndexForIn() const override { return true; } + + bool supportsDynamicSubcolumns() const override { return true; } bool mayBenefitFromIndexForIn( const ASTPtr & left_in_operand, ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot) const override diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index c682eec6f7..97d021fcd0 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -153,8 +153,7 @@ void genCompactMapChecksums( checksums_data.files[file_name] = checksum_compact_mrk_file(disk, path + map_file_name, file_offset, file_size); } - }, - {}); + }); } } } @@ -207,8 +206,7 @@ void genMapChecksums( [&](const ISerialization::SubstreamPath & substream_path) { String file_name = ISerialization::getFileNameForStream(implicit_column, substream_path) + ".bin"; checksums_data.files[file_name] = checksum_compressed_file(disk, path + file_name); - }, - {}); + }); } } } @@ -312,8 +310,7 @@ IMergeTreeDataPart::Checksums checkDataPart( { String projection_file_name = ISerialization::getFileNameForStream(projection_column, substream_path) + ".bin"; checksums_data.files[projection_file_name] = checksum_compressed_file(disk, projection_path + projection_file_name); - }, - {}); + }); } } @@ -394,7 +391,7 @@ IMergeTreeDataPart::Checksums checkDataPart( { String file_name = ISerialization::getFileNameForStream(column, substream_path) + ".bin"; checksums_data.files[file_name] = checksum_compressed_file(disk, path + file_name); - }, {}); + }); } } else diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp index 90103cc568..c297e1f580 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp @@ -264,7 +264,7 @@ bool StorageMaterializedPostgreSQL::needRewriteQueryWithFinal(const Names & colu Pipe StorageMaterializedPostgreSQL::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context_, QueryProcessingStage::Enum processed_stage, @@ -273,7 +273,7 @@ Pipe StorageMaterializedPostgreSQL::read( { auto materialized_table_lock = lockForShare(String(), context_->getSettingsRef().lock_acquire_timeout); auto nested_table = getNested(); - return readFinalFromNestedStorage(nested_table, column_names, metadata_snapshot, + return readFinalFromNestedStorage(nested_table, column_names, query_info, context_, processed_stage, max_block_size, num_streams); } diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h index becb4f6ba1..26d82a7b1c 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h @@ -86,7 +86,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context_, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/RabbitMQ/RabbitMQBlockInputStream.cpp b/src/Storages/RabbitMQ/RabbitMQBlockInputStream.cpp index 6c3d3a53c2..5657fc4c04 100644 --- a/src/Storages/RabbitMQ/RabbitMQBlockInputStream.cpp +++ b/src/Storages/RabbitMQ/RabbitMQBlockInputStream.cpp @@ -15,22 +15,21 @@ namespace DB RabbitMQBlockInputStream::RabbitMQBlockInputStream( StorageRabbitMQ & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, ContextPtr context_, const Names & columns, size_t max_block_size_, bool ack_in_suffix_) : storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , context(context_) , column_names(columns) , max_block_size(max_block_size_) , ack_in_suffix(ack_in_suffix_) - , non_virtual_header(metadata_snapshot->getSampleBlockNonMaterialized()) + , non_virtual_header(storage_snapshot->metadata->getSampleBlockNonMaterialized()) , sample_block(non_virtual_header) - , virtual_header(metadata_snapshot->getSampleBlockForColumns( - {"_exchange_name", "_channel_id", "_delivery_tag", "_redelivered", "_message_id", "_timestamp"}, - storage.getVirtuals(), storage.getStorageID())) + , virtual_header(storage_snapshot->getSampleBlockForColumns( + {"_exchange_name", "_channel_id", "_delivery_tag", "_redelivered", "_message_id", "_timestamp"})) { for (const auto & column : virtual_header) sample_block.insert(column); diff --git a/src/Storages/RabbitMQ/RabbitMQBlockInputStream.h b/src/Storages/RabbitMQ/RabbitMQBlockInputStream.h index 5ce1c96bf3..e6a7d16572 100644 --- a/src/Storages/RabbitMQ/RabbitMQBlockInputStream.h +++ b/src/Storages/RabbitMQ/RabbitMQBlockInputStream.h @@ -14,7 +14,7 @@ class RabbitMQBlockInputStream : public IBlockInputStream public: RabbitMQBlockInputStream( StorageRabbitMQ & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, ContextPtr context_, const Names & columns, size_t max_block_size_, @@ -37,7 +37,7 @@ public: private: StorageRabbitMQ & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; ContextPtr context; Names column_names; const size_t max_block_size; diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index 159fefe056..4a79dc9486 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -594,7 +594,7 @@ void StorageRabbitMQ::unbindExchange() Pipe StorageRabbitMQ::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /* query_info */, ContextPtr local_context, QueryProcessingStage::Enum /* processed_stage */, @@ -607,7 +607,7 @@ Pipe StorageRabbitMQ::read( if (num_created_consumers == 0) return {}; - auto sample_block = metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + auto sample_block = storage_snapshot->getSampleBlockForColumns(column_names); auto modified_context = addSettings(local_context); auto block_size = getMaxBlockSize(); @@ -627,7 +627,7 @@ Pipe StorageRabbitMQ::read( for (size_t i = 0; i < num_created_consumers; ++i) { auto rabbit_stream = std::make_shared( - *this, metadata_snapshot, modified_context, column_names, block_size); + *this, storage_snapshot, modified_context, column_names, block_size); auto converting_stream = std::make_shared( rabbit_stream, sample_block, ConvertingBlockInputStream::MatchColumnsMode::Name); @@ -952,6 +952,7 @@ bool StorageRabbitMQ::streamToViews() auto block_io = interpreter.execute(); auto metadata_snapshot = getInMemoryMetadataPtr(); + auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot, rabbitmq_context); auto column_names = block_io.out->getHeader().getNames(); auto sample_block = metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); @@ -964,7 +965,7 @@ bool StorageRabbitMQ::streamToViews() for (size_t i = 0; i < num_created_consumers; ++i) { auto stream = std::make_shared( - *this, metadata_snapshot, rabbitmq_context, column_names, block_size, false); + *this, storage_snapshot, rabbitmq_context, column_names, block_size, false); streams.emplace_back(stream); // Limit read batch to maximum block size to allow DDL diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.h b/src/Storages/RabbitMQ/StorageRabbitMQ.h index a02270459a..8f341af906 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.h +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h @@ -40,7 +40,7 @@ public: /// Always return virtual columns in addition to required columns Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/ReadFinalForExternalReplicaStorage.cpp b/src/Storages/ReadFinalForExternalReplicaStorage.cpp index 36a40beca3..f4d34ff13e 100644 --- a/src/Storages/ReadFinalForExternalReplicaStorage.cpp +++ b/src/Storages/ReadFinalForExternalReplicaStorage.cpp @@ -27,7 +27,6 @@ bool needRewriteQueryWithFinalForStorage(const Names & column_names, const Stora Pipe readFinalFromNestedStorage( StoragePtr nested_storage, const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -36,7 +35,7 @@ Pipe readFinalFromNestedStorage( { NameSet column_names_set = NameSet(column_names.begin(), column_names.end()); auto lock = nested_storage->lockForShare(context->getCurrentQueryId(), context->getSettingsRef().lock_acquire_timeout); - const StorageMetadataPtr & nested_metadata = nested_storage->getInMemoryMetadataPtr(); + const auto & nested_metadata = nested_storage->getInMemoryMetadataPtr(); Block nested_header = nested_metadata->getSampleBlock(); ColumnWithTypeAndName & sign_column = nested_header.getByPosition(nested_header.columns() - 2); @@ -55,7 +54,8 @@ Pipe readFinalFromNestedStorage( filter_column_name = expressions->children.back()->getColumnName(); } - Pipe pipe = nested_storage->read(require_columns_name, nested_metadata, query_info, context, processed_stage, max_block_size, num_streams); + auto nested_snapshot = nested_storage->getStorageSnapshot(nested_metadata, context); + Pipe pipe = nested_storage->read(require_columns_name, nested_snapshot, query_info, context, processed_stage, max_block_size, num_streams); pipe.addTableLock(lock); if (!expressions->children.empty() && !pipe.empty()) diff --git a/src/Storages/ReadFinalForExternalReplicaStorage.h b/src/Storages/ReadFinalForExternalReplicaStorage.h index f09a115919..860a66a7ee 100644 --- a/src/Storages/ReadFinalForExternalReplicaStorage.h +++ b/src/Storages/ReadFinalForExternalReplicaStorage.h @@ -18,7 +18,6 @@ bool needRewriteQueryWithFinalForStorage(const Names & column_names, const Stora Pipe readFinalFromNestedStorage( StoragePtr nested_storage, const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/RemoteFile/IStorageCloudFile.cpp b/src/Storages/RemoteFile/IStorageCloudFile.cpp index 9f8a9d8e69..97f5536f49 100644 --- a/src/Storages/RemoteFile/IStorageCloudFile.cpp +++ b/src/Storages/RemoteFile/IStorageCloudFile.cpp @@ -381,7 +381,7 @@ IStorageCloudFile::IStorageCloudFile( Pipe IStorageCloudFile::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, @@ -390,14 +390,14 @@ Pipe IStorageCloudFile::read( { LOG_TRACE(log, " CloudFile column_names size = {}", column_names.size()); QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe(QueryPlanOptimizationSettings::fromContext(query_context), BuildQueryPipelineSettings::fromContext(query_context)); } void IStorageCloudFile::read( DB::QueryPlan & query_plan, const DB::Names & column_names, - const DB::StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, DB::SelectQueryInfo & query_info, DB::ContextPtr query_context, QueryProcessingStage::Enum /*processed_stage*/, @@ -410,11 +410,11 @@ void IStorageCloudFile::read( return; Names real_column_names = column_names; - NamesAndTypesList available_real_columns = metadata_snapshot->getColumns().getAllPhysical(); + NamesAndTypesList available_real_columns = storage_snapshot->metadata->getColumns().getAllPhysical(); if (real_column_names.empty()) real_column_names.push_back(ExpressionActions::getSmallestColumn(available_real_columns)); - metadata_snapshot->check(real_column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(real_column_names); auto cloud_file_source = std::make_unique( client, @@ -423,7 +423,7 @@ void IStorageCloudFile::read( getVirtuals(), getStorageID(), query_info, - metadata_snapshot, + storage_snapshot->metadata, query_context, max_block_size, num_streams, diff --git a/src/Storages/RemoteFile/IStorageCloudFile.h b/src/Storages/RemoteFile/IStorageCloudFile.h index a32d5a4e54..443f6ddeeb 100644 --- a/src/Storages/RemoteFile/IStorageCloudFile.h +++ b/src/Storages/RemoteFile/IStorageCloudFile.h @@ -33,7 +33,7 @@ public: virtual Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -43,7 +43,7 @@ public: virtual void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/RemoteFile/IStorageCnchFile.cpp b/src/Storages/RemoteFile/IStorageCnchFile.cpp index 6f021f1a92..31cd68b271 100644 --- a/src/Storages/RemoteFile/IStorageCnchFile.cpp +++ b/src/Storages/RemoteFile/IStorageCnchFile.cpp @@ -108,7 +108,7 @@ IStorageCnchFile::IStorageCnchFile( Pipe IStorageCnchFile::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, @@ -116,7 +116,7 @@ Pipe IStorageCnchFile::read( unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(query_context), BuildQueryPipelineSettings::fromContext(query_context)); } @@ -124,7 +124,7 @@ Pipe IStorageCnchFile::read( void IStorageCnchFile::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, @@ -133,7 +133,7 @@ void IStorageCnchFile::read( { tryUpdateFSClient(query_context); - auto prepare_result = prepareReadContext(column_names, metadata_snapshot, query_info, query_context); + auto prepare_result = prepareReadContext(column_names, storage_snapshot->metadata, query_info, query_context); /// If no parts to read from - execute locally, must make sure that all stages are executed /// because CnchMergeTree is a high order storage @@ -152,7 +152,7 @@ void IStorageCnchFile::read( // todo(jiashuo): table function hasn't supported distributed query if (arguments.is_function_table || settings.resourcesAssignType() == StorageResourcesAssignType::SERVER_LOCAL) { - readByLocal(prepare_result.file_parts, query_plan, column_names, metadata_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); + readByLocal(prepare_result.file_parts, query_plan, column_names, storage_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); return; } @@ -176,6 +176,8 @@ void IStorageCnchFile::read( ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( header, + {}, + storage_snapshot, processed_stage, StorageID::createEmpty(), /// Don't check whether table exists in cnch-worker scalars, @@ -411,10 +413,10 @@ void IStorageCnchFile::collectResource(const ContextPtr & query_context, const F } QueryProcessingStage::Enum IStorageCnchFile::getQueryProcessingStage( - ContextPtr query_context, QueryProcessingStage::Enum stage, const StorageMetadataPtr & storage_metadata, SelectQueryInfo & query_info) const + ContextPtr query_context, QueryProcessingStage::Enum stage, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const { if (arguments.is_function_table || settings.resourcesAssignType() == StorageResourcesAssignType::SERVER_LOCAL) - return IStorage::getQueryProcessingStage(query_context, stage, storage_metadata, query_info); + return IStorage::getQueryProcessingStage(query_context, stage, storage_snapshot, query_info); const auto & local_settings = query_context->getSettingsRef(); diff --git a/src/Storages/RemoteFile/IStorageCnchFile.h b/src/Storages/RemoteFile/IStorageCnchFile.h index c157410624..a598746a97 100644 --- a/src/Storages/RemoteFile/IStorageCnchFile.h +++ b/src/Storages/RemoteFile/IStorageCnchFile.h @@ -27,7 +27,7 @@ public: virtual Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, @@ -37,7 +37,7 @@ public: virtual void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, @@ -51,7 +51,7 @@ public: FileDataPartsCNCHVector parts, QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, @@ -82,8 +82,8 @@ public: virtual void tryUpdateFSClient(const ContextPtr & /*query_context*/) { } QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr query_context, QueryProcessingStage::Enum stage, const StorageMetadataPtr & storage_metadata, SelectQueryInfo & query_info) const override; - + getQueryProcessingStage(ContextPtr query_context, QueryProcessingStage::Enum stage, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const override; + bool supportsOptimizer() const override { return true; } bool supportsDistributedRead() const override { return true; } StorageID prepareTableRead(const Names & output_columns, SelectQueryInfo & query_info, ContextPtr local_context) override; diff --git a/src/Storages/RemoteFile/StorageCnchHDFS.cpp b/src/Storages/RemoteFile/StorageCnchHDFS.cpp index d885978455..ed27c75422 100644 --- a/src/Storages/RemoteFile/StorageCnchHDFS.cpp +++ b/src/Storages/RemoteFile/StorageCnchHDFS.cpp @@ -126,16 +126,24 @@ void StorageCnchHDFS::readByLocal( FileDataPartsCNCHVector parts, QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { - auto storage = StorageCloudHDFS::create(getContext(), getStorageID(), metadata_snapshot->getColumns(), metadata_snapshot->getConstraints(), file_list, metadata_snapshot->getSettingsChanges(), arguments, settings); + auto storage = StorageCloudHDFS::create( + getContext(), + getStorageID(), + storage_snapshot->metadata->getColumns(), + storage_snapshot->metadata->getConstraints(), + file_list, + storage_snapshot->metadata->getSettingsChanges(), + arguments, + settings); storage->loadDataParts(parts); - storage->read(query_plan, column_names, metadata_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); + storage->read(query_plan, column_names, storage_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); } diff --git a/src/Storages/RemoteFile/StorageCnchHDFS.h b/src/Storages/RemoteFile/StorageCnchHDFS.h index 788935ebcc..f32c7b482c 100644 --- a/src/Storages/RemoteFile/StorageCnchHDFS.h +++ b/src/Storages/RemoteFile/StorageCnchHDFS.h @@ -21,7 +21,7 @@ public: FileDataPartsCNCHVector parts, QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/RemoteFile/StorageCnchS3.cpp b/src/Storages/RemoteFile/StorageCnchS3.cpp index 9ca9f154cb..f9bc1f08f7 100644 --- a/src/Storages/RemoteFile/StorageCnchS3.cpp +++ b/src/Storages/RemoteFile/StorageCnchS3.cpp @@ -99,16 +99,16 @@ void StorageCnchS3::readByLocal( FileDataPartsCNCHVector parts, QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { - auto storage = StorageCloudS3::create(query_context, getStorageID(), metadata_snapshot->getColumns(), metadata_snapshot->getConstraints(), file_list, metadata_snapshot->getSettingsChanges(), arguments, settings, config); + auto storage = StorageCloudS3::create(getContext(), getStorageID(), storage_snapshot->metadata->getColumns(), storage_snapshot->metadata->getConstraints(), file_list, storage_snapshot->metadata->getSettingsChanges(), arguments, settings, config); storage->loadDataParts(parts); - return storage->read(query_plan, column_names, metadata_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); + return storage->read(query_plan, column_names, storage_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); } Strings StorageCnchS3::readFileList() diff --git a/src/Storages/RemoteFile/StorageCnchS3.h b/src/Storages/RemoteFile/StorageCnchS3.h index c74f3dbd3c..f1171fa87c 100644 --- a/src/Storages/RemoteFile/StorageCnchS3.h +++ b/src/Storages/RemoteFile/StorageCnchS3.h @@ -23,7 +23,7 @@ public: FileDataPartsCNCHVector parts, QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr query_context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.cpp index 4900e17ad9..7c78686a6f 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.cpp +++ b/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.cpp @@ -17,13 +17,13 @@ namespace ErrorCodes EmbeddedRocksDBBlockInputStream::EmbeddedRocksDBBlockInputStream( StorageEmbeddedRocksDB & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, size_t max_block_size_) : storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , max_block_size(max_block_size_) { - sample_block = metadata_snapshot->getSampleBlock(); + sample_block = storage_snapshot->metadata->getSampleBlock(); primary_key_pos = sample_block.getPositionByName(storage.primary_key); } diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.h b/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.h index ddff1fc2e8..ee1a814fa8 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.h +++ b/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace rocksdb @@ -20,7 +21,7 @@ class EmbeddedRocksDBBlockInputStream : public IBlockInputStream public: EmbeddedRocksDBBlockInputStream( - StorageEmbeddedRocksDB & storage_, const StorageMetadataPtr & metadata_snapshot_, size_t max_block_size_); + StorageEmbeddedRocksDB & storage_, const StorageSnapshotPtr & storage_snapshot_, size_t max_block_size_); String getName() const override { return "EmbeddedRocksDB"; } Block getHeader() const override { return sample_block; } @@ -28,7 +29,7 @@ public: private: StorageEmbeddedRocksDB & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; const size_t max_block_size; Block sample_block; diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index 70cc173e38..2e7667bef8 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -161,14 +161,14 @@ class EmbeddedRocksDBSource : public SourceWithProgress public: EmbeddedRocksDBSource( const StorageEmbeddedRocksDB & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, FieldVectorPtr keys_, FieldVector::const_iterator begin_, FieldVector::const_iterator end_, const size_t max_block_size_) - : SourceWithProgress(metadata_snapshot_->getSampleBlock()) + : SourceWithProgress(storage_snapshot_->metadata->getSampleBlock()) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , keys(std::move(keys_)) , begin(begin_) , end(end_) @@ -192,7 +192,7 @@ public: std::vector serialized_keys(num_keys); std::vector slices_keys(num_keys); - const auto & sample_block = metadata_snapshot->getSampleBlock(); + const auto & sample_block = storage_snapshot->metadata->getSampleBlock(); const auto & key_column = sample_block.getByName(storage.primary_key); auto columns = sample_block.cloneEmptyColumns(); size_t primary_key_pos = sample_block.getPositionByName(storage.primary_key); @@ -235,7 +235,7 @@ public: private: const StorageEmbeddedRocksDB & storage; - const StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; FieldVectorPtr keys; FieldVector::const_iterator begin; FieldVector::const_iterator end; @@ -285,24 +285,24 @@ void StorageEmbeddedRocksDB::initDb() Pipe StorageEmbeddedRocksDB::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); FieldVectorPtr keys; bool all_scan = false; - auto primary_key_data_type = metadata_snapshot->getSampleBlock().getByName(primary_key).type; + auto primary_key_data_type = storage_snapshot->metadata->getSampleBlock().getByName(primary_key).type; std::tie(keys, all_scan) = getFilterKeys(primary_key, primary_key_data_type, query_info); if (all_scan) { auto reader = std::make_shared( - *this, metadata_snapshot, max_block_size); + *this, storage_snapshot, max_block_size); return Pipe(std::make_shared(reader)); } else @@ -327,7 +327,7 @@ Pipe StorageEmbeddedRocksDB::read( size_t end = num_keys * (thread_idx + 1) / num_threads; pipes.emplace_back(std::make_shared( - *this, metadata_snapshot, keys, keys->begin() + begin, keys->begin() + end, max_block_size)); + *this, storage_snapshot, keys, keys->begin() + begin, keys->begin() + end, max_block_size)); } return Pipe::unitePipes(std::move(pipes)); } diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h index aa81bc4d35..ff6ffad3b3 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h @@ -27,7 +27,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 84867d6ff1..ebdee7efc0 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -155,10 +155,11 @@ StorageBuffer::StorageBuffer( class BufferSource : public SourceWithProgress { public: - BufferSource(const Names & column_names_, StorageBuffer::Buffer & buffer_, const StorageBuffer & storage, const StorageMetadataPtr & metadata_snapshot) + BufferSource(const Names & column_names_, StorageBuffer::Buffer & buffer_, const StorageSnapshotPtr & storage_snapshot) : SourceWithProgress( - metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) - , column_names_and_types(metadata_snapshot->getColumns().getByNames(ColumnsDescription::All, column_names_, true)) + storage_snapshot->getSampleBlockForColumns(column_names_)) + , column_names_and_types(storage_snapshot->getColumnsByNames( + GetColumnsOptions(GetColumnsOptions::All).withSubcolumns(), column_names_)) , buffer(buffer_) {} String getName() const override { return "Buffer"; } @@ -205,7 +206,7 @@ private: QueryProcessingStage::Enum StorageBuffer::getQueryProcessingStage( ContextPtr local_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr &, + const StorageSnapshotPtr &, SelectQueryInfo & query_info) const { if (destination_id) @@ -215,7 +216,8 @@ QueryProcessingStage::Enum StorageBuffer::getQueryProcessingStage( if (destination.get() == this) throw Exception("Destination table is myself. Read will cause infinite loop.", ErrorCodes::INFINITE_LOOP); - return destination->getQueryProcessingStage(local_context, to_stage, destination->getInMemoryMetadataPtr(), query_info); + const auto & destination_metadata = destination->getInMemoryMetadataPtr(); + return destination->getQueryProcessingStage(local_context, to_stage, destination->getStorageSnapshot(destination_metadata, local_context), query_info); } return QueryProcessingStage::FetchColumns; @@ -224,7 +226,7 @@ QueryProcessingStage::Enum StorageBuffer::getQueryProcessingStage( Pipe StorageBuffer::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -232,7 +234,7 @@ Pipe StorageBuffer::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); @@ -241,13 +243,15 @@ Pipe StorageBuffer::read( void StorageBuffer::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { + const auto & metadata_snapshot = storage_snapshot->metadata; + if (destination_id) { auto destination = DatabaseCatalog::instance().getTable(destination_id, local_context); @@ -258,13 +262,14 @@ void StorageBuffer::read( auto destination_lock = destination->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto destination_metadata_snapshot = destination->getInMemoryMetadataPtr(); + auto destination_snapshot = destination->getStorageSnapshot(destination_metadata_snapshot, local_context); const bool dst_has_same_structure = std::all_of(column_names.begin(), column_names.end(), [metadata_snapshot, destination_metadata_snapshot](const String& column_name) { const auto & dest_columns = destination_metadata_snapshot->getColumns(); const auto & our_columns = metadata_snapshot->getColumns(); - auto dest_columm = dest_columns.tryGetColumnOrSubcolumn(ColumnsDescription::AllPhysical, column_name); - return dest_columm && dest_columm->type->equals(*our_columns.getColumnOrSubcolumn(ColumnsDescription::AllPhysical, column_name).type); + auto dest_columm = dest_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name); + return dest_columm && dest_columm->type->equals(*our_columns.getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name).type); }); if (dst_has_same_structure) @@ -274,7 +279,7 @@ void StorageBuffer::read( /// The destination table has the same structure of the requested columns and we can simply read blocks from there. destination->read( - query_plan, column_names, destination_metadata_snapshot, query_info, + query_plan, column_names, destination_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); } else @@ -309,7 +314,7 @@ void StorageBuffer::read( else { destination->read( - query_plan, columns_intersection, destination_metadata_snapshot, query_info, + query_plan, columns_intersection, destination_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); if (query_plan.isInitialized()) @@ -366,7 +371,7 @@ void StorageBuffer::read( Pipes pipes_from_buffers; pipes_from_buffers.reserve(num_shards); for (auto & buf : buffers) - pipes_from_buffers.emplace_back(std::make_shared(column_names, buf, *this, metadata_snapshot)); + pipes_from_buffers.emplace_back(std::make_shared(column_names, buf, storage_snapshot)); pipe_from_buffers = Pipe::unitePipes(std::move(pipes_from_buffers)); } diff --git a/src/Storages/StorageBuffer.h b/src/Storages/StorageBuffer.h index d0cf98425f..22c89db22d 100644 --- a/src/Storages/StorageBuffer.h +++ b/src/Storages/StorageBuffer.h @@ -59,11 +59,11 @@ public: std::string getName() const override { return "Buffer"; } QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -73,7 +73,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageCloudMergeTree.cpp b/src/Storages/StorageCloudMergeTree.cpp index 22fa5df58e..1390a83568 100644 --- a/src/Storages/StorageCloudMergeTree.cpp +++ b/src/Storages/StorageCloudMergeTree.cpp @@ -108,7 +108,7 @@ StorageCloudMergeTree::~StorageCloudMergeTree() void StorageCloudMergeTree::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -116,13 +116,13 @@ void StorageCloudMergeTree::read( unsigned num_streams) { if (auto plan = MergeTreeDataSelectExecutor(*this).read( - column_names, metadata_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage)) + column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage)) query_plan = std::move(*plan); } Pipe StorageCloudMergeTree::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -130,7 +130,7 @@ Pipe StorageCloudMergeTree::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); } @@ -362,12 +362,12 @@ ASTs StorageCloudMergeTree::convertBucketNumbersToAstLiterals(const ASTPtr where QueryProcessingStage::Enum StorageCloudMergeTree::getQueryProcessingStage( ContextPtr query_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const { if (to_stage >= QueryProcessingStage::Enum::WithMergeableState) { - if (getQueryProcessingStageWithAggregateProjection(query_context, metadata_snapshot, query_info)) + if (getQueryProcessingStageWithAggregateProjection(query_context, storage_snapshot, query_info)) { if (query_info.projection->desc->type == ProjectionDescription::Type::Aggregate) return QueryProcessingStage::Enum::WithMergeableState; @@ -443,8 +443,9 @@ static void selectBestProjection( } bool StorageCloudMergeTree::getQueryProcessingStageWithAggregateProjection( - ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot, SelectQueryInfo & query_info) const + ContextPtr query_context, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const { + const auto & metadata_snapshot = storage_snapshot->metadata; const auto & settings = query_context->getSettingsRef(); if (!settings.allow_experimental_projection_optimization || query_info.ignore_projections || query_info.is_projection_query) return false; diff --git a/src/Storages/StorageCloudMergeTree.h b/src/Storages/StorageCloudMergeTree.h index 0699dc34a3..8f37785652 100644 --- a/src/Storages/StorageCloudMergeTree.h +++ b/src/Storages/StorageCloudMergeTree.h @@ -60,7 +60,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -70,7 +70,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -100,8 +100,10 @@ public: CloudMergeTreeDedupWorker * tryGetDedupWorker() { return dedup_worker.get(); } CloudMergeTreeDedupWorker * getDedupWorker(); - QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; - bool getQueryProcessingStageWithAggregateProjection(ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot, SelectQueryInfo & query_info) const; + QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; + bool getQueryProcessingStageWithAggregateProjection(ContextPtr query_context, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const; + + void resetObjectColumns(const ColumnsDescription & object_columns_) { object_columns = object_columns_; } protected: MutationCommands getFirstAlterMutationCommandsForPart(const DataPartPtr & part) const override; diff --git a/src/Storages/StorageCnchMergeTree.cpp b/src/Storages/StorageCnchMergeTree.cpp index 780d369356..cee8755ba4 100644 --- a/src/Storages/StorageCnchMergeTree.cpp +++ b/src/Storages/StorageCnchMergeTree.cpp @@ -86,6 +86,9 @@ #include #include #include +#include +#include +#include namespace ProfileEvents @@ -217,7 +220,7 @@ void StorageCnchMergeTree::loadMutations() } QueryProcessingStage::Enum StorageCnchMergeTree::getQueryProcessingStage( - ContextPtr local_context, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const + ContextPtr local_context, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const { const auto & settings = local_context->getSettingsRef(); if (auto worker_group = local_context->tryGetCurrentWorkerGroup()) @@ -242,7 +245,7 @@ void StorageCnchMergeTree::shutdown() Pipe StorageCnchMergeTree::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -250,7 +253,7 @@ Pipe StorageCnchMergeTree::read( unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); } @@ -258,14 +261,14 @@ Pipe StorageCnchMergeTree::read( void StorageCnchMergeTree::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - auto prepare_result = prepareReadContext(column_names, metadata_snapshot, query_info, local_context); + auto prepare_result = prepareReadContext(column_names, storage_snapshot->metadata, query_info, local_context); Block header = InterpreterSelectQuery(query_info.query, local_context, SelectQueryOptions(processed_stage)).getSampleBlock(); auto worker_group = local_context->getCurrentWorkerGroup(); @@ -289,20 +292,24 @@ void StorageCnchMergeTree::read( Pipe pipe(std::make_shared(std::move(fetch_column_header))); /// Stage 2: (partial) aggregation and projection if any auto query = getBasicSelectQuery(query_info.query); - InterpreterSelectQuery(query, local_context, std::move(pipe), SelectQueryOptions(processed_stage)).buildQueryPlan(query_plan); - return; + // not support join query + if (const auto & select = query->as(); select && !select->join()) + { + InterpreterSelectQuery(query, local_context, std::move(pipe), SelectQueryOptions(processed_stage)).buildQueryPlan(query_plan); + return; + } } + auto modified_query_ast = query_info.query->clone(); + const Scalars & scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; + ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( + header, {}, storage_snapshot, processed_stage, StorageID{"system", "one"}, scalars, false, local_context->getExternalTables()); + LOG_TRACE(log, "Original query before rewrite: {}", queryToString(query_info.query)); - auto modified_query_ast = rewriteSelectQuery(query_info.query, getDatabaseName(), prepare_result.local_table_name); + modified_query_ast = rewriteSelectQuery(modified_query_ast, getDatabaseName(), prepare_result.local_table_name); LOG_TRACE(log, "After query rewrite: {}", queryToString(modified_query_ast)); - const Scalars & scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; - - ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( - header, processed_stage, StorageID{"system", "one"}, scalars, false, local_context->getExternalTables()); - ClusterProxy::executeQuery(query_plan, select_stream_factory, log, modified_query_ast, local_context, worker_group); if (!query_plan.isInitialized()) @@ -316,7 +323,8 @@ PrepareContextResult StorageCnchMergeTree::prepareReadContext( if (local_context->getServerType() == ServerType::cnch_server && txn && txn->isReadOnly()) local_context->getCnchTransactionCoordinator().touchActiveTimestampByTable(getStorageID(), txn); - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + auto storage_snapshot = getStorageSnapshot(metadata_snapshot, local_context); + storage_snapshot->check(column_names); auto worker_group = local_context->getCurrentWorkerGroup(); healthCheckForWorkerGroup(local_context, worker_group); @@ -349,7 +357,8 @@ PrepareContextResult StorageCnchMergeTree::prepareReadContext( String local_table_name = getCloudTableName(local_context); auto bucket_numbers = getRequiredBucketNumbers(query_info, local_context); - collectResource(local_context, parts, local_table_name, bucket_numbers); + + collectResource(local_context, parts, local_table_name, bucket_numbers, storage_snapshot); return {std::move(local_table_name), std::move(parts), {}, {}}; } @@ -1586,7 +1595,10 @@ void StorageCnchMergeTree::collectResource( ContextPtr local_context, ServerDataPartsVector & parts, const String & local_table_name, - const std::set & required_bucket_numbers) + const std::set & required_bucket_numbers, + const StorageSnapshotPtr & storage_snapshot, + WorkerEngineType /*engine_type*/, + bool replicated) { auto cnch_resource = local_context->getCnchServerResource(); auto create_table_query = getCreateQueryForCloudTable(getCreateTableSql(), local_table_name, local_context); @@ -1597,6 +1609,14 @@ void StorageCnchMergeTree::collectResource( // setVirtualPartSize(local_context, parts, worker_group->getReadWorkers().size()); cnch_resource->addDataParts(getStorageUUID(), parts, required_bucket_numbers); + + if (storage_snapshot && !storage_snapshot->object_columns.empty()) + cnch_resource->addDynamicObjectSchema(getStorageUUID(), storage_snapshot->object_columns); + + if (replicated) + { + cnch_resource->setResourceReplicated(getStorageUUID(), replicated); + } } void StorageCnchMergeTree::sendPreloadTasks(ContextPtr local_context, ServerDataPartsVector parts, bool enable_parts_sync_preload, UInt64 parts_preload_level, UInt64 ts) @@ -3082,9 +3102,10 @@ ServerDataPartsVector StorageCnchMergeTree::selectPartsByPartitionCommand(Contex select->setExpression(ASTSelectQuery::Expression::WHERE, std::move(where)); auto metadata_snapshot = getInMemoryMetadataPtr(); + auto storage_snapshot = getStorageSnapshot(metadata_snapshot, local_context); /// So this step will throws if WHERE expression contains columns not in partition key, and it's a good thing TreeRewriterResult syntax_analyzer_result( - metadata_snapshot->partition_key.sample_block.getNamesAndTypesList(), shared_from_this(), metadata_snapshot, true); + metadata_snapshot->partition_key.sample_block.getNamesAndTypesList(), shared_from_this(), storage_snapshot, true); auto analyzed_result = TreeRewriter(local_context).analyzeSelect(query, std::move(syntax_analyzer_result)); query_info.query = std::move(query); query_info.syntax_analyzer_result = std::move(analyzed_result); @@ -3256,6 +3277,26 @@ void StorageCnchMergeTree::mutate(const MutationCommands & commands, ContextPtr } } +void StorageCnchMergeTree::resetObjectColumns(ContextPtr query_context) +{ + object_columns = object_schemas.assembleSchema(query_context, getInMemoryMetadataPtr()); +} + +void StorageCnchMergeTree::appendObjectPartialSchema(const TxnTimestamp & txn_id, ObjectPartialSchema partial_schema) +{ + object_schemas.appendPartialSchema(txn_id, partial_schema); +} + +void StorageCnchMergeTree::resetObjectSchemas(const ObjectAssembledSchema & assembled_schema, const ObjectPartialSchemas & partial_schemas) +{ + object_schemas.reset(assembled_schema, partial_schemas); +} + +void StorageCnchMergeTree::refreshAssembledSchema(const ObjectAssembledSchema & assembled_schema, std::vector txn_ids) +{ + object_schemas.refreshAssembledSchema(assembled_schema, txn_ids); +} + std::unique_ptr StorageCnchMergeTree::getDefaultSettings() const { return std::make_unique(getContext()->getMergeTreeSettings()); diff --git a/src/Storages/StorageCnchMergeTree.h b/src/Storages/StorageCnchMergeTree.h index 33db29e112..86aaa39680 100644 --- a/src/Storages/StorageCnchMergeTree.h +++ b/src/Storages/StorageCnchMergeTree.h @@ -21,7 +21,9 @@ #include #include #include -#include "Catalog/DataModelPartWrapper_fwd.h" +#include +#include +#include namespace DB { @@ -44,6 +46,7 @@ public: bool supportsPrewhere() const override { return true; } bool supportsIndexForIn() const override { return true; } bool supportsMapImplicitColumn() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } bool supportsTrivialCount() const override { return true; } /// Whether support DELETE FROM. We only support for Unique MergeTree for now. @@ -58,14 +61,14 @@ public: bool isRemote() const override { return true; } QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; void startup() override; void shutdown() override; Pipe read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*local_context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -75,7 +78,7 @@ public: void read( QueryPlan & query_plan, const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*local_context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -208,6 +211,11 @@ public: PrunedPartitions getPrunedPartitions(const SelectQueryInfo & query_info, const Names & column_names_to_return, ContextPtr local_context) const ; + void resetObjectColumns(ContextPtr query_context); + + void appendObjectPartialSchema(const TxnTimestamp & txn_id, ObjectPartialSchema partial_schema); + void resetObjectSchemas(const ObjectAssembledSchema & assembled_schema, const ObjectPartialSchemas & partial_schemas); + void refreshAssembledSchema(const ObjectAssembledSchema & assembled_schema, std::vector txn_ids); void checkColumnsValidity(const ColumnsDescription & columns, const ASTPtr & new_settings = nullptr) const override; /// parse bucket number set from where clause, only works for single-key cluster by @@ -231,6 +239,11 @@ private: // Relative path to auxility storage disk root String relative_auxility_storage_path; + /// Current description of columns of data type Object. + /// It changes only when set of parts is changed and is + /// protected by @data_parts_mutex. + ObjectSchemas object_schemas; + CheckResults checkDataCommon(const ASTPtr & query, ContextPtr local_context, ServerDataPartsVector & parts) const; /** @@ -262,7 +275,10 @@ private: ContextPtr local_context, ServerDataPartsVector & parts, const String & local_table_name, - const std::set & required_bucket_numbers = {}); + const std::set & required_bucket_numbers = {}, + const StorageSnapshotPtr & storage_snapshot = nullptr, + WorkerEngineType engine_type = WorkerEngineType::CLOUD, + bool replicated = false); /// NOTE: No need to implement this for CnchMergeTree as data processing is on CloudMergeTree. MutationCommands getFirstAlterMutationCommandsForPart(const DataPartPtr &) const override { return {}; } diff --git a/src/Storages/StorageDictionary.cpp b/src/Storages/StorageDictionary.cpp index f4f9206622..6e02686429 100644 --- a/src/Storages/StorageDictionary.cpp +++ b/src/Storages/StorageDictionary.cpp @@ -184,7 +184,7 @@ void StorageDictionary::checkTableCanBeDetached() const Pipe StorageDictionary::read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr local_context, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/StorageDictionary.h b/src/Storages/StorageDictionary.h index a8b57f94b1..70a7c4f60d 100644 --- a/src/Storages/StorageDictionary.h +++ b/src/Storages/StorageDictionary.h @@ -47,7 +47,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 29407dc82a..972af1c5e5 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include @@ -74,6 +76,7 @@ #include #include #include +#include #include #include @@ -305,9 +308,9 @@ void replaceConstantExpressions( ContextPtr context, const NamesAndTypesList & columns, ConstStoragePtr storage, - const StorageMetadataPtr & metadata_snapshot) + const StorageSnapshotPtr & storage_snapshot) { - auto syntax_result = TreeRewriter(context).analyze(node, columns, storage, metadata_snapshot); + auto syntax_result = TreeRewriter(context).analyze(node, columns, storage, storage_snapshot); Block block_with_constants = KeyCondition::getBlockWithConstants(node, syntax_result, context); InDepthNodeVisitor visitor(block_with_constants); @@ -394,6 +397,30 @@ std::optional getOptimizedQueryProcessingStage(const return QueryProcessingStage::Complete; } +static bool requiresObjectColumns(const ColumnsDescription & all_columns, ASTPtr query) +{ + if (!hasDynamicSubcolumns(all_columns)) + return false; + + if (!query) + return true; + + RequiredSourceColumnsVisitor::Data columns_context; + RequiredSourceColumnsVisitor(columns_context).visit(query); + + auto required_columns = columns_context.requiredColumns(); + for (const auto & required_column : required_columns) + { + auto name_in_storage = Nested::splitName(required_column).first; + auto column_in_storage = all_columns.tryGetPhysical(name_in_storage); + + if (column_in_storage && column_in_storage->type->hasDynamicSubcolumns()) + return true; + } + + return false; +} + size_t getClusterQueriedNodes(const Settings & settings, const ClusterPtr & cluster) { size_t num_local_shards = cluster->getLocalShardCount(); @@ -531,7 +558,7 @@ StorageDistributed::StorageDistributed( QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage( ContextPtr local_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const { const auto & settings = local_context->getSettingsRef(); @@ -543,7 +570,7 @@ QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage( /// (Anyway it will be calculated in the read()) if (getClusterQueriedNodes(settings, cluster) > 1 && settings.optimize_skip_unused_shards) { - ClusterPtr optimized_cluster = getOptimizedCluster(local_context, metadata_snapshot, query_info.query); + ClusterPtr optimized_cluster = getOptimizedCluster(local_context, storage_snapshot, query_info.query); if (optimized_cluster) { LOG_DEBUG(log, "Skipping irrelevant shards - the query will be sent to the following shards of the cluster (shard numbers): {}", @@ -606,7 +633,7 @@ QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage( Pipe StorageDistributed::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -614,7 +641,7 @@ Pipe StorageDistributed::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); @@ -623,7 +650,7 @@ Pipe StorageDistributed::read( void StorageDistributed::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -650,19 +677,29 @@ void StorageDistributed::read( const Scalars & scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; bool has_virtual_shard_num_column = std::find(column_names.begin(), column_names.end(), "_shard_num") != column_names.end(); - if (has_virtual_shard_num_column && !isVirtualColumn("_shard_num", metadata_snapshot)) + if (has_virtual_shard_num_column && !isVirtualColumn("_shard_num", storage_snapshot->metadata)) has_virtual_shard_num_column = false; + + const auto & snapshot_data = assert_cast(*storage_snapshot->data); - ClusterProxy::SelectStreamFactory select_stream_factory = remote_table_function_ptr - ? ClusterProxy::SelectStreamFactory( - header, processed_stage, remote_table_function_ptr, scalars, has_virtual_shard_num_column, local_context->getExternalTables()) - : ClusterProxy::SelectStreamFactory( - header, - processed_stage, - StorageID{remote_database, remote_table}, - scalars, - has_virtual_shard_num_column, - local_context->getExternalTables()); + ClusterProxy::SelectStreamFactory select_stream_factory = remote_table_function_ptr ? ClusterProxy::SelectStreamFactory( + header, + snapshot_data.objects_by_shard, + storage_snapshot, + processed_stage, + remote_table_function_ptr, + scalars, + has_virtual_shard_num_column, + local_context->getExternalTables()) + : ClusterProxy::SelectStreamFactory( + header, + snapshot_data.objects_by_shard, + storage_snapshot, + processed_stage, + StorageID{remote_database, remote_table}, + scalars, + has_virtual_shard_num_column, + local_context->getExternalTables()); ClusterProxy::executeQuery(query_plan, select_stream_factory, log, modified_query_ast, local_context, query_info, @@ -1057,7 +1094,7 @@ ClusterPtr StorageDistributed::getCluster() const } ClusterPtr StorageDistributed::getOptimizedCluster( - ContextPtr local_context, const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query_ptr) const + ContextPtr local_context, const StorageSnapshotPtr & storage_snapshot, const ASTPtr & query_ptr) const { ClusterPtr cluster = getCluster(); const Settings & settings = local_context->getSettingsRef(); @@ -1066,7 +1103,7 @@ ClusterPtr StorageDistributed::getOptimizedCluster( if (has_sharding_key && sharding_key_is_usable) { - ClusterPtr optimized = skipUnusedShards(cluster, query_ptr, metadata_snapshot, local_context); + ClusterPtr optimized = skipUnusedShards(cluster, query_ptr, storage_snapshot, local_context); if (optimized) return optimized; } @@ -1122,7 +1159,7 @@ IColumn::Selector StorageDistributed::createSelector(const ClusterPtr cluster, c ClusterPtr StorageDistributed::skipUnusedShards( ClusterPtr cluster, const ASTPtr & query_ptr, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, ContextPtr local_context) const { const auto & select = query_ptr->as(); @@ -1142,7 +1179,7 @@ ClusterPtr StorageDistributed::skipUnusedShards( condition_ast = select.prewhere() ? select.prewhere()->clone() : select.where()->clone(); } - replaceConstantExpressions(condition_ast, local_context, metadata_snapshot->getColumns().getAll(), shared_from_this(), metadata_snapshot); + replaceConstantExpressions(condition_ast, local_context, storage_snapshot->metadata->getColumns().getAll(), shared_from_this(), storage_snapshot); size_t limit = local_context->getSettingsRef().optimize_skip_unused_shards_limit; if (!limit || limit > SSIZE_MAX) @@ -1313,6 +1350,35 @@ void StorageDistributed::delayInsertOrThrowIfNeeded() const } } +StorageSnapshotPtr StorageDistributed::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const +{ + return getStorageSnapshotForQuery(metadata_snapshot, nullptr, query_context); +} + +StorageSnapshotPtr StorageDistributed::getStorageSnapshotForQuery( + const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query, ContextPtr /*query_context*/) const +{ + /// If query doesn't use columns of type Object, don't deduce + /// concrete types for them, because it required extra round trip. + auto snapshot_data = std::make_unique(); + if (!requiresObjectColumns(metadata_snapshot->getColumns(), query)) + return std::make_shared(*this, metadata_snapshot, ColumnsDescription{}, std::move(snapshot_data)); + + snapshot_data->objects_by_shard = getExtendedObjectsOfRemoteTables( + *getCluster(), + StorageID{remote_database, remote_table}, + metadata_snapshot->getColumns(), + getContext()); + + auto object_columns = DB::getConcreteObjectColumns( + snapshot_data->objects_by_shard.begin(), + snapshot_data->objects_by_shard.end(), + metadata_snapshot->getColumns(), + [](const auto & shard_num_and_columns) -> const auto & { return shard_num_and_columns.second; }); + + return std::make_shared(*this, metadata_snapshot, object_columns, std::move(snapshot_data)); +} + void registerStorageDistributed(StorageFactory & factory) { factory.registerStorage("Distributed", [](const StorageFactory::Arguments & args) diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index 255ce2efce..88692619ee 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -80,17 +81,30 @@ public: bool supportsFinal() const override { return true; } bool supportsPrewhere() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } StoragePolicyPtr getStoragePolicy(IStorage::StorageLocation location) const override; bool isRemote() const override { return true; } bool supportsMapImplicitColumn() const override { return true; } + /// Snapshot for StorageDistributed contains descriptions + /// of columns of type Object for each shard at the moment + /// of the start of query. + struct SnapshotData : public StorageSnapshot::Data + { + ColumnsDescriptionByShardNum objects_by_shard; + }; + + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const override; + StorageSnapshotPtr getStorageSnapshotForQuery( + const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query, ContextPtr /*query_context*/) const override; + QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -100,7 +114,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -207,9 +221,9 @@ private: /// Apply the following settings: /// - optimize_skip_unused_shards /// - force_optimize_skip_unused_shards - ClusterPtr getOptimizedCluster(ContextPtr, const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query_ptr) const; + ClusterPtr getOptimizedCluster(ContextPtr, const StorageSnapshotPtr & storage_snapshot, const ASTPtr & query_ptr) const; ClusterPtr - skipUnusedShards(ClusterPtr cluster, const ASTPtr & query_ptr, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) const; + skipUnusedShards(ClusterPtr cluster, const ASTPtr & query_ptr, const StorageSnapshotPtr & storage_snapshot, ContextPtr context) const; size_t getRandomShardIndex(const Cluster::ShardsInfo & shards); diff --git a/src/Storages/StorageExternalDistributed.cpp b/src/Storages/StorageExternalDistributed.cpp index f20e49fe23..f4a7b39417 100644 --- a/src/Storages/StorageExternalDistributed.cpp +++ b/src/Storages/StorageExternalDistributed.cpp @@ -177,7 +177,7 @@ StorageExternalDistributed::StorageExternalDistributed( Pipe StorageExternalDistributed::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -189,7 +189,7 @@ Pipe StorageExternalDistributed::read( { pipes.emplace_back(shard->read( column_names, - metadata_snapshot, + storage_snapshot, query_info, context, processed_stage, diff --git a/src/Storages/StorageExternalDistributed.h b/src/Storages/StorageExternalDistributed.h index c85276c09d..1e0947d43b 100644 --- a/src/Storages/StorageExternalDistributed.h +++ b/src/Storages/StorageExternalDistributed.h @@ -32,7 +32,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index bcaa7cce18..a26c2edb90 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -293,26 +293,26 @@ public: static Block getBlockForSource( const StorageFilePtr & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const ColumnsDescription & columns_description, const FilesInfoPtr & files_info) { if (storage->isColumnOriented()) - return metadata_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical(), storage->getVirtuals(), storage->getStorageID()); + return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); else - return getHeader(metadata_snapshot, files_info->need_path_column, files_info->need_file_column); + return getHeader(storage_snapshot->metadata, files_info->need_path_column, files_info->need_file_column); } StorageFileSource( std::shared_ptr storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, ContextPtr context_, UInt64 max_block_size_, FilesInfoPtr files_info_, ColumnsDescription columns_description_) - : SourceWithProgress(getBlockForSource(storage_, metadata_snapshot_, columns_description_, files_info_)) + : SourceWithProgress(getBlockForSource(storage_, storage_snapshot_, columns_description_, files_info_)) , storage(std::move(storage_)) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , files_info(std::move(files_info_)) , columns_description(std::move(columns_description_)) , context(context_) @@ -403,8 +403,8 @@ public: auto get_block_for_format = [&]() -> Block { if (storage->isColumnOriented()) - return metadata_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); - return metadata_snapshot->getSampleBlock(); + return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); + return storage_snapshot->metadata->getSampleBlock(); }; auto format = FormatFactory::instance().getInput( @@ -465,7 +465,7 @@ public: private: std::shared_ptr storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; FilesInfoPtr files_info; String current_path; Block sample_block; @@ -486,7 +486,7 @@ private: Pipe StorageFile::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -501,7 +501,7 @@ Pipe StorageFile::read( if (paths.size() == 1 && !fs::exists(paths[0])) { if (context->getSettingsRef().engine_file_empty_if_not_exists) - return Pipe(std::make_shared(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()))); + return Pipe(std::make_shared(storage_snapshot->getSampleBlockForColumns(column_names))); else throw Exception("File " + paths[0] + " doesn't exist", ErrorCodes::FILE_DOESNT_EXIST); } @@ -537,12 +537,12 @@ Pipe StorageFile::read( { if (isColumnOriented()) return ColumnsDescription{ - metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()).getNamesAndTypesList()}; + storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()}; else - return metadata_snapshot->getColumns(); + return storage_snapshot->metadata->getColumns(); }; pipes.emplace_back(std::make_shared( - this_ptr, metadata_snapshot, context, max_block_size, files_info, get_columns_for_format())); + this_ptr, storage_snapshot, context, max_block_size, files_info, get_columns_for_format())); } return Pipe::unitePipes(std::move(pipes)); diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 843cd40582..cbd4169562 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -22,7 +22,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageGenerateRandom.cpp b/src/Storages/StorageGenerateRandom.cpp index ba19b7c525..d807ad1725 100644 --- a/src/Storages/StorageGenerateRandom.cpp +++ b/src/Storages/StorageGenerateRandom.cpp @@ -510,19 +510,19 @@ void registerStorageGenerateRandom(StorageFactory & factory) Pipe StorageGenerateRandom::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); Pipes pipes; pipes.reserve(num_streams); - const ColumnsDescription & our_columns = metadata_snapshot->getColumns(); + const ColumnsDescription & our_columns = storage_snapshot->metadata->getColumns(); Block block_header; for (const auto & name : column_names) { diff --git a/src/Storages/StorageGenerateRandom.h b/src/Storages/StorageGenerateRandom.h index 61fd68cb80..390b2cef46 100644 --- a/src/Storages/StorageGenerateRandom.h +++ b/src/Storages/StorageGenerateRandom.h @@ -17,7 +17,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index dc0863dee6..1935fce2f2 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include #include @@ -449,7 +451,7 @@ Block StorageInMemoryMetadata::getSampleBlockForColumns( for (const auto & name : column_names) { - auto column = getColumns().tryGetColumnOrSubcolumn(ColumnsDescription::All, name); + auto column = getColumns().tryGetColumnOrSubcolumn(GetColumnsOptions::All, name); if (column) { auto column_name = column->name; @@ -710,18 +712,6 @@ namespace using UniqueStrings = google::sparsehash::dense_hash_set; #endif - String listOfColumns(const NamesAndTypesList & available_columns) - { - WriteBufferFromOwnString ss; - for (auto it = available_columns.begin(); it != available_columns.end(); ++it) - { - if (it != available_columns.begin()) - ss << ", "; - ss << it->name; - } - return ss.str(); - } - NamesAndTypesMap getColumnsMap(const NamesAndTypesList & columns) { NamesAndTypesMap res; @@ -739,6 +729,35 @@ namespace strings.set_empty_key(StringRef()); return strings; } + + /* + * This function checks compatibility of enums. It returns true if: + * 1. Both types are enums. + * 2. The first type can represent all possible values of the second one. + * 3. Both types require the same amount of memory. + */ + bool isCompatibleEnumTypes(const IDataType * lhs, const IDataType * rhs) + { + if (IDataTypeEnum const * enum_type = dynamic_cast(lhs)) + { + if (!enum_type->contains(*rhs)) + return false; + return enum_type->getMaximumSizeOfValueInMemory() == rhs->getMaximumSizeOfValueInMemory(); + } + return false; + } +} + +String listOfColumns(const NamesAndTypesList & available_columns) +{ + WriteBufferFromOwnString ss; + for (auto it = available_columns.begin(); it != available_columns.end(); ++it) + { + if (it != available_columns.begin()) + ss << ", "; + ss << it->name; + } + return ss.str(); } void StorageInMemoryMetadata::check(const Names & column_names, const NamesAndTypesList & virtuals, const StorageID & storage_id) const @@ -762,7 +781,7 @@ void StorageInMemoryMetadata::check(const Names & column_names, const NamesAndTy if (func_columns.contains(name)) continue; - bool has_column = getColumns().hasColumnOrSubcolumn(ColumnsDescription::AllPhysical, name) || virtuals_map.count(name); + bool has_column = getColumns().hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, name) || virtuals_map.count(name); if (!has_column) { @@ -801,7 +820,9 @@ void StorageInMemoryMetadata::check(const NamesAndTypesList & provided_columns) "There is no column with name " + column.name + ". There are columns: " + listOfColumns(available_columns), ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); - if (!column.type->equals(*it->second)) + if (!it->second->hasDynamicSubcolumns() + && !column.type->equals(*it->second) + && !isCompatibleEnumTypes(it->second, column.type.get())) throw Exception( "Type mismatch for column " + column.name + ". Column has type " + it->second->getName() + ", got type " + column.type->getName(), @@ -844,7 +865,9 @@ void StorageInMemoryMetadata::check(const NamesAndTypesList & provided_columns, "There is no column with name " + name + ". There are columns: " + listOfColumns(available_columns), ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); - if (!it->second->equals(*jt->second)) + if (!it->second->hasDynamicSubcolumns() + && !it->second->equals(*jt->second) + && !isCompatibleEnumTypes(jt->second, it->second)) throw Exception( "Type mismatch for column " + name + ". Column has type " + jt->second->getName() + ", got type " + it->second->getName(), ErrorCodes::TYPE_MISMATCH); @@ -885,7 +908,9 @@ void StorageInMemoryMetadata::check(const Block & block, bool need_all) const "There is no column with name " + column.name + ". There are columns: " + listOfColumns(available_columns), ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); - if (!column.type->equals(*it->second)) + if (!it->second->hasDynamicSubcolumns() + && !column.type->equals(*it->second) + && !isCompatibleEnumTypes(it->second, column.type.get())) throw Exception( "Type mismatch for column " + column.name + ". Column has type " + it->second->getName() + ", got type " + column.type->getName(), diff --git a/src/Storages/StorageInMemoryMetadata.h b/src/Storages/StorageInMemoryMetadata.h index 6c1aee9961..028fb4f999 100644 --- a/src/Storages/StorageInMemoryMetadata.h +++ b/src/Storages/StorageInMemoryMetadata.h @@ -353,4 +353,5 @@ struct StorageInMemoryMetadata using StorageMetadataPtr = std::shared_ptr; using MultiVersionStorageMetadataPtr = MultiVersion; +String listOfColumns(const NamesAndTypesList & available_columns); } diff --git a/src/Storages/StorageInput.cpp b/src/Storages/StorageInput.cpp index fdde5a119a..dfcad4ea04 100644 --- a/src/Storages/StorageInput.cpp +++ b/src/Storages/StorageInput.cpp @@ -75,7 +75,7 @@ void StorageInput::setPipe(Pipe pipe_) Pipe StorageInput::read( const Names & /*column_names*/, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -89,7 +89,7 @@ Pipe StorageInput::read( { /// Send structure to the client. query_context->initializeInput(shared_from_this()); - return Pipe(std::make_shared(query_context, metadata_snapshot->getSampleBlock())); + return Pipe(std::make_shared(query_context, storage_snapshot->metadata->getSampleBlock())); } if (pipe.empty()) diff --git a/src/Storages/StorageInput.h b/src/Storages/StorageInput.h index 53a6809d0f..32fc43520a 100644 --- a/src/Storages/StorageInput.h +++ b/src/Storages/StorageInput.h @@ -41,7 +41,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp index 932791bebd..584457aebc 100644 --- a/src/Storages/StorageJoin.cpp +++ b/src/Storages/StorageJoin.cpp @@ -567,16 +567,16 @@ private: // TODO: multiple stream read and index read Pipe StorageJoin::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); - Block source_sample_block = metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + Block source_sample_block = storage_snapshot->getSampleBlockForColumns(column_names); return Pipe(std::make_shared(join, rwlock, max_block_size, source_sample_block)); } diff --git a/src/Storages/StorageJoin.h b/src/Storages/StorageJoin.h index d2a88bcfa4..76b9099339 100644 --- a/src/Storages/StorageJoin.h +++ b/src/Storages/StorageJoin.h @@ -49,7 +49,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index b0e386a54c..3e742304ed 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -429,7 +429,7 @@ void LogBlockOutputStream::writeData(const NameAndTypePair & name_and_type, cons settings.getter = createStreamGetter(name_and_type, written_streams); if (serialize_states.count(name) == 0) - serialization->serializeBinaryBulkStatePrefix(settings, serialize_states[name]); + serialization->serializeBinaryBulkStatePrefix(column, settings, serialize_states[name]); serialization->enumerateStreams([&] (const ISerialization::SubstreamPath & path) { @@ -668,19 +668,20 @@ static std::chrono::seconds getLockTimeout(ContextPtr context) Pipe StorageLog::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); auto lock_timeout = getLockTimeout(context); loadMarks(lock_timeout); - auto all_columns = metadata_snapshot->getColumns().getByNames(ColumnsDescription::All, column_names, true); + auto options = GetColumnsOptions(GetColumnsOptions::All).withSubcolumns(); + auto all_columns = storage_snapshot->getColumnsByNames(options, column_names); all_columns = Nested::convertToSubcolumns(all_columns); std::shared_lock lock(rwlock, lock_timeout); @@ -689,7 +690,7 @@ Pipe StorageLog::read( Pipes pipes; - const Marks & marks = getMarksWithRealRowCount(metadata_snapshot); + const Marks & marks = getMarksWithRealRowCount(storage_snapshot->metadata); size_t marks_size = marks.size(); if (num_streams > marks_size) diff --git a/src/Storages/StorageLog.h b/src/Storages/StorageLog.h index 6fea00edef..87ac537695 100644 --- a/src/Storages/StorageLog.h +++ b/src/Storages/StorageLog.h @@ -27,7 +27,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageMaterializeMySQL.cpp b/src/Storages/StorageMaterializeMySQL.cpp index 59e7cfaf0a..9eeab2e3bc 100644 --- a/src/Storages/StorageMaterializeMySQL.cpp +++ b/src/Storages/StorageMaterializeMySQL.cpp @@ -63,7 +63,7 @@ bool StorageMaterializeMySQL::needRewriteQueryWithFinal(const Names & column_nam Pipe StorageMaterializeMySQL::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -71,7 +71,7 @@ Pipe StorageMaterializeMySQL::read( unsigned int num_streams) { /// If the background synchronization thread has exception. - return nested_storage->read(column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams); + return nested_storage->read(column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); } BlockOutputStreamPtr StorageMaterializeMySQL::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) diff --git a/src/Storages/StorageMaterializeMySQL.h b/src/Storages/StorageMaterializeMySQL.h index 2d81d3e116..898574d569 100644 --- a/src/Storages/StorageMaterializeMySQL.h +++ b/src/Storages/StorageMaterializeMySQL.h @@ -48,7 +48,7 @@ public: bool needRewriteQueryWithFinal(const Names & column_names) const override; Pipe read( - const Names & column_names, const StorageMetadataPtr & metadata_snapshot, SelectQueryInfo & query_info, + const Names & column_names, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) override; BlockOutputStreamPtr write(const ASTPtr &, const StorageMetadataPtr &, ContextPtr) override; diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index b9c3d2a3aa..2f332f35a7 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -169,15 +169,16 @@ StorageMaterializedView::StorageMaterializedView( QueryProcessingStage::Enum StorageMaterializedView::getQueryProcessingStage( ContextPtr local_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr &, + const StorageSnapshotPtr &, SelectQueryInfo & query_info) const { - return getTargetTable()->getQueryProcessingStage(local_context, to_stage, getTargetTable()->getInMemoryMetadataPtr(), query_info); + const auto & target_metadata = getTargetTable()->getInMemoryMetadataPtr(); + return getTargetTable()->getQueryProcessingStage(local_context, to_stage, getTargetTable()->getStorageSnapshot(target_metadata, local_context), query_info); } Pipe StorageMaterializedView::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -185,7 +186,7 @@ Pipe StorageMaterializedView::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); @@ -194,7 +195,7 @@ Pipe StorageMaterializedView::read( void StorageMaterializedView::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -204,15 +205,16 @@ void StorageMaterializedView::read( auto storage = getTargetTable(); auto lock = storage->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto target_metadata_snapshot = storage->getInMemoryMetadataPtr(); + auto target_storage_snapshot = storage->getStorageSnapshot(target_metadata_snapshot, local_context); if (query_info.order_optimizer) query_info.input_order_info = query_info.order_optimizer->getInputOrder(target_metadata_snapshot, local_context); - storage->read(query_plan, column_names, target_metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + storage->read(query_plan, column_names, target_storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); if (query_plan.isInitialized()) { - auto mv_header = getHeaderForProcessingStage(*this, column_names, metadata_snapshot, query_info, local_context, processed_stage); + auto mv_header = getHeaderForProcessingStage(column_names, storage_snapshot, query_info, local_context, processed_stage); auto target_header = query_plan.getCurrentDataStream().header; /// No need to convert columns that does not exists in MV diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index 532189f03c..d9189b092b 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -90,7 +90,7 @@ public: virtual bool supportsOptimizer() const override { return true; } QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr & , SelectQueryInfo &) const override; StoragePtr getTargetTable() const; StoragePtr tryGetTargetTable() const; @@ -103,7 +103,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -113,7 +113,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index b4a8c34321..69f848b2be 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -1,10 +1,13 @@ #include #include #include +#include +#include #include #include +#include #include #include #include @@ -30,13 +33,13 @@ public: MemorySource( Names column_names_, - const StorageMemory & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, std::shared_ptr data_, std::shared_ptr> parallel_execution_index_, InitializerFunc initializer_func_ = {}) - : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) - , column_names_and_types(metadata_snapshot->getColumns().getByNames(ColumnsDescription::All, column_names_, true)) + : SourceWithProgress(storage_snapshot->getSampleBlockForColumns(column_names_)) + , column_names_and_types(storage_snapshot->getColumnsByNames( + GetColumnsOptions(GetColumnsOptions::All).withSubcolumns().withExtendedObjects(), column_names_)) , data(data_) , parallel_execution_index(parallel_execution_index_) , initializer_func(std::move(initializer_func_)) @@ -62,21 +65,21 @@ protected: } const Block & src = (*data)[current_index]; + Columns columns; - columns.reserve(columns.size()); + size_t num_columns = column_names_and_types.size(); + columns.reserve(num_columns); - /// Add only required columns to `res`. - for (const auto & elem : column_names_and_types) + auto name_and_type = column_names_and_types.begin(); + for (size_t i = 0; i < num_columns; ++i) { - auto current_column = src.getByName(elem.getNameInStorage()).column; - current_column = current_column->decompress(); - - if (elem.isSubcolumn()) - columns.emplace_back(elem.getTypeInStorage()->getSubcolumn(elem.getSubcolumnName(), *current_column)); - else - columns.emplace_back(std::move(current_column)); + columns.emplace_back(tryGetColumnFromBlock(src, *name_and_type)); + ++name_and_type; } + fillMissingColumns(columns, src.rows(), column_names_and_types, /*metadata_snapshot=*/ nullptr); + assert(std::all_of(columns.begin(), columns.end(), [](const auto & column) { return column != nullptr; })); + return Chunk(std::move(columns), src.rows()); } @@ -106,29 +109,36 @@ class MemoryBlockOutputStream : public IBlockOutputStream public: MemoryBlockOutputStream( StorageMemory & storage_, - const StorageMetadataPtr & metadata_snapshot_) + const StorageMetadataPtr & metadata_snapshot_, + ContextPtr context) : storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_.getStorageSnapshot(metadata_snapshot_, context)) { } - Block getHeader() const override { return metadata_snapshot->getSampleBlock(); } + Block getHeader() const override { return storage_snapshot->metadata->getSampleBlock(); } void write(const Block & block) override { - metadata_snapshot->check(block, true); + storage_snapshot->metadata->check(block, true); + + auto flatten_block = block.cloneWithColumns(block.getColumns()); + if (!storage_snapshot->object_columns.empty()) + { + convertDynamicColumnsToTuples(flatten_block, storage_snapshot); + } if (storage.compress) { Block compressed_block; - for (const auto & elem : block) + for (const auto & elem : flatten_block) compressed_block.insert({ elem.column->compress(), elem.type, elem.name }); new_blocks.emplace_back(compressed_block); } else { - new_blocks.emplace_back(block); + new_blocks.emplace_back(flatten_block); } } @@ -157,7 +167,7 @@ private: Blocks new_blocks; StorageMemory & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; }; @@ -181,16 +191,37 @@ StorageMemory::StorageMemory( } +StorageSnapshotPtr StorageMemory::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const +{ + auto snapshot_data = std::make_unique(); + snapshot_data->blocks = data.get(); + + if (!hasDynamicSubcolumns(metadata_snapshot->getColumns())) + return std::make_shared(*this, metadata_snapshot, ColumnsDescription{}, std::move(snapshot_data)); + + auto object_columns = getConcreteObjectColumns( + snapshot_data->blocks->begin(), + snapshot_data->blocks->end(), + metadata_snapshot->getColumns(), + [](const auto & block) -> const auto & { return block.getColumnsWithTypeAndName(); }); + + return std::make_shared(*this, metadata_snapshot, object_columns, std::move(snapshot_data)); +} + + Pipe StorageMemory::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t /*max_block_size*/, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); + + const auto & snapshot_data = assert_cast(*storage_snapshot->data); + auto current_data = snapshot_data.blocks; if (delay_read_for_global_subqueries) { @@ -204,17 +235,15 @@ Pipe StorageMemory::read( return Pipe(std::make_shared( column_names, - *this, - metadata_snapshot, + storage_snapshot, nullptr /* data */, nullptr /* parallel execution index */, - [this](std::shared_ptr & data_to_initialize) + [current_data](std::shared_ptr & data_to_initialize) { - data_to_initialize = data.get(); + data_to_initialize = current_data; })); } - auto current_data = data.get(); size_t size = current_data->size(); if (num_streams > size) @@ -226,16 +255,16 @@ Pipe StorageMemory::read( for (size_t stream = 0; stream < num_streams; ++stream) { - pipes.emplace_back(std::make_shared(column_names, *this, metadata_snapshot, current_data, parallel_execution_index)); + pipes.emplace_back(std::make_shared(column_names, storage_snapshot, current_data, parallel_execution_index)); } return Pipe::unitePipes(std::move(pipes)); } -BlockOutputStreamPtr StorageMemory::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/) +BlockOutputStreamPtr StorageMemory::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) { - return std::make_shared(*this, metadata_snapshot); + return std::make_shared(*this, metadata_snapshot, context); } diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index df99293cf4..75d7586a0f 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -11,6 +11,7 @@ #include #include +#include "Interpreters/Context_fwd.h" namespace DB { @@ -30,9 +31,18 @@ public: size_t getSize() const { return data.get()->size(); } + /// Snapshot for StorageMemory contains current set of blocks + /// at the moment of the start of query. + struct SnapshotData : public StorageSnapshot::Data + { + std::shared_ptr blocks; + }; + + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const override; + Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -41,6 +51,7 @@ public: bool supportsParallelInsert() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } /// Smaller blocks (e.g. 64K rows) are better for CPU cache. bool prefersLargeBlocks() const override { return false; } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 58a1927956..bfe21bbb8b 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -139,7 +139,7 @@ bool StorageMerge::mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, Cont QueryProcessingStage::Enum StorageMerge::getQueryProcessingStage( ContextPtr local_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr &, + const StorageSnapshotPtr &, SelectQueryInfo & query_info) const { /// In case of JOIN the first stage (which includes JOIN) @@ -168,7 +168,8 @@ QueryProcessingStage::Enum StorageMerge::getQueryProcessingStage( ++selected_table_size; stage_in_source_tables = std::max( stage_in_source_tables, - table->getQueryProcessingStage(local_context, to_stage, table->getInMemoryMetadataPtr(), query_info)); + table->getQueryProcessingStage(local_context, to_stage, + table->getStorageSnapshot(table->getInMemoryMetadataPtr(), local_context), query_info)); } iterator->next(); @@ -181,7 +182,7 @@ QueryProcessingStage::Enum StorageMerge::getQueryProcessingStage( Pipe StorageMerge::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -197,9 +198,9 @@ Pipe StorageMerge::read( for (const auto & column_name : column_names) { - if (column_name == "_database" && isVirtualColumn(column_name, metadata_snapshot)) + if (column_name == "_database" && isVirtualColumn(column_name, storage_snapshot->metadata)) has_database_virtual_column = true; - else if (column_name == "_table" && isVirtualColumn(column_name, metadata_snapshot)) + else if (column_name == "_table" && isVirtualColumn(column_name, storage_snapshot->metadata)) has_table_virtual_column = true; else real_column_names.push_back(column_name); @@ -212,7 +213,7 @@ Pipe StorageMerge::read( modified_context->setSetting("optimize_move_to_prewhere", Field{false}); /// What will be result structure depending on query processed stage in source tables? - Block header = getHeaderForProcessingStage(*this, column_names, metadata_snapshot, query_info, local_context, processed_stage); + Block header = getHeaderForProcessingStage(column_names, storage_snapshot, query_info, local_context, processed_stage); /** First we make list of selected tables to find out its size. * This is necessary to correctly pass the recommended number of threads to each table. @@ -281,10 +282,11 @@ Pipe StorageMerge::read( Aliases aliases; auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto storage_columns = storage_metadata_snapshot->getColumns(); + auto nested_storage_snapshot = storage->getStorageSnapshot(storage_metadata_snapshot, local_context); if (processed_stage == QueryProcessingStage::FetchColumns && !storage_columns.getAliases().empty()) { - auto syntax_result = TreeRewriter(local_context).analyzeSelect(query_info.query, TreeRewriterResult({}, storage, storage_metadata_snapshot)); + auto syntax_result = TreeRewriter(local_context).analyzeSelect(query_info.query, TreeRewriterResult({}, storage, nested_storage_snapshot)); ASTPtr required_columns_expr_list = std::make_shared(); ASTPtr column_expr; @@ -314,13 +316,13 @@ Pipe StorageMerge::read( } syntax_result = TreeRewriter(local_context).analyze(required_columns_expr_list, storage_columns.getAllPhysical(), - storage, storage_metadata_snapshot); + storage, storage->getStorageSnapshot(storage_metadata_snapshot, local_context)); auto alias_actions = ExpressionAnalyzer(required_columns_expr_list, syntax_result, local_context).getActionsDAG(true); required_columns = alias_actions->getRequiredColumns().getNames(); } auto source_pipe = createSources( - storage_metadata_snapshot, + nested_storage_snapshot, query_info, processed_stage, max_block_size, @@ -348,7 +350,7 @@ Pipe StorageMerge::read( } Pipe StorageMerge::createSources( - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, const QueryProcessingStage::Enum & processed_stage, const UInt64 max_block_size, @@ -397,16 +399,16 @@ Pipe StorageMerge::createSources( } auto storage_stage - = storage->getQueryProcessingStage(modified_context, QueryProcessingStage::Complete, metadata_snapshot, modified_query_info); + = storage->getQueryProcessingStage(modified_context, QueryProcessingStage::Complete, storage_snapshot, modified_query_info); if (processed_stage <= storage_stage) { /// If there are only virtual columns in query, you must request at least one other column. if (real_column_names.empty()) - real_column_names.push_back(ExpressionActions::getSmallestColumn(metadata_snapshot->getColumns().getAllPhysical())); + real_column_names.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical())); pipe = storage->read( real_column_names, - metadata_snapshot, + storage_snapshot, modified_query_info, modified_context, processed_stage, @@ -478,7 +480,7 @@ Pipe StorageMerge::createSources( /// Subordinary tables could have different but convertible types, like numeric types of different width. /// We must return streams with structure equals to structure of Merge table. - convertingSourceStream(header, metadata_snapshot, aliases, modified_context, modified_query_info.query, pipe, processed_stage); + convertingSourceStream(header, storage_snapshot->metadata, aliases, modified_context, modified_query_info.query, pipe, processed_stage); pipe.addTableLock(struct_lock); pipe.addStorageHolder(storage); diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 467e75ec58..16ba04ffb5 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -49,11 +49,11 @@ public: bool supportsSubcolumns() const override { return true; } QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -131,7 +131,7 @@ protected: using Aliases = std::vector; Pipe createSources( - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, const QueryProcessingStage::Enum & processed_stage, UInt64 max_block_size, diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 34829e6ef7..16afdc83b2 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -203,20 +203,20 @@ StorageMergeTree::~StorageMergeTree() void StorageMergeTree::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { - if (auto plan = reader.read(column_names, metadata_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage)) + if (auto plan = reader.read(column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage)) query_plan = std::move(*plan); } Pipe StorageMergeTree::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -224,7 +224,7 @@ Pipe StorageMergeTree::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index 179e5dd1b4..dd7665e61c 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -64,7 +64,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -74,7 +74,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageMongoDB.cpp b/src/Storages/StorageMongoDB.cpp index e27d16ecc6..61bc5dd4b5 100644 --- a/src/Storages/StorageMongoDB.cpp +++ b/src/Storages/StorageMongoDB.cpp @@ -74,7 +74,7 @@ void StorageMongoDB::connectIfNotConnected() Pipe StorageMongoDB::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -83,12 +83,12 @@ Pipe StorageMongoDB::read( { connectIfNotConnected(); - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); Block sample_block; for (const String & column_name : column_names) { - auto column_data = metadata_snapshot->getColumns().getPhysical(column_name); + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); sample_block.insert({ column_data.type, column_data.name }); } diff --git a/src/Storages/StorageMongoDB.h b/src/Storages/StorageMongoDB.h index 440353c345..d9e765db3b 100644 --- a/src/Storages/StorageMongoDB.h +++ b/src/Storages/StorageMongoDB.h @@ -33,7 +33,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageMySQL.cpp b/src/Storages/StorageMySQL.cpp index 365d5e9df7..d524002c16 100644 --- a/src/Storages/StorageMySQL.cpp +++ b/src/Storages/StorageMySQL.cpp @@ -80,17 +80,18 @@ StorageMySQL::StorageMySQL( Pipe StorageMySQL::read( const Names & column_names_, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info_, ContextPtr context_, QueryProcessingStage::Enum /*processed_stage*/, size_t /*max_block_size*/, unsigned) { - metadata_snapshot->check(column_names_, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names_); + String query = transformQueryForExternalDatabase( query_info_, - metadata_snapshot->getColumns().getOrdinary(), + storage_snapshot->metadata->getColumns().getOrdinary(), IdentifierQuotingStyle::BackticksMySQL, remote_database_name, remote_table_name, @@ -99,7 +100,7 @@ Pipe StorageMySQL::read( Block sample_block; for (const String & column_name : column_names_) { - auto column_data = metadata_snapshot->getColumns().getPhysical(column_name); + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); WhichDataType which(column_data.type); /// Convert enum to string. diff --git a/src/Storages/StorageMySQL.h b/src/Storages/StorageMySQL.h index 9dccb75ce5..d0f7a6032b 100644 --- a/src/Storages/StorageMySQL.h +++ b/src/Storages/StorageMySQL.h @@ -41,7 +41,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageNull.h b/src/Storages/StorageNull.h index 26b94e63cb..2124e475ad 100644 --- a/src/Storages/StorageNull.h +++ b/src/Storages/StorageNull.h @@ -23,7 +23,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo &, ContextPtr /*context*/, QueryProcessingStage::Enum /*processing_stage*/, @@ -31,7 +31,7 @@ public: unsigned) override { return Pipe( - std::make_shared(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()))); + std::make_shared(storage_snapshot->getSampleBlockForColumns(column_names))); } bool supportsParallelInsert() const override { return true; } diff --git a/src/Storages/StoragePostgreSQL.cpp b/src/Storages/StoragePostgreSQL.cpp index 6847ebd63f..683406939c 100644 --- a/src/Storages/StoragePostgreSQL.cpp +++ b/src/Storages/StoragePostgreSQL.cpp @@ -62,25 +62,25 @@ StoragePostgreSQL::StoragePostgreSQL( Pipe StoragePostgreSQL::read( const Names & column_names_, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info_, ContextPtr context_, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size_, unsigned) { - metadata_snapshot->check(column_names_, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names_); /// Connection is already made to the needed database, so it should not be present in the query; /// remote_table_schema is empty if it is not specified, will access only table_name. String query = transformQueryForExternalDatabase( - query_info_, metadata_snapshot->getColumns().getOrdinary(), + query_info_, storage_snapshot->metadata->getColumns().getOrdinary(), IdentifierQuotingStyle::DoubleQuotes, remote_table_schema, remote_table_name, context_); Block sample_block; for (const String & column_name : column_names_) { - auto column_data = metadata_snapshot->getColumns().getPhysical(column_name); + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); WhichDataType which(column_data.type); if (which.isEnum()) column_data.type = std::make_shared(); diff --git a/src/Storages/StoragePostgreSQL.h b/src/Storages/StoragePostgreSQL.h index 6ef47862b7..74b83b43cf 100644 --- a/src/Storages/StoragePostgreSQL.h +++ b/src/Storages/StoragePostgreSQL.h @@ -33,7 +33,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageProxy.h b/src/Storages/StorageProxy.h index d3a332f09c..6792a9c1cd 100644 --- a/src/Storages/StorageProxy.h +++ b/src/Storages/StorageProxy.h @@ -56,10 +56,11 @@ public: QueryProcessingStage::Enum getQueryProcessingStage( ContextPtr context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr &, + const StorageSnapshotPtr &, SelectQueryInfo & info) const override { - return getNested()->getQueryProcessingStage(context, to_stage, getNested()->getInMemoryMetadataPtr(), info); + const auto & nested_metadata = getNested()->getInMemoryMetadataPtr(); + return getNested()->getQueryProcessingStage(context, to_stage, getNested()->getStorageSnapshot(nested_metadata, context), info); } BlockInputStreams watch( @@ -75,14 +76,15 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) override { - return getNested()->read(column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams); + return getNested()->read(column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); + } BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 85494265fd..e1276dcfb2 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4480,7 +4480,7 @@ ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock StorageReplicatedMerg void StorageReplicatedMergeTree::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -4496,18 +4496,18 @@ void StorageReplicatedMergeTree::read( { auto max_added_blocks = std::make_shared(getMaxAddedBlocks()); if (auto plan = reader.read( - column_names, metadata_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage, std::move(max_added_blocks))) + column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage, std::move(max_added_blocks))) query_plan = std::move(*plan); return; } - if (auto plan = reader.read(column_names, metadata_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage)) + if (auto plan = reader.read(column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage)) query_plan = std::move(*plan); } Pipe StorageReplicatedMergeTree::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -4515,7 +4515,7 @@ Pipe StorageReplicatedMergeTree::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index d53be5fdf6..bdb72ddc3f 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -116,7 +116,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -126,7 +126,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 88fa78a747..2bdae85e51 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -386,7 +386,7 @@ StorageS3::StorageS3( Pipe StorageS3::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr local_context, QueryProcessingStage::Enum /*processed_stage*/, @@ -431,9 +431,9 @@ Pipe StorageS3::read( need_file_column, format_name, getName(), - metadata_snapshot->getSampleBlock(), + storage_snapshot->metadata->getSampleBlock(), local_context, - metadata_snapshot->getColumns(), + storage_snapshot->metadata->getColumns(), max_block_size, max_single_read_retries, compression_method, diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 3001fc6f67..c3d8b83d5a 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -144,7 +144,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp index cdb85fe588..d938c2f90e 100644 --- a/src/Storages/StorageS3Cluster.cpp +++ b/src/Storages/StorageS3Cluster.cpp @@ -101,7 +101,7 @@ StorageS3Cluster::StorageS3Cluster( /// The code executes on initiator Pipe StorageS3Cluster::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -154,12 +154,12 @@ Pipe StorageS3Cluster::read( } } - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); return Pipe::unitePipes(std::move(pipes)); } QueryProcessingStage::Enum StorageS3Cluster::getQueryProcessingStage( - ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageMetadataPtr &, SelectQueryInfo &) const + ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageSnapshotPtr &, SelectQueryInfo &) const { /// Initiator executes query on remote node. if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) diff --git a/src/Storages/StorageS3Cluster.h b/src/Storages/StorageS3Cluster.h index 821765a378..a40edf34e2 100644 --- a/src/Storages/StorageS3Cluster.h +++ b/src/Storages/StorageS3Cluster.h @@ -27,11 +27,11 @@ class StorageS3Cluster : public shared_ptr_helper, public ISto public: std::string getName() const override { return "S3Cluster"; } - Pipe read(const Names &, const StorageMetadataPtr &, SelectQueryInfo &, + Pipe read(const Names &, const StorageSnapshotPtr &, SelectQueryInfo &, ContextPtr, QueryProcessingStage::Enum, size_t /*max_block_size*/, unsigned /*num_streams*/) override; QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; NamesAndTypesList getVirtuals() const override; diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp new file mode 100644 index 0000000000..70c1282a88 --- /dev/null +++ b/src/Storages/StorageSnapshot.cpp @@ -0,0 +1,317 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_FOUND_COLUMN_IN_BLOCK; + extern const int EMPTY_LIST_OF_COLUMNS_QUERIED; + extern const int NO_SUCH_COLUMN_IN_TABLE; + extern const int COLUMN_QUERIED_MORE_THAN_ONCE; +} + +void ObjectSchemas::reset(const ObjectAssembledSchema & assembled_schema, const ObjectPartialSchemas & partial_schemas) +{ + std::lock_guard lock(partial_schema_refresh_mutex); + partial_object_schemas = partial_schemas; + + partial_object_schemas.emplace(OBJECT_GLOBAL_SCHEMA_TXN, assembled_schema); +} + +bool ObjectSchemas::isEmpty() const +{ + std::lock_guard lock(partial_schema_refresh_mutex); + return partial_object_schemas.empty(); +} + +void ObjectSchemas::appendPartialSchema(const TxnTimestamp & txn_id, ObjectPartialSchema partial_schema) +{ + std::lock_guard lock(partial_schema_refresh_mutex); + partial_object_schemas.emplace(txn_id, partial_schema); +} + +void ObjectSchemas::refreshAssembledSchema(const ObjectAssembledSchema & assembled_schema, std::vector txn_ids) +{ + std::lock_guard lock(partial_schema_refresh_mutex); + + partial_object_schemas[OBJECT_GLOBAL_SCHEMA_TXN] = assembled_schema; + for (const auto & txn_id : txn_ids) + { + partial_object_schemas.erase(txn_id); + } +} + +ObjectAssembledSchema ObjectSchemas::assembleSchema(const ContextPtr query_context, const StorageMetadataPtr & metadata) const +{ + std::lock_guard lock(partial_schema_refresh_mutex); + std::vector unfiltered_txn_ids; + std::for_each( + partial_object_schemas.begin(), partial_object_schemas.end(), [&unfiltered_txn_ids](const auto & unfilter_partial_schema) { + unfiltered_txn_ids.emplace_back(unfilter_partial_schema.first); + }); + + auto committed_partial_schema_txnids = query_context->getCnchCatalog()->filterUncommittedObjectPartialSchemas(unfiltered_txn_ids); + + ObjectPartialSchemas committed_partial_schemas; + std::for_each( + committed_partial_schema_txnids.begin(), + committed_partial_schema_txnids.end(), + [this, &committed_partial_schemas](const auto & txn_id) { + auto it = partial_object_schemas.find(txn_id); + if (it != partial_object_schemas.end()) + committed_partial_schemas[txn_id] = it->second; + }); + + return DB::getConcreteObjectColumns( + committed_partial_schemas.begin(), committed_partial_schemas.end(), metadata->columns, [](const auto & partial_schema) { + return partial_schema.second; + }); +} + +void ObjectSchemas::dropAbortedPartialSchema(const TxnTimestamp &txn_id) +{ + std::lock_guard lock(partial_schema_refresh_mutex); + partial_object_schemas.erase(txn_id); +} + +void StorageSnapshot::init() +{ + for (const auto & [name, type] : storage.getVirtuals()) + virtual_columns[name] = type; + + // if (storage.hasLightweightDeletedMask()) + // system_columns[LightweightDeleteDescription::FILTER_COLUMN.name] = LightweightDeleteDescription::FILTER_COLUMN.type; +} + +NamesAndTypesList StorageSnapshot::getColumns(const GetColumnsOptions & options) const +{ + auto all_columns = getMetadataForQuery()->getColumns().get(options); + + if (options.with_extended_objects) + extendObjectColumns(all_columns, object_columns, options.with_subcolumns); + + NameSet column_names; + if (options.with_virtuals) + { + /// Virtual columns must be appended after ordinary, + /// because user can override them. + if (!virtual_columns.empty()) + { + for (const auto & column : all_columns) + column_names.insert(column.name); + + for (const auto & [name, type] : virtual_columns) + if (!column_names.contains(name)) + all_columns.emplace_back(name, type); + } + } + + if (options.with_system_columns) + { + if (!system_columns.empty() && column_names.empty()) + { + for (const auto & column : all_columns) + column_names.insert(column.name); + } + + for (const auto & [name, type] : system_columns) + if (!column_names.contains(name)) + all_columns.emplace_back(name, type); + } + + return all_columns; +} + +NamesAndTypesList StorageSnapshot::getColumnsByNames(const GetColumnsOptions & options, const Names & names) const +{ + NamesAndTypesList res; + for (const auto & name : names) + res.push_back(getColumn(options, name)); + return res; +} + +std::optional StorageSnapshot::tryGetColumn(const GetColumnsOptions & options, const String & column_name) const +{ + const auto & columns = getMetadataForQuery()->getColumns(); + auto column = columns.tryGetColumn(options, column_name); + if (column && (!column->type->hasDynamicSubcolumns() || !options.with_extended_objects)) + return column; + + if (options.with_extended_objects) + { + auto object_column = object_columns.tryGetColumn(options, column_name); + if (object_column) + return object_column; + } + + if (options.with_virtuals) + { + auto it = virtual_columns.find(column_name); + if (it != virtual_columns.end()) + return NameAndTypePair(column_name, it->second); + } + + if (options.with_system_columns) + { + auto it = system_columns.find(column_name); + if (it != system_columns.end()) + return NameAndTypePair(column_name, it->second); + } + + return {}; +} + +NameAndTypePair StorageSnapshot::getColumn(const GetColumnsOptions & options, const String & column_name) const +{ + auto column = tryGetColumn(options, column_name); + if (!column) + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", column_name); + + return *column; +} + +Block StorageSnapshot::getSampleBlockForColumns( + const Names & column_names, const NameToNameMap & parameter_values, BitEngineReadType bitengine_read_type) const +{ + Block res; + + const auto & columns = getMetadataForQuery()->getColumns(); + for (const auto & column_name : column_names) + { + std::string substituted_column_name = column_name; + + /// substituted_column_name is used for parameterized view (which are created using query parameters + /// and SELECT is used with substitution of these query parameters ) + if (!parameter_values.empty()) + substituted_column_name = StorageView::replaceValueWithQueryParameter(column_name, parameter_values); + + auto column = columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, substituted_column_name); + auto object_column = object_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, substituted_column_name); + if (column && !object_column) + { + if (isBitmap64(column->type) && column->type->isBitEngineEncode() && bitengine_read_type == BitEngineReadType::ONLY_ENCODE) + column->name += BITENGINE_COLUMN_EXTENSION; + res.insert({column->type->createColumn(), column->type, column_name}); + } + else if (object_column) + { + res.insert({object_column->type->createColumn(), object_column->type, column_name}); + } + else if (auto it = virtual_columns.find(column_name); it != virtual_columns.end()) + { + /// Virtual columns must be appended after ordinary, because user can + /// override them. + const auto & type = it->second; + res.insert({type->createColumn(), type, column_name}); + } + else + { + throw Exception( + "Column " + backQuote(column_name) + " not found in table " + + (storage.getStorageID().empty() ? "" : storage.getStorageID().getNameForLogs()), + ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + } + } + return res; +} + +ColumnsDescription StorageSnapshot::getDescriptionForColumns(const Names & column_names) const +{ + ColumnsDescription res; + const auto & columns = getMetadataForQuery()->getColumns(); + for (const auto & name : column_names) + { + auto column = columns.tryGetColumnOrSubcolumnDescription(GetColumnsOptions::All, name); + auto object_column = object_columns.tryGetColumnOrSubcolumnDescription(GetColumnsOptions::All, name); + if (column && !object_column) + { + res.add(*column, "", false, false); + } + else if (object_column) + { + res.add(*object_column, "", false, false); + } + else if (auto it = virtual_columns.find(name); it != virtual_columns.end()) + { + /// Virtual columns must be appended after ordinary, because user can + /// override them. + const auto & type = it->second; + res.add({name, type}); + } + else + { + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, + "Column {} not found in table {}", backQuote(name), storage.getStorageID().getNameForLogs()); + } + } + + return res; +} + +namespace +{ + using DenseHashSet = google::dense_hash_set; +} + +void StorageSnapshot::check(const Names & column_names) const +{ + const auto & columns = getMetadataForQuery()->getColumns(); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns(); + + if (column_names.empty()) + { + auto list_of_columns = listOfColumns(columns.get(options)); + throw Exception(ErrorCodes::EMPTY_LIST_OF_COLUMNS_QUERIED, + "Empty list of columns queried. There are columns: {}", list_of_columns); + } + + DenseHashSet unique_names; + unique_names.set_empty_key(StringRef()); + + auto func_columns = metadata->getFuncColumns(); + + for (const auto & name : column_names) + { + if (isMapImplicitKey(name)) continue; + + // ignore checking functional columns + if (func_columns.contains(name)) + continue; + + bool has_column = columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, name) + || object_columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, name) + || virtual_columns.contains(name); + + if (!has_column) + { + auto list_of_columns = listOfColumns(columns.get(options)); + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, + "There is no column with name {} in table {}. There are columns: {}", + backQuote(name), storage.getStorageID().getNameForLogs(), list_of_columns); + } + + if (unique_names.count(name)) + throw Exception(ErrorCodes::COLUMN_QUERIED_MORE_THAN_ONCE, "Column {} queried more than once", name); + + unique_names.insert(name); + } +} + +DataTypePtr StorageSnapshot::getConcreteType(const String & column_name) const +{ + auto object_column = object_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, column_name); + if (object_column) + return object_column->type; + + return metadata->getColumns().get(column_name).type; +} + +} diff --git a/src/Storages/StorageSnapshot.h b/src/Storages/StorageSnapshot.h new file mode 100644 index 0000000000..3538aff426 --- /dev/null +++ b/src/Storages/StorageSnapshot.h @@ -0,0 +1,153 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class IStorage; + +struct TxnTimestampHasher +{ + size_t operator()(const TxnTimestamp & txn) const + { + return std::hash()(txn.toUInt64()); + } +}; + +// Dynamic object column schema related +const static TxnTimestamp OBJECT_GLOBAL_SCHEMA_TXN = TxnTimestamp{0}; +using ObjectPartialSchemaStatus = Protos::TransactionStatus; +using ObjectPartialSchemaStatuses = std::unordered_map; +using ObjectPartialSchemas = std::unordered_map; +using ObjectAssembledSchema = ColumnsDescription; +using ObjectPartialSchema = ColumnsDescription; + +struct ObjectSchemas +{ + static String serializeObjectPartialSchemaStatus(const ObjectPartialSchemaStatus & status) + { + return Protos::TransactionStatus_Name(status); + } + + static ObjectPartialSchemaStatus deserializeObjectPartialSchemaStatus(const String & status_str) + { + ObjectPartialSchemaStatus status; + Protos::TransactionStatus_Parse(status_str, &status); + return status; + } + + mutable std::mutex partial_schema_refresh_mutex; + // Including assembled schema, use default key OBJECT_GLOBAL_SCHEMA_TXN + ObjectPartialSchemas partial_object_schemas; + + void reset(const ObjectAssembledSchema & assembled_schema, const ObjectPartialSchemas & partial_schemas); + + bool isEmpty() const; + + void appendPartialSchema(const TxnTimestamp & txn_id, ObjectPartialSchema partial_schema); + + void refreshAssembledSchema(const ObjectAssembledSchema & assembled_schema, std::vector txn_ids); + + ObjectAssembledSchema assembleSchema(ContextPtr query_context, const StorageMetadataPtr & metadata) const; + + void dropAbortedPartialSchema(const TxnTimestamp & txn_id); +}; + +/// Snapshot of storage that fixes set columns that can be read in query. +/// There are 3 sources of columns: regular columns from metadata, +/// dynamic columns from object Types, virtual columns. +struct StorageSnapshot +{ + const IStorage & storage; + const StorageMetadataPtr metadata; + const ColumnsDescription object_columns; + + /// Additional data, on which set of columns may depend. + /// E.g. data parts in MergeTree, list of blocks in Memory, etc. + struct Data + { + virtual ~Data() = default; + }; + + using DataPtr = std::unique_ptr; + DataPtr data; + + /// Projection that is used in query. + mutable const ProjectionDescription * projection = nullptr; + + StorageSnapshot( + const IStorage & storage_, + const StorageMetadataPtr & metadata_) + : storage(storage_), metadata(metadata_) + { + init(); + } + + StorageSnapshot( + const IStorage & storage_, + const StorageMetadataPtr & metadata_, + const ColumnsDescription & object_columns_) + : storage(storage_), metadata(metadata_), object_columns(object_columns_) + { + init(); + } + + StorageSnapshot( + const IStorage & storage_, + const StorageMetadataPtr & metadata_, + const ColumnsDescription & object_columns_, + DataPtr data_) + : storage(storage_), metadata(metadata_), object_columns(object_columns_), data(std::move(data_)) + { + init(); + } + + /// Get all available columns with types according to options. + NamesAndTypesList getColumns(const GetColumnsOptions & options) const; + + /// Get columns with types according to options only for requested names. + NamesAndTypesList getColumnsByNames(const GetColumnsOptions & options, const Names & names) const; + + /// Get column with type according to options for requested name. + std::optional tryGetColumn(const GetColumnsOptions & options, const String & column_name) const; + NameAndTypePair getColumn(const GetColumnsOptions & options, const String & column_name) const; + + /// Block with ordinary + materialized + aliases + virtuals + subcolumns. + Block getSampleBlockForColumns( + const Names & column_names, + const NameToNameMap & parameter_values = {}, + BitEngineReadType bitengine_read_type = BitEngineReadType::ONLY_SOURCE) const; + + ColumnsDescription getDescriptionForColumns(const Names & column_names) const; + + /// Verify that all the requested names are in the table and are set correctly: + /// list of names is not empty and the names do not repeat. + void check(const Names & column_names) const; + + DataTypePtr getConcreteType(const String & column_name) const; + + void addProjection(const ProjectionDescription * projection_) const { projection = projection_; } + + /// If we have a projection then we should use its metadata. + StorageMetadataPtr getMetadataForQuery() const { return projection ? projection->metadata : metadata; } + +private: + void init(); + + std::unordered_map virtual_columns; + + /// System columns are not visible in the schema but might be persisted in the data. + /// One example of such column is lightweight delete mask '_row_exists'. + std::unordered_map system_columns; +}; + +using StorageSnapshotPtr = std::shared_ptr; + +} diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index c7329ddaba..bcb7f38a2d 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -73,14 +73,13 @@ class StripeLogSource final : public SourceWithProgress { public: static Block getHeader( - StorageStripeLog & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const Names & column_names, IndexForNativeFormat::Blocks::const_iterator index_begin, IndexForNativeFormat::Blocks::const_iterator index_end) { if (index_begin == index_end) - return metadata_snapshot->getSampleBlockForColumns(column_names, storage.getVirtuals(), storage.getStorageID()); + return storage_snapshot->getSampleBlockForColumns(column_names); /// TODO: check if possible to always return storage.getSampleBlock() @@ -97,16 +96,16 @@ public: StripeLogSource( StorageStripeLog & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const Names & column_names, size_t max_read_buffer_size_, std::shared_ptr & index_, IndexForNativeFormat::Blocks::const_iterator index_begin_, IndexForNativeFormat::Blocks::const_iterator index_end_) : SourceWithProgress( - getHeader(storage_, metadata_snapshot_, column_names, index_begin_, index_end_)) + getHeader(storage_snapshot_, column_names, index_begin_, index_end_)) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , max_read_buffer_size(max_read_buffer_size_) , index(index_) , index_begin(index_begin_) @@ -143,7 +142,7 @@ protected: private: StorageStripeLog & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; size_t max_read_buffer_size; std::shared_ptr index; @@ -342,7 +341,7 @@ static std::chrono::seconds getLockTimeout(ContextPtr context) Pipe StorageStripeLog::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -353,7 +352,7 @@ Pipe StorageStripeLog::read( if (!lock) throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); NameSet column_names_set(column_names.begin(), column_names.end()); @@ -362,7 +361,7 @@ Pipe StorageStripeLog::read( String index_file = table_path + "index.mrk"; if (!disk->exists(index_file)) { - return Pipe(std::make_shared(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()))); + return Pipe(std::make_shared(storage_snapshot->getSampleBlockForColumns(column_names))); } CompressedReadBufferFromFile index_in(disk->readFile(index_file, {.buffer_size = 4096})); @@ -381,7 +380,7 @@ Pipe StorageStripeLog::read( std::advance(end, (stream + 1) * size / num_streams); pipes.emplace_back(std::make_shared( - *this, metadata_snapshot, column_names, context->getSettingsRef().max_read_buffer_size, index, begin, end)); + *this, storage_snapshot, column_names, context->getSettingsRef().max_read_buffer_size, index, begin, end)); } /// We do not keep read lock directly at the time of reading, because we read ranges of data that do not change. diff --git a/src/Storages/StorageStripeLog.h b/src/Storages/StorageStripeLog.h index 0749273043..8ae8dda41b 100644 --- a/src/Storages/StorageStripeLog.h +++ b/src/Storages/StorageStripeLog.h @@ -27,7 +27,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageTableFunction.h b/src/Storages/StorageTableFunction.h index 859735fec5..d80d6c9c5f 100644 --- a/src/Storages/StorageTableFunction.h +++ b/src/Storages/StorageTableFunction.h @@ -78,7 +78,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -89,12 +89,12 @@ public: for (const auto & c : column_names) cnames += c + " "; auto storage = getNested(); - auto nested_metadata = storage->getInMemoryMetadataPtr(); + auto nested_metadata = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context); auto pipe = storage->read(column_names, nested_metadata, query_info, context, processed_stage, max_block_size, num_streams); if (!pipe.empty() && add_conversion) { - auto to_header = getHeaderForProcessingStage(*this, column_names, metadata_snapshot, + auto to_header = getHeaderForProcessingStage(column_names, storage_snapshot, query_info, context, processed_stage); auto convert_actions_dag = ActionsDAG::makeConvertingActions( diff --git a/src/Storages/StorageTinyLog.cpp b/src/Storages/StorageTinyLog.cpp index 5e54d8514b..31224c0ff0 100644 --- a/src/Storages/StorageTinyLog.cpp +++ b/src/Storages/StorageTinyLog.cpp @@ -331,7 +331,7 @@ void TinyLogBlockOutputStream::writeData(const NameAndTypePair & name_and_type, /// Use different WrittenStreams set, or we get nullptr for them in `serializeBinaryBulkWithMultipleStreams` WrittenStreams prefix_written_streams; settings.getter = createStreamGetter(name_and_type, prefix_written_streams); - serialization->serializeBinaryBulkStatePrefix(settings, serialize_states[name]); + serialization->serializeBinaryBulkStatePrefix(column, settings, serialize_states[name]); } settings.getter = createStreamGetter(name_and_type, written_streams); @@ -466,9 +466,8 @@ void StorageTinyLog::addFiles(const NameAndTypePair & column) } }; - ISerialization::SubstreamPath substream_path; auto serialization = type->getDefaultSerialization(); - serialization->enumerateStreams(stream_callback, substream_path); + serialization->enumerateStreams(stream_callback); } @@ -500,16 +499,17 @@ static std::chrono::seconds getLockTimeout(ContextPtr context) Pipe StorageTinyLog::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t max_block_size, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); - auto all_columns = metadata_snapshot->getColumns().getByNames(ColumnsDescription::All, column_names, true); + auto options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects(); + auto all_columns = storage_snapshot->getColumns(options); // When reading, we lock the entire storage, because we only have one file // per column and can't modify it concurrently. diff --git a/src/Storages/StorageTinyLog.h b/src/Storages/StorageTinyLog.h index 71763a6403..96955d7119 100644 --- a/src/Storages/StorageTinyLog.h +++ b/src/Storages/StorageTinyLog.h @@ -26,7 +26,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index f9069e8f6c..871379a885 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -207,7 +207,7 @@ std::string IStorageURLBase::getReadMethod() const std::vector> IStorageURLBase::getReadURIParams( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, @@ -218,7 +218,7 @@ std::vector> IStorageURLBase::getReadURIPara std::function IStorageURLBase::getReadPOSTDataCallback( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, @@ -230,7 +230,7 @@ std::function IStorageURLBase::getReadPOSTDataCallback( Pipe IStorageURLBase::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -238,7 +238,7 @@ Pipe IStorageURLBase::read( unsigned /*num_streams*/) { auto request_uri = uri; - auto params = getReadURIParams(column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size); + auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size); for (const auto & [param, value] : params) request_uri.addQueryParameter(param, value); @@ -246,14 +246,14 @@ Pipe IStorageURLBase::read( request_uri, getReadMethod(), getReadPOSTDataCallback( - column_names, metadata_snapshot, query_info, + column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size), format_name, format_settings, getName(), - getHeaderBlock(column_names, metadata_snapshot), + getHeaderBlock(column_names, storage_snapshot), local_context, - metadata_snapshot->getColumns(), + storage_snapshot->metadata->getColumns(), max_block_size, ConnectionTimeouts::getHTTPTimeouts(local_context), chooseCompressionMethod(request_uri.getPath(), compression_method))); @@ -262,14 +262,14 @@ Pipe IStorageURLBase::read( Pipe StorageURLWithFailover::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned /*num_streams*/) { - auto params = getReadURIParams(column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size); + auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size); WriteBufferFromOwnString error_message; error_message << "Detailed description:"; @@ -285,14 +285,14 @@ Pipe StorageURLWithFailover::read( request_uri, getReadMethod(), getReadPOSTDataCallback( - column_names, metadata_snapshot, query_info, + column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size), format_name, format_settings, getName(), - getHeaderBlock(column_names, metadata_snapshot), + getHeaderBlock(column_names, storage_snapshot), local_context, - metadata_snapshot->getColumns(), + storage_snapshot->metadata->getColumns(), max_block_size, ConnectionTimeouts::getHTTPTimeouts(local_context), chooseCompressionMethod(request_uri.getPath(), compression_method)); diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 7830629204..ad22fc6264 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -25,7 +25,7 @@ class IStorageURLBase : public IStorage public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -59,7 +59,7 @@ protected: virtual std::vector> getReadURIParams( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum & processed_stage, @@ -67,14 +67,14 @@ protected: virtual std::function getReadPOSTDataCallback( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const; private: - virtual Block getHeaderBlock(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const = 0; + virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0; }; class StorageURLBlockOutputStream : public IBlockOutputStream @@ -124,9 +124,9 @@ public: return "URL"; } - Block getHeaderBlock(const Names & /*column_names*/, const StorageMetadataPtr & metadata_snapshot) const override + Block getHeaderBlock(const Names & /*column_names*/, const StorageSnapshotPtr & storage_snapshot) const override { - return metadata_snapshot->getSampleBlock(); + return storage_snapshot->metadata->getSampleBlock(); } static FormatSettings getFormatSettingsFromArgs(const StorageFactory::Arguments & args); @@ -149,7 +149,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageValues.cpp b/src/Storages/StorageValues.cpp index ace5ca3667..d0acb191e8 100644 --- a/src/Storages/StorageValues.cpp +++ b/src/Storages/StorageValues.cpp @@ -22,14 +22,14 @@ StorageValues::StorageValues( Pipe StorageValues::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t /*max_block_size*/, unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); /// Get only required columns. Block block; diff --git a/src/Storages/StorageValues.h b/src/Storages/StorageValues.h index 4d6d168441..c74f8eae3e 100644 --- a/src/Storages/StorageValues.h +++ b/src/Storages/StorageValues.h @@ -17,7 +17,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index fa1d844273..5b8ef56b7c 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -125,7 +125,7 @@ StorageView::StorageView( Pipe StorageView::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -133,7 +133,7 @@ Pipe StorageView::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); @@ -142,14 +142,14 @@ Pipe StorageView::read( void StorageView::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - ASTPtr current_inner_query = metadata_snapshot->getSelectQuery().inner_query; + ASTPtr current_inner_query = storage_snapshot->metadata->getSelectQuery().inner_query; if (query_info.view_query) { @@ -171,7 +171,7 @@ void StorageView::read( query_plan.addStep(std::move(materializing)); /// And also convert to expected structure. - const auto & expected_header = metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + const auto & expected_header = storage_snapshot->getSampleBlockForColumns(column_names); const auto & header = query_plan.getCurrentDataStream().header; const auto * select_with_union = current_inner_query->as(); @@ -234,6 +234,21 @@ void StorageView::replaceWithSubquery(ASTSelectQuery & outer_query, ASTPtr view_ child = view_query; } +String StorageView::replaceValueWithQueryParameter(const String & column_name, const NameToNameMap & parameter_values) +{ + String name = column_name; + std::string::size_type pos = 0u; + for (const auto & parameter : parameter_values) + { + if ((pos = name.find("_CAST(" + parameter.second)) != std::string::npos) + { + name = name.substr(0,pos) + parameter.first + ")"; + break; + } + } + return name; +} + ASTPtr StorageView::restoreViewName(ASTSelectQuery & select_query, const ASTPtr & view_name) { ASTTableExpression * table_expression = getFirstTableExpression(select_query); diff --git a/src/Storages/StorageView.h b/src/Storages/StorageView.h index 2dd9e8d7b1..bfa971469f 100644 --- a/src/Storages/StorageView.h +++ b/src/Storages/StorageView.h @@ -48,7 +48,7 @@ public: } Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -58,7 +58,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -72,6 +72,7 @@ public: static void replaceWithSubquery(ASTSelectQuery & outer_query, ASTPtr view_query, ASTPtr & view_name); static ASTPtr restoreViewName(ASTSelectQuery & select_query, const ASTPtr & view_name); + static String replaceValueWithQueryParameter (const String & column_name, const NameToNameMap & parameter_values); protected: StorageView( diff --git a/src/Storages/StorageXDBC.cpp b/src/Storages/StorageXDBC.cpp index 9cffc32fda..313ab36182 100644 --- a/src/Storages/StorageXDBC.cpp +++ b/src/Storages/StorageXDBC.cpp @@ -58,7 +58,7 @@ std::string StorageXDBC::getReadMethod() const std::vector> StorageXDBC::getReadURIParams( const Names & /* column_names */, - const StorageMetadataPtr & /* metadata_snapshot */, + const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, @@ -69,14 +69,14 @@ std::vector> StorageXDBC::getReadURIParams( std::function StorageXDBC::getReadPOSTDataCallback( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { String query = transformQueryForExternalDatabase(query_info, - metadata_snapshot->getColumns().getOrdinary(), + storage_snapshot->metadata->getColumns().getOrdinary(), bridge_helper->getIdentifierQuotingStyle(), remote_database_name, remote_table_name, @@ -85,7 +85,7 @@ std::function StorageXDBC::getReadPOSTDataCallback( NamesAndTypesList cols; for (const String & name : column_names) { - auto column_data = metadata_snapshot->getColumns().getPhysical(name); + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(name); cols.emplace_back(column_data.name, column_data.type); } @@ -101,17 +101,17 @@ std::function StorageXDBC::getReadPOSTDataCallback( Pipe StorageXDBC::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); bridge_helper->startBridgeSync(); - return IStorageURLBase::read(column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + return IStorageURLBase::read(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); } BlockOutputStreamPtr StorageXDBC::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) @@ -140,9 +140,9 @@ BlockOutputStreamPtr StorageXDBC::write(const ASTPtr & /*query*/, const StorageM chooseCompressionMethod(uri.toString(), compression_method)); } -Block StorageXDBC::getHeaderBlock(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const +Block StorageXDBC::getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const { - return metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + return storage_snapshot->getSampleBlockForColumns(column_names); } std::string StorageXDBC::getName() const diff --git a/src/Storages/StorageXDBC.h b/src/Storages/StorageXDBC.h index e1697eca74..0f40c2627e 100644 --- a/src/Storages/StorageXDBC.h +++ b/src/Storages/StorageXDBC.h @@ -17,7 +17,7 @@ class StorageXDBC : public IStorageURLBase public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -48,7 +48,7 @@ private: std::vector> getReadURIParams( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum & processed_stage, @@ -56,13 +56,13 @@ private: std::function getReadPOSTDataCallback( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const override; - Block getHeaderBlock(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const override; + Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const override; }; } diff --git a/src/Storages/System/IStorageSystemOneBlock.h b/src/Storages/System/IStorageSystemOneBlock.h index e6e564f3c8..a6e8f74a0c 100644 --- a/src/Storages/System/IStorageSystemOneBlock.h +++ b/src/Storages/System/IStorageSystemOneBlock.h @@ -46,17 +46,16 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, size_t /*max_block_size*/, unsigned /*num_streams*/) override { - auto virtuals_names_and_types = getVirtuals(); - metadata_snapshot->check(column_names, virtuals_names_and_types, getStorageID()); + storage_snapshot->check(column_names); - Block sample_block = metadata_snapshot->getSampleBlockWithVirtuals(virtuals_names_and_types); + Block sample_block = storage_snapshot->metadata->getSampleBlockWithVirtuals(getVirtuals()); MutableColumns res_columns = sample_block.cloneEmptyColumns(); fillData(res_columns, context, query_info); diff --git a/src/Storages/System/StorageSystemCloudTables.cpp b/src/Storages/System/StorageSystemCloudTables.cpp index e09a845194..aed04ddb8f 100644 --- a/src/Storages/System/StorageSystemCloudTables.cpp +++ b/src/Storages/System/StorageSystemCloudTables.cpp @@ -220,19 +220,19 @@ private: Pipe StorageSystemCloudTables::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t max_block_size, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); /// Create a mask of what columns are needed in the result. NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block res_block; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemCloudTables.h b/src/Storages/System/StorageSystemCloudTables.h index c174b8f9ab..6db2bb5b1f 100644 --- a/src/Storages/System/StorageSystemCloudTables.h +++ b/src/Storages/System/StorageSystemCloudTables.h @@ -17,7 +17,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemCnchKafkaTables.cpp b/src/Storages/System/StorageSystemCnchKafkaTables.cpp index af18469794..8bbebc7831 100644 --- a/src/Storages/System/StorageSystemCnchKafkaTables.cpp +++ b/src/Storages/System/StorageSystemCnchKafkaTables.cpp @@ -80,7 +80,7 @@ static ASTPtr getSelectQuery(const SelectQueryInfo & query_info) Pipe StorageSystemCnchKafkaTables::read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -93,8 +93,16 @@ Pipe StorageSystemCnchKafkaTables::read( auto select_query = getSelectQuery(query_info); Block header = materializeBlock(InterpreterSelectQuery(select_query, context, QueryProcessingStage::Complete).getSampleBlock()); + ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( - header, QueryProcessingStage::Complete, StorageID("system", "kafka_tables"), {}, false, {}); + header, + {}, + storage_snapshot, + QueryProcessingStage::Complete, + StorageID("system", "kafka_tables"), + {}, + false, + {}); /// Set `query_info.cluster` to forward query to all instances of `server cluster` query_info.cluster = context->mockCnchServersCluster(); diff --git a/src/Storages/System/StorageSystemCnchKafkaTables.h b/src/Storages/System/StorageSystemCnchKafkaTables.h index 3402b7347f..194c978143 100644 --- a/src/Storages/System/StorageSystemCnchKafkaTables.h +++ b/src/Storages/System/StorageSystemCnchKafkaTables.h @@ -18,7 +18,7 @@ public: Pipe read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/System/StorageSystemCnchMaterializedMySQL.cpp b/src/Storages/System/StorageSystemCnchMaterializedMySQL.cpp index 0d143cb075..9e6c4bab52 100644 --- a/src/Storages/System/StorageSystemCnchMaterializedMySQL.cpp +++ b/src/Storages/System/StorageSystemCnchMaterializedMySQL.cpp @@ -84,7 +84,7 @@ static ASTPtr getSelectQuery(const SelectQueryInfo & query_info) Pipe StorageSystemCnchMaterializedMySQL::read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -98,7 +98,7 @@ Pipe StorageSystemCnchMaterializedMySQL::read( auto select_query = getSelectQuery(query_info); Block header = materializeBlock(InterpreterSelectQuery(select_query, context, QueryProcessingStage::Complete).getSampleBlock()); ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( - header, QueryProcessingStage::Complete, StorageID("system", "materialized_mysql"), {}, false, {}); + header, {}, {}, QueryProcessingStage::Complete, StorageID("system", "materialized_mysql"), {}, false, {}); /// Set `query_info.cluster` to forward query to all instances of `server cluster` query_info.cluster = context->mockCnchServersCluster(); diff --git a/src/Storages/System/StorageSystemCnchMaterializedMySQL.h b/src/Storages/System/StorageSystemCnchMaterializedMySQL.h index 2e7673fddd..e95421f8f0 100644 --- a/src/Storages/System/StorageSystemCnchMaterializedMySQL.h +++ b/src/Storages/System/StorageSystemCnchMaterializedMySQL.h @@ -1,4 +1,5 @@ #pragma once +#include "Storages/StorageSnapshot.h" #include "config_core.h" #if USE_MYSQL @@ -17,7 +18,7 @@ public: Pipe read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/System/StorageSystemCnchPartsColumns.cpp b/src/Storages/System/StorageSystemCnchPartsColumns.cpp index ba04ccc9f6..547f53653b 100644 --- a/src/Storages/System/StorageSystemCnchPartsColumns.cpp +++ b/src/Storages/System/StorageSystemCnchPartsColumns.cpp @@ -192,8 +192,17 @@ void StorageSystemCnchPartsColumns::fillData(MutableColumns & res_columns, Conte Block header = materializeBlock(InterpreterSelectQuery(ast, context, QueryProcessingStage::Complete).getSampleBlock()); + auto metadata_snapshot = table->getInMemoryMetadataPtr(); + auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot, context); ClusterProxy::SelectStreamFactory stream_factory = ClusterProxy::SelectStreamFactory( - header, QueryProcessingStage::Complete, StorageID{"system", "cnch_parts_columns"}, Scalars{}, false, {}); + header, + {}, + storage_snapshot, + QueryProcessingStage::Complete, + StorageID{"system", "cnch_parts_columns"}, + Scalars{}, + false, + {}); QueryPlan query_plan; ClusterProxy::executeQuery(query_plan, stream_factory, log, ast, context, worker_group); diff --git a/src/Storages/System/StorageSystemCnchPartsInfo.cpp b/src/Storages/System/StorageSystemCnchPartsInfo.cpp index b71dfaa080..22cf201aa1 100644 --- a/src/Storages/System/StorageSystemCnchPartsInfo.cpp +++ b/src/Storages/System/StorageSystemCnchPartsInfo.cpp @@ -61,7 +61,7 @@ StorageSystemCnchPartsInfo::StorageSystemCnchPartsInfo(const StorageID & table_i Pipe StorageSystemCnchPartsInfo::read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -86,8 +86,16 @@ Pipe StorageSystemCnchPartsInfo::read( Block header = materializeBlock(InterpreterSelectQuery(ast, context, QueryProcessingStage::Complete).getSampleBlock()); QueryPlan query_plan; Poco::Logger * log = &Poco::Logger::get("SystemPartsInfo"); + ClusterProxy::SelectStreamFactory stream_factory = ClusterProxy::SelectStreamFactory( - header, QueryProcessingStage::Complete, StorageID{"system", "cnch_parts_info_local"}, Scalars{}, false, {}); + header, + {}, + storage_snapshot, + QueryProcessingStage::Complete, + StorageID{"system", "cnch_parts_info_local"}, + Scalars{}, + false, + {}); //set cluster in query_info query_info.cluster = context->mockCnchServersCluster(); diff --git a/src/Storages/System/StorageSystemCnchPartsInfo.h b/src/Storages/System/StorageSystemCnchPartsInfo.h index bc46c89af6..5d6e7bb176 100644 --- a/src/Storages/System/StorageSystemCnchPartsInfo.h +++ b/src/Storages/System/StorageSystemCnchPartsInfo.h @@ -32,7 +32,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemCnchPartsInfoLocal.cpp b/src/Storages/System/StorageSystemCnchPartsInfoLocal.cpp index ba6bc2d78c..6736478a5f 100644 --- a/src/Storages/System/StorageSystemCnchPartsInfoLocal.cpp +++ b/src/Storages/System/StorageSystemCnchPartsInfoLocal.cpp @@ -63,7 +63,7 @@ StorageSystemCnchPartsInfoLocal::StorageSystemCnchPartsInfoLocal(const StorageID Pipe StorageSystemCnchPartsInfoLocal::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -80,7 +80,7 @@ Pipe StorageSystemCnchPartsInfoLocal::read( if (active_tables.empty()) return {}; - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block res_block; NameSet names_set(column_names.begin(), column_names.end()); diff --git a/src/Storages/System/StorageSystemCnchPartsInfoLocal.h b/src/Storages/System/StorageSystemCnchPartsInfoLocal.h index 092e4d987a..9413d5c15b 100644 --- a/src/Storages/System/StorageSystemCnchPartsInfoLocal.h +++ b/src/Storages/System/StorageSystemCnchPartsInfoLocal.h @@ -39,7 +39,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemCnchTableInfo.cpp b/src/Storages/System/StorageSystemCnchTableInfo.cpp index e6b31cd01f..bbdf645b40 100644 --- a/src/Storages/System/StorageSystemCnchTableInfo.cpp +++ b/src/Storages/System/StorageSystemCnchTableInfo.cpp @@ -134,7 +134,7 @@ StorageSystemCnchTableInfo::StorageSystemCnchTableInfo(const StorageID & table_i Pipe StorageSystemCnchTableInfo::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -146,7 +146,7 @@ Pipe StorageSystemCnchTableInfo::read( throw Exception("Table system.cnch_table_info only support cnch_server", ErrorCodes::LOGICAL_ERROR); NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block res_block; std::vector columns_mask(sample_block.columns()); for (size_t i = 0, size = columns_mask.size(); i < size; ++i) diff --git a/src/Storages/System/StorageSystemCnchTableInfo.h b/src/Storages/System/StorageSystemCnchTableInfo.h index 84a03e58ee..32395acb6d 100644 --- a/src/Storages/System/StorageSystemCnchTableInfo.h +++ b/src/Storages/System/StorageSystemCnchTableInfo.h @@ -30,7 +30,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemCnchTables.cpp b/src/Storages/System/StorageSystemCnchTables.cpp index 3fab16492f..778bf67433 100644 --- a/src/Storages/System/StorageSystemCnchTables.cpp +++ b/src/Storages/System/StorageSystemCnchTables.cpp @@ -159,7 +159,7 @@ static bool matchAnyPredicate(const std::optionalgetSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block header; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemCnchTables.h b/src/Storages/System/StorageSystemCnchTables.h index bde77d03e1..9806adcdc3 100644 --- a/src/Storages/System/StorageSystemCnchTables.h +++ b/src/Storages/System/StorageSystemCnchTables.h @@ -30,7 +30,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemCnchTablesHistory.cpp b/src/Storages/System/StorageSystemCnchTablesHistory.cpp index ce3384d0c5..cd15095c30 100644 --- a/src/Storages/System/StorageSystemCnchTablesHistory.cpp +++ b/src/Storages/System/StorageSystemCnchTablesHistory.cpp @@ -114,7 +114,7 @@ StorageSystemCnchTablesHistory::StorageSystemCnchTablesHistory(const StorageID & Pipe StorageSystemCnchTablesHistory::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -128,7 +128,7 @@ Pipe StorageSystemCnchTablesHistory::read( NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block res_block; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemCnchTablesHistory.h b/src/Storages/System/StorageSystemCnchTablesHistory.h index a5198be801..36d66f7e3c 100644 --- a/src/Storages/System/StorageSystemCnchTablesHistory.h +++ b/src/Storages/System/StorageSystemCnchTablesHistory.h @@ -30,7 +30,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemCnchTrashItemsInfo.cpp b/src/Storages/System/StorageSystemCnchTrashItemsInfo.cpp index 4deeae798d..ea3fd05569 100644 --- a/src/Storages/System/StorageSystemCnchTrashItemsInfo.cpp +++ b/src/Storages/System/StorageSystemCnchTrashItemsInfo.cpp @@ -33,7 +33,7 @@ StorageSystemCnchTrashItemsInfo::StorageSystemCnchTrashItemsInfo(const StorageID Pipe StorageSystemCnchTrashItemsInfo::read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -59,7 +59,7 @@ Pipe StorageSystemCnchTrashItemsInfo::read( QueryPlan query_plan; Poco::Logger * log = &Poco::Logger::get("SystemTrashItemsInfo"); ClusterProxy::SelectStreamFactory stream_factory = ClusterProxy::SelectStreamFactory( - header, QueryProcessingStage::Complete, StorageID{"system", "cnch_trash_items_info_local"}, Scalars{}, false, {}); + header, {}, {}, QueryProcessingStage::Complete, StorageID{"system", "cnch_trash_items_info_local"}, Scalars{}, false, {}); //set cluster in query_info query_info.cluster = context->mockCnchServersCluster(); diff --git a/src/Storages/System/StorageSystemCnchTrashItemsInfo.h b/src/Storages/System/StorageSystemCnchTrashItemsInfo.h index 1d31b87d0d..c9e2df3260 100644 --- a/src/Storages/System/StorageSystemCnchTrashItemsInfo.h +++ b/src/Storages/System/StorageSystemCnchTrashItemsInfo.h @@ -16,7 +16,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemCnchTrashItemsInfoLocal.cpp b/src/Storages/System/StorageSystemCnchTrashItemsInfoLocal.cpp index 20b0754747..52d0d16553 100644 --- a/src/Storages/System/StorageSystemCnchTrashItemsInfoLocal.cpp +++ b/src/Storages/System/StorageSystemCnchTrashItemsInfoLocal.cpp @@ -35,7 +35,7 @@ StorageSystemCnchTrashItemsInfoLocal::StorageSystemCnchTrashItemsInfoLocal(const Pipe StorageSystemCnchTrashItemsInfoLocal::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -52,7 +52,7 @@ Pipe StorageSystemCnchTrashItemsInfoLocal::read( if (active_tables.empty()) return {}; - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block res_block; NameSet names_set(column_names.begin(), column_names.end()); diff --git a/src/Storages/System/StorageSystemCnchTrashItemsInfoLocal.h b/src/Storages/System/StorageSystemCnchTrashItemsInfoLocal.h index 8b25ac642d..a6db6ef6a8 100644 --- a/src/Storages/System/StorageSystemCnchTrashItemsInfoLocal.h +++ b/src/Storages/System/StorageSystemCnchTrashItemsInfoLocal.h @@ -17,7 +17,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemCnchViewTables.cpp b/src/Storages/System/StorageSystemCnchViewTables.cpp index 1fa04995ef..7cc3ac8728 100644 --- a/src/Storages/System/StorageSystemCnchViewTables.cpp +++ b/src/Storages/System/StorageSystemCnchViewTables.cpp @@ -53,7 +53,7 @@ StorageSystemCnchViewTables::StorageSystemCnchViewTables(const StorageID & table Pipe StorageSystemCnchViewTables::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -67,7 +67,7 @@ Pipe StorageSystemCnchViewTables::read( NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block header; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemCnchViewTables.h b/src/Storages/System/StorageSystemCnchViewTables.h index 11cfe0b86c..a233b949ac 100644 --- a/src/Storages/System/StorageSystemCnchViewTables.h +++ b/src/Storages/System/StorageSystemCnchViewTables.h @@ -15,7 +15,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemColumns.cpp b/src/Storages/System/StorageSystemColumns.cpp index 37f4f43ca9..a3b45e64a5 100644 --- a/src/Storages/System/StorageSystemColumns.cpp +++ b/src/Storages/System/StorageSystemColumns.cpp @@ -241,20 +241,20 @@ private: Pipe StorageSystemColumns::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t max_block_size, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); /// Create a mask of what columns are needed in the result. NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block header; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemColumns.h b/src/Storages/System/StorageSystemColumns.h index 6a369b5b8c..fd3da1eb07 100644 --- a/src/Storages/System/StorageSystemColumns.h +++ b/src/Storages/System/StorageSystemColumns.h @@ -19,7 +19,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemDMBGJobs.cpp b/src/Storages/System/StorageSystemDMBGJobs.cpp index e1f3407b57..07801690ee 100644 --- a/src/Storages/System/StorageSystemDMBGJobs.cpp +++ b/src/Storages/System/StorageSystemDMBGJobs.cpp @@ -57,7 +57,8 @@ namespace DB CnchBGThreadType::MergeMutate, CnchBGThreadType::Consumer, CnchBGThreadType::Clustering, - CnchBGThreadType::DedupWorker + CnchBGThreadType::DedupWorker, + CnchBGThreadType::ObjectSchemaAssemble }; std::for_each(types.begin(), types.end(), diff --git a/src/Storages/System/StorageSystemDataSkippingIndices.cpp b/src/Storages/System/StorageSystemDataSkippingIndices.cpp index 38641f009e..74ea1d3aa4 100644 --- a/src/Storages/System/StorageSystemDataSkippingIndices.cpp +++ b/src/Storages/System/StorageSystemDataSkippingIndices.cpp @@ -146,18 +146,18 @@ private: Pipe StorageSystemDataSkippingIndices::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /* processed_stage */, size_t max_block_size, unsigned int /* num_streams */) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block header; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemDataSkippingIndices.h b/src/Storages/System/StorageSystemDataSkippingIndices.h index 9c9cef6f04..1ba7f4dc38 100644 --- a/src/Storages/System/StorageSystemDataSkippingIndices.h +++ b/src/Storages/System/StorageSystemDataSkippingIndices.h @@ -16,7 +16,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemDetachedParts.cpp b/src/Storages/System/StorageSystemDetachedParts.cpp index 1a9418485e..1021f31d04 100644 --- a/src/Storages/System/StorageSystemDetachedParts.cpp +++ b/src/Storages/System/StorageSystemDetachedParts.cpp @@ -31,7 +31,7 @@ StorageSystemDetachedParts::StorageSystemDetachedParts(const StorageID & table_i Pipe StorageSystemDetachedParts::read( const Names & /* column_names */, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -41,7 +41,7 @@ Pipe StorageSystemDetachedParts::read( StoragesInfoStream stream(query_info, context); /// Create the result. - Block block = metadata_snapshot->getSampleBlock(); + Block block = storage_snapshot->metadata->getSampleBlock(); MutableColumns new_columns = block.cloneEmptyColumns(); while (StoragesInfo info = stream.next()) diff --git a/src/Storages/System/StorageSystemDetachedParts.h b/src/Storages/System/StorageSystemDetachedParts.h index ece9d49550..b0993d7c98 100644 --- a/src/Storages/System/StorageSystemDetachedParts.h +++ b/src/Storages/System/StorageSystemDetachedParts.h @@ -25,7 +25,7 @@ protected: Pipe read( const Names & /* column_names */, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/System/StorageSystemDisks.cpp b/src/Storages/System/StorageSystemDisks.cpp index 2d73c70791..b3088dff2d 100644 --- a/src/Storages/System/StorageSystemDisks.cpp +++ b/src/Storages/System/StorageSystemDisks.cpp @@ -50,14 +50,14 @@ StorageSystemDisks::StorageSystemDisks(const StorageID & table_id_) Pipe StorageSystemDisks::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); MutableColumnPtr col_name = ColumnString::create(); MutableColumnPtr col_id = ColumnUInt64::create(); @@ -90,7 +90,7 @@ Pipe StorageSystemDisks::read( UInt64 num_rows = res_columns.at(0)->size(); Chunk chunk(std::move(res_columns), num_rows); - return Pipe(std::make_shared(metadata_snapshot->getSampleBlock(), std::move(chunk))); + return Pipe(std::make_shared(storage_snapshot->metadata->getSampleBlock(), std::move(chunk))); } } diff --git a/src/Storages/System/StorageSystemDisks.h b/src/Storages/System/StorageSystemDisks.h index e2cb4cb2c3..cc9e6ada0b 100644 --- a/src/Storages/System/StorageSystemDisks.h +++ b/src/Storages/System/StorageSystemDisks.h @@ -22,7 +22,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemKafkaTables.cpp b/src/Storages/System/StorageSystemKafkaTables.cpp index 2a186aef67..6413400abb 100644 --- a/src/Storages/System/StorageSystemKafkaTables.cpp +++ b/src/Storages/System/StorageSystemKafkaTables.cpp @@ -63,7 +63,7 @@ StorageSystemKafkaTables::StorageSystemKafkaTables(const StorageID & table_id_) Pipe StorageSystemKafkaTables::read( const Names & /* column_names */, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/System/StorageSystemKafkaTables.h b/src/Storages/System/StorageSystemKafkaTables.h index 1365a22780..0256b58855 100644 --- a/src/Storages/System/StorageSystemKafkaTables.h +++ b/src/Storages/System/StorageSystemKafkaTables.h @@ -37,7 +37,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemMaterializedMySQL.cpp b/src/Storages/System/StorageSystemMaterializedMySQL.cpp index c1d57c064b..ab10c0160d 100644 --- a/src/Storages/System/StorageSystemMaterializedMySQL.cpp +++ b/src/Storages/System/StorageSystemMaterializedMySQL.cpp @@ -48,7 +48,7 @@ StorageSystemMaterializedMySQL::StorageSystemMaterializedMySQL(const StorageID & Pipe StorageSystemMaterializedMySQL::read( const Names & /* column_names */, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/System/StorageSystemMaterializedMySQL.h b/src/Storages/System/StorageSystemMaterializedMySQL.h index 367f954161..2488e3564a 100644 --- a/src/Storages/System/StorageSystemMaterializedMySQL.h +++ b/src/Storages/System/StorageSystemMaterializedMySQL.h @@ -1,4 +1,5 @@ #pragma once +#include #include "config_core.h" #if USE_MYSQL @@ -16,7 +17,7 @@ public: Pipe read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/System/StorageSystemNumbers.cpp b/src/Storages/System/StorageSystemNumbers.cpp index 545f2c8be9..bfdfafd57a 100644 --- a/src/Storages/System/StorageSystemNumbers.cpp +++ b/src/Storages/System/StorageSystemNumbers.cpp @@ -124,14 +124,14 @@ StorageSystemNumbers::StorageSystemNumbers(const StorageID & table_id, bool mult Pipe StorageSystemNumbers::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo &, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); if (limit && *limit < max_block_size) { diff --git a/src/Storages/System/StorageSystemNumbers.h b/src/Storages/System/StorageSystemNumbers.h index 02a70e4d38..14c567e6c4 100644 --- a/src/Storages/System/StorageSystemNumbers.h +++ b/src/Storages/System/StorageSystemNumbers.h @@ -31,7 +31,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemOne.cpp b/src/Storages/System/StorageSystemOne.cpp index 7c28f89712..d0c7072b07 100644 --- a/src/Storages/System/StorageSystemOne.cpp +++ b/src/Storages/System/StorageSystemOne.cpp @@ -22,14 +22,14 @@ StorageSystemOne::StorageSystemOne(const StorageID & table_id_) Pipe StorageSystemOne::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo &, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); Block header{ColumnWithTypeAndName( DataTypeUInt8().createColumn(), diff --git a/src/Storages/System/StorageSystemOne.h b/src/Storages/System/StorageSystemOne.h index 9ae9063309..3c810e5f6f 100644 --- a/src/Storages/System/StorageSystemOne.h +++ b/src/Storages/System/StorageSystemOne.h @@ -23,7 +23,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index f790d97a08..9514a420ae 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -26,7 +26,7 @@ namespace ErrorCodes extern const int TABLE_IS_DROPPED; } -bool StorageSystemPartsBase::hasStateColumn(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const +bool StorageSystemPartsBase::hasStateColumn(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const { bool has_state_column = false; Names real_column_names; @@ -41,7 +41,7 @@ bool StorageSystemPartsBase::hasStateColumn(const Names & column_names, const St /// Do not check if only _state column is requested if (!(has_state_column && real_column_names.empty())) - metadata_snapshot->check(real_column_names, {}, getStorageID()); + storage_snapshot->check(real_column_names); return has_state_column; } @@ -235,14 +235,14 @@ StoragesInfo StoragesInfoStream::next() Pipe StorageSystemPartsBase::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - bool has_state_column = hasStateColumn(column_names, metadata_snapshot); + bool has_state_column = hasStateColumn(column_names, storage_snapshot); StoragesInfoStream stream(query_info, context); @@ -250,7 +250,7 @@ Pipe StorageSystemPartsBase::read( NameSet names_set(column_names.begin(), column_names.end()); - Block sample = metadata_snapshot->getSampleBlock(); + Block sample = storage_snapshot->metadata->getSampleBlock(); Block header; std::vector columns_mask(sample.columns()); diff --git a/src/Storages/System/StorageSystemPartsBase.h b/src/Storages/System/StorageSystemPartsBase.h index d6415f9de7..27e18e6991 100644 --- a/src/Storages/System/StorageSystemPartsBase.h +++ b/src/Storages/System/StorageSystemPartsBase.h @@ -58,7 +58,7 @@ class StorageSystemPartsBase : public IStorage public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -70,7 +70,7 @@ public: bool isSystemStorage() const override { return true; } private: - bool hasStateColumn(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const; + bool hasStateColumn(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const; protected: const FormatSettings format_settings; diff --git a/src/Storages/System/StorageSystemPersistentBGJobStatus.cpp b/src/Storages/System/StorageSystemPersistentBGJobStatus.cpp index f26d4b82c7..564b9cb767 100644 --- a/src/Storages/System/StorageSystemPersistentBGJobStatus.cpp +++ b/src/Storages/System/StorageSystemPersistentBGJobStatus.cpp @@ -44,7 +44,8 @@ namespace DB CnchBGThreadType::Clustering, CnchBGThreadType::PartGC, CnchBGThreadType::Consumer, - CnchBGThreadType::DedupWorker + CnchBGThreadType::DedupWorker, + CnchBGThreadType::ObjectSchemaAssemble }; std::shared_ptr catalog = context->getCnchCatalog(); diff --git a/src/Storages/System/StorageSystemReplicas.cpp b/src/Storages/System/StorageSystemReplicas.cpp index 9888b002af..de34e5a81b 100644 --- a/src/Storages/System/StorageSystemReplicas.cpp +++ b/src/Storages/System/StorageSystemReplicas.cpp @@ -58,14 +58,14 @@ StorageSystemReplicas::StorageSystemReplicas(const StorageID & table_id_) Pipe StorageSystemReplicas::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); const auto access = context->getAccess(); const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES); @@ -145,7 +145,7 @@ Pipe StorageSystemReplicas::read( col_engine = filtered_block.getByName("engine").column; } - MutableColumns res_columns = metadata_snapshot->getSampleBlock().cloneEmptyColumns(); + MutableColumns res_columns = storage_snapshot->metadata->getSampleBlock().cloneEmptyColumns(); for (size_t i = 0, size = col_database->size(); i < size; ++i) { @@ -186,8 +186,6 @@ Pipe StorageSystemReplicas::read( res_columns[col_num++]->insert(status.zookeeper_exception); } - Block header = metadata_snapshot->getSampleBlock(); - Columns fin_columns; fin_columns.reserve(res_columns.size()); @@ -201,7 +199,7 @@ Pipe StorageSystemReplicas::read( UInt64 num_rows = fin_columns.at(0)->size(); Chunk chunk(std::move(fin_columns), num_rows); - return Pipe(std::make_shared(metadata_snapshot->getSampleBlock(), std::move(chunk))); + return Pipe(std::make_shared(storage_snapshot->metadata->getSampleBlock(), std::move(chunk))); } diff --git a/src/Storages/System/StorageSystemReplicas.h b/src/Storages/System/StorageSystemReplicas.h index ffe12544b5..1a253f49e7 100644 --- a/src/Storages/System/StorageSystemReplicas.h +++ b/src/Storages/System/StorageSystemReplicas.h @@ -20,7 +20,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemStoragePolicies.cpp b/src/Storages/System/StorageSystemStoragePolicies.cpp index 48dfadd2b3..2333f8cf0f 100644 --- a/src/Storages/System/StorageSystemStoragePolicies.cpp +++ b/src/Storages/System/StorageSystemStoragePolicies.cpp @@ -37,14 +37,14 @@ StorageSystemStoragePolicies::StorageSystemStoragePolicies(const StorageID & tab Pipe StorageSystemStoragePolicies::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); MutableColumnPtr col_policy_name = ColumnString::create(); MutableColumnPtr col_volume_name = ColumnString::create(); @@ -88,7 +88,7 @@ Pipe StorageSystemStoragePolicies::read( UInt64 num_rows = res_columns.at(0)->size(); Chunk chunk(std::move(res_columns), num_rows); - return Pipe(std::make_shared(metadata_snapshot->getSampleBlock(), std::move(chunk))); + return Pipe(std::make_shared(storage_snapshot->metadata->getSampleBlock(), std::move(chunk))); } } diff --git a/src/Storages/System/StorageSystemStoragePolicies.h b/src/Storages/System/StorageSystemStoragePolicies.h index bb483e100e..3ec0adf9b3 100644 --- a/src/Storages/System/StorageSystemStoragePolicies.h +++ b/src/Storages/System/StorageSystemStoragePolicies.h @@ -22,7 +22,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index 0f2425f6ab..185d546bf5 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -543,20 +543,20 @@ private: Pipe StorageSystemTables::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t max_block_size, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); /// Create a mask of what columns are needed in the result. NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block res_block; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemTables.h b/src/Storages/System/StorageSystemTables.h index 83bd7c0853..87a28dfbe8 100644 --- a/src/Storages/System/StorageSystemTables.h +++ b/src/Storages/System/StorageSystemTables.h @@ -20,7 +20,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemZeros.cpp b/src/Storages/System/StorageSystemZeros.cpp index d1456d7268..93f00d62e7 100644 --- a/src/Storages/System/StorageSystemZeros.cpp +++ b/src/Storages/System/StorageSystemZeros.cpp @@ -92,14 +92,14 @@ StorageSystemZeros::StorageSystemZeros(const StorageID & table_id_, bool multith Pipe StorageSystemZeros::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo &, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); bool use_multiple_streams = multithreaded; diff --git a/src/Storages/System/StorageSystemZeros.h b/src/Storages/System/StorageSystemZeros.h index 1f10c62045..71914861e5 100644 --- a/src/Storages/System/StorageSystemZeros.h +++ b/src/Storages/System/StorageSystemZeros.h @@ -22,7 +22,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/getStructureOfRemoteTable.cpp b/src/Storages/getStructureOfRemoteTable.cpp index 73abfa10bf..4f646094a4 100644 --- a/src/Storages/getStructureOfRemoteTable.cpp +++ b/src/Storages/getStructureOfRemoteTable.cpp @@ -173,4 +173,69 @@ ColumnsDescription getStructureOfRemoteTable( ErrorCodes::NO_REMOTE_SHARD_AVAILABLE); } +ColumnsDescriptionByShardNum getExtendedObjectsOfRemoteTables( + const Cluster & cluster, + const StorageID & remote_table_id, + const ColumnsDescription & storage_columns, + ContextPtr context) +{ + const auto & shards_info = cluster.getShardsInfo(); + auto query = "DESC TABLE " + remote_table_id.getFullTableName(); + + auto new_context = ClusterProxy::updateSettingsForCluster(cluster, context, context->getSettingsRef()); + new_context->setSetting("describe_extend_object_types", true); + + /// Expect only needed columns from the result of DESC TABLE. + Block sample_block + { + { ColumnString::create(), std::make_shared(), "name" }, + { ColumnString::create(), std::make_shared(), "type" }, + }; + + auto execute_query_on_shard = [&](const auto & shard_info) + { + /// Execute remote query without restrictions (because it's not real user query, but part of implementation) + RemoteQueryExecutor executor(shard_info.pool, query, sample_block, new_context); + + executor.setPoolMode(PoolMode::GET_ONE); + executor.setMainTable(remote_table_id); + + ColumnsDescription res; + while (auto block = executor.read()) + { + const auto & name_col = *block.getByName("name").column; + const auto & type_col = *block.getByName("type").column; + + size_t size = name_col.size(); + for (size_t i = 0; i < size; ++i) + { + auto name = get(name_col[i]); + auto type_name = get(type_col[i]); + + auto storage_column = storage_columns.tryGetPhysical(name); + if (storage_column && isObject(storage_column->type)) + res.add(ColumnDescription(std::move(name), DataTypeFactory::instance().get(type_name))); + } + } + + return res; + }; + + ColumnsDescriptionByShardNum columns; + for (const auto & shard_info : shards_info) + { + auto res = execute_query_on_shard(shard_info); + + /// Expect at least some columns. + /// This is a hack to handle the empty block case returned by Connection when skip_unavailable_shards is set. + if (!res.empty()) + columns.emplace(shard_info.shard_num, std::move(res)); + } + + if (columns.empty()) + throw NetException("All attempts to get table structure failed", ErrorCodes::NO_REMOTE_SHARD_AVAILABLE); + + return columns; +} + } diff --git a/src/Storages/getStructureOfRemoteTable.h b/src/Storages/getStructureOfRemoteTable.h index 3f77236c75..62f93dccf1 100644 --- a/src/Storages/getStructureOfRemoteTable.h +++ b/src/Storages/getStructureOfRemoteTable.h @@ -8,6 +8,7 @@ namespace DB { + class Context; struct StorageID; @@ -19,4 +20,14 @@ ColumnsDescription getStructureOfRemoteTable( ContextPtr context, const ASTPtr & table_func_ptr = nullptr); + +using ColumnsDescriptionByShardNum = std::unordered_map; + +/// Returns descriptions of columns of type Object for each shard. +ColumnsDescriptionByShardNum getExtendedObjectsOfRemoteTables( + const Cluster & cluster, + const StorageID & remote_table_id, + const ColumnsDescription & storage_columns, + ContextPtr context); + } diff --git a/src/Storages/tests/gtest_storage_log.cpp b/src/Storages/tests/gtest_storage_log.cpp index e8057c54a0..8e7eb4c59a 100644 --- a/src/Storages/tests/gtest_storage_log.cpp +++ b/src/Storages/tests/gtest_storage_log.cpp @@ -112,16 +112,17 @@ std::string readData(DB::StoragePtr & table, const DB::ContextPtr context) { using namespace DB; auto metadata_snapshot = table->getInMemoryMetadataPtr(); + auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot, context); Names column_names; column_names.push_back("a"); SelectQueryInfo query_info; QueryProcessingStage::Enum stage = table->getQueryProcessingStage( - context, QueryProcessingStage::Complete, metadata_snapshot, query_info); + context, QueryProcessingStage::Complete, storage_snapshot, query_info); QueryPipeline pipeline; - pipeline.init(table->read(column_names, metadata_snapshot, query_info, context, stage, 8192, 1)); + pipeline.init(table->read(column_names, storage_snapshot, query_info, context, stage, 8192, 1)); BlockInputStreamPtr in = std::make_shared(std::move(pipeline)); Block sample; diff --git a/src/Storages/tests/gtest_transform_query_for_external_database.cpp b/src/Storages/tests/gtest_transform_query_for_external_database.cpp index ceae6a8ae4..bdd454bd7a 100644 --- a/src/Storages/tests/gtest_transform_query_for_external_database.cpp +++ b/src/Storages/tests/gtest_transform_query_for_external_database.cpp @@ -104,7 +104,7 @@ static void check( SelectQueryInfo query_info; SelectQueryOptions select_options; query_info.syntax_analyzer_result - = TreeRewriter(state.context).analyzeSelect(ast, state.getColumns(), select_options, state.getTables(table_num)); + = TreeRewriter(state.context).analyzeSelect(ast, DB::TreeRewriterResult(state.getColumns()), select_options, state.getTables(table_num)); query_info.query = ast; std::string transformed_query = transformQueryForExternalDatabase( query_info, state.getColumns(), IdentifierQuotingStyle::DoubleQuotes, "test", "table", state.context); diff --git a/src/Transaction/Actions/InsertAction.cpp b/src/Transaction/Actions/InsertAction.cpp index fb0c0f95fa..2f30fe7260 100644 --- a/src/Transaction/Actions/InsertAction.cpp +++ b/src/Transaction/Actions/InsertAction.cpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace DB @@ -77,6 +78,9 @@ void InsertAction::executeV2() auto catalog = global_context.getCnchCatalog(); catalog->writeParts(table, txn_id, Catalog::CommitItems{{parts.begin(), parts.end()}, delete_bitmaps, {staged_parts.begin(), staged_parts.end()}}, false, /*preallocate_mode=*/ false); + + if (table && hasDynamicSubcolumns(table->getInMemoryMetadata().columns)) + catalog->appendObjectPartialSchema(table, txn_id, parts); } /// Post progressing @@ -89,6 +93,10 @@ void InsertAction::postCommit(TxnTimestamp commit_time) for (auto & part : parts) part->commit_time = commit_time; + // set commit flag for dynamic object column schema + if (table && hasDynamicSubcolumns(table->getInMemoryMetadata().getColumns())) + global_context.getCnchCatalog()->commitObjectPartialSchema(txn_id); + ServerPartLog::addNewParts(getContext(), ServerPartLogElement::INSERT_PART, parts, txn_id, false); } @@ -98,6 +106,10 @@ void InsertAction::abort() // skip part cache to avoid blocking by write lock of part cache for long time global_context.getCnchCatalog()->clearParts(table, Catalog::CommitItems{{parts.begin(), parts.end()}, delete_bitmaps, {staged_parts.begin(), staged_parts.end()}}); + // set commit flag for dynamic object column schema + if (table && hasDynamicSubcolumns(table->getInMemoryMetadata().getColumns())) + global_context.getCnchCatalog()->abortObjectPartialSchema(txn_id); + ServerPartLog::addNewParts(getContext(), ServerPartLogElement::INSERT_PART, parts, txn_id, true); } diff --git a/src/WorkerTasks/MergeTreeDataMerger.cpp b/src/WorkerTasks/MergeTreeDataMerger.cpp index 01c17854ab..1ced66cee2 100644 --- a/src/WorkerTasks/MergeTreeDataMerger.cpp +++ b/src/WorkerTasks/MergeTreeDataMerger.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -37,8 +37,11 @@ #include #include #include -#include #include +#include +#include +#include +#include namespace ProfileEvents { @@ -131,7 +134,7 @@ MergeTreeDataMerger::~MergeTreeDataMerger() } void MergeTreeDataMerger::prepareColumnNamesAndTypes( - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const MergeTreeMetaBase::MergingParams & merging_params, Names & all_column_names, Names & gathering_column_names, @@ -140,9 +143,12 @@ void MergeTreeDataMerger::prepareColumnNamesAndTypes( NamesAndTypesList & gathering_columns, NamesAndTypesList & merging_columns) { + auto metadata_snapshot = storage_snapshot->metadata; all_column_names = metadata_snapshot->getColumns().getNamesOfPhysical(); storage_columns = metadata_snapshot->getColumns().getAllPhysical(); + extendObjectColumns(storage_columns, storage_snapshot->object_columns, false); + Names sort_key_columns_vec = metadata_snapshot->getSortingKey().expression->getRequiredColumns(); std::set key_columns(sort_key_columns_vec.cbegin(), sort_key_columns_vec.cend()); for (const auto & index : metadata_snapshot->getSecondaryIndices()) @@ -314,8 +320,10 @@ MergeTreeMutableDataPartPtr MergeTreeDataMerger::mergePartsToTemporaryPartImpl( NamesAndTypesList gathering_columns; NamesAndTypesList merging_columns; + auto storage_snapshot = data.getStorageSnapshot(metadata_snapshot, context); + prepareColumnNamesAndTypes( - metadata_snapshot, + storage_snapshot, merging_params, all_column_names, gathering_column_names, @@ -424,7 +432,7 @@ MergeTreeMutableDataPartPtr MergeTreeDataMerger::mergePartsToTemporaryPartImpl( auto input = std::make_unique( data, - metadata_snapshot, + storage_snapshot, part, merging_column_names, read_with_direct_io, @@ -668,7 +676,7 @@ MergeTreeMutableDataPartPtr MergeTreeDataMerger::mergePartsToTemporaryPartImpl( auto column_part_source = std::make_shared( data, - metadata_snapshot, + storage_snapshot, source_data_parts[part_num], Names{column_name}, read_with_direct_io, diff --git a/src/WorkerTasks/MergeTreeDataMerger.h b/src/WorkerTasks/MergeTreeDataMerger.h index 4bfc3b93b0..0483e1da78 100644 --- a/src/WorkerTasks/MergeTreeDataMerger.h +++ b/src/WorkerTasks/MergeTreeDataMerger.h @@ -78,7 +78,7 @@ private: const IMergeTreeDataPart * parent_part); void prepareColumnNamesAndTypes( - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const MergeTreeMetaBase::MergingParams & merging_params, Names & all_column_names, Names & gathering_column_names, diff --git a/src/WorkerTasks/MergeTreeDataMutator.cpp b/src/WorkerTasks/MergeTreeDataMutator.cpp index 3ce211315f..b38b841cbc 100644 --- a/src/WorkerTasks/MergeTreeDataMutator.cpp +++ b/src/WorkerTasks/MergeTreeDataMutator.cpp @@ -547,8 +547,7 @@ NameToNameVector MergeTreeDataMutator::collectFilesForRenames( [&](const ISerialization::SubstreamPath & substream_path) { ++stream_counts[ISerialization::getFileNameForStream(column, substream_path)]; - }, - {}); + }); } NameToNameVector rename_vector; diff --git a/src/WorkerTasks/MergeTreeDataReclusterMutator.cpp b/src/WorkerTasks/MergeTreeDataReclusterMutator.cpp index 3f532914a6..a0866881b0 100644 --- a/src/WorkerTasks/MergeTreeDataReclusterMutator.cpp +++ b/src/WorkerTasks/MergeTreeDataReclusterMutator.cpp @@ -57,7 +57,8 @@ MergeTreeMutableDataPartsVector MergeTreeDataReclusterMutator::executeOnSinglePa MergeTreeMutableDataPartsVector res; auto metadata_snapshot = data.getInMemoryMetadataPtr(); auto column_names = metadata_snapshot->getColumns().getNamesOfPhysical(); - auto source = std::make_shared(data, metadata_snapshot, part, column_names, false, true); + auto storage_snapshot = data.getStorageSnapshot(metadata_snapshot, context); + auto source = std::make_shared(data, storage_snapshot, part, column_names, false, true); QueryPipeline pipeline; pipeline.init(Pipe(std::move(source))); pipeline.setMaxThreads(1); diff --git a/tests/queries/4_cnch_stateless/01825_type_json_1.reference b/tests/queries/4_cnch_stateless/01825_type_json_1.reference new file mode 100644 index 0000000000..d76ca42cfd --- /dev/null +++ b/tests/queries/4_cnch_stateless/01825_type_json_1.reference @@ -0,0 +1,3 @@ +1 aa bb c +2 ee ff +3 foo diff --git a/tests/queries/4_cnch_stateless/01825_type_json_1.sql b/tests/queries/4_cnch_stateless/01825_type_json_1.sql new file mode 100644 index 0000000000..ab5672be97 --- /dev/null +++ b/tests/queries/4_cnch_stateless/01825_type_json_1.sql @@ -0,0 +1,84 @@ +-- Tags: no-fasttest +use test; +set enable_optimizer = 0; +SET allow_experimental_object_type = 1; + +DROP TABLE IF EXISTS t_json; + +CREATE TABLE t_json(id UInt64, data Object('JSON')) +ENGINE = CnchMergeTree ORDER BY tuple(); + +INSERT INTO t_json FORMAT JSONEachRow {"id": 1, "data": {"k1": "aa", "k2": {"k3": "bb", "k4": "c"}}} {"id": 2, "data": {"k1": "ee", "k5": "ff"}}; +INSERT INTO t_json FORMAT JSONEachRow {"id": 3, "data": {"k5":"foo"}}; + +SELECT id, data.k1, data.k2.k3, data.k2.k4, data.k5 FROM t_json ORDER BY id; + +-- SELECT name, column, type +-- FROM system.parts_columns +-- WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +-- ORDER BY name; + +-- SYSTEM START MERGES t_json; + +-- OPTIMIZE TABLE t_json FINAL; + +-- SELECT name, column, type +-- FROM system.parts_columns +-- WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +-- ORDER BY name; + +-- SELECT '============'; +-- TRUNCATE TABLE t_json; + +-- INSERT INTO t_json FORMAT JSONEachRow {"id": 1, "data": {"k1":[{"k2":"aaa","k3":[{"k4":"bbb"},{"k4":"ccc"}]},{"k2":"ddd","k3":[{"k4":"eee"},{"k4":"fff"}]}]}}; +-- SELECT id, data.k1.k2, data.k1.k3.k4 FROM t_json ORDER BY id; + +-- SELECT name, column, type +-- FROM system.parts_columns +-- WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +-- ORDER BY name; + +-- SELECT '============'; +-- TRUNCATE TABLE t_json; + +-- SYSTEM STOP MERGES t_json; + +-- INSERT INTO t_json FORMAT JSONEachRow {"id": 1, "data": {"name": "a", "value": 42 }}, {"id": 2, "data": {"name": "b", "value": 4200 }}; + +-- SELECT id, data.name, data.value FROM t_json ORDER BY id; +-- SELECT sum(data.value) FROM t_json; + +-- SELECT name, column, type +-- FROM system.parts_columns +-- WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +-- ORDER BY name; + +-- INSERT INTO t_json FORMAT JSONEachRow {"id": 3, "data": {"name": "a", "value": 42.123 }}; + +-- SELECT id, data.name, data.value FROM t_json ORDER BY id; + +-- SELECT name, column, type +-- FROM system.parts_columns +-- WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +-- ORDER BY name; + +-- INSERT INTO t_json FORMAT JSONEachRow {"id": 4, "data": {"name": "a", "value": "some" }}; + +-- SELECT id, data.name, data.value FROM t_json ORDER BY id; + +-- SELECT name, column, type +-- FROM system.parts_columns +-- WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +-- ORDER BY name; + +-- SYSTEM START MERGES t_json; +-- OPTIMIZE TABLE t_json FINAL; + +-- SELECT name, column, type +-- FROM system.parts_columns +-- WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +-- ORDER BY name; + +DROP TABLE IF EXISTS t_json; + +-- CREATE TABLE t_json(id UInt64, data Object('JSON')) ENGINE = Log; -- { serverError 44 } diff --git a/tests/queries/4_cnch_stateless_no_tenant/45008_type_list_for_optimizer.sql b/tests/queries/4_cnch_stateless_no_tenant/45008_type_list_for_optimizer.sql index 31bc700c00..ee06dc1722 100644 --- a/tests/queries/4_cnch_stateless_no_tenant/45008_type_list_for_optimizer.sql +++ b/tests/queries/4_cnch_stateless_no_tenant/45008_type_list_for_optimizer.sql @@ -34,6 +34,9 @@ insert into optimizer_unsupported values ('AggregateFunction')('SimpleAggregateF insert into optimizer_unsupported values ('Map')('Set')('Nested')('Nothing')('Array')('BitMap64')('BitMap32')('Tuple')('SketchBinary')('HllSketchBinary')('Base64ToBinary'); -- graph type is not supported insert into optimizer_unsupported values ('MultiPolygon')('Point')('Polygon')('Ring'); + +-- dynamic object type is not supported +insert into optimizer_unsupported values ('JSON')('Object'); insert into optimizer_unsupported values ('BigString'); select '*** the following types are newly added, please contact optimizer team to determine if optimizer should support them';