Merge branch 'cnch_hualloc_test' into 'cnch-dev'

feat(clickhousech@m-3348167131): cnch-hualloc

See merge request dp/ClickHouse!20751
# Conflicts:
#	build_bin.sh
#	src/Common/config.h.in
#	src/Storages/System/attachSystemTables.cpp
This commit is contained in:
fanqi1909 2024-05-13 14:26:20 +08:00
parent 076fec9944
commit 981165eb0f
29 changed files with 3053 additions and 11 deletions

View File

@ -20,6 +20,7 @@ export CMAKE_FLAGS="-DCMAKE_INSTALL_PREFIX=../output -DCMAKE_BUILD_TYPE=${CMAKE_
CMAKE_FLAGS="-DCMAKE_INSTALL_PREFIX=../output ${CMAKE_FLAGS}"
CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=${CUSTOM_CMAKE_BUILD_TYPE:-RelWithDebInfo} $CMAKE_FLAGS"
CMAKE_FLAGS="-DENABLE_BREAKPAD=ON $CMAKE_FLAGS" # enable minidump
CMAKE_FLAGS="-DENABLE_HUALLOC=OFF ${CMAKE_FLAGS}"
[[ -n "$CUSTOM_SANITIZE" ]] && CMAKE_FLAGS="-DSANITIZE=$CUSTOM_SANITIZE $CMAKE_FLAGS"
[[ -n "$CUSTOM_MAX_LINKING_JOBS" ]] && CMAKE_FLAGS="-DPARALLEL_LINK_JOBS=${CUSTOM_MAX_LINKING_JOBS} ${CMAKE_FLAGS}"
[[ -n "$CUSTOM_MAX_COMPILE_JOBS" ]] && CMAKE_FLAGS="-DPARALLEL_COMPILE_JOBS=${CUSTOM_MAX_COMPILE_JOBS} ${CMAKE_FLAGS}"

View File

@ -436,3 +436,7 @@ endif()
if (USE_TSQUERY)
add_subdirectory(TSQuery-cmake)
endif()
if (ENABLE_HUALLOC)
add_subdirectory (hualloc-cmake)
endif()

View File

@ -0,0 +1,18 @@
if (SANITIZE OR NOT (
((OS_LINUX OR OS_FREEBSD) AND (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE OR ARCH_RISCV64)) OR
(OS_DARWIN AND (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO"))))
if (ENABLE_HUALLOC)
message (${RECONFIGURE_MESSAGE_LEVEL}
"hualloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64, or ppc64le Linux or FreeBSD builds and RelWithDebInfo macOS builds.")
endif ()
set (ENABLE_HUALLOC OFF)
else ()
option (ENABLE_HUALLOC "Enable hualloc allocator" ${ENABLE_LIBRARIES})
endif ()
# set (ENABLE_HUALLOC ON)
if (ENABLE_HUALLOC)
message ( "Enable hualloc allocator")
add_library(hualloc "${ClickHouse_SOURCE_DIR}/contrib/hualloc/hu_alloc.cpp")
# add_library(ch_contrib::hualloc ALIAS _hualloc)
endif()

View File

@ -0,0 +1,323 @@
#include "hu_alloc.h"
#include <stdio.h>
#include <vector>
#include <set>
#include <string>
#include <iostream>
#include <sstream>
void* hu_calloc(size_t n, size_t elem_size)
{
// Overflow check
const size_t size = n * elem_size;
if (elem_size != 0 && size / elem_size != n) return nullptr;
void* result = hu_alloc(size);
if (result != nullptr)
{
memset(result, 0, size);
}
return result;
}
void* hu_alloc_aligned(size_t size, size_t align)
{
if (align > PAGE_SIZE)
abort();
return hu_alloc(align > size ? align : size);
}
void* hu_realloc(void* old_ptr, size_t new_size)
{
if (old_ptr == nullptr)
{
void* result = hu_alloc(new_size);
return result;
}
if (new_size == 0)
{
hu_free(old_ptr);
return nullptr;
}
void* new_ptr = hu_alloc(new_size);
if (new_ptr == nullptr)
{
return nullptr;
}
size_t old_size = hu_getsize(old_ptr);
memcpy(new_ptr, old_ptr, ((old_size < new_size) ? old_size : new_size));
hu_free(old_ptr);
return new_ptr;
}
void hu_free_w(void *p)
{
hu_free(p);
}
void* hu_alloc_w(size_t sz)
{
return hu_alloc(sz);
}
void hu_check_init_w()
{
hu_check_init();
}
void* ReclaimThread(void *args)
{
// keep & max can be separate for large & segment spaces
const char * sleep_second = std::getenv("HUALLOC_CLAIM_INTERVAL");
int sleep = 3;
try
{
if (sleep_second && std::strlen(sleep_second) > 0)
sleep = atoi(sleep_second);
}
catch(...)
{
sleep = 3;
}
yint cached = *(yint *) args;
if (sleep > 0)
{
for (;;) {
Sleep(sleep * 1000);
ui64 total_cached = LargeCached() + SegmentCached();
if (total_cached < cached * 2)
continue;
LargeReclaim(cached, ReclaimMaxReclaim);
SegmentReclaim(cached, ReclaimMaxReclaim);
}
}
}
ui64 LargeCached()
{
if (AllocatorIsInitialized != 1)
return 0;
ui64 total = 0;
for (yint g = 0; g < LARGE_GROUP_COUNT; ++g)
{
TLargeGroupInfo &gg = LargeGroupInfo[g];
total += _mm_popcnt_u64(gg.FreeBlockMask & gg.CommitedMask);
}
return total * LARGE_BLOCK_SIZE;
}
ui64 SegmentCached()
{
if (AllocatorIsInitialized != 1)
return 0;
ui64 total = 0;
for (yint g = 0; g < SEGMENT_GROUP_COUNT; ++g)
{
TSegmentGroupInfo &gg = SegmentGroupInfo[g];
total += _mm_popcnt_u64(gg.GoodForReclaimMask);
}
return total * SEGMENT_SIZE;
}
ui64 HugeAlloc()
{
return GAllocCnt.load();
}
ui64 LargeReclaimed()
{
return LargeReclaimCnt.load();
}
ui64 SegmentReclaimed()
{
return SegmentReclaimCnt.load();
}
/* mbind Policies */
#define MPOL_DEFAULT 0
#define MPOL_PREFERRED 1
#define MPOL_BIND 2
#define MPOL_INTERLEAVE 3
#define MPOL_LOCAL 4
#define MPOL_MAX 5
#define __NR_mbind 237
static long mbind_bytedance(void *start, unsigned long len, int mode,
const unsigned long *nmask, unsigned long maxnode, unsigned flags)
{
return syscall(__NR_mbind, (long)start, len, mode, (long)nmask,
maxnode, flags);
}
bool hualloc_use_numa_info = false;
bool hualloc_enable_mbind = false;
int hualloc_mbind_mode = MPOL_BIND;
void (*hualloc_logger)(std::string) = nullptr;
void hualloc_log(std::string s)
{
if (hualloc_logger)
hualloc_logger(s);
else
printf(s.c_str());
}
size_t hualloc_numa_node_count = 0;
std::unordered_map<size_t, size_t> hualloc_cpu_index_to_numa_node;
size_t hualloc_used_numa_node_count = 0;
std::unordered_map<size_t, size_t> hualloc_used_numa_nodes_to_mem_index; // node index -> mem index for node
void mbind_memory(char *mem, size_t size, int alignment)
{
int alignment_count = size/alignment;
for (auto & hualloc_used_numa_node : hualloc_used_numa_nodes_to_mem_index)
{
int mem_index = hualloc_used_numa_node.second;
int numa_node = hualloc_used_numa_node.first;
char *mem_cur = mem + (alignment_count/hualloc_numa_node_count) * mem_index * alignment;
char *mem_next = mem + (alignment_count/hualloc_numa_node_count) * (mem_index+1) * alignment;
uint64_t mbind_mask = 1ull<<numa_node;
int res = mbind_bytedance(mem_cur, mem_next-mem_cur, hualloc_mbind_mode, &mbind_mask, hualloc_numa_node_count+1, 0);
std::stringstream ss;
ss << "hualloc numa info: bind mem [" << static_cast<void*>(mem_cur) << ", " << static_cast<void*>(mem_next)
<< ") len 0x" << std::hex << mem_next-mem_cur
<< " @ index " << mem_index << " -> numa node " << numa_node << " return " << res ;
ss << " err: " << errno << "-" << strerror(errno) << std::endl;
hualloc_log(ss.str());
}
}
std::string getCpuListOfNumaNode(int numa_noe)
{
std::set<int> cpu_set;
std::string cpu_list;
for (auto & item : hualloc_cpu_index_to_numa_node)
if (item.second == numa_noe)
cpu_set.insert(item.first);
cpu_list += "[";
for (auto cpu_index : cpu_set)
cpu_list += std::to_string(cpu_index) + ",";
if (!cpu_list.empty()) {
cpu_list.pop_back();
}
cpu_list += "]";
return cpu_list;
}
void huallocSetNumaInfo(
size_t max_numa_node_,
std::vector<cpu_set_t> & numa_nodes_cpu_mask_,
bool hualloc_enable_mbind_,
int mbind_mode,
void (*logger)(std::string)
)
{
hualloc_logger = logger;
if (max_numa_node_ <= 0 || numa_nodes_cpu_mask_.size() != max_numa_node_+1)
return;
hualloc_enable_mbind = hualloc_enable_mbind_;
hualloc_numa_node_count = max_numa_node_+1;
hualloc_mbind_mode = mbind_mode;
std::stringstream ss;
ss << "hualloc numa info: max_numa_node: " << max_numa_node_ << ", numa_nodes_cpu_mask.size(): " << numa_nodes_cpu_mask_.size() << std::endl;
hualloc_log(ss.str());
for (int i = 0; i < numa_nodes_cpu_mask_.size(); ++i)
{
cpu_set_t cpu_mask = numa_nodes_cpu_mask_[i];
for (int cpu_index = 0; cpu_index < CPU_SETSIZE; ++cpu_index)
{
if (CPU_ISSET(cpu_index, &cpu_mask))
{
hualloc_cpu_index_to_numa_node[cpu_index] = i;
}
}
}
cpu_set_t progress_cpu_mask;
std::set<int> progress_used_numa_nodes;
CPU_ZERO(&progress_cpu_mask);
if (sched_getaffinity(0, sizeof(cpu_set_t), &progress_cpu_mask) == -1) {
hualloc_log("sched_getaffinity fail");
return;
}
for (int cpu_index = 0; cpu_index < CPU_SETSIZE; ++cpu_index)
{
if (CPU_ISSET(cpu_index, &progress_cpu_mask))
{
progress_used_numa_nodes.insert(hualloc_cpu_index_to_numa_node[cpu_index]);
}
}
hualloc_used_numa_node_count = progress_used_numa_nodes.size();
if (hualloc_used_numa_node_count <= 0 || hualloc_used_numa_node_count > hualloc_numa_node_count)
{
std::stringstream ss;
ss << "hualloc numa info: hualloc_used_numa_node_count is " << hualloc_used_numa_node_count << ", hualloc_numa_node_count is "
<< hualloc_numa_node_count << ". Won't set hualloc_use_numa_info\n";
hualloc_log(ss.str());
return;
}
int mem_index = 0;
for (int hualloc_used_numa_node : progress_used_numa_nodes)
{
std::string cpu_list = getCpuListOfNumaNode(hualloc_used_numa_node);
std::stringstream ss;
ss << "hualloc numa info: numa node(" << hualloc_used_numa_node << ") -> mem_index(" << mem_index << ") -> cpu list: "
<< cpu_list.c_str() << std::endl;
hualloc_log(ss.str());
hualloc_used_numa_nodes_to_mem_index[hualloc_used_numa_node] = mem_index;
++mem_index;
}
hualloc_use_numa_info = true;
}
int get_thread_numa_mem_index()
{
int cpu = sched_getcpu();
int numa_node_index = hualloc_cpu_index_to_numa_node[cpu];
return hualloc_used_numa_nodes_to_mem_index[numa_node_index];
}
ui64 GetTotalLargeAlloc()
{
return TotalLargeAlloc.load();
}
ui64 GetTotalLargeFree()
{
return TotalLargeFree.load();
}
ui64 GetTotalSegmentAlloc()
{
return TotalSegmentAlloc.load();
}
ui64 GetTotalSegmentFree()
{
return TotalSegmentFree.load();
}
ui64 GetTotalGiantAlloc()
{
return TotalGiantAlloc.load();
}
ui64 GetTotalGiantFree()
{
return TotalGiantFree.load();
}

2037
contrib/hualloc/hu_alloc.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -138,6 +138,7 @@
#include <common/phdr_cache.h>
#include <common/scope_guard.h>
#include <Common/ChineseTokenExtractor.h>
#include <Common/HuAllocator.h>
#include <CloudServices/CnchServerClientPool.h>
@ -227,7 +228,6 @@ namespace DB::ErrorCodes
int mainEntryClickHouseServer(int argc, char ** argv)
{
DB::Server app;
if (jemallocOptionEnabled("opt.background_thread"))
{
LOG_ERROR(&app.logger(),
@ -534,6 +534,12 @@ void checkForUsersNotInMainConfig(
#endif
}
void huallocLogPrint(std::string s)
{
static Poco::Logger * logger = &Poco::Logger::get("HuallocDebug");
LOG_INFO(logger, s);
}
int Server::main(const std::vector<std::string> & /*args*/)
{
Poco::Logger * log = &logger();
@ -1008,6 +1014,37 @@ int Server::main(const std::vector<std::string> & /*args*/)
}
BrpcApplication::getInstance().reloadConfig(*config);
#if USE_HUALLOC
if (config->getBool("hualloc_numa_aware", false))
{
size_t max_numa_node = SystemUtils::getMaxNumaNode();
std::vector<cpu_set_t> numa_nodes_cpu_mask = SystemUtils::getNumaNodesCpuMask();
bool hualloc_enable_mbind = config->getBool("hualloc_enable_mbind", false);
int mbind_mode = config->getInt("hualloc_mbind_mode", 1);
/*
*mbind mode
#define MPOL_DEFAULT 0
#define MPOL_PREFERRED 1
#define MPOL_BIND 2
#define MPOL_INTERLEAVE 3
#define MPOL_LOCAL 4
#define MPOL_MAX 5
*/
huallocSetNumaInfo(
max_numa_node,
numa_nodes_cpu_mask,
hualloc_enable_mbind,
mbind_mode,
huallocLogPrint
);
}
double default_hualloc_cache_ratio = config->getDouble("hualloc_cache_ratio", 0.25);
LOG_INFO(log, "HuAlloc cache memory size:{}",
formatReadableSizeWithBinarySuffix(max_server_memory_usage * default_hualloc_cache_ratio));
HuAllocator<false>::InitHuAlloc(max_server_memory_usage * default_hualloc_cache_ratio);
#endif
total_memory_tracker.setHardLimit(max_server_memory_usage);
total_memory_tracker.setDescription("(total)");
total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking);

View File

@ -329,11 +329,19 @@ list (APPEND DBMS_COMMON_LIBRARIES ch_contrib::abseil_swiss_tables)
if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES)
add_library (dbms STATIC ${dbms_headers} ${dbms_sources})
target_link_libraries (dbms PRIVATE jemalloc libdivide ${DBMS_COMMON_LIBRARIES})
if (USE_HUALLOC)
target_link_libraries (dbms PRIVATE hualloc)
endif()
set (all_modules dbms)
else()
add_library (dbms SHARED ${dbms_headers} ${dbms_sources})
target_link_libraries (dbms PUBLIC ${all_modules} ${DBMS_COMMON_LIBRARIES})
target_link_libraries (clickhouse_interpreters PRIVATE jemalloc libdivide)
if (USE_HUALLOC)
target_link_libraries(clickhouse_interpreters PRIVATE hualloc)
endif()
list (APPEND all_modules dbms)
# force all split libs to be linked
if (OS_DARWIN)

View File

@ -0,0 +1,51 @@
#pragma once
#include <Columns/IColumn.h>
#include <Common/PODArray.h>
namespace DB
{
/** Allows to access internal array of fixed-size column without cast to concrete type.
* We will inherit ColumnVector and ColumnFixedString from this class instead of IColumn.
* Assumes data layout of ColumnVector, ColumnFixedString and PODArray.
*
* Why it is needed?
*
* There are some algorithms that specialize on the size of data type but doesn't care about concrete type.
* The same specialization may work for UInt64, Int64, Float64, FixedString(8), if it only does byte moving and hashing.
* To avoid code bloat and compile time increase, we can use single template instantiation for these cases
* and just static_cast pointer to some single column type (e. g. ColumnUInt64) assuming that all types have identical memory layout.
*
* But this static_cast (downcast to unrelated type) is illegal according to the C++ standard and UBSan warns about it.
* To allow functional tests to work under UBSan we have to separate some base class that will present the memory layout in explicit way,
* and we will do static_cast to this class.
*/
class ColumnFixedSizeHelper : public IColumn
{
public:
template <size_t ELEMENT_SIZE>
const char * getRawDataBegin() const
{
tryToFlushZeroCopyBuffer();
#if USE_HUALLOC
return reinterpret_cast<const PODArrayBase<ELEMENT_SIZE, 4096, HuAllocator<false>, 15, 16> *>(reinterpret_cast<const char *>(this) + sizeof(*this))->raw_data();
#else
return reinterpret_cast<const PODArrayBase<ELEMENT_SIZE, 4096, Allocator<false>, 15, 16> *>(reinterpret_cast<const char *>(this) + sizeof(*this))->raw_data();
#endif
}
template <size_t ELEMENT_SIZE>
void insertRawData(const char * ptr)
{
tryToFlushZeroCopyBuffer();
#if USE_HUALLOC
return reinterpret_cast<PODArrayBase<ELEMENT_SIZE, 4096, HuAllocator<false>, 15, 16> *>(reinterpret_cast<char *>(this) + sizeof(*this))->push_back_raw(ptr);
#else
return reinterpret_cast<PODArrayBase<ELEMENT_SIZE, 4096, Allocator<false>, 15, 16> *>(reinterpret_cast<char *>(this) + sizeof(*this))->push_back_raw(ptr);
#endif
}
};
}

View File

@ -29,14 +29,22 @@ public:
const char * getRawDataBegin() const
{
tryToFlushZeroCopyBuffer();
#if USE_HUALLOC
return reinterpret_cast<const PODArrayBase<ELEMENT_SIZE, 4096, HuAllocator<false>, 15, 16> *>(reinterpret_cast<const char *>(this) + sizeof(*this))->raw_data();
#else
return reinterpret_cast<const PODArrayBase<ELEMENT_SIZE, 4096, Allocator<false>, 15, 16> *>(reinterpret_cast<const char *>(this) + sizeof(*this))->raw_data();
#endif
}
template <size_t ELEMENT_SIZE>
void insertRawData(const char * ptr)
{
tryToFlushZeroCopyBuffer();
#if USE_HUALLOC
return reinterpret_cast<PODArrayBase<ELEMENT_SIZE, 4096, HuAllocator<false>, 15, 16> *>(reinterpret_cast<char *>(this) + sizeof(*this))->push_back_raw(ptr);
#else
return reinterpret_cast<PODArrayBase<ELEMENT_SIZE, 4096, Allocator<false>, 15, 16> *>(reinterpret_cast<char *>(this) + sizeof(*this))->push_back_raw(ptr);
#endif
}
};

View File

@ -8,3 +8,6 @@ class Allocator;
template <typename Base, size_t N = 64, size_t Alignment = 1>
class AllocatorWithStackMemory;
template <bool clear_memory>
class HuAllocator;

View File

@ -11,6 +11,7 @@
#include <Common/memcpySmall.h>
#include <Common/ProfileEvents.h>
#include <Common/Allocator.h>
#include <Common/HuAllocator.h>
namespace ProfileEvents
@ -38,7 +39,11 @@ private:
static constexpr size_t pad_right = 15;
/// Contiguous MemoryChunk of memory and pointer to free space inside it. Member of single-linked list.
#if USE_HUALLOC
struct alignas(16) MemoryChunk : private HuAllocator<false> /// empty base optimization
#else
struct alignas(16) MemoryChunk : private Allocator<false> /// empty base optimization
#endif
{
char * begin;
char * pos;
@ -51,7 +56,11 @@ private:
ProfileEvents::increment(ProfileEvents::ArenaAllocChunks);
ProfileEvents::increment(ProfileEvents::ArenaAllocBytes, size_);
#if USE_HUALLOC
begin = reinterpret_cast<char *>(HuAllocator<false>::alloc(size_));
#else
begin = reinterpret_cast<char *>(Allocator<false>::alloc(size_));
#endif
pos = begin;
end = begin + size_ - pad_right;
prev = prev_;
@ -66,8 +75,11 @@ private:
/// memory would stay poisoned forever. If the allocator supports
/// asan, it will correctly poison the memory by itself.
ASAN_UNPOISON_MEMORY_REGION(begin, size());
#if USE_HUALLOC
HuAllocator<false>::free(begin, size());
#else
Allocator<false>::free(begin, size());
#endif
if (prev)
delete prev;

View File

@ -83,7 +83,11 @@ protected:
/// Switches to ordinary Allocator after REAL_ALLOCATION_TRESHOLD bytes to avoid fragmentation and trash in Arena.
#if USE_HUALLOC
template <size_t REAL_ALLOCATION_TRESHOLD = 4096, typename TRealAllocator = HuAllocator<false>, typename TArenaAllocator = ArenaAllocator, size_t alignment = 0>
#else
template <size_t REAL_ALLOCATION_TRESHOLD = 4096, typename TRealAllocator = Allocator<false>, typename TArenaAllocator = ArenaAllocator, size_t alignment = 0>
#endif
class MixedArenaAllocator : private TRealAllocator
{
public:
@ -122,9 +126,13 @@ protected:
};
#if USE_HUALLOC
template <size_t alignment, size_t REAL_ALLOCATION_TRESHOLD = 4096>
using MixedAlignedArenaAllocator = MixedArenaAllocator<REAL_ALLOCATION_TRESHOLD, HuAllocator<false>, AlignedArenaAllocator<alignment>, alignment>;
#else
template <size_t alignment, size_t REAL_ALLOCATION_TRESHOLD = 4096>
using MixedAlignedArenaAllocator = MixedArenaAllocator<REAL_ALLOCATION_TRESHOLD, Allocator<false>, AlignedArenaAllocator<alignment>, alignment>;
#endif
template <size_t N = 64, typename Base = ArenaAllocator>
class ArenaAllocatorWithStackMemory : public Base

View File

@ -21,7 +21,11 @@ namespace DB
* When allocating, we take the head of the list of free blocks,
* or, if the list is empty - allocate a new block using Arena.
*/
#if USE_HUALLOC
class ArenaWithFreeLists : private HuAllocator<false>, private boost::noncopyable
#else
class ArenaWithFreeLists : private Allocator<false>, private boost::noncopyable
#endif
{
private:
/// If the block is free, then the pointer to the next free block is stored at its beginning, or nullptr, if there are no more free blocks.
@ -58,8 +62,13 @@ public:
char * alloc(const size_t size)
{
#if USE_HUALLOC
if (size > max_fixed_block_size)
return static_cast<char *>(HuAllocator<false>::alloc(size));
#else
if (size > max_fixed_block_size)
return static_cast<char *>(Allocator<false>::alloc(size));
#endif
/// find list of required size
const auto list_idx = findFreeListIndex(size);
@ -90,8 +99,13 @@ public:
void free(char * ptr, const size_t size)
{
#if USE_HUALLOC
if (size > max_fixed_block_size)
return HuAllocator<false>::free(ptr, size);
#else
if (size > max_fixed_block_size)
return Allocator<false>::free(ptr, size);
#endif
/// find list of required size
const auto list_idx = findFreeListIndex(size);

View File

@ -1,6 +1,8 @@
#pragma once
#include <Common/config.h>
#include <Common/Allocator.h>
#include <Common/HuAllocator.h>
/**
@ -8,7 +10,11 @@
* table, so it makes sense to pre-fault the pages so that page faults don't
* interrupt the resize loop. Set the allocator parameter accordingly.
*/
#if USE_HUALLOC
using HashTableAllocator = HuAllocator<true>;
#else
using HashTableAllocator = Allocator<true /* clear_memory */, true /* mmap_populate */>;
#endif
template <size_t initial_bytes = 64>
using HashTableAllocatorWithStackMemory = AllocatorWithStackMemory<HashTableAllocator, initial_bytes>;

View File

@ -0,0 +1,6 @@
#include "HuAllocator.h"
#if USE_HUALLOC
template class HuAllocator<false>;
template class HuAllocator<true>;
#endif

200
src/Common/HuAllocator.h Normal file
View File

@ -0,0 +1,200 @@
#pragma once
#include <string.h>
#ifdef NDEBUG
#define ALLOCATOR_ASLR 0
#else
#define ALLOCATOR_ASLR 1
#endif
#include <pcg_random.hpp>
#include <Common/thread_local_rng.h>
#if !defined(__APPLE__) && !defined(__FreeBSD__)
#include <malloc.h>
#endif
#include <cstdlib>
#include <algorithm>
#include <sys/mman.h>
#include <Core/Defines.h>
#include <common/getPageSize.h>
#include <Common/CurrentMemoryTracker.h>
#include <Common/CurrentMetrics.h>
#include <Common/Exception.h>
#include <Common/formatReadable.h>
#include <common/errnoToString.h>
#include <Poco/Logger.h>
#include <common/logger_useful.h>
#include <Common/config.h>
#if USE_HUALLOC
#include <hualloc/hu_alloc.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int LOGICAL_ERROR;
}
}
static constexpr size_t HUMALLOC_MIN_ALIGNMENT = 8;
template <bool clear_memory_>
class HuAllocator
{
public:
/// Allocate memory range.
void * alloc(size_t size, size_t alignment = 0)
{
checkSize(size);
CurrentMemoryTracker::alloc(size);
void * ptr = allocNoTrack(size, alignment);
return ptr;
}
/// Free memory range.
void free(void * buf, size_t size)
{
try
{
checkSize(size);
freeNoTrack(buf);
CurrentMemoryTracker::free(size);
}
catch (...)
{
DB::tryLogCurrentException("HugeAllocator::free");
throw;
}
}
/** Enlarge memory range.
* Data from old range is moved to the beginning of new range.
* Address of memory range could change.
*/
void * realloc(void * buf, size_t old_size, size_t new_size, size_t alignment = 0)
{
checkSize(new_size);
if (old_size == new_size)
{
/// nothing to do.
/// BTW, it's not possible to change alignment while doing realloc.
}
else if (alignment <= HUMALLOC_MIN_ALIGNMENT)
{
/// Resize malloc'd memory region with no special alignment requirement.
CurrentMemoryTracker::free(old_size);
CurrentMemoryTracker::alloc(new_size);
void * new_buf = hu_realloc(buf, new_size);
if (nullptr == new_buf)
{
DB::throwFromErrno(
fmt::format("HugeAllocator: Cannot realloc from {} to {}.", ReadableSize(old_size), ReadableSize(new_size)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
}
buf = new_buf;
if constexpr (clear_memory)
if (new_size > old_size)
memset(reinterpret_cast<char *>(buf) + old_size, 0, new_size - old_size);
}
else
{
/// Big allocs that requires a copy. MemoryTracker is called inside 'alloc', 'free' methods.
void * new_buf = alloc(new_size, alignment);
memcpy(new_buf, buf, std::min(old_size, new_size));
free(buf, old_size);
buf = new_buf;
}
return buf;
}
static void InitHuAlloc(size_t cached)
{
hu_check_init_w();
pthread_t tid;
size_t use_cache = cached / 2;
if (use_cache <= 0)
use_cache = 1024 * (1ull << 20); /// If not set properly use 1G as default
pthread_create(&tid, nullptr, ReclaimThread, &use_cache);
}
protected:
static constexpr size_t getStackThreshold()
{
return 0;
}
static constexpr bool clear_memory = clear_memory_;
private:
void * allocNoTrack(size_t size, size_t alignment)
{
void * buf;
if (alignment <= HUMALLOC_MIN_ALIGNMENT)
{
if constexpr (clear_memory)
buf = hu_calloc(size, 1);
else
buf = hu_alloc_w(size);
if (nullptr == buf)
DB::throwFromErrno(fmt::format("HugeAllocator: Cannot malloc {}.", ReadableSize(size)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
}
else
{
buf = hu_alloc_aligned(size, alignment);
if (!buf)
DB::throwFromErrno(fmt::format("Cannot allocate memory (posix_memalign) {}.", ReadableSize(size)),
DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY, errno);
if constexpr (clear_memory)
memset(buf, 0, size);
}
return buf;
}
void freeNoTrack(void * buf)
{
hu_free_w(buf);
}
void checkSize(size_t size)
{
/// More obvious exception in case of possible overflow (instead of just "Cannot mmap").
if (size >= 0x8000000000000000ULL)
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Too large size ({}) passed to HugeAllocator. It indicates an error.", size);
}
};
/** When using AllocatorWithStackMemory, located on the stack,
* GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack.
* In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this.
*/
#if !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
#endif
/// Prevent implicit template instantiation of HugeAllocator
extern template class HuAllocator<false>;
extern template class HuAllocator<true>;
#if !defined(__clang__)
#pragma GCC diagnostic pop
#endif
#endif

View File

@ -5,7 +5,17 @@ namespace DB
/// Used for left padding of PODArray when empty
const char empty_pod_array[empty_pod_array_size]{};
#if USE_HUALLOC
template class PODArray<UInt8, 4096, HuAllocator<false>, 15, 16>;
template class PODArray<UInt16, 4096, HuAllocator<false>, 15, 16>;
template class PODArray<UInt32, 4096, HuAllocator<false>, 15, 16>;
template class PODArray<UInt64, 4096, HuAllocator<false>, 15, 16>;
template class PODArray<Int8, 4096, HuAllocator<false>, 15, 16>;
template class PODArray<Int16, 4096, HuAllocator<false>, 15, 16>;
template class PODArray<Int32, 4096, HuAllocator<false>, 15, 16>;
template class PODArray<Int64, 4096, HuAllocator<false>, 15, 16>;
#else
template class PODArray<UInt8, 4096, Allocator<false>, 15, 16>;
template class PODArray<UInt16, 4096, Allocator<false>, 15, 16>;
template class PODArray<UInt32, 4096, Allocator<false>, 15, 16>;
@ -15,5 +25,6 @@ template class PODArray<Int8, 4096, Allocator<false>, 15, 16>;
template class PODArray<Int16, 4096, Allocator<false>, 15, 16>;
template class PODArray<Int32, 4096, Allocator<false>, 15, 16>;
template class PODArray<Int64, 4096, Allocator<false>, 15, 16>;
#endif
}

View File

@ -32,6 +32,7 @@
#include <common/strong_typedef.h>
#include <Common/Allocator.h>
#include <Common/HuAllocator.h>
#include <Common/Exception.h>
#include <Common/BitHelpers.h>
#include <Common/memcpySmall.h>
@ -843,7 +844,17 @@ void swap(PODArray<T, initial_bytes, TAllocator, pad_right_, pad_left_> & lhs, P
#pragma GCC diagnostic pop
/// Prevent implicit template instantiation of PODArray for common numeric types
#if USE_HUALLOC
extern template class PODArray<UInt8, 4096, HuAllocator<false>, 15, 16>;
extern template class PODArray<UInt16, 4096, HuAllocator<false>, 15, 16>;
extern template class PODArray<UInt32, 4096, HuAllocator<false>, 15, 16>;
extern template class PODArray<UInt64, 4096, HuAllocator<false>, 15, 16>;
extern template class PODArray<Int8, 4096, HuAllocator<false>, 15, 16>;
extern template class PODArray<Int16, 4096, HuAllocator<false>, 15, 16>;
extern template class PODArray<Int32, 4096, HuAllocator<false>, 15, 16>;
extern template class PODArray<Int64, 4096, HuAllocator<false>, 15, 16>;
#else
extern template class PODArray<UInt8, 4096, Allocator<false>, 15, 16>;
extern template class PODArray<UInt16, 4096, Allocator<false>, 15, 16>;
extern template class PODArray<UInt32, 4096, Allocator<false>, 15, 16>;
@ -853,5 +864,5 @@ extern template class PODArray<Int8, 4096, Allocator<false>, 15, 16>;
extern template class PODArray<Int16, 4096, Allocator<false>, 15, 16>;
extern template class PODArray<Int32, 4096, Allocator<false>, 15, 16>;
extern template class PODArray<Int64, 4096, Allocator<false>, 15, 16>;
#endif
}

View File

@ -6,6 +6,7 @@
#include <common/types.h>
#include <Common/Allocator_fwd.h>
#include <Common/config.h>
namespace DB
{
@ -15,6 +16,25 @@ inline constexpr size_t integerRoundUp(size_t value, size_t dividend)
return ((value + dividend - 1) / dividend) * dividend;
}
#if USE_HUALLOC
template <typename T, size_t initial_bytes = 4096,
typename TAllocator = HuAllocator<false>, size_t pad_right_ = 0,
size_t pad_left_ = 0>
class PODArray;
template <typename T, size_t initial_bytes = 4096, typename TAllocator = HuAllocator<false>>
using PaddedPODArray = PODArray<T, initial_bytes, TAllocator, 15, 16>;
/** A helper for declaring PODArray that uses inline memory.
* The initial size is set to use all the inline bytes, since using less would
* only add some extra allocation calls.
*/
template <typename T, size_t inline_bytes,
size_t rounded_bytes = integerRoundUp(inline_bytes, sizeof(T))>
using PODArrayWithStackMemory = PODArray<T, rounded_bytes,
AllocatorWithStackMemory<HuAllocator<false>, rounded_bytes, alignof(T)>>;
#else
template <typename T, size_t initial_bytes = 4096,
typename TAllocator = Allocator<false>, size_t pad_right_ = 0,
size_t pad_left_ = 0>
@ -32,5 +52,7 @@ template <typename T, size_t inline_bytes,
size_t rounded_bytes = integerRoundUp(inline_bytes, sizeof(T))>
using PODArrayWithStackMemory = PODArray<T, rounded_bytes,
AllocatorWithStackMemory<Allocator<false>, rounded_bytes, alignof(T)>>;
#endif
}

View File

@ -12,6 +12,10 @@ constexpr auto linux_numa_cpu_file_online = "/sys/devices/system/node/online";
constexpr auto linux_numa_cpu_file_possible = "/sys/devices/system/node/possible";
size_t max_numa_node = 0;
std::mutex numa_nodes_cpu_mask_mutex;
bool numa_nodes_cpu_mask_initialized = false;
std::vector<cpu_set_t> numa_nodes_cpu_mask;
size_t buffer_to_number(const std::string & buffer)
{
try
@ -50,4 +54,97 @@ __attribute__((constructor)) static void init_max_numa_node()
try_read_max_numa_nude(linux_numa_cpu_file_possible);
}
std::vector<size_t> parse_cpu_list(const std::string & cpu_list_str)
{
std::unique_ptr<DB::UInt16> lb_cache = nullptr;
DB::Int32 digit_cache = -1;
std::vector<size_t> cpu_list;
for (auto it = cpu_list_str.cbegin();; it++)
{
if (it == cpu_list_str.cend() || *it == ',')
{
if (digit_cache < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid format of cpu_list: {}", cpu_list_str);
if (!lb_cache)
cpu_list.emplace_back(digit_cache);
else
{
auto start = *lb_cache.release();
for (int i = start; i <= digit_cache; i++)
{
cpu_list.emplace_back(i);
}
}
if (it == cpu_list_str.cend())
break;
digit_cache = -1;
}
else if (*it >= '0' && *it <= '9')
{
digit_cache = digit_cache > 0 ? digit_cache * 10 + (*it - 48) : (*it - 48);
}
else if (std::isspace(*it))
{
}
else if (*it == '-')
{
if (digit_cache < 0 || lb_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid format of cpu_list: {}", cpu_list_str);
lb_cache = std::make_unique<DB::UInt16>(digit_cache);
digit_cache = -1;
}
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid format of cpu_list: {}", cpu_list_str);
}
return cpu_list;
}
void init_numa_nodes_cpu_mask()
{
numa_nodes_cpu_mask.resize(max_numa_node+1);
for (size_t numa_node = 0; numa_node < numa_nodes_cpu_mask.size(); ++numa_node)
{
CPU_ZERO(&numa_nodes_cpu_mask[numa_node]);
std::string cpu_list_path = fmt::format("/sys/devices/system/node/node{}/cpulist", numa_node);
if (!std::filesystem::exists(cpu_list_path))
continue;
std::ifstream fstream(cpu_list_path);
std::stringstream buffer;
buffer << fstream.rdbuf();
if (buffer.str().empty())
continue;
try
{
auto cpu_list = parse_cpu_list(buffer.str());
for (auto cpu_index : cpu_list)
CPU_SET(cpu_index, &numa_nodes_cpu_mask[numa_node]);
}
catch (std::exception &)
{
}
}
}
std::vector<cpu_set_t> SystemUtils::getNumaNodesCpuMask()
{
#if defined(__linux__)
if (numa_nodes_cpu_mask_initialized)
return numa_nodes_cpu_mask;
std::unique_lock lock(numa_nodes_cpu_mask_mutex);
if (numa_nodes_cpu_mask_initialized)
return numa_nodes_cpu_mask;
init_numa_nodes_cpu_mask();
numa_nodes_cpu_mask_initialized = true;
return numa_nodes_cpu_mask;
#else
return {};
#endif
}
}

View File

@ -62,6 +62,8 @@ struct CpuUsageInfo
size_t buffer_to_number(const std::string & buffer);
void init_numa_nodes_cpu_mask();
class SystemUtils
{
public:
@ -160,6 +162,8 @@ public:
return 0;
}
static std::vector<cpu_set_t> getNumaNodesCpuMask();
static void getCpuUsageInfo(const std::unordered_set<size_t> & cpu_nodes, std::vector<CpuUsageInfo> & cpu_usage_info_vec)
{
#if defined(__linux__)
@ -194,4 +198,7 @@ public:
#endif
}
};
std::vector<size_t> parse_cpu_list(const std::string & cpu_list_str);
}

View File

@ -33,3 +33,4 @@
#cmakedefine01 USE_SIMDJSON
#cmakedefine01 USE_RAPIDJSON
#cmakedefine01 USE_NLP
#cmakedefine01 USE_HUALLOC

View File

@ -19,3 +19,4 @@
#cmakedefine01 USE_SIMDJSON
#cmakedefine01 USE_RAPIDJSON
#cmakedefine01 USE_NLP
#cmakedefine01 USE_HUALLOC

View File

@ -49,6 +49,7 @@
#include <common/logger_useful.h>
#include <fmt/format.h>
#include <common/errnoToString.h>
#include <Common/HuAllocator.h>
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
@ -730,17 +731,37 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
Int64 amount = total_memory_tracker.get();
Int64 peak = total_memory_tracker.getPeak();
Int64 new_amount = data.resident;
[[maybe_unused]]Int64 free_memory_in_allocator_arenas = 0;
#if USE_HUALLOC
/// During hualloc, the cached memory should be treat as free memory, for safety keep 0.2 as buffer for concurrent alloc
/// Which assume the alloc size shoule be less than cached_memory * 1.2
Int64 hualloc_cache = (SegmentCached() + LargeCached()) * 0.8;
new_amount -= hualloc_cache;
Int64 difference = new_amount - amount;
/// Log only if difference is high. This is for convenience. The threshold is arbitrary.
// if (difference >= 1048576 || difference <= -1048576)
LOG_DEBUG(&Poco::Logger::get("AsynchronousMetrics"),
"MemoryTracking: was {}, peak {}, free memory in arenas {}, hard limit will set to {}, RSS: {}, difference: {}, hualloc cache:{}",
ReadableSize(amount),
ReadableSize(peak),
ReadableSize(free_memory_in_allocator_arenas),
ReadableSize(new_amount),
ReadableSize(new_amount + hualloc_cache),
ReadableSize(difference),
ReadableSize(hualloc_cache));
#else
Int64 difference = new_amount - amount;
/// Log only if difference is high. This is for convenience. The threshold is arbitrary.
if (difference >= 1048576 || difference <= -1048576)
LOG_TRACE(&Poco::Logger::get("AsynchronousMetrics"),
"MemoryTracking: was {}, peak {}, will set to {} (RSS), difference: {}",
ReadableSize(amount),
ReadableSize(peak),
ReadableSize(new_amount),
ReadableSize(difference));
LOG_DEBUG(&Poco::Logger::get("AsynchronousMetrics"),
"MemoryTracking: was {}, peak {}, will set to {} (RSS), difference: {}",
ReadableSize(amount),
ReadableSize(peak),
ReadableSize(new_amount),
ReadableSize(difference));
#endif
total_memory_tracker.set(new_amount);
CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_amount);

View File

@ -56,6 +56,7 @@ const char * auto_config_build[]
"TZDATA_VERSION", "@TZDATA_VERSION@",
"USE_KRB5", "@USE_KRB5@",
"USE_BYTEDANCE_RDKAFKA", "@USE_BYTEDANCE_RDKAFKA@",
"USE_HUALLOC", "@USE_HUALLOC@",
nullptr, nullptr
};

View File

@ -0,0 +1,91 @@
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <Storages/System/StorageSystemHuAllocStats.h>
#include <Processors/Sources/SourceFromSingleChunk.h>
#include <Processors/Pipe.h>
#include <Core/NamesAndTypes.h>
#include <Common/Exception.h>
#include <common/logger_useful.h>
#include <Common/formatReadable.h>
#include <fmt/core.h>
#include "config.h"
#if USE_HUALLOC
# include <hualloc/hu_alloc.h>
#endif
namespace DB
{
StorageSystemHuAllocStats::StorageSystemHuAllocStats(const StorageID & table_id_)
: IStorage(table_id_)
{
StorageInMemoryMetadata storage_metadata;
ColumnsDescription desc;
auto columns = getNamesAndTypes();
for (const auto & col : columns)
{
ColumnDescription col_desc(col.name, col.type);
desc.add(col_desc);
}
storage_metadata.setColumns(desc);
setInMemoryMetadata(storage_metadata);
}
NamesAndTypesList StorageSystemHuAllocStats::getNamesAndTypes()
{
return {
{ "GiantAlloc", std::make_shared<DataTypeUInt64>() },
{ "LargeReclaim", std::make_shared<DataTypeUInt64>() },
{ "SegmentReclaim", std::make_shared<DataTypeUInt64>() },
{ "LargeCached", std::make_shared<DataTypeString>() },
{ "SegmentCached", std::make_shared<DataTypeString>() },
{ "LargeAllocate", std::make_shared<DataTypeUInt64>() },
{ "LargeFree", std::make_shared<DataTypeUInt64>() },
{ "SegmentAllocate", std::make_shared<DataTypeUInt64>() },
{ "SegmentFree", std::make_shared<DataTypeUInt64>() },
{ "GiantAllocate", std::make_shared<DataTypeUInt64>() },
{ "GiantFree", std::make_shared<DataTypeUInt64>() },
};
}
Pipe StorageSystemHuAllocStats::read(
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
SelectQueryInfo &,
ContextPtr /*context*/,
QueryProcessingStage::Enum /*processed_stage*/,
const size_t /*max_block_size*/,
const unsigned /*num_streams*/)
{
storage_snapshot->check(column_names);
auto header = storage_snapshot->getMetadataForQuery()->getSampleBlockWithVirtuals(getVirtuals());
MutableColumns res_columns = header.cloneEmptyColumns();
#if USE_HUALLOC
size_t col_num = 0;
res_columns.at(col_num++)->insert(HugeAlloc());
res_columns.at(col_num++)->insert(LargeReclaimed());
res_columns.at(col_num++)->insert(SegmentReclaimed());
res_columns.at(col_num++)->insert(formatReadableSizeWithBinarySuffix(LargeCached()));
res_columns.at(col_num++)->insert(formatReadableSizeWithBinarySuffix(SegmentCached()));
res_columns.at(col_num++)->insert(GetTotalLargeAlloc());
res_columns.at(col_num++)->insert(GetTotalLargeFree());
res_columns.at(col_num++)->insert(GetTotalSegmentAlloc());
res_columns.at(col_num++)->insert(GetTotalSegmentFree());
res_columns.at(col_num++)->insert(GetTotalGiantAlloc());
res_columns.at(col_num++)->insert(GetTotalGiantFree());
#else
LOG_INFO(&Poco::Logger::get("StorageSystemHuAllocStats"), "HuAlloc is not enabled");
#endif // USE_HUALLOC
UInt64 num_rows = res_columns.at(0)->size();
Chunk chunk(std::move(res_columns), num_rows);
return Pipe(std::make_shared<SourceFromSingleChunk>(std::move(header), std::move(chunk)));
}
}

View File

@ -0,0 +1,35 @@
#pragma once
#include <Storages/IStorage.h>
namespace DB
{
class Context;
class StorageSystemHuAllocStats final : public shared_ptr_helper<StorageSystemHuAllocStats>, public IStorage
{
friend struct shared_ptr_helper<StorageSystemHuAllocStats>;
public:
explicit StorageSystemHuAllocStats(const StorageID & table_id_);
std::string getName() const override { return "SystemHuAllocStats"; }
static NamesAndTypesList getNamesAndTypes();
Pipe read(
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
SelectQueryInfo & query_info,
ContextPtr context,
QueryProcessingStage::Enum processed_stage,
size_t max_block_size,
unsigned num_streams) override;
bool isSystemStorage() const override { return true; }
// bool supportsTransactions() const override { return true; }
};
}

View File

@ -19,6 +19,8 @@
* All Bytedance's Modifications are Copyright (2023) Bytedance Ltd. and/or its affiliates.
*/
#include <Storages/System/StorageSystemHuAllocStats.h>
#include <Databases/IDatabase.h>
#include <Storages/System/attachSystemTables.h>
#include <Storages/System/attachSystemTablesImpl.h>
@ -277,6 +279,7 @@ void attachSystemTablesServer(IDatabase & system_database, bool has_zookeeper)
attach<StorageSystemPersistentBGJobStatus>(system_database, "persistent_bg_job_status");
attach<StorageSystemGlobalGCManager>(system_database, "global_gc_manager");
attach<StorageSystemLockMap>(system_database, "lock_map");
attach<StorageSystemHuAllocStats>( system_database, "hualloc_stats");
attach<StorageSystemWorkers>(system_database, "workers");
attach<StorageSystemWorkerGroups>(system_database, "worker_groups");

View File

@ -9,6 +9,11 @@ endif()
if (TARGET ch_contrib::ulid)
set(USE_ULID 1)
endif()
if (TARGET ch_rust::blake3)
set(USE_BLAKE3 1)
endif()
if (TARGET hualloc AND ENABLE_HUALLOC)
set(USE_HUALLOC 1)
endif()