Skip to content

Commit

Permalink
Merge branch 'main' into gluten-1367
Browse files Browse the repository at this point in the history
  • Loading branch information
leesf authored Apr 17, 2023
2 parents 73b6800 + 7fedc2a commit 0787a06
Show file tree
Hide file tree
Showing 161 changed files with 29,445 additions and 89 deletions.
89 changes: 0 additions & 89 deletions .github/workflows/clickhouse_be.yml

This file was deleted.

5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,8 @@ cpp/**/benchmarks/**/*
cpp/**/example/**/*
!cpp/**/example/**/
!cpp/**/example/**/*.*

/cpp-ch/ClickHouse/*
/cpp-ch/cmake-build-*
/cpp-ch/local-engine/tests/testConfig.h
/cpp-ch/local-engine/proto/substrait/
51 changes: 51 additions & 0 deletions cpp-ch/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
cmake_minimum_required(VERSION 3.20)
set(CH_SOURCE_DIR ${CMAKE_SOURCE_DIR}/ClickHouse CACHE STRING "ClickHouse source dir")

project(libch LANGUAGES C CXX ASM)
file(GLOB clickhouse_files "${CH_SOURCE_DIR}/*")
if ("${CH_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}/ClickHouse")
if ("${CH_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}/ClickHouse")
if (NOT clickhouse_files)
execute_process(COMMAND git clone --recursive --depth 1 https://github.com/Kyligence/ClickHouse.git ${CH_SOURCE_DIR}
OUTPUT_VARIABLE download_ch)
execute_process(COMMAND cd ${CH_SOURCE_DIR} && git pull && git submodule update --init --recursive --force --depth 1
OUTPUT_VARIABLE download_ch)
endif()
endif()
else()
if (NOT clickhouse_files)
# Checking out *all* submodules takes > 5 min. Therefore, the smoke build ("FastTest") in CI initializes only the set of
# submodules minimally needed for a build and we cannot assume here that all submodules are populated.
message(ERROR "clickhouse ${CH_SOURCE_DIR} is missing or empty. to fix try run:")
message(STATUS " git clone --recursive --depth 1 https://github.com/Kyligence/ClickHouse.git ${CMAKE_SOURCE_DIR}")
endif()
endif()

if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/local-engine/proto/substrait")
execute_process(COMMAND ln -s ${CMAKE_CURRENT_SOURCE_DIR}/../gluten-core/src/main/resources/substrait/proto/substrait ${CMAKE_CURRENT_SOURCE_DIR}/local-engine/proto/substrait)
endif ()

if (NOT EXISTS "${CH_SOURCE_DIR}/utils/extern-local-engine/")
execute_process(COMMAND ln -s ${CMAKE_CURRENT_SOURCE_DIR}/local-engine ${CH_SOURCE_DIR}/utils/extern-local-engine)
endif ()

set(CH_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/build")

add_custom_command(
USES_TERMINAL
COMMAND
bash -c
\"cmake -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-DENABLE_PROTOBUF=ON
-DENABLE_TESTS=OFF
-DENABLE_JEMALLOC=ON
-DENABLE_MULTITARGET_CODE=ON
-DENABLE_EXTERN_LOCAL_ENGINE=ON
-S ${CH_SOURCE_DIR} -G Ninja -B ${CH_BINARY_DIR} &&
cmake --build ${CH_BINARY_DIR} --target ch\"
OUTPUT _build_ch
)

add_custom_target(build_ch ALL DEPENDS _build_ch)
38 changes: 38 additions & 0 deletions cpp-ch/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# How to build
项目依赖ClickHouse,有两种选择,一种是自动下载ClickHouse源码到当前目录,
另一种手动clone ClickHouse代码(https://github.com/Kyligence/ClickHouse.git)并通过cmake参数CH_SOURCE_DIR指定。

代码开发推荐使用外部ClickHouse源码,下面说明如果绑定外部ClickHouse项目进行编译
## 克隆ClickHouse
```shell
export CH_SOURCE_DIR=/tmp/ClickHouse #可以选择自己的目录
git clone https://github.com/Kyligence/ClickHouse.git ${CH_SOURCE_DIR}
cd ${CH_SOURCE_DIR}
# 更新submodule
git submodule update --init --recursive --depth 1
```

## 编译CH依赖
构建target build_ch,生成所有的静态链接库依赖。并刷新cmake build目录。
```shell
export GLUTEN_SOURCE=$(pwd)
cmake -G Ninja -S ${GLUTEN_SOURCE}/cpp-ch -B ${GLUTEN_SOURCE}/cpp-ch/build_ch -DCH_SOURCE_DIR=${CH_SOURCE_DIR} -DCMAKE_C_COMPILER=clang-15 -DCMAKE_CXX_COMPILER=clang++-15 -DCMAKE_BUILD_TYPE=Release
cmake --build ${GLUTEN_SOURCE}/cpp-ch/build_ch --target build_ch
```

动态链接库位于`cpp-ch/build/utils/extern-local-engine/libch.so`

# 模块拆分原理
local_engine目录与ClickHouse项目通过软链接的方式关联,cpp-ch下的cmake会在ClickHouse创建一个utils/extern-local-engine的软链接。
整体开发方式可以与之前保持一致,只是git提交需要通过gluten项目完成。
新增cmake option:
* ENABLE_EXTERN_LOCAL_ENGINE=ON 在导入ClickHouse时指定启用extern-local-engine

# 一些问题
启动时需要指定LD_PRELOAD={path of libch.so},目的是让libch.so内的jemalloc最先被加载。

spark-submit --conf spark.executorEnv.LD_PRELOAD=/path/to/your/library

# 新的Jenkins CI
https://cicd-aws.kyligence.com/job/Gluten/job/gluten-ci/
公共只读账号:gluten/hN2xX3uQ4m
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#include <AggregateFunctions/AggregateFunctionPartialMerge.h>
#include <AggregateFunctions/AggregateFunctionCombinatorFactory.h>
#include <DataTypes/DataTypeAggregateFunction.h>


using namespace DB;

namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
}

namespace local_engine
{

namespace
{

class AggregateFunctionCombinatorPartialMerge final : public IAggregateFunctionCombinator
{
public:
String getName() const override { return "PartialMerge"; }

DataTypes transformArguments(const DataTypes & arguments) const override
{
if (arguments.size() != 1)
throw Exception("Incorrect number of arguments for aggregate function with " + getName() + " suffix", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);

const DataTypePtr & argument = arguments[0];

const DataTypeAggregateFunction * function = typeid_cast<const DataTypeAggregateFunction *>(argument.get());
if (!function)
throw Exception("Illegal type " + argument->getName() + " of argument for aggregate function with " + getName() + " suffix"
+ " must be AggregateFunction(...)", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

const DataTypeAggregateFunction * function2 = typeid_cast<const DataTypeAggregateFunction *>(function->getArgumentsDataTypes()[0].get());
if (function2) {
return transformArguments(function->getArgumentsDataTypes());
}
return function->getArgumentsDataTypes();
}

AggregateFunctionPtr transformAggregateFunction(
const AggregateFunctionPtr & nested_function,
const AggregateFunctionProperties &,
const DataTypes & arguments,
const Array & params) const override
{
DataTypePtr & argument = const_cast<DataTypePtr &>(arguments[0]);

const DataTypeAggregateFunction * function = typeid_cast<const DataTypeAggregateFunction *>(argument.get());
if (!function)
throw Exception("Illegal type " + argument->getName() + " of argument for aggregate function with " + getName() + " suffix"
+ " must be AggregateFunction(...)", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

while (nested_function->getName() != function->getFunctionName()) {
argument = function->getArgumentsDataTypes()[0];
function = typeid_cast<const DataTypeAggregateFunction *>(function->getArgumentsDataTypes()[0].get());
if (!function)
throw Exception("Illegal type " + argument->getName() + " of argument for aggregate function with " + getName() + " suffix"
+ " must be AggregateFunction(...)", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}

if (nested_function->getName() != function->getFunctionName())
throw Exception("Illegal type " + argument->getName() + " of argument for aggregate function with " + getName() + " suffix"
+ ", because it corresponds to different aggregate function: " + function->getFunctionName() + " instead of " + nested_function->getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

return std::make_shared<AggregateFunctionPartialMerge>(nested_function, argument, params);
}
};

}

void registerAggregateFunctionCombinatorPartialMerge(AggregateFunctionCombinatorFactory & factory)
{
factory.registerCombinator(std::make_shared<AggregateFunctionCombinatorPartialMerge>());
}

}
Loading

0 comments on commit 0787a06

Please sign in to comment.