Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI] Build and package Treelite with OpenMP #353

Merged
merged 18 commits into from
Feb 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ jobs:
- os: osx
env: TASK=python_coverage_test
osx_image: xcode10.2
# Build Python wheels for MacOS Intel and Apple Silicon
# pypa/cibuildwheel is used, hence CIBW prefix
- os: osx
env: TASK=python_wheels CIBW_PLATFORM_ID=macosx_x86_64
osx_image: xcode12.5
- os: osx
env: TASK=python_wheels CIBW_PLATFORM_ID=macosx_arm64
osx_image: xcode12.5

# dependent brew packages
addons:
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ if(MSVC)
endif()

option(TEST_COVERAGE "C++ test coverage" OFF)
option(USE_OPENMP "Use OpenMP" ON)
option(BUILD_CPP_TEST "Build C++ tests" OFF)
option(BUILD_STATIC_LIBS "Build static libs, in addition to dynamic libs" OFF)
option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
Expand Down
13 changes: 10 additions & 3 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,24 @@ jobs:
- script: tests/ci_build/ci_build.sh cpu tests/ci_build/build_via_cmake.sh
displayName: 'Building Treelite...'
- script: |
TAG=manylinux2014_x86_64
tests/ci_build/ci_build.sh cpu bash -c "cd python/ && python setup.py bdist_wheel --universal"
tests/ci_build/ci_build.sh auditwheel_x86_64 auditwheel repair --only-plat --plat ${TAG} python/dist/*.whl
rm -v python/dist/*.whl
mv -v wheelhouse/*.whl python/dist/
tests/ci_build/ci_build.sh cpu python tests/ci_build/rename_whl.py python/dist/*.whl $(Build.SourceVersion) ${TAG}
displayName: 'Packaging Python wheel for Treelite...'
- task: PublishPipelineArtifact@0
inputs:
artifactName: 'python_linux_whl'
targetPath: 'python/dist/'
- script: |
TAG=manylinux2014_x86_64
tests/ci_build/ci_build.sh cpu bash -c "cd runtime/python/ && python setup.py bdist_wheel --universal"
tests/ci_build/ci_build.sh auditwheel_x86_64 auditwheel repair --only-plat --plat ${TAG} runtime/python/dist/*.whl
rm -v runtime/python/dist/*.whl
mv -v wheelhouse/*.whl runtime/python/dist/
tests/ci_build/ci_build.sh cpu python tests/ci_build/rename_whl.py runtime/python/dist/*.whl $(Build.SourceVersion) ${TAG}
displayName: 'Packaging Python wheel for Treelite runtime...'
- task: PublishPipelineArtifact@1
inputs:
Expand Down Expand Up @@ -210,9 +220,6 @@ jobs:
- script: python -m pytest -v --fulltrace tests/python/test_basic.py
displayName: 'Running Python tests...'
- script: |
TAG=manylinux2014_x86_64
python tests/ci_build/rename_whl.py main/*.whl $(Build.SourceVersion) ${TAG}
python tests/ci_build/rename_whl.py runtime/*.whl $(Build.SourceVersion) ${TAG}
python -m awscli s3 cp main/*.whl s3://treelite-wheels/ --acl public-read || true
python -m awscli s3 cp runtime/*.whl s3://treelite-wheels/ --acl public-read || true
displayName: 'Uploading Python wheels...'
Expand Down
5 changes: 4 additions & 1 deletion cmake/TreeliteConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

include(CMakeFindDependencyMacro)

find_dependency(Threads)
set(USE_OPENMP @USE_OPENMP@)
if(USE_OPENMP)
find_dependency(OpenMP)
endif()

if(NOT TARGET treelite::treelite)
include(${CMAKE_CURRENT_LIST_DIR}/TreeliteTargets.cmake)
Expand Down
7 changes: 7 additions & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ def build(self, src_dir, build_dir, generator, build_tool=None):
"""Build the core library with CMake."""
cmake_cmd = ['cmake', src_dir, generator]

# Flag for cross-compiling for Apple Silicon
# We use environment variable because it's the only way to pass down custom flags
# through the cibuildwheel package, which otherwise calls `python setup.py bdist_wheel`
# command.
if 'CIBW_TARGET_OSX_ARM64' in os.environ:
cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")

self.logger.info('Run CMake command: %s', str(cmake_cmd))
subprocess.check_call(cmake_cmd, cwd=build_dir)

Expand Down
7 changes: 7 additions & 0 deletions runtime/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ def build(self, src_dir, build_dir, generator, build_tool=None, use_omp=1):
"""Build the runtime with CMake."""
cmake_cmd = ['cmake', src_dir, generator]

# Flag for cross-compiling for Apple Silicon
# We use environment variable because it's the only way to pass down custom flags
# through the cibuildwheel package, which otherwise calls `python setup.py bdist_wheel`
# command.
if 'CIBW_TARGET_OSX_ARM64' in os.environ:
cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")

self.logger.info('Run CMake command: %s', str(cmake_cmd))
subprocess.check_call(cmake_cmd, cwd=build_dir)

Expand Down
17 changes: 15 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,16 @@ endif(UNIX)

add_library(objtreelite_common OBJECT) # Component shared by both main package and runtime

find_package(Threads REQUIRED)
if(USE_OPENMP)
if (APPLE)
# Require CMake 3.16+ on Mac OSX, as previous versions of CMake had trouble locating
# OpenMP on Mac. See https://github.com/dmlc/xgboost/pull/5146#issuecomment-568312706
cmake_minimum_required(VERSION 3.16)
endif (APPLE)
find_package(OpenMP REQUIRED)
else()
message(STATUS "Disabling OpenMP")
endif()

if(ENABLE_ALL_WARNINGS)
foreach(target objtreelite objtreelite_runtime objtreelite_runtime)
Expand All @@ -24,14 +33,17 @@ foreach(lib objtreelite objtreelite_runtime objtreelite_common)
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>)
target_link_libraries(${lib} PUBLIC Threads::Threads)
if(MSVC)
target_compile_options(${lib} PRIVATE /MP)
target_compile_definitions(${lib} PRIVATE -DNOMINMAX)
target_compile_options(${lib} PRIVATE /utf-8 -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
else()
target_compile_options(${lib} PRIVATE -funroll-loops)
endif()
if(USE_OPENMP)
target_link_libraries(${lib} PUBLIC OpenMP::OpenMP_CXX)
target_compile_definitions(${lib} PRIVATE -DTREELITE_OPENMP_SUPPORT)
endif()
if(TEST_COVERAGE)
if(MSVC)
message(FATAL_ERROR "Test coverage not available on Windows")
Expand Down Expand Up @@ -101,6 +113,7 @@ target_sources(objtreelite
${PROJECT_SOURCE_DIR}/include/treelite/frontend.h
${PROJECT_SOURCE_DIR}/include/treelite/frontend_impl.h
${PROJECT_SOURCE_DIR}/include/treelite/gtil.h
${PROJECT_SOURCE_DIR}/include/treelite/omp.h
${PROJECT_SOURCE_DIR}/include/treelite/optional.h
${PROJECT_SOURCE_DIR}/include/treelite/thread_local.h
${PROJECT_SOURCE_DIR}/include/treelite/tree.h
Expand Down
12 changes: 7 additions & 5 deletions src/annotator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,9 @@ inline void ComputeBranchLoopImpl(
size_t num_col = dmat->num_col;
ElementType missing_value = dmat->missing_value;
bool nan_missing = treelite::math::CheckNAN(missing_value);
treelite::threading_utils::ParallelFor(rbegin, rend, nthread,
[&](std::size_t rid, std::size_t thread_id) {
auto sched = treelite::threading_utils::ParallelSchedule::Static();
treelite::threading_utils::ParallelFor(rbegin, rend, nthread, sched,
[&](std::size_t rid, int thread_id) {
const ElementType* row = &dmat->data[rid * num_col];
const size_t off = dmat->num_col * thread_id;
const size_t off2 = count_row_ptr[ntree] * thread_id;
Expand Down Expand Up @@ -107,8 +108,9 @@ inline void ComputeBranchLoopImpl(
std::vector<Entry<ElementType>> inst(nthread * dmat->num_col, {-1});
size_t ntree = model.trees.size();
TREELITE_CHECK_LE(rbegin, rend);
treelite::threading_utils::ParallelFor(rbegin, rend, nthread,
[&](std::size_t rid, std::size_t thread_id) {
auto sched = treelite::threading_utils::ParallelSchedule::Static();
treelite::threading_utils::ParallelFor(rbegin, rend, nthread, sched,
[&](std::size_t rid, int thread_id) {
const size_t off = dmat->num_col * thread_id;
const size_t off2 = count_row_ptr[ntree] * thread_id;
const size_t ibegin = dmat->row_ptr[rid];
Expand Down Expand Up @@ -192,7 +194,7 @@ AnnotateImpl(

count_row_ptr = {0};
const size_t ntree = model.trees.size();
const int max_thread = static_cast<int>(std::thread::hardware_concurrency());
const int max_thread = static_cast<int>(threading_utils::MaxNumThread());
nthread = (nthread == 0) ? max_thread : std::min(nthread, max_thread);
for (const treelite::Tree<ThresholdType, LeafOutputType>& tree : model.trees) {
count_row_ptr.push_back(count_row_ptr.back() + tree.num_nodes);
Expand Down
159 changes: 109 additions & 50 deletions src/threading_utils/parallel_for.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,75 +7,134 @@
#ifndef TREELITE_THREADING_UTILS_PARALLEL_FOR_H_
#define TREELITE_THREADING_UTILS_PARALLEL_FOR_H_

#include <treelite/omp.h>
#include <treelite/logging.h>
#include <future>
#include <thread>
#include <algorithm>
#include <vector>
#include <type_traits>
#include <exception>
#include <mutex>
#include <cstddef>

namespace treelite {
namespace threading_utils {

template <typename IndexType>
std::vector<IndexType> ComputeWorkRange(IndexType begin, IndexType end, std::size_t nthread);
/*!
* \brief OMP Exception class catches, saves and rethrows exception from OMP blocks
*/
class OMPException {
private:
// exception_ptr member to store the exception
std::exception_ptr omp_exception_;
// mutex to be acquired during catch to set the exception_ptr
std::mutex mutex_;

public:
/*!
* \brief Parallel OMP blocks should be placed within Run to save exception
*/
template <typename Function, typename... Parameters>
void Run(Function f, Parameters... params) {
try {
f(params...);
} catch (std::exception& ex) {
std::lock_guard<std::mutex> lock(mutex_);
if (!omp_exception_) {
omp_exception_ = std::current_exception();
}
}
}

/*!
* \brief should be called from the main thread to rethrow the exception
*/
void Rethrow() {
if (this->omp_exception_) {
std::rethrow_exception(this->omp_exception_);
}
}
};

inline int MaxNumThread() {
return omp_get_max_threads();
}

// OpenMP schedule
struct ParallelSchedule {
enum {
kAuto,
kDynamic,
kStatic,
kGuided,
} sched;
std::size_t chunk{0};

ParallelSchedule static Auto() { return ParallelSchedule{kAuto}; }
ParallelSchedule static Dynamic(std::size_t n = 0) { return ParallelSchedule{kDynamic, n}; }
ParallelSchedule static Static(std::size_t n = 0) { return ParallelSchedule{kStatic, n}; }
ParallelSchedule static Guided() { return ParallelSchedule{kGuided}; }
};

template <typename IndexType, typename FuncType>
void ParallelFor(IndexType begin, IndexType end, std::size_t nthread, FuncType func) {
inline void ParallelFor(IndexType begin, IndexType end, int nthread, ParallelSchedule sched,
FuncType func) {
TREELITE_CHECK_GT(nthread, 0) << "nthread must be positive";
TREELITE_CHECK_LE(nthread, std::thread::hardware_concurrency())
<< "nthread cannot exceed " << std::thread::hardware_concurrency();
TREELITE_CHECK_LE(nthread, MaxNumThread()) << "nthread cannot exceed " << MaxNumThread();
if (begin == end) {
return;
}
/* Divide the range [begin, end) equally among the threads.
* The i-th thread gets the range [work_range[i], work_range[i+1]). */
std::vector<IndexType> work_range = ComputeWorkRange(begin, end, nthread);

// Launch (nthread - 1) threads, as the main thread should also perform work.
std::vector<std::future<void>> async_tasks;
for (std::size_t thread_id = 1; thread_id < nthread; ++thread_id) {
async_tasks.push_back(std::async(std::launch::async, [&work_range, &func, thread_id]() {
const IndexType begin_ = work_range[thread_id];
const IndexType end_ = work_range[thread_id + 1];
for (IndexType i = begin_; i < end_; ++i) {
func(i, thread_id);
#if defined(_MSC_VER)
// msvc doesn't support unsigned integer as openmp index.
using OmpInd = std::conditional_t<std::is_signed<IndexType>::value, IndexType, std::int64_t>;
#else
using OmpInd = IndexType;
#endif

OMPException exc;
switch (sched.sched) {
case ParallelSchedule::kAuto: {
#pragma omp parallel for num_threads(nthread)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
break;
}
case ParallelSchedule::kDynamic: {
if (sched.chunk == 0) {
#pragma omp parallel for num_threads(nthread) schedule(dynamic)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
} else {
#pragma omp parallel for num_threads(nthread) schedule(dynamic, sched.chunk)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
}));
}
break;
}
{
const IndexType begin_ = work_range[0];
const IndexType end_ = work_range[1];
for (IndexType i = begin_; i < end_; ++i) {
func(i, 0);
case ParallelSchedule::kStatic: {
if (sched.chunk == 0) {
#pragma omp parallel for num_threads(nthread) schedule(static)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
} else {
#pragma omp parallel for num_threads(nthread) schedule(static, sched.chunk)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
}
break;
}
// Join threads
for (auto& task : async_tasks) {
task.get();
case ParallelSchedule::kGuided: {
#pragma omp parallel for num_threads(nthread) schedule(guided)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
break;
}
}

template <typename IndexType>
std::vector<IndexType> ComputeWorkRange(IndexType begin, IndexType end, std::size_t nthread) {
TREELITE_CHECK_GE(end, 0) << "end must be 0 or greater";
TREELITE_CHECK_GE(begin, 0) << "begin must be 0 or greater";
TREELITE_CHECK_GE(end, begin) << "end cannot be less than begin";
TREELITE_CHECK_GT(nthread, 0) << "nthread must be positive";
IndexType num_elem = end - begin;
const IndexType portion = num_elem / nthread + !!(num_elem % nthread);
// integer division, rounded-up

std::vector<IndexType> work_range(nthread + 1);
work_range[0] = begin;
IndexType acc = begin;
for (std::size_t i = 0; i < nthread; ++i) {
acc += portion;
work_range[i + 1] = std::min(acc, end);
}
TREELITE_CHECK_EQ(work_range[nthread], end);

return work_range;
exc.Rethrow();
}

} // namespace threading_utils
Expand Down
15 changes: 15 additions & 0 deletions tests/ci_build/Dockerfile.auditwheel_x86_64
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM quay.io/pypa/manylinux2014_x86_64

# Install lightweight sudo (not bound to TTY)
ENV GOSU_VERSION 1.10
RUN set -ex; \
curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

# Default entry-point to use if running locally
# It will preserve attributes of created files
COPY entrypoint.sh /scripts/

WORKDIR /workspace
ENTRYPOINT ["/scripts/entrypoint.sh"]
Loading