Skip to content

Commit

Permalink
[CI] Build and package Treelite with OpenMP (#353)
Browse files Browse the repository at this point in the history
- Rewrite `ParallelFor` using OpenMP constructs.
- Link Treelite libs with OpenMP runtime lib.
- On MacOS, bundle libomp (OpenMP runtime) with Treelite. This is to ensure that Treelite does not randomly crash. dmlc/xgboost#7621 for full explanation.
- Build PyPI wheel targeting Apple Silicon. Closes #350
- On Linux, run `auditwheel repair` command to vendor (bundle) `libgomp.so` inside the Python wheel. This is required by the Python packaging standard.
  • Loading branch information
hcho3 authored Feb 3, 2022
1 parent ab699e7 commit f815a58
Show file tree
Hide file tree
Showing 13 changed files with 291 additions and 125 deletions.
8 changes: 8 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ jobs:
- os: osx
env: TASK=python_coverage_test
osx_image: xcode10.2
# Build Python wheels for MacOS Intel and Apple Silicon
# pypa/cibuildwheel is used, hence CIBW prefix
- os: osx
env: TASK=python_wheels CIBW_PLATFORM_ID=macosx_x86_64
osx_image: xcode12.5
- os: osx
env: TASK=python_wheels CIBW_PLATFORM_ID=macosx_arm64
osx_image: xcode12.5

# dependent brew packages
addons:
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ if(MSVC)
endif()

option(TEST_COVERAGE "C++ test coverage" OFF)
option(USE_OPENMP "Use OpenMP" ON)
option(BUILD_CPP_TEST "Build C++ tests" OFF)
option(BUILD_STATIC_LIBS "Build static libs, in addition to dynamic libs" OFF)
option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
Expand Down
13 changes: 10 additions & 3 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,24 @@ jobs:
- script: tests/ci_build/ci_build.sh cpu tests/ci_build/build_via_cmake.sh
displayName: 'Building Treelite...'
- script: |
TAG=manylinux2014_x86_64
tests/ci_build/ci_build.sh cpu bash -c "cd python/ && python setup.py bdist_wheel --universal"
tests/ci_build/ci_build.sh auditwheel_x86_64 auditwheel repair --only-plat --plat ${TAG} python/dist/*.whl
rm -v python/dist/*.whl
mv -v wheelhouse/*.whl python/dist/
tests/ci_build/ci_build.sh cpu python tests/ci_build/rename_whl.py python/dist/*.whl $(Build.SourceVersion) ${TAG}
displayName: 'Packaging Python wheel for Treelite...'
- task: PublishPipelineArtifact@0
inputs:
artifactName: 'python_linux_whl'
targetPath: 'python/dist/'
- script: |
TAG=manylinux2014_x86_64
tests/ci_build/ci_build.sh cpu bash -c "cd runtime/python/ && python setup.py bdist_wheel --universal"
tests/ci_build/ci_build.sh auditwheel_x86_64 auditwheel repair --only-plat --plat ${TAG} runtime/python/dist/*.whl
rm -v runtime/python/dist/*.whl
mv -v wheelhouse/*.whl runtime/python/dist/
tests/ci_build/ci_build.sh cpu python tests/ci_build/rename_whl.py runtime/python/dist/*.whl $(Build.SourceVersion) ${TAG}
displayName: 'Packaging Python wheel for Treelite runtime...'
- task: PublishPipelineArtifact@1
inputs:
Expand Down Expand Up @@ -210,9 +220,6 @@ jobs:
- script: python -m pytest -v --fulltrace tests/python/test_basic.py
displayName: 'Running Python tests...'
- script: |
TAG=manylinux2014_x86_64
python tests/ci_build/rename_whl.py main/*.whl $(Build.SourceVersion) ${TAG}
python tests/ci_build/rename_whl.py runtime/*.whl $(Build.SourceVersion) ${TAG}
python -m awscli s3 cp main/*.whl s3://treelite-wheels/ --acl public-read || true
python -m awscli s3 cp runtime/*.whl s3://treelite-wheels/ --acl public-read || true
displayName: 'Uploading Python wheels...'
Expand Down
5 changes: 4 additions & 1 deletion cmake/TreeliteConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

include(CMakeFindDependencyMacro)

find_dependency(Threads)
set(USE_OPENMP @USE_OPENMP@)
if(USE_OPENMP)
find_dependency(OpenMP)
endif()

if(NOT TARGET treelite::treelite)
include(${CMAKE_CURRENT_LIST_DIR}/TreeliteTargets.cmake)
Expand Down
7 changes: 7 additions & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ def build(self, src_dir, build_dir, generator, build_tool=None):
"""Build the core library with CMake."""
cmake_cmd = ['cmake', src_dir, generator]

# Flag for cross-compiling for Apple Silicon
# We use environment variable because it's the only way to pass down custom flags
# through the cibuildwheel package, which otherwise calls `python setup.py bdist_wheel`
# command.
if 'CIBW_TARGET_OSX_ARM64' in os.environ:
cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")

self.logger.info('Run CMake command: %s', str(cmake_cmd))
subprocess.check_call(cmake_cmd, cwd=build_dir)

Expand Down
7 changes: 7 additions & 0 deletions runtime/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ def build(self, src_dir, build_dir, generator, build_tool=None, use_omp=1):
"""Build the runtime with CMake."""
cmake_cmd = ['cmake', src_dir, generator]

# Flag for cross-compiling for Apple Silicon
# We use environment variable because it's the only way to pass down custom flags
# through the cibuildwheel package, which otherwise calls `python setup.py bdist_wheel`
# command.
if 'CIBW_TARGET_OSX_ARM64' in os.environ:
cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")

self.logger.info('Run CMake command: %s', str(cmake_cmd))
subprocess.check_call(cmake_cmd, cwd=build_dir)

Expand Down
17 changes: 15 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,16 @@ endif(UNIX)

add_library(objtreelite_common OBJECT) # Component shared by both main package and runtime

find_package(Threads REQUIRED)
if(USE_OPENMP)
if (APPLE)
# Require CMake 3.16+ on Mac OSX, as previous versions of CMake had trouble locating
# OpenMP on Mac. See https://github.com/dmlc/xgboost/pull/5146#issuecomment-568312706
cmake_minimum_required(VERSION 3.16)
endif (APPLE)
find_package(OpenMP REQUIRED)
else()
message(STATUS "Disabling OpenMP")
endif()

if(ENABLE_ALL_WARNINGS)
foreach(target objtreelite objtreelite_runtime objtreelite_runtime)
Expand All @@ -24,14 +33,17 @@ foreach(lib objtreelite objtreelite_runtime objtreelite_common)
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>)
target_link_libraries(${lib} PUBLIC Threads::Threads)
if(MSVC)
target_compile_options(${lib} PRIVATE /MP)
target_compile_definitions(${lib} PRIVATE -DNOMINMAX)
target_compile_options(${lib} PRIVATE /utf-8 -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
else()
target_compile_options(${lib} PRIVATE -funroll-loops)
endif()
if(USE_OPENMP)
target_link_libraries(${lib} PUBLIC OpenMP::OpenMP_CXX)
target_compile_definitions(${lib} PRIVATE -DTREELITE_OPENMP_SUPPORT)
endif()
if(TEST_COVERAGE)
if(MSVC)
message(FATAL_ERROR "Test coverage not available on Windows")
Expand Down Expand Up @@ -101,6 +113,7 @@ target_sources(objtreelite
${PROJECT_SOURCE_DIR}/include/treelite/frontend.h
${PROJECT_SOURCE_DIR}/include/treelite/frontend_impl.h
${PROJECT_SOURCE_DIR}/include/treelite/gtil.h
${PROJECT_SOURCE_DIR}/include/treelite/omp.h
${PROJECT_SOURCE_DIR}/include/treelite/optional.h
${PROJECT_SOURCE_DIR}/include/treelite/thread_local.h
${PROJECT_SOURCE_DIR}/include/treelite/tree.h
Expand Down
12 changes: 7 additions & 5 deletions src/annotator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,9 @@ inline void ComputeBranchLoopImpl(
size_t num_col = dmat->num_col;
ElementType missing_value = dmat->missing_value;
bool nan_missing = treelite::math::CheckNAN(missing_value);
treelite::threading_utils::ParallelFor(rbegin, rend, nthread,
[&](std::size_t rid, std::size_t thread_id) {
auto sched = treelite::threading_utils::ParallelSchedule::Static();
treelite::threading_utils::ParallelFor(rbegin, rend, nthread, sched,
[&](std::size_t rid, int thread_id) {
const ElementType* row = &dmat->data[rid * num_col];
const size_t off = dmat->num_col * thread_id;
const size_t off2 = count_row_ptr[ntree] * thread_id;
Expand Down Expand Up @@ -107,8 +108,9 @@ inline void ComputeBranchLoopImpl(
std::vector<Entry<ElementType>> inst(nthread * dmat->num_col, {-1});
size_t ntree = model.trees.size();
TREELITE_CHECK_LE(rbegin, rend);
treelite::threading_utils::ParallelFor(rbegin, rend, nthread,
[&](std::size_t rid, std::size_t thread_id) {
auto sched = treelite::threading_utils::ParallelSchedule::Static();
treelite::threading_utils::ParallelFor(rbegin, rend, nthread, sched,
[&](std::size_t rid, int thread_id) {
const size_t off = dmat->num_col * thread_id;
const size_t off2 = count_row_ptr[ntree] * thread_id;
const size_t ibegin = dmat->row_ptr[rid];
Expand Down Expand Up @@ -192,7 +194,7 @@ AnnotateImpl(

count_row_ptr = {0};
const size_t ntree = model.trees.size();
const int max_thread = static_cast<int>(std::thread::hardware_concurrency());
const int max_thread = static_cast<int>(threading_utils::MaxNumThread());
nthread = (nthread == 0) ? max_thread : std::min(nthread, max_thread);
for (const treelite::Tree<ThresholdType, LeafOutputType>& tree : model.trees) {
count_row_ptr.push_back(count_row_ptr.back() + tree.num_nodes);
Expand Down
159 changes: 109 additions & 50 deletions src/threading_utils/parallel_for.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,75 +7,134 @@
#ifndef TREELITE_THREADING_UTILS_PARALLEL_FOR_H_
#define TREELITE_THREADING_UTILS_PARALLEL_FOR_H_

#include <treelite/omp.h>
#include <treelite/logging.h>
#include <future>
#include <thread>
#include <algorithm>
#include <vector>
#include <type_traits>
#include <exception>
#include <mutex>
#include <cstddef>

namespace treelite {
namespace threading_utils {

template <typename IndexType>
std::vector<IndexType> ComputeWorkRange(IndexType begin, IndexType end, std::size_t nthread);
/*!
* \brief OMP Exception class catches, saves and rethrows exception from OMP blocks
*/
class OMPException {
private:
// exception_ptr member to store the exception
std::exception_ptr omp_exception_;
// mutex to be acquired during catch to set the exception_ptr
std::mutex mutex_;

public:
/*!
* \brief Parallel OMP blocks should be placed within Run to save exception
*/
template <typename Function, typename... Parameters>
void Run(Function f, Parameters... params) {
try {
f(params...);
} catch (std::exception& ex) {
std::lock_guard<std::mutex> lock(mutex_);
if (!omp_exception_) {
omp_exception_ = std::current_exception();
}
}
}

/*!
* \brief should be called from the main thread to rethrow the exception
*/
void Rethrow() {
if (this->omp_exception_) {
std::rethrow_exception(this->omp_exception_);
}
}
};

inline int MaxNumThread() {
return omp_get_max_threads();
}

// OpenMP schedule
struct ParallelSchedule {
enum {
kAuto,
kDynamic,
kStatic,
kGuided,
} sched;
std::size_t chunk{0};

ParallelSchedule static Auto() { return ParallelSchedule{kAuto}; }
ParallelSchedule static Dynamic(std::size_t n = 0) { return ParallelSchedule{kDynamic, n}; }
ParallelSchedule static Static(std::size_t n = 0) { return ParallelSchedule{kStatic, n}; }
ParallelSchedule static Guided() { return ParallelSchedule{kGuided}; }
};

template <typename IndexType, typename FuncType>
void ParallelFor(IndexType begin, IndexType end, std::size_t nthread, FuncType func) {
inline void ParallelFor(IndexType begin, IndexType end, int nthread, ParallelSchedule sched,
FuncType func) {
TREELITE_CHECK_GT(nthread, 0) << "nthread must be positive";
TREELITE_CHECK_LE(nthread, std::thread::hardware_concurrency())
<< "nthread cannot exceed " << std::thread::hardware_concurrency();
TREELITE_CHECK_LE(nthread, MaxNumThread()) << "nthread cannot exceed " << MaxNumThread();
if (begin == end) {
return;
}
/* Divide the range [begin, end) equally among the threads.
* The i-th thread gets the range [work_range[i], work_range[i+1]). */
std::vector<IndexType> work_range = ComputeWorkRange(begin, end, nthread);

// Launch (nthread - 1) threads, as the main thread should also perform work.
std::vector<std::future<void>> async_tasks;
for (std::size_t thread_id = 1; thread_id < nthread; ++thread_id) {
async_tasks.push_back(std::async(std::launch::async, [&work_range, &func, thread_id]() {
const IndexType begin_ = work_range[thread_id];
const IndexType end_ = work_range[thread_id + 1];
for (IndexType i = begin_; i < end_; ++i) {
func(i, thread_id);
#if defined(_MSC_VER)
// msvc doesn't support unsigned integer as openmp index.
using OmpInd = std::conditional_t<std::is_signed<IndexType>::value, IndexType, std::int64_t>;
#else
using OmpInd = IndexType;
#endif

OMPException exc;
switch (sched.sched) {
case ParallelSchedule::kAuto: {
#pragma omp parallel for num_threads(nthread)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
break;
}
case ParallelSchedule::kDynamic: {
if (sched.chunk == 0) {
#pragma omp parallel for num_threads(nthread) schedule(dynamic)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
} else {
#pragma omp parallel for num_threads(nthread) schedule(dynamic, sched.chunk)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
}));
}
break;
}
{
const IndexType begin_ = work_range[0];
const IndexType end_ = work_range[1];
for (IndexType i = begin_; i < end_; ++i) {
func(i, 0);
case ParallelSchedule::kStatic: {
if (sched.chunk == 0) {
#pragma omp parallel for num_threads(nthread) schedule(static)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
} else {
#pragma omp parallel for num_threads(nthread) schedule(static, sched.chunk)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
}
break;
}
// Join threads
for (auto& task : async_tasks) {
task.get();
case ParallelSchedule::kGuided: {
#pragma omp parallel for num_threads(nthread) schedule(guided)
for (OmpInd i = begin; i < end; ++i) {
exc.Run(func, i, omp_get_thread_num());
}
break;
}
}

template <typename IndexType>
std::vector<IndexType> ComputeWorkRange(IndexType begin, IndexType end, std::size_t nthread) {
TREELITE_CHECK_GE(end, 0) << "end must be 0 or greater";
TREELITE_CHECK_GE(begin, 0) << "begin must be 0 or greater";
TREELITE_CHECK_GE(end, begin) << "end cannot be less than begin";
TREELITE_CHECK_GT(nthread, 0) << "nthread must be positive";
IndexType num_elem = end - begin;
const IndexType portion = num_elem / nthread + !!(num_elem % nthread);
// integer division, rounded-up

std::vector<IndexType> work_range(nthread + 1);
work_range[0] = begin;
IndexType acc = begin;
for (std::size_t i = 0; i < nthread; ++i) {
acc += portion;
work_range[i + 1] = std::min(acc, end);
}
TREELITE_CHECK_EQ(work_range[nthread], end);

return work_range;
exc.Rethrow();
}

} // namespace threading_utils
Expand Down
15 changes: 15 additions & 0 deletions tests/ci_build/Dockerfile.auditwheel_x86_64
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM quay.io/pypa/manylinux2014_x86_64

# Install lightweight sudo (not bound to TTY)
ENV GOSU_VERSION 1.10
RUN set -ex; \
curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

# Default entry-point to use if running locally
# It will preserve attributes of created files
COPY entrypoint.sh /scripts/

WORKDIR /workspace
ENTRYPOINT ["/scripts/entrypoint.sh"]
Loading

0 comments on commit f815a58

Please sign in to comment.