Skip to content

Commit

Permalink
Remove OpenMP dependency (#300)
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 authored Jul 16, 2021
1 parent 59217af commit 45b803f
Show file tree
Hide file tree
Showing 9 changed files with 258 additions and 60 deletions.
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ if(MSVC)
endif()

option(TEST_COVERAGE "C++ test coverage" OFF)
option(USE_OPENMP "Use OpenMP" ON)
option(BUILD_CPP_TEST "Build C++ tests" OFF)
option(BUILD_STATIC_LIBS "Build static libs, in addition to dynamic libs" OFF)
option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
Expand Down
6 changes: 2 additions & 4 deletions cmake/TreeliteConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@

include(CMakeFindDependencyMacro)

set(USE_OPENMP @USE_OPENMP@)
if(USE_OPENMP)
find_dependency(OpenMP)
endif()
find_dependency(Threads)

if(NOT TARGET treelite::treelite)
include(${CMAKE_CURRENT_LIST_DIR}/TreeliteTargets.cmake)
endif()
Expand Down
18 changes: 3 additions & 15 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,7 @@ endif(UNIX)

add_library(objtreelite_common OBJECT) # Component shared by both main package and runtime

if(USE_OPENMP)
if (APPLE)
# Require CMake 3.16+ on Mac OSX, as previous versions of CMake had trouble locating
# OpenMP on Mac. See https://github.com/dmlc/xgboost/pull/5146#issuecomment-568312706
cmake_minimum_required(VERSION 3.16)
endif (APPLE)
find_package(OpenMP REQUIRED)
else()
message(STATUS "Disabling OpenMP")
endif()
find_package(Threads REQUIRED)

if(ENABLE_ALL_WARNINGS)
foreach(target objtreelite objtreelite_runtime objtreelite_runtime)
Expand All @@ -33,17 +24,14 @@ foreach(lib objtreelite objtreelite_runtime objtreelite_common)
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>)
target_link_libraries(${lib} PUBLIC Threads::Threads)
if(MSVC)
target_compile_options(${lib} PRIVATE /MP)
target_compile_definitions(${lib} PRIVATE -DNOMINMAX)
target_compile_options(${lib} PRIVATE /utf-8 -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
else()
target_compile_options(${lib} PRIVATE -funroll-loops)
endif()
if(USE_OPENMP)
target_link_libraries(${lib} PUBLIC OpenMP::OpenMP_CXX)
target_compile_definitions(${lib} PRIVATE -DTREELITE_OPENMP_SUPPORT)
endif()
if(TEST_COVERAGE)
if(MSVC)
message(FATAL_ERROR "Test coverage not available on Windows")
Expand Down Expand Up @@ -98,6 +86,7 @@ target_sources(objtreelite
gtil/predict.cc
gtil/pred_transform.h
gtil/pred_transform.cc
threading_utils/parallel_for.h
annotator.cc
filesystem.cc
optable.cc
Expand All @@ -112,7 +101,6 @@ target_sources(objtreelite
${PROJECT_SOURCE_DIR}/include/treelite/frontend.h
${PROJECT_SOURCE_DIR}/include/treelite/frontend_impl.h
${PROJECT_SOURCE_DIR}/include/treelite/gtil.h
${PROJECT_SOURCE_DIR}/include/treelite/omp.h
${PROJECT_SOURCE_DIR}/include/treelite/optional.h
${PROJECT_SOURCE_DIR}/include/treelite/thread_local.h
${PROJECT_SOURCE_DIR}/include/treelite/tree.h
Expand Down
33 changes: 13 additions & 20 deletions src/annotator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
#include <treelite/logging.h>
#include <treelite/annotator.h>
#include <treelite/math.h>
#include <treelite/omp.h>
#include <rapidjson/istreamwrapper.h>
#include <rapidjson/ostreamwrapper.h>
#include <rapidjson/writer.h>
#include <rapidjson/document.h>
#include <limits>
#include <thread>
#include <cstdint>
#include "threading_utils/parallel_for.h"

namespace {

Expand Down Expand Up @@ -73,18 +74,14 @@ inline void ComputeBranchLoopImpl(
std::vector<Entry<ElementType>> inst(nthread * dmat->num_col, {-1});
const size_t ntree = model.trees.size();
TREELITE_CHECK_LE(rbegin, rend);
TREELITE_CHECK_LT(static_cast<int64_t>(rend), std::numeric_limits<int64_t>::max());
const size_t num_col = dmat->num_col;
const ElementType missing_value = dmat->missing_value;
const bool nan_missing = treelite::math::CheckNAN(missing_value);
const auto rbegin_i = static_cast<int64_t>(rbegin);
const auto rend_i = static_cast<int64_t>(rend);
#pragma omp parallel for schedule(static) num_threads(nthread)
for (int64_t rid = rbegin_i; rid < rend_i; ++rid) {
const int tid = omp_get_thread_num();
treelite::threading_utils::ParallelFor(rbegin, rend, nthread,
[&](std::size_t rid, std::size_t thread_id) {
const ElementType* row = &dmat->data[rid * num_col];
const size_t off = dmat->num_col * tid;
const size_t off2 = count_row_ptr[ntree] * tid;
const size_t off = dmat->num_col * thread_id;
const size_t off2 = count_row_ptr[ntree] * thread_id;
for (size_t j = 0; j < num_col; ++j) {
if (treelite::math::CheckNAN(row[j])) {
TREELITE_CHECK(nan_missing)
Expand All @@ -99,7 +96,7 @@ inline void ComputeBranchLoopImpl(
for (size_t j = 0; j < num_col; ++j) {
inst[off + j].missing = -1;
}
}
});
}

template <typename ElementType, typename ThresholdType, typename LeafOutputType>
Expand All @@ -110,14 +107,10 @@ inline void ComputeBranchLoopImpl(
std::vector<Entry<ElementType>> inst(nthread * dmat->num_col, {-1});
const size_t ntree = model.trees.size();
TREELITE_CHECK_LE(rbegin, rend);
TREELITE_CHECK_LT(static_cast<int64_t>(rend), std::numeric_limits<int64_t>::max());
const auto rbegin_i = static_cast<int64_t>(rbegin);
const auto rend_i = static_cast<int64_t>(rend);
#pragma omp parallel for schedule(static) num_threads(nthread)
for (int64_t rid = rbegin_i; rid < rend_i; ++rid) {
const int tid = omp_get_thread_num();
const size_t off = dmat->num_col * tid;
const size_t off2 = count_row_ptr[ntree] * tid;
treelite::threading_utils::ParallelFor(rbegin, rend, nthread,
[&](std::size_t rid, std::size_t thread_id) {
const size_t off = dmat->num_col * thread_id;
const size_t off2 = count_row_ptr[ntree] * thread_id;
const size_t ibegin = dmat->row_ptr[rid];
const size_t iend = dmat->row_ptr[rid + 1];
for (size_t i = ibegin; i < iend; ++i) {
Expand All @@ -129,7 +122,7 @@ inline void ComputeBranchLoopImpl(
for (size_t i = ibegin; i < iend; ++i) {
inst[off + dmat->col_ind[i]].missing = -1;
}
}
});
}

template <typename ElementType>
Expand Down Expand Up @@ -199,7 +192,7 @@ AnnotateImpl(

count_row_ptr = {0};
const size_t ntree = model.trees.size();
const int max_thread = omp_get_max_threads();
const int max_thread = static_cast<int>(std::thread::hardware_concurrency());
nthread = (nthread == 0) ? max_thread : std::min(nthread, max_thread);
for (const treelite::Tree<ThresholdType, LeafOutputType>& tree : model.trees) {
count_row_ptr.push_back(count_row_ptr.back() + tree.num_nodes);
Expand Down
84 changes: 84 additions & 0 deletions src/threading_utils/parallel_for.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*!
* Copyright (c) 2021 by Contributors
* \file parallel_for.h
* \brief Implemenation of parallel for loop
* \author Hyunsu Cho
*/
#ifndef TREELITE_THREADING_UTILS_PARALLEL_FOR_H_
#define TREELITE_THREADING_UTILS_PARALLEL_FOR_H_

#include <treelite/logging.h>
#include <future>
#include <thread>
#include <algorithm>
#include <vector>
#include <cstddef>

namespace treelite {
namespace threading_utils {

template <typename IndexType>
std::vector<IndexType> ComputeWorkRange(IndexType begin, IndexType end, std::size_t nthread);

template <typename IndexType, typename FuncType>
void ParallelFor(IndexType begin, IndexType end, std::size_t nthread, FuncType func) {
TREELITE_CHECK_GT(nthread, 0) << "nthread must be positive";
TREELITE_CHECK_LE(nthread, std::thread::hardware_concurrency())
<< "nthread cannot exceed " << std::thread::hardware_concurrency();
if (begin == end) {
return;
}
/* Divide the range [begin, end) equally among the threads.
* The i-th thread gets the range [work_range[i], work_range[i+1]). */
std::vector<IndexType> work_range = ComputeWorkRange(begin, end, nthread);

// Launch (nthread - 1) threads, as the main thread should also perform work.
std::vector<std::future<void>> async_tasks;
for (std::size_t thread_id = 1; thread_id < nthread; ++thread_id) {
async_tasks.push_back(std::async(std::launch::async, [&work_range, &func, thread_id]() {
const IndexType begin_ = work_range[thread_id];
const IndexType end_ = work_range[thread_id + 1];
for (IndexType i = begin_; i < end_; ++i) {
func(i, thread_id);
}
}));
}
{
const IndexType begin_ = work_range[0];
const IndexType end_ = work_range[1];
for (IndexType i = begin_; i < end_; ++i) {
func(i, 0);
}
}
// Join threads
for (auto& task : async_tasks) {
task.get();
}
}

template <typename IndexType>
std::vector<IndexType> ComputeWorkRange(IndexType begin, IndexType end, std::size_t nthread) {
TREELITE_CHECK_GE(end, 0) << "end must be 0 or greater";
TREELITE_CHECK_GE(begin, 0) << "begin must be 0 or greater";
TREELITE_CHECK_GE(end, begin) << "end cannot be less than begin";
TREELITE_CHECK_GT(nthread, 0) << "nthread must be positive";
IndexType num_elem = end - begin;
const IndexType portion = num_elem / nthread + !!(num_elem % nthread);
// integer division, rounded-up

std::vector<IndexType> work_range(nthread + 1);
work_range[0] = begin;
IndexType acc = begin;
for (std::size_t i = 0; i < nthread; ++i) {
acc += portion;
work_range[i + 1] = std::min(acc, end);
}
TREELITE_CHECK_EQ(work_range[nthread], end);

return work_range;
}

} // namespace threading_utils
} // namespace treelite

#endif // TREELITE_THREADING_UTILS_PARALLEL_FOR_H_
3 changes: 2 additions & 1 deletion tests/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ target_sources(treelite_cpp_test
test_serializer.cc
test_frontend.cc
test_compiler_param.cc
test_threading_utils.cc
)

target_include_directories(treelite_cpp_test
PRIVATE ../../src/frontend
PRIVATE ../../src/frontend ../../src/
)

msvc_use_static_runtime()
137 changes: 137 additions & 0 deletions tests/cpp/test_threading_utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*!
* Copyright (c) 2021 by Contributors
* \file test_threading_utils.cc
* \author Hyunsu Cho
* \brief C++ tests for threading utilities
*/
#include <gtest/gtest.h>
#include <gmock/gmock.h>
#include <vector>
#include <algorithm>
#include <thread>
#include <random>
#include <cstddef>
#include <cstdint>
#include "threading_utils/parallel_for.h"

namespace {

class RandomGenerator {
public:
RandomGenerator()
: rng_(std::random_device()()),
int_dist_(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max()),
real_dist_(0.0, 1.0) {}

int64_t DrawInteger(int64_t low, int64_t high) {
TREELITE_CHECK_LT(low, high);
int64_t out = int_dist_(rng_);
int64_t rem = out % (high - low);
int64_t ret;
if (rem < 0) {
ret = high + rem;
} else {
ret = low + rem;
}
TREELITE_CHECK_GE(ret, low);
TREELITE_CHECK_LT(ret, high);
return ret;
}

double DrawReal(double low, double high) {
TREELITE_CHECK_LT(low, high);
return real_dist_(rng_) * (high - low) + low;
}

private:
std::mt19937 rng_;
std::uniform_int_distribution<int64_t> int_dist_;
std::uniform_real_distribution<double> real_dist_;
};

} // namespace anonymous

namespace treelite {
namespace threading_utils {

TEST(ThreadingUtils, ComputeWorkRange) {
/* Test error handling */
EXPECT_THROW(ComputeWorkRange(0, 100, 0), treelite::Error);
EXPECT_THROW(ComputeWorkRange(-100, 100, 3), treelite::Error);
EXPECT_THROW(ComputeWorkRange(-200, -100, 3), treelite::Error);
EXPECT_THROW(ComputeWorkRange(200, 100, 3), treelite::Error);

/* Property-based testing with randomly generated parameters */
RandomGenerator rng;

constexpr int kNumTrial = 200;
for (int i = 0; i < kNumTrial; ++i) {
int64_t begin = rng.DrawInteger(0, 10000);
std::size_t nthread = static_cast<std::size_t>(rng.DrawInteger(1, 100));
int64_t end = rng.DrawInteger(begin, 10000);
auto range = ComputeWorkRange(begin, end, nthread);
EXPECT_EQ(range.size(), nthread + 1);
EXPECT_EQ(range[0], begin);
EXPECT_EQ(range[nthread], end);
for (std::size_t i = 0; i < nthread; ++i) {
EXPECT_GE(range[i + 1], range[i]);
}
}
// Test the case with begin == end
for (int i = 0; i < 10; ++i) {
int64_t begin = rng.DrawInteger(0, 10000);
int64_t end = begin;
std::size_t nthread = static_cast<std::size_t>(rng.DrawInteger(1, 100));
auto range = ComputeWorkRange(begin, end, nthread);
EXPECT_EQ(range.size(), nthread + 1);
EXPECT_EQ(range[0], begin);
EXPECT_EQ(range[nthread], begin);
for (std::size_t i = 0; i < nthread; ++i) {
EXPECT_EQ(range[i + 1], range[i]);
}
}
}

TEST(ThreadingUtils, ParallelFor) {
/* Test error handling */
const int max_thread = std::thread::hardware_concurrency();

auto dummy_func = [](int, std::size_t) {};
EXPECT_THROW(ParallelFor(0, 100, 0, dummy_func), treelite::Error);
EXPECT_THROW(ParallelFor(200, 100, 3, dummy_func), treelite::Error);
EXPECT_THROW(ParallelFor(-100, 100, 3, dummy_func), treelite::Error);
EXPECT_THROW(ParallelFor(-200, -100, 3, dummy_func), treelite::Error);
EXPECT_THROW(ParallelFor(200, 100, 3, dummy_func), treelite::Error);
EXPECT_THROW(ParallelFor(10, 20, 3 * max_thread, dummy_func), treelite::Error);

/* Property-based testing with randomly generated parameters */
constexpr int kVectorLength = 10000;
RandomGenerator rng;
std::vector<double> a(kVectorLength);
std::vector<double> b(kVectorLength);
std::generate_n(a.begin(), kVectorLength, [&rng]() { return rng.DrawReal(-1.0, 1.0); });
std::generate_n(b.begin(), kVectorLength, [&rng]() { return rng.DrawReal(-10.0, 10.0); });

constexpr int kNumTrial = 200;
for (int i = 0; i < kNumTrial; ++i) {
std::vector<double> c(kVectorLength);
// Fill c with dummy values
std::generate_n(c.begin(), kVectorLength, [&rng]() { return rng.DrawReal(100.0, 200.0); });

// Compute c := a + b on range [begin, end)
int64_t begin = rng.DrawInteger(0, kVectorLength);
std::size_t nthread = static_cast<std::size_t>(rng.DrawInteger(1, max_thread + 1));
int64_t end = rng.DrawInteger(begin, kVectorLength);

ParallelFor(begin, end, nthread, [&a, &b, &c](int64_t i, std::size_t) {
c[i] = a[i] + b[i];
});

for (int64_t i = begin; i < end; ++i) {
EXPECT_FLOAT_EQ(c[i], a[i] + b[i]) << ", at index " << i;
}
}
}

} // namespace threading_utils
} // namespace treelite
Loading

0 comments on commit 45b803f

Please sign in to comment.