Skip to content

Commit

Permalink
Merge pull request #1244 from stan-dev/perf-regression
Browse files Browse the repository at this point in the history
Revert "Merge pull request #1212 from stan-dev/feature/faster-ad-tls-v5"
  • Loading branch information
rok-cesnovar authored May 18, 2019
2 parents dd45af4 + db1a002 commit f49f224
Show file tree
Hide file tree
Showing 18 changed files with 145 additions and 362 deletions.
16 changes: 1 addition & 15 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ pipeline {
skipDefaultCheckout()
preserveStashes(buildCount: 7)
}
environment {
STAN_NUM_THREADS = '4'
}
stages {
stage('Kill previous builds') {
when {
Expand Down Expand Up @@ -228,17 +225,6 @@ pipeline {
runTestsWin("test/unit")
}
}
stage('Windows Threading') {
agent { label 'windows' }
steps {
deleteDirWin()
unstash 'MathSetup'
bat "echo CXX=${env.CXX} -Werror > make/local"
bat "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
runTestsWin("test/unit -f thread")
runTestsWin("test/unit -f map_rect")
}
}
}
}
stage('Additional merge tests') {
Expand All @@ -250,7 +236,7 @@ pipeline {
deleteDir()
unstash 'MathSetup'
sh "echo CXX=${GCC} >> make/local"
sh "echo CXXFLAGS=-DSTAN_THREADS >> make/local"
sh "echo CPPFLAGS=-DSTAN_THREADS >> make/local"
runTests("test/unit")
}
post { always { retry(3) { deleteDir() } } }
Expand Down
1 change: 0 additions & 1 deletion stan/math/prim/mat/functor/map_rect.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <stan/math/prim/arr/err/check_matching_sizes.hpp>
#include <stan/math/prim/mat/fun/dims.hpp>
#include <stan/math/prim/mat/fun/typedefs.hpp>
#include <stan/math/prim/scal/meta/return_type.hpp>

#define STAN_REGISTER_MAP_RECT(CALLID, FUNCTOR)

Expand Down
73 changes: 70 additions & 3 deletions stan/math/prim/mat/functor/map_rect_concurrent.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@

#include <stan/math/prim/mat/fun/typedefs.hpp>

#include <stan/math/prim/scal/meta/return_type.hpp>
#include <stan/math/prim/mat/functor/map_rect_reduce.hpp>
#include <stan/math/prim/mat/functor/map_rect_combine.hpp>
#include <stan/math/prim/scal/err/invalid_argument.hpp>
#include <boost/lexical_cast.hpp>

#include <cstdlib>
#include <vector>
#include <thread>
#include <future>
#include <cstdlib>

namespace stan {
namespace math {
Expand Down Expand Up @@ -75,7 +77,72 @@ map_rect_concurrent(
const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&
job_params,
const std::vector<std::vector<double>>& x_r,
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr);
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr) {
typedef map_rect_reduce<F, T_shared_param, T_job_param> ReduceF;
typedef map_rect_combine<F, T_shared_param, T_job_param> CombineF;

const int num_jobs = job_params.size();
const vector_d shared_params_dbl = value_of(shared_params);
std::vector<std::future<std::vector<matrix_d>>> futures;

auto execute_chunk = [&](int start, int size) -> std::vector<matrix_d> {
const int end = start + size;
std::vector<matrix_d> chunk_f_out;
chunk_f_out.reserve(size);
for (int i = start; i != end; i++)
chunk_f_out.push_back(ReduceF()(
shared_params_dbl, value_of(job_params[i]), x_r[i], x_i[i], msgs));
return chunk_f_out;
};

int num_threads = get_num_threads(num_jobs);
int num_jobs_per_thread = num_jobs / num_threads;
futures.emplace_back(
std::async(std::launch::deferred, execute_chunk, 0, num_jobs_per_thread));

#ifdef STAN_THREADS
if (num_threads > 1) {
const int num_big_threads = num_jobs % num_threads;
const int first_big_thread = num_threads - num_big_threads;
for (int i = 1, job_start = num_jobs_per_thread, job_size = 0;
i < num_threads; ++i, job_start += job_size) {
job_size = i >= first_big_thread ? num_jobs_per_thread + 1
: num_jobs_per_thread;
futures.emplace_back(
std::async(std::launch::async, execute_chunk, job_start, job_size));
}
}
#endif

// collect results
std::vector<int> world_f_out;
world_f_out.reserve(num_jobs);
matrix_d world_output(0, 0);

int offset = 0;
for (std::size_t i = 0; i < futures.size(); ++i) {
const std::vector<matrix_d>& chunk_result = futures[i].get();
if (i == 0)
world_output.resize(chunk_result[0].rows(),
num_jobs * chunk_result[0].cols());

for (const auto& job_result : chunk_result) {
const int num_job_outputs = job_result.cols();
world_f_out.push_back(num_job_outputs);

if (world_output.cols() < offset + num_job_outputs)
world_output.conservativeResize(Eigen::NoChange,
2 * (offset + num_job_outputs));

world_output.block(0, offset, world_output.rows(), num_job_outputs)
= job_result;

offset += num_job_outputs;
}
}
CombineF combine(shared_params, job_params);
return combine(world_output, world_f_out);
}

} // namespace internal
} // namespace math
Expand Down
1 change: 0 additions & 1 deletion stan/math/rev/core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#include <stan/math/rev/core/build_vari_array.hpp>
#include <stan/math/rev/core/chainable_alloc.hpp>
#include <stan/math/rev/core/chainablestack.hpp>
#include <stan/math/rev/core/init_chainablestack.hpp>
#include <stan/math/rev/core/ddv_vari.hpp>
#include <stan/math/rev/core/dv_vari.hpp>
#include <stan/math/rev/core/dvd_vari.hpp>
Expand Down
132 changes: 37 additions & 95 deletions stan/math/rev/core/autodiffstackstorage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,98 +7,42 @@
namespace stan {
namespace math {

// Internal macro used to modify global pointer definition to the
// global AD instance.
#ifdef STAN_THREADS
// Whenever STAN_THREADS is set a TLS keyword is used. For reasons
// explained below we use the GNU compiler extension __thread if
// supported by the compiler while the generic thread_local C++11
// keyword is used otherwise.
#ifdef __GNUC__
#define STAN_THREADS_DEF __thread
#else
#define STAN_THREADS_DEF thread_local
#endif
#else
// In case STAN_THREADS is not set, then no modifier is needed.
#define STAN_THREADS_DEF
#endif

/**
* This struct always provides access to the autodiff stack using
* the singleton pattern. Read warnings below!
*
* The singleton <code>instance_</code> is a global static pointer,
* which is thread local (TLS) if the STAN_THREADS preprocess variable
* is defined.
* Provides a thread_local singleton if needed. Read warnings below!
* For performance reasons the singleton is a global static for the
* case of no threading which is returned by a function. This design
* should allow the compiler to apply necessary inlining to get
* maximal performance. However, this design suffers from "the static
* init order fiasco"[0]. Anywhere this is used, we must be
* absolutely positive that it doesn't matter when the singleton will
* get initialized relative to other static variables. In exchange,
* we get a more performant singleton pattern for the non-threading
* case. In the threading case we use the defacto standard C++11
* singleton pattern relying on a function wrapping a static local
* variable. This standard pattern is expected to be well supported
* by the major compilers (as its standard), but it does incur some
* performance penalty. There has been some discussion on this; see
* [1] and [2] and the discussions those PRs link to as well.
*
* The use of a pointer is motivated by performance reasons for the
* threading case. When a TLS is used, initialization with a constant
* expression at compile time is required for fast access to the
* TLS. As the autodiff storage struct is non-POD, its initialization
* is a dynamic expression at compile time. These dynamic expressions
* are wrapped, in the TLS case, by a TLS wrapper function which slows
* down its access. Using a pointer instead allows to initialize at
* compile time to <code>nullptr</code>, which is a compile time
* constant. In this case, the compiler avoids the use of a TLS
* wrapper function.
*
* For performance reasons we use the __thread keyword on compilers
* which support it. The __thread keyword is a GNU compiler-specific
* (gcc, clang, Intel) extension which requires initialization with a
* compile time constant expression. The C++11 keyword thread_local
* does allow for constant and dynamic initialization of the
* TLS. Thus, only the __thread keyword gurantees that constant
* initialization and it's implied speedup, is used.
*
* The initialzation of the AD instance at run-time is handled by the
* lifetime of a AutodiffStackSingleton object. More specifically, the
* first instance of the AutodiffStackSingleton object will initialize
* the AD instance and take ownership (it is the only one instance
* with the private member own_instance_ being true). Thus, whenever
* the first instance of the AutodiffStackSingleton object gets
* destructed, the AD tape will be destructed as well. Within
* stan-math the initialization of the AD instance for the main thread
* of the program is handled by instantiating the singleton once in
* the init_chainablestack.hpp file. Whenever STAN_THREADS is defined
* then all created child threads must instantiate a
* AutodiffStackSingleton object within the child thread before
* accessing the AD system in order to initialize the TLS AD tape
* within the child thread.
*
* The design of a globally held (optionally TLS) pointer, which is
* globally initialized, allows the compiler to apply necessary
* inlining to get maximal performance. However, the design suffers
* from "the static init order fiasco"[0]. Whenever the static init
* order fiasco occurs, the C++ client of the library may instantiate
* a AutodiffStackSingleton object at the adequate code position prior
* to any AD tape access to ensure proper initialization order. In
* exchange, we get a more performant singleton pattern with automatic
* initialization of the AD stack for the main thread. There has been
* some discussion on earlier designs using the Mayer singleton
* approach; see [1] and [2] and the discussions those PRs link to as
* well.
* These are thread_local only if the user asks for it with
* -DSTAN_THREADS. This is primarily because Apple clang compilers
* before 2016 don't support thread_local and the additional
* performance cost. We have proposed removing support for those[3],
* and at that time we should evaluate the performance of a switch to
* thread_local. If there is no loss in performance, we can remove
* this ifdef.
*
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
* [1] https://github.com/stan-dev/math/pull/840
* [2] https://github.com/stan-dev/math/pull/826
* [3]
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
* [4] https://github.com/stan-dev/math/pull/1135
*/
template <typename ChainableT, typename ChainableAllocT>
struct AutodiffStackSingleton {
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
AutodiffStackSingleton_t;

AutodiffStackSingleton() : own_instance_(init()) {}
~AutodiffStackSingleton() {
if (own_instance_) {
delete instance_;
instance_ = nullptr;
}
}

struct AutodiffStackStorage {
AutodiffStackStorage &operator=(const AutodiffStackStorage &) = delete;

Expand All @@ -113,32 +57,30 @@ struct AutodiffStackSingleton {
std::vector<size_t> nested_var_alloc_stack_starts_;
};

AutodiffStackSingleton() = delete;
explicit AutodiffStackSingleton(AutodiffStackSingleton_t const &) = delete;
AutodiffStackSingleton &operator=(const AutodiffStackSingleton_t &) = delete;

static inline constexpr AutodiffStackStorage &instance() {
return *instance_;
static inline AutodiffStackStorage &instance() {
#ifdef STAN_THREADS
thread_local static AutodiffStackStorage instance_;
#endif
return instance_;
}

private:
static bool init() {
if (!instance_) {
instance_ = new AutodiffStackStorage();
return true;
}
return false;
}
#ifndef STAN_THREADS

static STAN_THREADS_DEF AutodiffStackStorage *instance_;
const bool own_instance_;
private:
static AutodiffStackStorage instance_;
#endif
};

#ifndef STAN_THREADS
template <typename ChainableT, typename ChainableAllocT>
STAN_THREADS_DEF
typename AutodiffStackSingleton<ChainableT,
ChainableAllocT>::AutodiffStackStorage
*AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
= nullptr;
typename AutodiffStackSingleton<ChainableT,
ChainableAllocT>::AutodiffStackStorage
AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
#endif

} // namespace math
} // namespace stan
Expand Down
18 changes: 0 additions & 18 deletions stan/math/rev/core/init_chainablestack.hpp

This file was deleted.

1 change: 0 additions & 1 deletion stan/math/rev/mat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
#include <stan/math/rev/mat/functor/integrate_ode_adams.hpp>
#include <stan/math/rev/mat/functor/integrate_ode_bdf.hpp>
#include <stan/math/rev/mat/functor/integrate_dae.hpp>
#include <stan/math/rev/mat/functor/map_rect_concurrent.hpp>
#include <stan/math/rev/mat/functor/map_rect_reduce.hpp>

#endif
Loading

0 comments on commit f49f224

Please sign in to comment.