Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Merge pull request #1212 from stan-dev/feature/faster-ad-tls-v5" #1244

Merged
merged 1 commit into from
May 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 1 addition & 15 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ pipeline {
skipDefaultCheckout()
preserveStashes(buildCount: 7)
}
environment {
STAN_NUM_THREADS = '4'
}
stages {
stage('Kill previous builds') {
when {
Expand Down Expand Up @@ -228,17 +225,6 @@ pipeline {
runTestsWin("test/unit")
}
}
stage('Windows Threading') {
agent { label 'windows' }
steps {
deleteDirWin()
unstash 'MathSetup'
bat "echo CXX=${env.CXX} -Werror > make/local"
bat "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
runTestsWin("test/unit -f thread")
runTestsWin("test/unit -f map_rect")
}
}
}
}
stage('Additional merge tests') {
Expand All @@ -250,7 +236,7 @@ pipeline {
deleteDir()
unstash 'MathSetup'
sh "echo CXX=${GCC} >> make/local"
sh "echo CXXFLAGS=-DSTAN_THREADS >> make/local"
sh "echo CPPFLAGS=-DSTAN_THREADS >> make/local"
runTests("test/unit")
}
post { always { retry(3) { deleteDir() } } }
Expand Down
1 change: 0 additions & 1 deletion stan/math/prim/mat/functor/map_rect.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <stan/math/prim/arr/err/check_matching_sizes.hpp>
#include <stan/math/prim/mat/fun/dims.hpp>
#include <stan/math/prim/mat/fun/typedefs.hpp>
#include <stan/math/prim/scal/meta/return_type.hpp>

#define STAN_REGISTER_MAP_RECT(CALLID, FUNCTOR)

Expand Down
73 changes: 70 additions & 3 deletions stan/math/prim/mat/functor/map_rect_concurrent.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@

#include <stan/math/prim/mat/fun/typedefs.hpp>

#include <stan/math/prim/scal/meta/return_type.hpp>
#include <stan/math/prim/mat/functor/map_rect_reduce.hpp>
#include <stan/math/prim/mat/functor/map_rect_combine.hpp>
#include <stan/math/prim/scal/err/invalid_argument.hpp>
#include <boost/lexical_cast.hpp>

#include <cstdlib>
#include <vector>
#include <thread>
#include <future>
#include <cstdlib>

namespace stan {
namespace math {
Expand Down Expand Up @@ -75,7 +77,72 @@ map_rect_concurrent(
const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&
job_params,
const std::vector<std::vector<double>>& x_r,
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr);
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr) {
typedef map_rect_reduce<F, T_shared_param, T_job_param> ReduceF;
typedef map_rect_combine<F, T_shared_param, T_job_param> CombineF;

const int num_jobs = job_params.size();
const vector_d shared_params_dbl = value_of(shared_params);
std::vector<std::future<std::vector<matrix_d>>> futures;

auto execute_chunk = [&](int start, int size) -> std::vector<matrix_d> {
const int end = start + size;
std::vector<matrix_d> chunk_f_out;
chunk_f_out.reserve(size);
for (int i = start; i != end; i++)
chunk_f_out.push_back(ReduceF()(
shared_params_dbl, value_of(job_params[i]), x_r[i], x_i[i], msgs));
return chunk_f_out;
};

int num_threads = get_num_threads(num_jobs);
int num_jobs_per_thread = num_jobs / num_threads;
futures.emplace_back(
std::async(std::launch::deferred, execute_chunk, 0, num_jobs_per_thread));

#ifdef STAN_THREADS
if (num_threads > 1) {
const int num_big_threads = num_jobs % num_threads;
const int first_big_thread = num_threads - num_big_threads;
for (int i = 1, job_start = num_jobs_per_thread, job_size = 0;
i < num_threads; ++i, job_start += job_size) {
job_size = i >= first_big_thread ? num_jobs_per_thread + 1
: num_jobs_per_thread;
futures.emplace_back(
std::async(std::launch::async, execute_chunk, job_start, job_size));
}
}
#endif

// collect results
std::vector<int> world_f_out;
world_f_out.reserve(num_jobs);
matrix_d world_output(0, 0);

int offset = 0;
for (std::size_t i = 0; i < futures.size(); ++i) {
const std::vector<matrix_d>& chunk_result = futures[i].get();
if (i == 0)
world_output.resize(chunk_result[0].rows(),
num_jobs * chunk_result[0].cols());

for (const auto& job_result : chunk_result) {
const int num_job_outputs = job_result.cols();
world_f_out.push_back(num_job_outputs);

if (world_output.cols() < offset + num_job_outputs)
world_output.conservativeResize(Eigen::NoChange,
2 * (offset + num_job_outputs));

world_output.block(0, offset, world_output.rows(), num_job_outputs)
= job_result;

offset += num_job_outputs;
}
}
CombineF combine(shared_params, job_params);
return combine(world_output, world_f_out);
}

} // namespace internal
} // namespace math
Expand Down
1 change: 0 additions & 1 deletion stan/math/rev/core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#include <stan/math/rev/core/build_vari_array.hpp>
#include <stan/math/rev/core/chainable_alloc.hpp>
#include <stan/math/rev/core/chainablestack.hpp>
#include <stan/math/rev/core/init_chainablestack.hpp>
#include <stan/math/rev/core/ddv_vari.hpp>
#include <stan/math/rev/core/dv_vari.hpp>
#include <stan/math/rev/core/dvd_vari.hpp>
Expand Down
132 changes: 37 additions & 95 deletions stan/math/rev/core/autodiffstackstorage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,98 +7,42 @@
namespace stan {
namespace math {

// Internal macro used to modify global pointer definition to the
// global AD instance.
#ifdef STAN_THREADS
// Whenever STAN_THREADS is set a TLS keyword is used. For reasons
// explained below we use the GNU compiler extension __thread if
// supported by the compiler while the generic thread_local C++11
// keyword is used otherwise.
#ifdef __GNUC__
#define STAN_THREADS_DEF __thread
#else
#define STAN_THREADS_DEF thread_local
#endif
#else
// In case STAN_THREADS is not set, then no modifier is needed.
#define STAN_THREADS_DEF
#endif

/**
* This struct always provides access to the autodiff stack using
* the singleton pattern. Read warnings below!
*
* The singleton <code>instance_</code> is a global static pointer,
* which is thread local (TLS) if the STAN_THREADS preprocess variable
* is defined.
* Provides a thread_local singleton if needed. Read warnings below!
* For performance reasons the singleton is a global static for the
* case of no threading which is returned by a function. This design
* should allow the compiler to apply necessary inlining to get
* maximal performance. However, this design suffers from "the static
* init order fiasco"[0]. Anywhere this is used, we must be
* absolutely positive that it doesn't matter when the singleton will
* get initialized relative to other static variables. In exchange,
* we get a more performant singleton pattern for the non-threading
* case. In the threading case we use the defacto standard C++11
* singleton pattern relying on a function wrapping a static local
* variable. This standard pattern is expected to be well supported
* by the major compilers (as its standard), but it does incur some
* performance penalty. There has been some discussion on this; see
* [1] and [2] and the discussions those PRs link to as well.
*
* The use of a pointer is motivated by performance reasons for the
* threading case. When a TLS is used, initialization with a constant
* expression at compile time is required for fast access to the
* TLS. As the autodiff storage struct is non-POD, its initialization
* is a dynamic expression at compile time. These dynamic expressions
* are wrapped, in the TLS case, by a TLS wrapper function which slows
* down its access. Using a pointer instead allows to initialize at
* compile time to <code>nullptr</code>, which is a compile time
* constant. In this case, the compiler avoids the use of a TLS
* wrapper function.
*
* For performance reasons we use the __thread keyword on compilers
* which support it. The __thread keyword is a GNU compiler-specific
* (gcc, clang, Intel) extension which requires initialization with a
* compile time constant expression. The C++11 keyword thread_local
* does allow for constant and dynamic initialization of the
* TLS. Thus, only the __thread keyword gurantees that constant
* initialization and it's implied speedup, is used.
*
* The initialzation of the AD instance at run-time is handled by the
* lifetime of a AutodiffStackSingleton object. More specifically, the
* first instance of the AutodiffStackSingleton object will initialize
* the AD instance and take ownership (it is the only one instance
* with the private member own_instance_ being true). Thus, whenever
* the first instance of the AutodiffStackSingleton object gets
* destructed, the AD tape will be destructed as well. Within
* stan-math the initialization of the AD instance for the main thread
* of the program is handled by instantiating the singleton once in
* the init_chainablestack.hpp file. Whenever STAN_THREADS is defined
* then all created child threads must instantiate a
* AutodiffStackSingleton object within the child thread before
* accessing the AD system in order to initialize the TLS AD tape
* within the child thread.
*
* The design of a globally held (optionally TLS) pointer, which is
* globally initialized, allows the compiler to apply necessary
* inlining to get maximal performance. However, the design suffers
* from "the static init order fiasco"[0]. Whenever the static init
* order fiasco occurs, the C++ client of the library may instantiate
* a AutodiffStackSingleton object at the adequate code position prior
* to any AD tape access to ensure proper initialization order. In
* exchange, we get a more performant singleton pattern with automatic
* initialization of the AD stack for the main thread. There has been
* some discussion on earlier designs using the Mayer singleton
* approach; see [1] and [2] and the discussions those PRs link to as
* well.
* These are thread_local only if the user asks for it with
* -DSTAN_THREADS. This is primarily because Apple clang compilers
* before 2016 don't support thread_local and the additional
* performance cost. We have proposed removing support for those[3],
* and at that time we should evaluate the performance of a switch to
* thread_local. If there is no loss in performance, we can remove
* this ifdef.
*
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
* [1] https://github.com/stan-dev/math/pull/840
* [2] https://github.com/stan-dev/math/pull/826
* [3]
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
* [4] https://github.com/stan-dev/math/pull/1135
*/
template <typename ChainableT, typename ChainableAllocT>
struct AutodiffStackSingleton {
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
AutodiffStackSingleton_t;

AutodiffStackSingleton() : own_instance_(init()) {}
~AutodiffStackSingleton() {
if (own_instance_) {
delete instance_;
instance_ = nullptr;
}
}

struct AutodiffStackStorage {
AutodiffStackStorage &operator=(const AutodiffStackStorage &) = delete;

Expand All @@ -113,32 +57,30 @@ struct AutodiffStackSingleton {
std::vector<size_t> nested_var_alloc_stack_starts_;
};

AutodiffStackSingleton() = delete;
explicit AutodiffStackSingleton(AutodiffStackSingleton_t const &) = delete;
AutodiffStackSingleton &operator=(const AutodiffStackSingleton_t &) = delete;

static inline constexpr AutodiffStackStorage &instance() {
return *instance_;
static inline AutodiffStackStorage &instance() {
#ifdef STAN_THREADS
thread_local static AutodiffStackStorage instance_;
#endif
return instance_;
}

private:
static bool init() {
if (!instance_) {
instance_ = new AutodiffStackStorage();
return true;
}
return false;
}
#ifndef STAN_THREADS

static STAN_THREADS_DEF AutodiffStackStorage *instance_;
const bool own_instance_;
private:
static AutodiffStackStorage instance_;
#endif
};

#ifndef STAN_THREADS
template <typename ChainableT, typename ChainableAllocT>
STAN_THREADS_DEF
typename AutodiffStackSingleton<ChainableT,
ChainableAllocT>::AutodiffStackStorage
*AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
= nullptr;
typename AutodiffStackSingleton<ChainableT,
ChainableAllocT>::AutodiffStackStorage
AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
#endif

} // namespace math
} // namespace stan
Expand Down
18 changes: 0 additions & 18 deletions stan/math/rev/core/init_chainablestack.hpp

This file was deleted.

1 change: 0 additions & 1 deletion stan/math/rev/mat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
#include <stan/math/rev/mat/functor/integrate_ode_adams.hpp>
#include <stan/math/rev/mat/functor/integrate_ode_bdf.hpp>
#include <stan/math/rev/mat/functor/integrate_dae.hpp>
#include <stan/math/rev/mat/functor/map_rect_concurrent.hpp>
#include <stan/math/rev/mat/functor/map_rect_reduce.hpp>

#endif
Loading