stan-dev · rok-cesnovar · May 18, 2019 · May 16, 2019
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -49,9 +49,6 @@ pipeline {
         skipDefaultCheckout()
         preserveStashes(buildCount: 7)
     }
-    environment {
-        STAN_NUM_THREADS = '4'
-    }
     stages {
         stage('Kill previous builds') {
             when {
@@ -228,17 +225,6 @@ pipeline {
                         runTestsWin("test/unit")
                     }
                 }
-                stage('Windows Threading') {
-                    agent { label 'windows' }
-                    steps {
-                        deleteDirWin()
-                        unstash 'MathSetup'
-                        bat "echo CXX=${env.CXX} -Werror > make/local"
-                        bat "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
-                        runTestsWin("test/unit -f thread")
-                        runTestsWin("test/unit -f map_rect")
-                    }
-                }
             }
         }
         stage('Additional merge tests') {
@@ -250,7 +236,7 @@ pipeline {
                         deleteDir()
                         unstash 'MathSetup'
                         sh "echo CXX=${GCC} >> make/local"
-                        sh "echo CXXFLAGS=-DSTAN_THREADS >> make/local"
+                        sh "echo CPPFLAGS=-DSTAN_THREADS >> make/local"
                         runTests("test/unit")
                     }
                     post { always { retry(3) { deleteDir() } } }

diff --git a/stan/math/prim/mat/functor/map_rect.hpp b/stan/math/prim/mat/functor/map_rect.hpp
@@ -4,7 +4,6 @@
 #include <stan/math/prim/arr/err/check_matching_sizes.hpp>
 #include <stan/math/prim/mat/fun/dims.hpp>
 #include <stan/math/prim/mat/fun/typedefs.hpp>
-#include <stan/math/prim/scal/meta/return_type.hpp>
 
 #define STAN_REGISTER_MAP_RECT(CALLID, FUNCTOR)
 

diff --git a/stan/math/prim/mat/functor/map_rect_concurrent.hpp b/stan/math/prim/mat/functor/map_rect_concurrent.hpp
@@ -3,13 +3,15 @@
 
 #include <stan/math/prim/mat/fun/typedefs.hpp>
 
-#include <stan/math/prim/scal/meta/return_type.hpp>
+#include <stan/math/prim/mat/functor/map_rect_reduce.hpp>
+#include <stan/math/prim/mat/functor/map_rect_combine.hpp>
 #include <stan/math/prim/scal/err/invalid_argument.hpp>
 #include <boost/lexical_cast.hpp>
 
-#include <cstdlib>
 #include <vector>
 #include <thread>
+#include <future>
+#include <cstdlib>
 
 namespace stan {
 namespace math {
@@ -75,7 +77,72 @@ map_rect_concurrent(
     const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&
         job_params,
     const std::vector<std::vector<double>>& x_r,
-    const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr);
+    const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr) {
+  typedef map_rect_reduce<F, T_shared_param, T_job_param> ReduceF;
+  typedef map_rect_combine<F, T_shared_param, T_job_param> CombineF;
+
+  const int num_jobs = job_params.size();
+  const vector_d shared_params_dbl = value_of(shared_params);
+  std::vector<std::future<std::vector<matrix_d>>> futures;
+
+  auto execute_chunk = [&](int start, int size) -> std::vector<matrix_d> {
+    const int end = start + size;
+    std::vector<matrix_d> chunk_f_out;
+    chunk_f_out.reserve(size);
+    for (int i = start; i != end; i++)
+      chunk_f_out.push_back(ReduceF()(
+          shared_params_dbl, value_of(job_params[i]), x_r[i], x_i[i], msgs));
+    return chunk_f_out;
+  };
+
+  int num_threads = get_num_threads(num_jobs);
+  int num_jobs_per_thread = num_jobs / num_threads;
+  futures.emplace_back(
+      std::async(std::launch::deferred, execute_chunk, 0, num_jobs_per_thread));
+
+#ifdef STAN_THREADS
+  if (num_threads > 1) {
+    const int num_big_threads = num_jobs % num_threads;
+    const int first_big_thread = num_threads - num_big_threads;
+    for (int i = 1, job_start = num_jobs_per_thread, job_size = 0;
+         i < num_threads; ++i, job_start += job_size) {
+      job_size = i >= first_big_thread ? num_jobs_per_thread + 1
+                                       : num_jobs_per_thread;
+      futures.emplace_back(
+          std::async(std::launch::async, execute_chunk, job_start, job_size));
+    }
+  }
+#endif
+
+  // collect results
+  std::vector<int> world_f_out;
+  world_f_out.reserve(num_jobs);
+  matrix_d world_output(0, 0);
+
+  int offset = 0;
+  for (std::size_t i = 0; i < futures.size(); ++i) {
+    const std::vector<matrix_d>& chunk_result = futures[i].get();
+    if (i == 0)
+      world_output.resize(chunk_result[0].rows(),
+                          num_jobs * chunk_result[0].cols());
+
+    for (const auto& job_result : chunk_result) {
+      const int num_job_outputs = job_result.cols();
+      world_f_out.push_back(num_job_outputs);
+
+      if (world_output.cols() < offset + num_job_outputs)
+        world_output.conservativeResize(Eigen::NoChange,
+                                        2 * (offset + num_job_outputs));
+
+      world_output.block(0, offset, world_output.rows(), num_job_outputs)
+          = job_result;
+
+      offset += num_job_outputs;
+    }
+  }
+  CombineF combine(shared_params, job_params);
+  return combine(world_output, world_f_out);
+}
 
 }  // namespace internal
 }  // namespace math

diff --git a/stan/math/rev/core.hpp b/stan/math/rev/core.hpp
@@ -5,7 +5,6 @@
 #include <stan/math/rev/core/build_vari_array.hpp>
 #include <stan/math/rev/core/chainable_alloc.hpp>
 #include <stan/math/rev/core/chainablestack.hpp>
-#include <stan/math/rev/core/init_chainablestack.hpp>
 #include <stan/math/rev/core/ddv_vari.hpp>
 #include <stan/math/rev/core/dv_vari.hpp>
 #include <stan/math/rev/core/dvd_vari.hpp>

diff --git a/stan/math/rev/core/autodiffstackstorage.hpp b/stan/math/rev/core/autodiffstackstorage.hpp
@@ -7,98 +7,42 @@
 namespace stan {
 namespace math {
 
-// Internal macro used to modify global pointer definition to the
-// global AD instance.
-#ifdef STAN_THREADS
-// Whenever STAN_THREADS is set a TLS keyword is used. For reasons
-// explained below we use the GNU compiler extension __thread if
-// supported by the compiler while the generic thread_local C++11
-// keyword is used otherwise.
-#ifdef __GNUC__
-#define STAN_THREADS_DEF __thread
-#else
-#define STAN_THREADS_DEF thread_local
-#endif
-#else
-// In case STAN_THREADS is not set, then no modifier is needed.
-#define STAN_THREADS_DEF
-#endif
-
 /**
- * This struct always provides access to the autodiff stack using
- * the singleton pattern. Read warnings below!
- *
- * The singleton <code>instance_</code> is a global static pointer,
- * which is thread local (TLS) if the STAN_THREADS preprocess variable
- * is defined.
+ * Provides a thread_local singleton if needed. Read warnings below!
+ * For performance reasons the singleton is a global static for the
+ * case of no threading which is returned by a function. This design
+ * should allow the compiler to apply necessary inlining to get
+ * maximal performance. However, this design suffers from "the static
+ * init order fiasco"[0].  Anywhere this is used, we must be
+ * absolutely positive that it doesn't matter when the singleton will
+ * get initialized relative to other static variables.  In exchange,
+ * we get a more performant singleton pattern for the non-threading
+ * case. In the threading case we use the defacto standard C++11
+ * singleton pattern relying on a function wrapping a static local
+ * variable. This standard pattern is expected to be well supported
+ * by the major compilers (as its standard), but it does incur some
+ * performance penalty.  There has been some discussion on this; see
+ * [1] and [2] and the discussions those PRs link to as well.
  *
- * The use of a pointer is motivated by performance reasons for the
- * threading case. When a TLS is used, initialization with a constant
- * expression at compile time is required for fast access to the
- * TLS. As the autodiff storage struct is non-POD, its initialization
- * is a dynamic expression at compile time. These dynamic expressions
- * are wrapped, in the TLS case, by a TLS wrapper function which slows
- * down its access. Using a pointer instead allows to initialize at
- * compile time to <code>nullptr</code>, which is a compile time
- * constant. In this case, the compiler avoids the use of a TLS
- * wrapper function.
- *
- * For performance reasons we use the __thread keyword on compilers
- * which support it. The __thread keyword is a GNU compiler-specific
- * (gcc, clang, Intel) extension which requires initialization with a
- * compile time constant expression. The C++11 keyword thread_local
- * does allow for constant and dynamic initialization of the
- * TLS. Thus, only the __thread keyword gurantees that constant
- * initialization and it's implied speedup, is used.
- *
- * The initialzation of the AD instance at run-time is handled by the
- * lifetime of a AutodiffStackSingleton object. More specifically, the
- * first instance of the AutodiffStackSingleton object will initialize
- * the AD instance and take ownership (it is the only one instance
- * with the private member own_instance_ being true). Thus, whenever
- * the first instance of the AutodiffStackSingleton object gets
- * destructed, the AD tape will be destructed as well. Within
- * stan-math the initialization of the AD instance for the main thread
- * of the program is handled by instantiating the singleton once in
- * the init_chainablestack.hpp file. Whenever STAN_THREADS is defined
- * then all created child threads must instantiate a
- * AutodiffStackSingleton object within the child thread before
- * accessing the AD system in order to initialize the TLS AD tape
- * within the child thread.
- *
- * The design of a globally held (optionally TLS) pointer, which is
- * globally initialized, allows the compiler to apply necessary
- * inlining to get maximal performance. However, the design suffers
- * from "the static init order fiasco"[0]. Whenever the static init
- * order fiasco occurs, the C++ client of the library may instantiate
- * a AutodiffStackSingleton object at the adequate code position prior
- * to any AD tape access to ensure proper initialization order. In
- * exchange, we get a more performant singleton pattern with automatic
- * initialization of the AD stack for the main thread. There has been
- * some discussion on earlier designs using the Mayer singleton
- * approach; see [1] and [2] and the discussions those PRs link to as
- * well.
+ * These are thread_local only if the user asks for it with
+ * -DSTAN_THREADS. This is primarily because Apple clang compilers
+ * before 2016 don't support thread_local and the additional
+ * performance cost. We have proposed removing support for those[3],
+ * and at that time we should evaluate the performance of a switch to
+ * thread_local.  If there is no loss in performance, we can remove
+ * this ifdef.
  *
  * [0] https://isocpp.org/wiki/faq/ctors#static-init-order
  * [1] https://github.com/stan-dev/math/pull/840
  * [2] https://github.com/stan-dev/math/pull/826
  * [3]
  * http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
- * [4] https://github.com/stan-dev/math/pull/1135
  */
 template <typename ChainableT, typename ChainableAllocT>
 struct AutodiffStackSingleton {
   typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
       AutodiffStackSingleton_t;
 
-  AutodiffStackSingleton() : own_instance_(init()) {}
-  ~AutodiffStackSingleton() {
-    if (own_instance_) {
-      delete instance_;
-      instance_ = nullptr;
-    }
-  }
-
   struct AutodiffStackStorage {
     AutodiffStackStorage &operator=(const AutodiffStackStorage &) = delete;
 
@@ -113,32 +57,30 @@ struct AutodiffStackSingleton {
     std::vector<size_t> nested_var_alloc_stack_starts_;
   };
 
+  AutodiffStackSingleton() = delete;
   explicit AutodiffStackSingleton(AutodiffStackSingleton_t const &) = delete;
   AutodiffStackSingleton &operator=(const AutodiffStackSingleton_t &) = delete;
 
-  static inline constexpr AutodiffStackStorage &instance() {
-    return *instance_;
+  static inline AutodiffStackStorage &instance() {
+#ifdef STAN_THREADS
+    thread_local static AutodiffStackStorage instance_;
+#endif
+    return instance_;
   }
 
- private:
-  static bool init() {
-    if (!instance_) {
-      instance_ = new AutodiffStackStorage();
-      return true;
-    }
-    return false;
-  }
+#ifndef STAN_THREADS
 
-  static STAN_THREADS_DEF AutodiffStackStorage *instance_;
-  const bool own_instance_;
+ private:
+  static AutodiffStackStorage instance_;
+#endif
 };
 
+#ifndef STAN_THREADS
 template <typename ChainableT, typename ChainableAllocT>
-STAN_THREADS_DEF
-    typename AutodiffStackSingleton<ChainableT,
-                                    ChainableAllocT>::AutodiffStackStorage
-        *AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
-    = nullptr;
+typename AutodiffStackSingleton<ChainableT,
+                                ChainableAllocT>::AutodiffStackStorage
+    AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
+#endif
 
 }  // namespace math
 }  // namespace stan

diff --git a/stan/math/rev/core/init_chainablestack.hpp b/stan/math/rev/core/init_chainablestack.hpp
diff --git a/stan/math/rev/mat.hpp b/stan/math/rev/mat.hpp
@@ -68,7 +68,6 @@
 #include <stan/math/rev/mat/functor/integrate_ode_adams.hpp>
 #include <stan/math/rev/mat/functor/integrate_ode_bdf.hpp>
 #include <stan/math/rev/mat/functor/integrate_dae.hpp>
-#include <stan/math/rev/mat/functor/map_rect_concurrent.hpp>
 #include <stan/math/rev/mat/functor/map_rect_reduce.hpp>
 
 #endif