From 62b29550d1e7c2f96067b7934af0e372744a531c Mon Sep 17 00:00:00 2001
From: tbbdev <alexey.v.moskalev@intel.com>
Date: Thu, 15 Sep 2016 14:57:37 +0300
Subject: [PATCH] Committing Intel(R) TBB 4.4 Update 1 source code

---
 CHANGES                                       |  50 ++++++
 README.md                                     |   4 +-
 build/android.inc                             |   2 +
 build/linux.gcc.inc                           |   2 +-
 build/windows.icl.inc                         |   3 +
 doc/Release_Notes.txt                         |  11 ++
 doc/html/a00241.html                          |  18 +-
 doc/html/a00256.html                          |  34 ++--
 examples/common/toolset.props                 |   2 +-
 examples/graph/index.html                     |   2 +
 examples/index.html                           |   2 +-
 .../fractal/Makefile                          |  16 +-
 .../fractal/Makefile.windows                  |   0
 .../fractal/fractal.cpp                       | 116 ++++++++-----
 .../fractal/fractal.h                         |  69 ++++++--
 .../fractal/fractal_video.h                   |  30 +++-
 .../fractal/index.html                        |  30 +++-
 .../fractal/main.cpp                          |   0
 .../fractal/msvs/fractal.sln                  |   0
 .../fractal/msvs/fractal.vcxproj              |   0
 .../fractal/msvs/gui.ico                      | Bin
 .../fractal/msvs/gui.rc                       |   0
 .../fractal/msvs/resource.h                   |   0
 .../fractal/msvs/small.ico                    | Bin
 .../xcode/fractal.xcodeproj/project.pbxproj   |   0
 .../xcschemes/tbbExample.xcscheme             |   0
 .../{task_priority => task_arena}/index.html  |   0
 include/tbb/concurrent_vector.h               |  12 +-
 include/tbb/flow_graph.h                      |   2 +-
 include/tbb/internal/_flow_graph_node_impl.h  |   2 +-
 include/tbb/machine/gcc_generic.h             |  65 ++++++-
 include/tbb/parallel_for_each.h               |  66 +++++--
 include/tbb/task.h                            |   4 +
 include/tbb/tbb_config.h                      |  33 ++--
 include/tbb/tbb_stddef.h                      |   2 +-
 index.src.html => index.html                  |  35 ++--
 jni/Application.mk                            |   4 +-
 src/perf/time_hash_map_fill.cpp               |   6 +-
 src/perf/time_parallel_for_each.cpp           |  70 ++++++++
 src/rml/test/test_rml_mixed.cpp               |   2 +-
 src/tbb/arena.cpp                             | 158 ++++++++++-------
 src/tbb/arena.h                               |  16 +-
 src/tbb/cache_aligned_allocator.cpp           |  24 +--
 src/tbb/concurrent_vector.cpp                 |  25 ++-
 src/tbb/governor.cpp                          |  57 +++---
 src/tbb/governor.h                            |   2 -
 src/tbb/market.cpp                            |  13 +-
 src/tbb/market.h                              |   2 +-
 src/tbb/scheduler.cpp                         |  12 +-
 src/tbb/scheduler.h                           |   7 +-
 src/tbb/task_group_context.cpp                |  17 +-
 src/tbb/tbb_misc.cpp                          |   6 +-
 src/tbbmalloc/backend.cpp                     |  42 ++++-
 src/tbbmalloc/frontend.cpp                    |  35 ++--
 src/tbbmalloc/proxy.cpp                       |  63 ++++---
 src/tbbmalloc/tbbmalloc_internal.h            |  59 +++++--
 src/test/harness.h                            |   2 +-
 src/test/harness_allocator_overload.h         |  39 +++++
 src/test/harness_defs.h                       |   2 +-
 src/test/harness_iterator.h                   |   8 +-
 src/test/harness_tsx.h                        |   8 +-
 src/test/test_allocator.h                     |  32 ++++
 src/test/test_atomic.cpp                      |   2 +-
 src/test/test_malloc_atexit.cpp               |  10 +-
 src/test/test_malloc_overload.cpp             |  33 ++--
 src/test/test_malloc_pools.cpp                | 137 ++++++++-------
 src/test/test_task_arena.cpp                  | 162 ++++++++++++++++--
 src/test/test_task_group.cpp                  |  13 ++
 src/test/test_task_priority.cpp               |  19 ++
 src/test/test_tbb_fork.cpp                    |  28 ++-
 src/test/test_tbb_version.cpp                 |   2 +-
 71 files changed, 1259 insertions(+), 470 deletions(-)
 rename examples/{task_priority => task_arena}/fractal/Makefile (76%)
 rename examples/{task_priority => task_arena}/fractal/Makefile.windows (100%)
 rename examples/{task_priority => task_arena}/fractal/fractal.cpp (74%)
 rename examples/{task_priority => task_arena}/fractal/fractal.h (61%)
 rename examples/{task_priority => task_arena}/fractal/fractal_video.h (71%)
 rename examples/{task_priority => task_arena}/fractal/index.html (77%)
 rename examples/{task_priority => task_arena}/fractal/main.cpp (100%)
 rename examples/{task_priority => task_arena}/fractal/msvs/fractal.sln (100%)
 rename examples/{task_priority => task_arena}/fractal/msvs/fractal.vcxproj (100%)
 rename examples/{task_priority => task_arena}/fractal/msvs/gui.ico (100%)
 rename examples/{task_priority => task_arena}/fractal/msvs/gui.rc (100%)
 rename examples/{task_priority => task_arena}/fractal/msvs/resource.h (100%)
 rename examples/{task_priority => task_arena}/fractal/msvs/small.ico (100%)
 rename examples/{task_priority => task_arena}/fractal/xcode/fractal.xcodeproj/project.pbxproj (100%)
 rename examples/{task_priority => task_arena}/fractal/xcode/fractal.xcodeproj/xcshareddata/xcschemes/tbbExample.xcscheme (100%)
 rename examples/{task_priority => task_arena}/index.html (100%)
 rename index.src.html => index.html (71%)
 create mode 100644 src/perf/time_parallel_for_each.cpp
 create mode 100644 src/test/harness_allocator_overload.h

diff --git a/CHANGES b/CHANGES
index f538787744..942f7df5e3 100644
--- a/CHANGES
+++ b/CHANGES
@@ -2,9 +2,49 @@
 The list of most significant changes made over time in
 Intel(R) Threading Building Blocks (Intel(R) TBB).
 
+Intel TBB 4.4 Update 1
+TBB_INTERFACE_VERSION == 9001
+
+Changes (w.r.t. Intel TBB 4.4):
+
+- Added support for Microsoft* Visual Studio* 2015.
+- Intel TBB no longer performs dynamic replacement of memory allocation
+    functions for Microsoft Visual Studio 2005 and earlier versions.
+- For GCC 4.7 and higher, the intrinsics-based platform isolation layer
+    uses __atomic_* built-ins instead of the legacy __sync_* ones.
+    This change is inspired by a contribution from Mathieu Malaterre.
+- Improvements in task_arena:
+    Several application threads may join a task_arena and execute tasks
+    simultaneously. The amount of concurrency reserved for application
+    threads at task_arena construction can be set to any value between
+    0 and the arena concurrency limit.
+- The fractal example was modified to demonstrate class task_arena
+    and moved to examples/task_arena/fractal.
+
+Bugs fixed:
+
+- Fixed a deadlock during destruction of task_scheduler_init objects
+    when one of destructors is set to wait for worker threads.
+- Added a workaround for a possible crash on OS X* when dynamic memory
+    allocator replacement (libtbbmalloc_proxy) is used and memory is
+    released during application startup.
+- Usage of mutable functors with task_group::run_and_wait() and
+    task_arena::enqueue() is disabled. An attempt to pass a functor
+    which operator()() is not const will produce compilation errors.
+- Makefiles and environment scripts now properly recognize GCC 5.0 and
+    higher.
+
+Open-source contributions integrated:
+
+- Improved performance of parallel_for_each for inputs allowing random
+    access, by Raf Schietekat.
+
+------------------------------------------------------------------------
 Intel TBB 4.4
 TBB_INTERFACE_VERSION == 9000
 
+Changes (w.r.t. Intel TBB 4.3 Update 6):
+
 - The following features are now fully supported:
     tbb::flow::composite_node;
     additional policies of tbb::flow::graph_node::reset().
@@ -60,6 +100,7 @@ Intel TBB 4.3 Update 6
 TBB_INTERFACE_VERSION == 8006
 
 Changes (w.r.t. Intel TBB 4.3 Update 5):
+
 - Supported zero-copy realloc for objects >1MB under Linux* via
     mremap system call.
 - C++11 move-aware insert and emplace methods have been added to
@@ -67,6 +108,7 @@ Changes (w.r.t. Intel TBB 4.3 Update 5):
 - install_name is set to @rpath/<library name> on OS X*.
 
 Preview Features:
+
 - Added template class async_node to the flow graph API. It allows a
     flow graph to communicate with an external activity managed by
     the user or another runtime.
@@ -75,6 +117,7 @@ Preview Features:
 - extract() method of graph nodes now takes no arguments.
 
 Bugs fixed:
+
 - concurrent_unordered_{set,map} behaves correctly for degenerate
     hashes.
 - Fixed a race condition in the memory allocator that may lead to
@@ -85,9 +128,11 @@ Intel TBB 4.3 Update 5
 TBB_INTERFACE_VERSION == 8005
 
 Changes (w.r.t. Intel TBB 4.3 Update 4):
+
 - Added add_ref_count() method of class tbb::task.
 
 Preview Features:
+
 - Added class global_control for application-wide control of allowed
     parallelism and thread stack size.
 - memory_pool_allocator now throws the std::bad_alloc exception on
@@ -96,6 +141,7 @@ Preview Features:
     std::bad_alloc to std::invalid_argument and std::runtime_error.
 
 Bugs fixed:
+
 - scalable_allocator now throws the std::bad_alloc exception on
     allocation failure.
 - Fixed a race condition in the memory allocator that may lead to
@@ -104,6 +150,7 @@ Bugs fixed:
     might be unable to modify the number of worker threads.
 
 Open-source contributions integrated:
+
 - (Added but not enabled) push_front() method of class tbb::task_list
     by Raf Schietekat.
 
@@ -112,6 +159,7 @@ Intel TBB 4.3 Update 4
 TBB_INTERFACE_VERSION == 8004
 
 Changes (w.r.t. Intel TBB 4.3 Update 3):
+
 - Added a C++11 variadic constructor for enumerable_thread_specific.
     The arguments from this constructor are used to construct
     thread-local values.
@@ -123,6 +171,7 @@ Changes (w.r.t. Intel TBB 4.3 Update 3):
     concurrent unordered containers.
 
 Preview Features:
+
 - Interface-breaking change: typedefs changed for node predecessor and
     successor lists, affecting copy_predecessors and copy_successors
     methods.
@@ -132,6 +181,7 @@ Preview Features:
     automatically using the node port with index 0 for an edge.
 
 Open-source contributions integrated:
+
 - Draft code for enumerable_thread_specific constructor with multiple
     arguments (see above) by Adrien Guinet.
 - Fix for GCC invocation on IBM* Blue Gene*
diff --git a/README.md b/README.md
index 8bce84b8bb..19088d19c4 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Intel(R) Threading Building Blocks 4.4
+# Intel(R) Threading Building Blocks 4.4 Update 1
 
 Intel(R) Threading Building Blocks (Intel(R) TBB) lets you easily write parallel C++ programs that take
 full advantage of multicore performance, that are portable, composable and have future-proof scalability.
@@ -8,7 +8,7 @@ Here are the latest [Changes] (CHANGES) and [Release Notes]
 (doc/Release_Notes.txt) (contains system requirements and known issues).
 
 ## Licensing
-Intel(R) TBB 4.4 is licensed under [GPLv2] (COPYING) with the runtime exception.
+Intel(R) TBB 4.4 Update 1 is licensed under [GPLv2] (COPYING) with the runtime exception.
 
 ## Documentation
 * Intel(R) TBB [tutorial] (https://software.intel.com/en-us/tbb-tutorial)
diff --git a/build/android.inc b/build/android.inc
index fbe68a7642..061e308bde 100644
--- a/build/android.inc
+++ b/build/android.inc
@@ -33,6 +33,8 @@ ifneq ("command line","$(origin arch)")
         export COMPILER_VERSION := ICC: $(shell icc -V </dev/null 2>&1 | grep 'Version')
         ifneq (,$(findstring IA-32, $(COMPILER_VERSION)))
             export arch:=ia32
+        else ifneq (,$(findstring Intel(R) 64, $(COMPILER_VERSION)))
+            export arch:=intel64
         else
             $(error "No support for Android in $(COMPILER_VERSION)")
         endif
diff --git a/build/linux.gcc.inc b/build/linux.gcc.inc
index 4b7122bd62..9d93cfc179 100644
--- a/build/linux.gcc.inc
+++ b/build/linux.gcc.inc
@@ -49,7 +49,7 @@ ifneq (,$(shell gcc -dumpversion | egrep  "^(4\.[2-9]|[5-9])"))
 endif
 
 # gcc 4.8 and later support RTM intrinsics, but require command line switch to enable them
-ifneq (,$(shell gcc -dumpversion | egrep  "^4\.[8-9]"))
+ifneq (,$(shell gcc -dumpversion | egrep  "^(4\.[8-9]|[5-9])"))
     RTM_KEY = -mrtm
 endif
 
diff --git a/build/windows.icl.inc b/build/windows.icl.inc
index d5047d4ed1..687516860e 100644
--- a/build/windows.icl.inc
+++ b/build/windows.icl.inc
@@ -126,6 +126,9 @@ endif
 ifeq ($(VCCOMPAT_FLAG),)
         VCCOMPAT_FLAG := $(if $(findstring vc12, $(VCVERSION)),/Qvc12)
 endif
+ifeq ($(VCCOMPAT_FLAG),)
+        VCCOMPAT_FLAG := $(if $(findstring vc14, $(VCVERSION)),/Qvc14)
+endif
 ifeq ($(VCCOMPAT_FLAG),)
         $(error VC version not detected correctly: $(VCVERSION) )
 endif
diff --git a/doc/Release_Notes.txt b/doc/Release_Notes.txt
index 661de4d90f..cb795750a3 100644
--- a/doc/Release_Notes.txt
+++ b/doc/Release_Notes.txt
@@ -106,6 +106,11 @@ Library Issues
         of Microsoft* Visual C++ 10.0 runtime (msvcp100d.dll) in order
         to run.
 
+    - For applications linked with the debug version of Microsoft*
+        Universal CRT (ucrtbased.dll, used since Microsoft Visual C++
+        14.0) dynamic replacement of memory management functions
+        is not supported.
+
     - If an application uses static MSVCRT libraries or the Intel TBB
         library built with static MSVCRT (vc_mt variant), and throws
         an exception from a functor passed to task_group::run_and_wait(),
@@ -157,6 +162,12 @@ Library Issues
         4.8.2, and 4.9.2), the destructor of a task_group might not
         throw missing_wait exception.
 
+    - On OS X* 10.11 some examples might fail to run via makefiles in
+        case System Integrity Protection is enabled. In such case
+        instead of `make <args>` use the following command:
+        `run_cmd="DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH" make <args>`
+        or run executables directly.
+
 ------------------------------------------------------------------------
 Copyright (C) 2005-2015 Intel Corporation.  All Rights Reserved.
 
diff --git a/doc/html/a00241.html b/doc/html/a00241.html
index 5af0b4cc89..9d9328168f 100644
--- a/doc/html/a00241.html
+++ b/doc/html/a00241.html
@@ -401,10 +401,10 @@
 <tr class="memdesc:ga62b1a9ac304ba1eb4f87ab8cc18dd281"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parallel iteration over range with affinity_partitioner and user-supplied context.  <a href="a00256.html#ga62b1a9ac304ba1eb4f87ab8cc18dd281">More...</a><br/></td></tr>
 <tr class="separator:ga62b1a9ac304ba1eb4f87ab8cc18dd281"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr><td colspan="2"><div class="groupHeader">parallel_for_each</div></td></tr>
-<tr class="memitem:ga96bf42caaec779e413e5a2396c1e301c"><td class="memTemplParams" colspan="2">template&lt;typename InputIterator , typename Function &gt; </td></tr>
-<tr class="memitem:ga96bf42caaec779e413e5a2396c1e301c"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#ga96bf42caaec779e413e5a2396c1e301c">parallel_for_each</a> (InputIterator first, InputIterator last, const Function &amp;f, <a class="el" href="a00155.html">task_group_context</a> &amp;context)</td></tr>
-<tr class="memdesc:ga96bf42caaec779e413e5a2396c1e301c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calls function f for all items from [first, last) interval using user-supplied context.  <a href="a00256.html#ga96bf42caaec779e413e5a2396c1e301c">More...</a><br/></td></tr>
-<tr class="separator:ga96bf42caaec779e413e5a2396c1e301c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gac9225af45fd0468843b3fc32608c71de"><td class="memTemplParams" colspan="2">template&lt;typename Iterator , typename Function &gt; </td></tr>
+<tr class="memitem:gac9225af45fd0468843b3fc32608c71de"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#gac9225af45fd0468843b3fc32608c71de">parallel_for_each</a> (Iterator first, Iterator last, const Function &amp;f, <a class="el" href="a00155.html">task_group_context</a> &amp;context)</td></tr>
+<tr class="memdesc:gac9225af45fd0468843b3fc32608c71de"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calls function f for all items from [first, last) interval using user-supplied context.  <a href="a00256.html#gac9225af45fd0468843b3fc32608c71de">More...</a><br/></td></tr>
+<tr class="separator:gac9225af45fd0468843b3fc32608c71de"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga72131543fd218b67dbb048b768d534e4"><td class="memTemplParams" colspan="2">template&lt;typename Range , typename Function &gt; </td></tr>
 <tr class="memitem:ga72131543fd218b67dbb048b768d534e4"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#ga72131543fd218b67dbb048b768d534e4">parallel_for_each</a> (Range &amp;rng, const Function &amp;f, <a class="el" href="a00155.html">task_group_context</a> &amp;context)</td></tr>
 <tr class="memdesc:ga72131543fd218b67dbb048b768d534e4"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calls function f for all items from rng using user-supplied context.  <a href="a00256.html#ga72131543fd218b67dbb048b768d534e4">More...</a><br/></td></tr>
@@ -413,11 +413,11 @@
 <tr class="memitem:gad7d0702d16fbd7554584ab735eed39ba"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#gad7d0702d16fbd7554584ab735eed39ba">parallel_for_each</a> (const Range &amp;rng, const Function &amp;f, <a class="el" href="a00155.html">task_group_context</a> &amp;context)</td></tr>
 <tr class="memdesc:gad7d0702d16fbd7554584ab735eed39ba"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calls function f for all items from const rng user-supplied context.  <a href="a00256.html#gad7d0702d16fbd7554584ab735eed39ba">More...</a><br/></td></tr>
 <tr class="separator:gad7d0702d16fbd7554584ab735eed39ba"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gaef59e9baf9141a19c99a291e4532bd98"><td class="memTemplParams" colspan="2"><a class="anchor" id="gaef59e9baf9141a19c99a291e4532bd98"></a>
-template&lt;typename InputIterator , typename Function &gt; </td></tr>
-<tr class="memitem:gaef59e9baf9141a19c99a291e4532bd98"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#gaef59e9baf9141a19c99a291e4532bd98">parallel_for_each</a> (InputIterator first, InputIterator last, const Function &amp;f)</td></tr>
-<tr class="memdesc:gaef59e9baf9141a19c99a291e4532bd98"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses default context. <br/></td></tr>
-<tr class="separator:gaef59e9baf9141a19c99a291e4532bd98"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga21c63dee10188196a2f0514afa81ff1f"><td class="memTemplParams" colspan="2"><a class="anchor" id="ga21c63dee10188196a2f0514afa81ff1f"></a>
+template&lt;typename Iterator , typename Function &gt; </td></tr>
+<tr class="memitem:ga21c63dee10188196a2f0514afa81ff1f"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#ga21c63dee10188196a2f0514afa81ff1f">parallel_for_each</a> (Iterator first, Iterator last, const Function &amp;f)</td></tr>
+<tr class="memdesc:ga21c63dee10188196a2f0514afa81ff1f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses default context. <br/></td></tr>
+<tr class="separator:ga21c63dee10188196a2f0514afa81ff1f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga5868cd01f2126b139ab5809f64632db1"><td class="memTemplParams" colspan="2"><a class="anchor" id="ga5868cd01f2126b139ab5809f64632db1"></a>
 template&lt;typename Range , typename Function &gt; </td></tr>
 <tr class="memitem:ga5868cd01f2126b139ab5809f64632db1"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#ga5868cd01f2126b139ab5809f64632db1">parallel_for_each</a> (Range &amp;rng, const Function &amp;f)</td></tr>
diff --git a/doc/html/a00256.html b/doc/html/a00256.html
index 17f8841ef2..d248af6b83 100644
--- a/doc/html/a00256.html
+++ b/doc/html/a00256.html
@@ -131,10 +131,10 @@
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="member-group"></a>
 parallel_for_each</h2></td></tr>
-<tr class="memitem:ga96bf42caaec779e413e5a2396c1e301c"><td class="memTemplParams" colspan="2">template&lt;typename InputIterator , typename Function &gt; </td></tr>
-<tr class="memitem:ga96bf42caaec779e413e5a2396c1e301c"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#ga96bf42caaec779e413e5a2396c1e301c">tbb::parallel_for_each</a> (InputIterator first, InputIterator last, const Function &amp;f, task_group_context &amp;context)</td></tr>
-<tr class="memdesc:ga96bf42caaec779e413e5a2396c1e301c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calls function f for all items from [first, last) interval using user-supplied context.  <a href="#ga96bf42caaec779e413e5a2396c1e301c">More...</a><br/></td></tr>
-<tr class="separator:ga96bf42caaec779e413e5a2396c1e301c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gac9225af45fd0468843b3fc32608c71de"><td class="memTemplParams" colspan="2">template&lt;typename Iterator , typename Function &gt; </td></tr>
+<tr class="memitem:gac9225af45fd0468843b3fc32608c71de"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#gac9225af45fd0468843b3fc32608c71de">tbb::parallel_for_each</a> (Iterator first, Iterator last, const Function &amp;f, task_group_context &amp;context)</td></tr>
+<tr class="memdesc:gac9225af45fd0468843b3fc32608c71de"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calls function f for all items from [first, last) interval using user-supplied context.  <a href="#gac9225af45fd0468843b3fc32608c71de">More...</a><br/></td></tr>
+<tr class="separator:gac9225af45fd0468843b3fc32608c71de"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga72131543fd218b67dbb048b768d534e4"><td class="memTemplParams" colspan="2">template&lt;typename Range , typename Function &gt; </td></tr>
 <tr class="memitem:ga72131543fd218b67dbb048b768d534e4"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#ga72131543fd218b67dbb048b768d534e4">tbb::parallel_for_each</a> (Range &amp;rng, const Function &amp;f, task_group_context &amp;context)</td></tr>
 <tr class="memdesc:ga72131543fd218b67dbb048b768d534e4"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calls function f for all items from rng using user-supplied context.  <a href="#ga72131543fd218b67dbb048b768d534e4">More...</a><br/></td></tr>
@@ -143,11 +143,11 @@
 <tr class="memitem:gad7d0702d16fbd7554584ab735eed39ba"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#gad7d0702d16fbd7554584ab735eed39ba">tbb::parallel_for_each</a> (const Range &amp;rng, const Function &amp;f, task_group_context &amp;context)</td></tr>
 <tr class="memdesc:gad7d0702d16fbd7554584ab735eed39ba"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calls function f for all items from const rng user-supplied context.  <a href="#gad7d0702d16fbd7554584ab735eed39ba">More...</a><br/></td></tr>
 <tr class="separator:gad7d0702d16fbd7554584ab735eed39ba"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gaef59e9baf9141a19c99a291e4532bd98"><td class="memTemplParams" colspan="2"><a class="anchor" id="gaef59e9baf9141a19c99a291e4532bd98"></a>
-template&lt;typename InputIterator , typename Function &gt; </td></tr>
-<tr class="memitem:gaef59e9baf9141a19c99a291e4532bd98"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#gaef59e9baf9141a19c99a291e4532bd98">tbb::parallel_for_each</a> (InputIterator first, InputIterator last, const Function &amp;f)</td></tr>
-<tr class="memdesc:gaef59e9baf9141a19c99a291e4532bd98"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses default context. <br/></td></tr>
-<tr class="separator:gaef59e9baf9141a19c99a291e4532bd98"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga21c63dee10188196a2f0514afa81ff1f"><td class="memTemplParams" colspan="2"><a class="anchor" id="ga21c63dee10188196a2f0514afa81ff1f"></a>
+template&lt;typename Iterator , typename Function &gt; </td></tr>
+<tr class="memitem:ga21c63dee10188196a2f0514afa81ff1f"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#ga21c63dee10188196a2f0514afa81ff1f">tbb::parallel_for_each</a> (Iterator first, Iterator last, const Function &amp;f)</td></tr>
+<tr class="memdesc:ga21c63dee10188196a2f0514afa81ff1f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses default context. <br/></td></tr>
+<tr class="separator:ga21c63dee10188196a2f0514afa81ff1f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga5868cd01f2126b139ab5809f64632db1"><td class="memTemplParams" colspan="2"><a class="anchor" id="ga5868cd01f2126b139ab5809f64632db1"></a>
 template&lt;typename Range , typename Function &gt; </td></tr>
 <tr class="memitem:ga5868cd01f2126b139ab5809f64632db1"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="a00256.html#ga5868cd01f2126b139ab5809f64632db1">tbb::parallel_for_each</a> (Range &amp;rng, const Function &amp;f)</td></tr>
@@ -558,8 +558,6 @@ <h2 class="groupheader">Function Documentation</h2>
 
 <p>Parallel iteration over a range, with optional addition of more work. </p>
 
-<p>Referenced by <a class="el" href="a00256.html#ga96bf42caaec779e413e5a2396c1e301c">tbb::parallel_for_each()</a>.</p>
-
 </div>
 </div>
 <a class="anchor" id="ga7db29bc10d81eddc85854256ffc723c0"></a>
@@ -904,22 +902,22 @@ <h2 class="groupheader">Function Documentation</h2>
 
 </div>
 </div>
-<a class="anchor" id="ga96bf42caaec779e413e5a2396c1e301c"></a>
+<a class="anchor" id="gac9225af45fd0468843b3fc32608c71de"></a>
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename InputIterator , typename Function &gt; </div>
+template&lt;typename Iterator , typename Function &gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void tbb::parallel_for_each </td>
           <td>(</td>
-          <td class="paramtype">InputIterator&#160;</td>
+          <td class="paramtype">Iterator&#160;</td>
           <td class="paramname"><em>first</em>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">InputIterator&#160;</td>
+          <td class="paramtype">Iterator&#160;</td>
           <td class="paramname"><em>last</em>, </td>
         </tr>
         <tr>
@@ -944,8 +942,6 @@ <h2 class="groupheader">Function Documentation</h2>
 
 <p>Calls function f for all items from [first, last) interval using user-supplied context. </p>
 
-<p>References <a class="el" href="a00256.html#ga533f6732498ade8634a9470cbf89192d">tbb::parallel_do()</a>.</p>
-
 <p>Referenced by <a class="el" href="a00256.html#ga72131543fd218b67dbb048b768d534e4">tbb::parallel_for_each()</a>.</p>
 
 </div>
@@ -984,7 +980,7 @@ <h2 class="groupheader">Function Documentation</h2>
 
 <p>Calls function f for all items from rng using user-supplied context. </p>
 
-<p>References <a class="el" href="a00256.html#ga96bf42caaec779e413e5a2396c1e301c">tbb::parallel_for_each()</a>.</p>
+<p>References <a class="el" href="a00256.html#gac9225af45fd0468843b3fc32608c71de">tbb::parallel_for_each()</a>.</p>
 
 </div>
 </div>
@@ -1022,7 +1018,7 @@ <h2 class="groupheader">Function Documentation</h2>
 
 <p>Calls function f for all items from const rng user-supplied context. </p>
 
-<p>References <a class="el" href="a00256.html#ga96bf42caaec779e413e5a2396c1e301c">tbb::parallel_for_each()</a>.</p>
+<p>References <a class="el" href="a00256.html#gac9225af45fd0468843b3fc32608c71de">tbb::parallel_for_each()</a>.</p>
 
 </div>
 </div>
diff --git a/examples/common/toolset.props b/examples/common/toolset.props
index a99e238c52..1c2c2fb99a 100644
--- a/examples/common/toolset.props
+++ b/examples/common/toolset.props
@@ -1,9 +1,9 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <PropertyGroup>
-    <PlatformToolset Condition="Exists('$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\Intel C++ Compiler XE 14.0')">Intel C++ Compiler XE 14.0</PlatformToolset>
     <PlatformToolset Condition="Exists('$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\Intel C++ Compiler 15.0 [Intel(R) System Studio]')">Intel C++ Compiler 15.0 [Intel(R) System Studio]</PlatformToolset>
     <PlatformToolset Condition="Exists('$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\Intel C++ Compiler XE 15.0')">Intel C++ Compiler XE 15.0</PlatformToolset>
+    <PlatformToolset Condition="Exists('$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\Intel C++ Compiler 16.0')">Intel C++ Compiler 16.0</PlatformToolset>
     <UseIntelTBB Condition="'$(UseIntelTBB)' == ''">true</UseIntelTBB>
   </PropertyGroup>
 </Project>
diff --git a/examples/graph/index.html b/examples/graph/index.html
index df37e757ad..71cc09aeac 100644
--- a/examples/graph/index.html
+++ b/examples/graph/index.html
@@ -14,6 +14,8 @@ <H2>Directories</H2>
 <DD>A simplistic example of a collection of digital logic gates that can be easily composed into larger circuits.
 <DT><A HREF="som/index.html">som</A>
 <DD>A simple example of a Kohonen Self-Organizing Map using cancellation.
+<DT><A HREF="fgbzip2/index.html">fgbzip2</A>
+<DD>A parallel implementation of bzip2 block-sorting file compressor.
 </DL>
 
 <HR>
diff --git a/examples/index.html b/examples/index.html
index 8eeb038bb1..b947bcd138 100644
--- a/examples/index.html
+++ b/examples/index.html
@@ -26,7 +26,7 @@ <H2>Directories</H2>
 <DD>Examples using raw <CODE>task</CODE> interface.
 <DT><A HREF="task_group/index.html">task_group</A>
 <DD>Examples using <CODE>task_group</CODE> interface.
-<DT><A HREF="task_priority/index.html">task_priority</A>
+<DT><A HREF="task_arena/index.html">task_arena</A>
 <DD>Examples using the task priority feature.
 <DT><A HREF="test_all/index.html">test_all</A>
 <DD>Examples that test all the parts of the package.
diff --git a/examples/task_priority/fractal/Makefile b/examples/task_arena/fractal/Makefile
similarity index 76%
rename from examples/task_priority/fractal/Makefile
rename to examples/task_arena/fractal/Makefile
index 953dae54f9..b4fdf48a54 100644
--- a/examples/task_priority/fractal/Makefile
+++ b/examples/task_arena/fractal/Makefile
@@ -41,10 +41,12 @@ all:	release test
 
 resources:
 ifeq ($(UI),mac)
-	mkdir -p $(APPRES)/en.lproj $(NAME).app/Contents/MacOS
-	cp ../../common/gui/xcode/tbbExample/Info.plist $(NAME).app/Contents
+	mkdir -p $(NAME).app/Contents/{MacOS,Resources/en.lproj}
+	cat ../../common/gui/xcode/tbbExample/Info.plist | sed -e "s/tbbExample/$(NAME)/" > $(NAME).app/Contents/Info.plist
+	cat ../../common/gui/xcode/tbbExample/launcher.sh | sed -e "s/tbbExample/$(NAME)/" > $(NAME).app/Contents/MacOS/launcher.sh
+	chmod +x $(NAME).app/Contents/MacOS/launcher.sh
 	cp ../../common/gui/xcode/tbbExample/PkgInfo $(NAME).app/Contents
-	cp ../../common/gui/xcode/tbbExample/en.lproj/* $(APPRES)/en.lproj
+	cp ../../common/gui/xcode/tbbExample/en.lproj/* $(NAME).app/Contents/Resources/en.lproj
 endif # OS X*
 
 release: $(SOURCES) resources
@@ -52,12 +54,20 @@ ifeq ($(UI),mac)
 	$(CXX_UI) -O2 -DNDEBUG $(CXXFLAGS) -c $(MACUISOURCES)
 endif # OS X*
 	$(CXX) -O2 -DNDEBUG $(CXXFLAGS) -o $(EXE) $(SOURCES) $(MACUIOBJS) -ltbb $(LIBS)
+ifeq ($(UI),mac)
+	cp ../../../build/*_release/libtbb.dylib $(NAME).app/Contents/Resources	
+	install_name_tool -change libtbb.dylib @executable_path/../Resources/libtbb.dylib $(EXE)
+endif
 
 debug: resources
 ifeq ($(UI),mac)
 	$(CXX_UI) -g -O0 -DTBB_USE_DEBUG $(CXXFLAGS) -c $(MACUISOURCES)
 endif # OS X*
 	$(CXX) -g -O0 -DTBB_USE_DEBUG $(CXXFLAGS) -o $(EXE) $(SOURCES) $(MACUIOBJS) -ltbb_debug $(LIBS)
+ifeq ($(UI),mac)
+	cp ../../../build/*_debug/libtbb_debug.dylib $(NAME).app/Contents/Resources	
+	install_name_tool -change libtbb_debug.dylib @executable_path/../Resources/libtbb_debug.dylib $(EXE)
+endif
 
 clean:
 	$(RM) $(EXE) *.o *.d
diff --git a/examples/task_priority/fractal/Makefile.windows b/examples/task_arena/fractal/Makefile.windows
similarity index 100%
rename from examples/task_priority/fractal/Makefile.windows
rename to examples/task_arena/fractal/Makefile.windows
diff --git a/examples/task_priority/fractal/fractal.cpp b/examples/task_arena/fractal/fractal.cpp
similarity index 74%
rename from examples/task_priority/fractal/fractal.cpp
rename to examples/task_arena/fractal/fractal.cpp
index 3dc2d20c1b..03f02987ec 100644
--- a/examples/task_priority/fractal/fractal.cpp
+++ b/examples/task_arena/fractal/fractal.cpp
@@ -23,19 +23,23 @@
 #include "tbb/parallel_for.h"
 #include "tbb/blocked_range2d.h"
 #include "tbb/task_scheduler_init.h"
+#include "tbb/task_arena.h"
+#include "tbb/task_group.h"
 #include "tbb/tick_count.h"
-#include "tbb/compat/thread"
 
 #include <math.h>
 #include <stdio.h>
 
+// Included for __TBB_CPP11_LAMBDAS_PRESENT definition
+#include "tbb/tbb_config.h"
+
 video *v;
 extern bool silent;
 extern bool schedule_auto;
 extern int grain_size;
 
-color_t fractal::calc_one_pixel(int x0, int y0) {
-    int iter;
+color_t fractal::calc_one_pixel( int x0, int y0 ) const {
+    unsigned int iter;
     double fx0, fy0, xtemp, x, y, mu;
 
     color_t color;
@@ -46,11 +50,13 @@ color_t fractal::calc_one_pixel(int x0, int y0) {
     fy0 = fy0 / magn + cy;
 
     iter = 0; x = 0; y = 0;
+    mu = 0;
 
-    while (((x*x + y*y) <= 4) && (iter < max_iterations)) { 
+    while (((x*x + y*y) <= 4) && (iter < max_iterations)) {
         xtemp = x*x - y*y + fx0;
         y = 2*x*y + fy0;
         x = xtemp;
+        mu += exp(-sqrt(x*x+y*y));
         iter++;
     }
 
@@ -60,18 +66,6 @@ color_t fractal::calc_one_pixel(int x0, int y0) {
         return color;
     }
 
-    // compute again but with exponent calculation at each iteration
-    // it's all for coloring point outside the mandelbrot set
-    iter = 0; x = 0; y = 0;
-    mu = 0;
-    while (((x*x + y*y) <= 4) && (iter < max_iterations)) { 
-        xtemp = x*x - y*y + fx0;
-        y = 2*x*y + fy0;
-        x = xtemp;
-        mu += exp(-sqrt(x*x+y*y));
-        iter++;
-    }
-
     int b = (int)(256*mu);
     int g = (b/8);
     int r = (g/16);
@@ -85,7 +79,7 @@ color_t fractal::calc_one_pixel(int x0, int y0) {
 }
 
 void fractal::clear() {
-    drawing_area area( off_x, off_y, size_x, size_y, dm) ;
+    drawing_area area( off_x, off_y, size_x, size_y, dm ) ;
 
     // fill the rendering area with black color
     for (int y=0; y<size_y; ++y) {
@@ -118,7 +112,7 @@ void fractal::draw_border( bool is_active ) {
         area3.set_pixel(0, i, color);
 }
 
-void fractal::render_rect( int x0, int y0, int x1, int y1 ) {
+void fractal::render_rect( int x0, int y0, int x1, int y1 ) const {
     // render the specified rectangle area
     drawing_area area(off_x+x0, off_y+y0, x1-x0, y1-y0, dm);
     for ( int y=y0; y<y1; ++y ) {
@@ -137,27 +131,33 @@ class fractal_body {
             f.render_rect( r.cols().begin(), r.rows().begin(), r.cols().end(), r.rows().end() );
     }
 
-    fractal_body( fractal &f ) : f(f) {
+    fractal_body( fractal &_f ) : f(_f) {
     }
 };
 
 void fractal::render( tbb::task_group_context &context ) {
-    // run parallel_for that process the fractal area
+    // Make copy of fractal object and render fractal with parallel_for with
+    // the provided context and partitioner chosen by schedule_auto.
+    // Updates to fractal are not reflected in the render.
+    fractal f = *this;
+    fractal_body body(f);
+
     if( schedule_auto )
         tbb::parallel_for( tbb::blocked_range2d<int>(0, size_y, grain_size, 0, size_x, grain_size ),
-                fractal_body(*this), tbb::auto_partitioner(), context);
+                body, tbb::auto_partitioner(), context);
     else
         tbb::parallel_for( tbb::blocked_range2d<int>(0, size_y, grain_size, 0, size_x, grain_size ),
-                fractal_body(*this), tbb::simple_partitioner(), context);
+                body, tbb::simple_partitioner(), context);
 }
 
 void fractal::run( tbb::task_group_context &context ) {
     clear();
+    context.reset();
     render( context );
 }
 
-bool fractal::check_point( int x, int y ) {
-    return x >= off_x && x <= off_x+size_x && 
+bool fractal::check_point( int x, int y ) const {
+    return x >= off_x && x <= off_x+size_x &&
             y >= off_y && y <= off_y+size_y;
 }
 
@@ -177,17 +177,10 @@ void fractal_group::calc_fractal( int num ) {
     }
 }
 
-void fg_thread_func(fractal_group *fg) {
-    // initialize the task scheduler for the second thread
-    tbb::task_scheduler_init init( fg->get_num_threads() );
-    // calculate the second fractal
-    fg->calc_fractal( 1 );
-}
-
 void fractal_group::set_priorities() {
     // set the high priority for the active area and the normal priority for another area
     context[active].set_priority( tbb::priority_high );
-    context[active^1].set_priority( tbb::priority_normal );
+    context[active^1].set_priority( tbb::priority_low );
 }
 
 void fractal_group::switch_priorities( int new_active ) {
@@ -197,11 +190,39 @@ void fractal_group::switch_priorities( int new_active ) {
     draw_borders();
 }
 
-void fractal_group::set_num_frames_at_least(int n) {
+void fractal_group::set_num_frames_at_least( int n ) {
     if ( num_frames[0]<n ) num_frames[0] = n;
     if ( num_frames[1]<n ) num_frames[1] = n;
 }
 
+#if !__TBB_CPP11_LAMBDAS_PRESENT
+class task_group_body {
+    fractal_group &fg;
+public:
+    task_group_body(fractal_group &_fg) : fg(_fg) { }
+
+    void operator() () const { fg.calc_fractal( 1 ); }
+};
+
+class arena_body {
+    task_group_body &tg_body;
+    tbb::task_group &task_group;
+public:
+    arena_body( task_group_body &_tg_body, tbb::task_group &_task_group )
+        :  tg_body( _tg_body ), task_group( _task_group )  { }
+
+    void operator() () const { task_group.run( tg_body ); }
+};
+
+class arena_body_wait {
+    tbb::task_group &group;
+public:
+    arena_body_wait( tbb::task_group &gr ) : group(gr) { }
+
+    void operator() () const { group.wait(); }
+};
+#endif
+
 void fractal_group::run( bool create_second_fractal ) {
     // initialize task scheduler
     tbb::task_scheduler_init init( num_threads );
@@ -212,17 +233,32 @@ void fractal_group::run( bool create_second_fractal ) {
     set_priorities();
     draw_borders();
 
+    tbb::task_arena arena;
+    tbb::task_group gr;
+
     // the second fractal is calculating on separated thread
-    std::thread *fg_thread = 0;
-    if ( create_second_fractal ) fg_thread = new std::thread( fg_thread_func, this );
+    if ( create_second_fractal ) {
+#if __TBB_CPP11_LAMBDAS_PRESENT
+        arena.execute( [&] {
+            gr.run( [&] { calc_fractal( 1 ); } );
+        } );
+#else
+        task_group_body tg_body( *this );
+        arena_body a_body( tg_body, gr );
+        arena.execute( a_body );
+#endif
+    }
 
     // calculate the first fractal
     calc_fractal( 0 );
 
-    if ( fg_thread ) {
+    if ( create_second_fractal ) {
+#if __TBB_CPP11_LAMBDAS_PRESENT
         // wait for second fractal
-        fg_thread->join();
-        delete fg_thread;
+        arena.execute( [&] { gr.wait(); } );
+#else
+        arena.execute( arena_body_wait( gr ) );
+#endif
     }
 
     delete[] context;
@@ -233,7 +269,7 @@ void fractal_group::draw_borders() {
     f1.draw_border( active==1 );
 }
 
-fractal_group::fractal_group( const drawing_memory &_dm, int _num_threads, int _max_iterations, int _num_frames ) : f0(_dm), f1(_dm), num_threads(_num_threads) {
+fractal_group::fractal_group( const drawing_memory &_dm, int _num_threads, unsigned int _max_iterations, int _num_frames ) : f0(_dm), f1(_dm), num_threads(_num_threads) {
     // set rendering areas
     f0.size_x = f1.size_x = _dm.sizex/2-4;
     f0.size_y = f1.size_y = _dm.sizey-4;
@@ -251,7 +287,7 @@ fractal_group::fractal_group( const drawing_memory &_dm, int _num_threads, int _
     num_frames[0] = num_frames[1] = _num_frames;
 }
 
-void fractal_group::mouse_click(int x, int y) {
+void fractal_group::mouse_click( int x, int y ) {
     // assumption that the point is not inside any fractal area
     int new_active = -1;
 
diff --git a/examples/task_priority/fractal/fractal.h b/examples/task_arena/fractal/fractal.h
similarity index 61%
rename from examples/task_priority/fractal/fractal.h
rename to examples/task_arena/fractal/fractal.h
index 92b3d82e8c..12f178eaf3 100644
--- a/examples/task_priority/fractal/fractal.h
+++ b/examples/task_arena/fractal/fractal.h
@@ -37,13 +37,14 @@ class fractal {
     //! Fractal properties
     float cx, cy;
     float magn;
-    int max_iterations;
+    float step;
+    unsigned int max_iterations;
 
     //! Drawing memory object for rendering
     const drawing_memory &dm;
 
     //! One pixel calculation routine
-    color_t calc_one_pixel(int x, int y);
+    color_t calc_one_pixel( int x, int y ) const;
     //! Clears the fractal area
     void clear();
     //! Draws the border around the fractal area
@@ -51,11 +52,11 @@ class fractal {
     //! Renders the fractal
     void render( tbb::task_group_context &context );
     //! Check if the point is inside the fractal area
-    bool check_point( int x, int y);
+    bool check_point( int x, int y ) const;
 
 public:
     //! Constructor
-    fractal( const drawing_memory &dm ) : dm(dm) {
+    fractal( const drawing_memory &dm ) : step(0.2), dm(dm) {
 #if _MSC_VER && _WIN64 && !__INTEL_COMPILER
         // Workaround for MSVC x64 compiler issue
         volatile int i=0;
@@ -64,18 +65,29 @@ class fractal {
     //! Runs the fractal calculation
     void run( tbb::task_group_context &context );
     //! Renders the fractal rectangular area
-    void render_rect(int x0, int y0, int x1, int y1);
+    void render_rect( int x0, int y0, int x1, int y1 ) const;
+
+    void move_up()   { cy += step; }
+    void move_down() { cy -= step; }
+    void move_left() { cx += step; }
+    void move_right(){ cx -= step; }
+
+    void zoom_in() { magn *= 2.; step /= 2.; }
+    void zoom_out(){ magn /= 2.; step *= 2.; }
+
+    void quality_inc() { max_iterations += max_iterations/2; }
+    void quality_dec() { max_iterations -= max_iterations/2; }
 
     friend class fractal_group;
 };
 
 //! The group of fractals
 class fractal_group {
-    //! Fractals defenition
+    //! Fractals definition
     fractal f0, f1;
     //! Number of frames to calculate
     tbb::atomic<int> num_frames[2];
-    //! Task group contexts to manage prioroties
+    //! Task group contexts to manage priorities
     tbb::task_group_context *context;
 
     //! Border type enumeration
@@ -96,19 +108,56 @@ class fractal_group {
 
 public:
     //! Constructor
-    fractal_group( const drawing_memory &_dm, int num_threads = tbb::task_scheduler_init::automatic, int max_iterations = 100000, int num_frames = 1);
+    fractal_group( const drawing_memory &_dm,
+            int num_threads = tbb::task_scheduler_init::automatic,
+            unsigned int max_iterations = 100000, int num_frames = 1 );
     //! Run calculation
     void run( bool create_second_fractal=true );
     //! Mouse event handler
-    void mouse_click(int x, int y);
+    void mouse_click( int x, int y );
     //! Fractal calculation routine
     void calc_fractal( int num );
     //! Get number of threads
     int get_num_threads() const { return num_threads; }
     //! Reset the number of frames to be not less than the given value
-    void set_num_frames_at_least(int n);
+    void set_num_frames_at_least( int n );
     //! Switches the priorities of two fractals
     void switch_priorities( int new_active=-1 );
+    //! Get active fractal
+    fractal& get_active_fractal() { return  active ? f1 : f0; }
+
+    void active_fractal_zoom_in() {
+        get_active_fractal().zoom_in();
+        context[active].cancel_group_execution();
+    }
+    void active_fractal_zoom_out() {
+        get_active_fractal().zoom_out();
+        context[active].cancel_group_execution();
+    }
+    void active_fractal_quality_inc() {
+        get_active_fractal().quality_inc();
+        context[active].cancel_group_execution();
+    }
+    void active_fractal_quality_dec() {
+        get_active_fractal().quality_dec();
+        context[active].cancel_group_execution();
+    }
+    void active_fractal_move_up() {
+        get_active_fractal().move_up();
+        context[active].cancel_group_execution();
+    }
+    void active_fractal_move_down() {
+        get_active_fractal().move_down();
+        context[active].cancel_group_execution();
+    }
+    void active_fractal_move_left() {
+        get_active_fractal().move_left();
+        context[active].cancel_group_execution();
+    }
+    void active_fractal_move_right() {
+        get_active_fractal().move_right();
+        context[active].cancel_group_execution();
+    }
 };
 
 #endif /* FRACTAL_H_ */
diff --git a/examples/task_priority/fractal/fractal_video.h b/examples/task_arena/fractal/fractal_video.h
similarity index 71%
rename from examples/task_priority/fractal/fractal_video.h
rename to examples/task_arena/fractal/fractal_video.h
index 02eda48714..d56d03fc7d 100644
--- a/examples/task_priority/fractal/fractal_video.h
+++ b/examples/task_arena/fractal/fractal_video.h
@@ -43,13 +43,31 @@ class fractal_video : public video
 
     void on_key( int key ) {
         switch ( key&0xff ) {
-        case 27:
-            running = false; break;
-        case ' ': // space
-            if( fg ) fg->switch_priorities();
-        default:
-            if( fg ) fg->set_num_frames_at_least(20);
+            case esc_key:
+                running = false; break;
+            case ' ': // space
+                if( fg ) fg->switch_priorities(); break;
+
+            case 'q':
+                if( fg ) fg->active_fractal_zoom_in(); break;
+            case 'e':
+                if( fg ) fg->active_fractal_zoom_out(); break;
+
+            case 'r':
+                if( fg ) fg->active_fractal_quality_inc(); break;
+            case 'f':
+                if( fg ) fg->active_fractal_quality_dec(); break;
+
+            case 'w':
+                if( fg ) fg->active_fractal_move_up(); break;
+            case 'a':
+                if( fg ) fg->active_fractal_move_left(); break;
+            case 's':
+                if( fg ) fg->active_fractal_move_down(); break;
+            case 'd':
+                if( fg ) fg->active_fractal_move_right(); break;
         }
+        if( fg ) fg->set_num_frames_at_least(20);
     }
 
     void on_process() {
diff --git a/examples/task_priority/fractal/index.html b/examples/task_arena/fractal/index.html
similarity index 77%
rename from examples/task_priority/fractal/index.html
rename to examples/task_arena/fractal/index.html
index 2c3583dd29..da80481141 100644
--- a/examples/task_priority/fractal/index.html
+++ b/examples/task_arena/fractal/index.html
@@ -3,7 +3,7 @@
 
 <H2>Overview</H2>
 The example calculates two classical Mandelbrot fractals with different priorities. 
-The application window is divided into two areas where fractals are rendered. With mouse click on an area the user can change the priority of the calculating fractal. In the clicked area the fractal priority is changed to be "high" and the priority of the other fractal is changed to "normal".
+The application window is divided into two areas where fractals are rendered. With mouse click on an area the user can change the priority of the calculating fractal. In the clicked area the fractal priority is changed to be "high" and the priority of the other fractal is changed to "low". The fractal with "high" priority we will call active.
 The example also has the console mode but in this mode the priorities could not be changed during execution.
 
 
@@ -52,6 +52,34 @@ <H2>Usage</H2>
     <BR>Run it with a small fractal iterations number and the desired number of threads, e.g., <TT>fractal 4 1 10000</TT>.
 </DL>
 
+<H2>Hot keys</H2>
+The following hot keys can be used in interactive execution mode when the example is compiled with the graphical
+user interface:
+<DL>
+<dt>&lt;left mouse button&gt;
+<dd>Make the fractal active and change its priority to high
+<dt>&lt;space&gt;
+<dd>Switch priorities
+<dt>&lt;w&gt;
+<dd>Move the active fractal up
+<dt>&lt;a&gt;
+<dd>Move the active fractal to the left
+<dt>&lt;s&gt;
+<dd>Move the active fractal down
+<dt>&lt;d&gt;
+<dd>Move the active fractal to the right
+<dt>&lt;q&gt;
+<dd>Zoom in the active fractal
+<dt>&lt;e&gt;
+<dd>Zoom out the active fractal
+<dt>&lt;r&gt;
+<dd>Increase quality (count of iterations for each pixel) the active fractal
+<dt>&lt;f&gt;
+<dd>Decrease quality (count of iterations for each pixel) the active fractal
+<dt>&lt;esc&gt;
+<dd>Stop execution.
+</DL>
+
 <HR>
 <A HREF="../index.html">Up to parent directory</A>
 <p></p>
diff --git a/examples/task_priority/fractal/main.cpp b/examples/task_arena/fractal/main.cpp
similarity index 100%
rename from examples/task_priority/fractal/main.cpp
rename to examples/task_arena/fractal/main.cpp
diff --git a/examples/task_priority/fractal/msvs/fractal.sln b/examples/task_arena/fractal/msvs/fractal.sln
similarity index 100%
rename from examples/task_priority/fractal/msvs/fractal.sln
rename to examples/task_arena/fractal/msvs/fractal.sln
diff --git a/examples/task_priority/fractal/msvs/fractal.vcxproj b/examples/task_arena/fractal/msvs/fractal.vcxproj
similarity index 100%
rename from examples/task_priority/fractal/msvs/fractal.vcxproj
rename to examples/task_arena/fractal/msvs/fractal.vcxproj
diff --git a/examples/task_priority/fractal/msvs/gui.ico b/examples/task_arena/fractal/msvs/gui.ico
similarity index 100%
rename from examples/task_priority/fractal/msvs/gui.ico
rename to examples/task_arena/fractal/msvs/gui.ico
diff --git a/examples/task_priority/fractal/msvs/gui.rc b/examples/task_arena/fractal/msvs/gui.rc
similarity index 100%
rename from examples/task_priority/fractal/msvs/gui.rc
rename to examples/task_arena/fractal/msvs/gui.rc
diff --git a/examples/task_priority/fractal/msvs/resource.h b/examples/task_arena/fractal/msvs/resource.h
similarity index 100%
rename from examples/task_priority/fractal/msvs/resource.h
rename to examples/task_arena/fractal/msvs/resource.h
diff --git a/examples/task_priority/fractal/msvs/small.ico b/examples/task_arena/fractal/msvs/small.ico
similarity index 100%
rename from examples/task_priority/fractal/msvs/small.ico
rename to examples/task_arena/fractal/msvs/small.ico
diff --git a/examples/task_priority/fractal/xcode/fractal.xcodeproj/project.pbxproj b/examples/task_arena/fractal/xcode/fractal.xcodeproj/project.pbxproj
similarity index 100%
rename from examples/task_priority/fractal/xcode/fractal.xcodeproj/project.pbxproj
rename to examples/task_arena/fractal/xcode/fractal.xcodeproj/project.pbxproj
diff --git a/examples/task_priority/fractal/xcode/fractal.xcodeproj/xcshareddata/xcschemes/tbbExample.xcscheme b/examples/task_arena/fractal/xcode/fractal.xcodeproj/xcshareddata/xcschemes/tbbExample.xcscheme
similarity index 100%
rename from examples/task_priority/fractal/xcode/fractal.xcodeproj/xcshareddata/xcschemes/tbbExample.xcscheme
rename to examples/task_arena/fractal/xcode/fractal.xcodeproj/xcshareddata/xcschemes/tbbExample.xcscheme
diff --git a/examples/task_priority/index.html b/examples/task_arena/index.html
similarity index 100%
rename from examples/task_priority/index.html
rename to examples/task_arena/index.html
diff --git a/include/tbb/concurrent_vector.h b/include/tbb/concurrent_vector.h
index 484de99e75..80a57f12de 100644
--- a/include/tbb/concurrent_vector.h
+++ b/include/tbb/concurrent_vector.h
@@ -127,6 +127,12 @@ namespace internal {
             T* pointer() const {  return static_cast<T*>(const_cast<void*>(array)); }
         };
 
+        friend void enforce_segment_allocated(segment_value_t const& s, internal::exception_id exception = eid_bad_last_alloc){
+            if(s != segment_allocated()){
+                internal::throw_exception(exception);
+            }
+        }
+
         // Segment pointer.
         class segment_t {
             atomic<void*> array;
@@ -1153,8 +1159,9 @@ class concurrent_vector: protected internal::allocator_base<T, A>,
 
         pointer internal_push_back_result(){ return g.element;}
         iterator return_iterator_and_dismiss(){
+            pointer ptr = g.element;
             g.dismiss();
-            return iterator(v, k, g.element);
+            return iterator(v, k, ptr);
         }
     };
 };
@@ -1235,8 +1242,7 @@ T& concurrent_vector<T, A>::internal_subscript_with_exceptions( size_type index
     //TODO: why not make a load of my_segment relaxed as well ?
     //TODO: add an assertion that my_segment[k] is properly aligned to please ITT
     segment_value_t segment_value =  my_segment[k].template load<relaxed>();
-    if( segment_value != segment_allocated() ) // check for correct segment pointer
-        internal::throw_exception(internal::eid_index_range_error); // throw std::range_error
+    enforce_segment_allocated(segment_value, internal::eid_index_range_error);
     return (segment_value.pointer<T>())[j];
 }
 
diff --git a/include/tbb/flow_graph.h b/include/tbb/flow_graph.h
index 77f86b7230..f3466388a9 100644
--- a/include/tbb/flow_graph.h
+++ b/include/tbb/flow_graph.h
@@ -3528,7 +3528,7 @@ class composite_node <tbb::flow::tuple<>, tbb::flow::tuple<OutputTypes...> > : p
 
 };  // class composite_node
 
-#endif // __TBB_PREVIEW_COMPOSITE_NODE
+#endif // __TBB_FLOW_GRAPH_CPP11_FEATURES
 
 #if __TBB_PREVIEW_ASYNC_NODE
 namespace internal {
diff --git a/include/tbb/internal/_flow_graph_node_impl.h b/include/tbb/internal/_flow_graph_node_impl.h
index a18855d5a4..f0efc907e9 100644
--- a/include/tbb/internal/_flow_graph_node_impl.h
+++ b/include/tbb/internal/_flow_graph_node_impl.h
@@ -851,7 +851,7 @@ namespace internal {
     };  // multifunction_output
 
 //composite_node
-#if TBB_PREVIEW_FLOW_GRAPH_TRACE
+#if TBB_PREVIEW_FLOW_GRAPH_TRACE && __TBB_FLOW_GRAPH_CPP11_FEATURES
     template<typename CompositeType>
     void add_nodes_impl(CompositeType*, bool) {}
 
diff --git a/include/tbb/machine/gcc_generic.h b/include/tbb/machine/gcc_generic.h
index 257af96574..53b528176b 100644
--- a/include/tbb/machine/gcc_generic.h
+++ b/include/tbb/machine/gcc_generic.h
@@ -50,6 +50,9 @@
     #define __TBB_ENDIANNESS __TBB_ENDIAN_DETECT
 #endif
 
+#if __TBB_GCC_VERSION < 40700
+// Use __sync_* builtins
+
 /** As this generic implementation has absolutely no information about underlying
     hardware, its performance most likely will be sub-optimal because of full memory
     fence usages where a more lightweight synchronization means (or none at all)
@@ -64,10 +67,37 @@
 inline T __TBB_machine_cmpswp##S( volatile void *ptr, T value, T comparand ) {                    \
     return __sync_val_compare_and_swap(reinterpret_cast<volatile T *>(ptr),comparand,value);      \
 }                                                                                                 \
-                                                                                                  \
 inline T __TBB_machine_fetchadd##S( volatile void *ptr, T value ) {                               \
     return __sync_fetch_and_add(reinterpret_cast<volatile T *>(ptr),value);                       \
+}
+
+#define __TBB_USE_GENERIC_FETCH_STORE 1
+
+#else
+// __TBB_GCC_VERSION >= 40700; use __atomic_* builtins available since gcc 4.7
+
+#define __TBB_compiler_fence()              __asm__ __volatile__("": : :"memory")
+// Acquire and release fence intrinsics in GCC might miss compiler fence.
+// Adding it at both sides of an intrinsic, as we do not know what reordering can be made.
+#define __TBB_acquire_consistency_helper()  __TBB_compiler_fence(); __atomic_thread_fence(__ATOMIC_ACQUIRE); __TBB_compiler_fence()
+#define __TBB_release_consistency_helper()  __TBB_compiler_fence(); __atomic_thread_fence(__ATOMIC_RELEASE); __TBB_compiler_fence()
+#define __TBB_full_memory_fence()           __atomic_thread_fence(__ATOMIC_SEQ_CST)
+#define __TBB_control_consistency_helper()  __TBB_acquire_consistency_helper()
+
+#define __TBB_MACHINE_DEFINE_ATOMICS(S,T)                                                         \
+inline T __TBB_machine_cmpswp##S( volatile void *ptr, T value, T comparand ) {                    \
+    (void)__atomic_compare_exchange_n(reinterpret_cast<volatile T *>(ptr), &comparand, value,     \
+                                      false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);                 \
+    return comparand;                                                                             \
 }                                                                                                 \
+inline T __TBB_machine_fetchadd##S( volatile void *ptr, T value ) {                               \
+    return __atomic_fetch_add(reinterpret_cast<volatile T *>(ptr), value, __ATOMIC_SEQ_CST);      \
+}                                                                                                 \
+inline T __TBB_machine_fetchstore##S( volatile void *ptr, T value ) {                             \
+    return __atomic_exchange_n(reinterpret_cast<volatile T *>(ptr), value, __ATOMIC_SEQ_CST);     \
+}
+
+#endif // __TBB_GCC_VERSION < 40700
 
 __TBB_MACHINE_DEFINE_ATOMICS(1,int8_t)
 __TBB_MACHINE_DEFINE_ATOMICS(2,int16_t)
@@ -86,6 +116,13 @@ static inline intptr_t __TBB_machine_lg( uintptr_t x ) {
     return sizeof(x)*8 - tbb::internal::gcc_builtins::clz(x) -1 ;
 }
 
+
+typedef unsigned char __TBB_Flag;
+typedef __TBB_atomic __TBB_Flag __TBB_atomic_flag;
+
+#if __TBB_GCC_VERSION < 40700
+// Use __sync_* builtins
+
 static inline void __TBB_machine_or( volatile void *ptr, uintptr_t addend ) {
     __sync_fetch_and_or(reinterpret_cast<volatile uintptr_t *>(ptr),addend);
 }
@@ -94,19 +131,35 @@ static inline void __TBB_machine_and( volatile void *ptr, uintptr_t addend ) {
     __sync_fetch_and_and(reinterpret_cast<volatile uintptr_t *>(ptr),addend);
 }
 
+inline bool __TBB_machine_try_lock_byte( __TBB_atomic_flag &flag ) {
+    return __sync_lock_test_and_set(&flag,1)==0;
+}
 
-typedef unsigned char __TBB_Flag;
+inline void __TBB_machine_unlock_byte( __TBB_atomic_flag &flag ) {
+    __sync_lock_release(&flag);
+}
 
-typedef __TBB_atomic __TBB_Flag __TBB_atomic_flag;
+#else
+// __TBB_GCC_VERSION >= 40700; use __atomic_* builtins available since gcc 4.7
+
+static inline void __TBB_machine_or( volatile void *ptr, uintptr_t addend ) {
+    __atomic_fetch_or(reinterpret_cast<volatile uintptr_t *>(ptr),addend,__ATOMIC_SEQ_CST);
+}
+
+static inline void __TBB_machine_and( volatile void *ptr, uintptr_t addend ) {
+    __atomic_fetch_and(reinterpret_cast<volatile uintptr_t *>(ptr),addend,__ATOMIC_SEQ_CST);
+}
 
 inline bool __TBB_machine_try_lock_byte( __TBB_atomic_flag &flag ) {
-    return __sync_lock_test_and_set(&flag,1)==0;
+    return !__atomic_test_and_set(&flag,__ATOMIC_ACQUIRE);
 }
 
 inline void __TBB_machine_unlock_byte( __TBB_atomic_flag &flag ) {
-    __sync_lock_release(&flag);
+    __atomic_clear(&flag,__ATOMIC_RELEASE);
 }
 
+#endif // __TBB_GCC_VERSION < 40700
+
 // Machine specific atomic operations
 #define __TBB_AtomicOR(P,V)     __TBB_machine_or(P,V)
 #define __TBB_AtomicAND(P,V)    __TBB_machine_and(P,V)
@@ -117,7 +170,7 @@ inline void __TBB_machine_unlock_byte( __TBB_atomic_flag &flag ) {
 // Definition of other functions
 #define __TBB_Log2(V)           __TBB_machine_lg(V)
 
-#define __TBB_USE_GENERIC_FETCH_STORE                       1
+// TODO: implement with __atomic_* builtins where available
 #define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE            1
 #define __TBB_USE_GENERIC_RELAXED_LOAD_STORE                1
 #define __TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE 1
diff --git a/include/tbb/parallel_for_each.h b/include/tbb/parallel_for_each.h
index e0527dde6f..f3d64d89a0 100644
--- a/include/tbb/parallel_for_each.h
+++ b/include/tbb/parallel_for_each.h
@@ -22,6 +22,7 @@
 #define __TBB_parallel_for_each_H
 
 #include "parallel_do.h"
+#include "parallel_for.h"
 
 namespace tbb {
 
@@ -29,16 +30,59 @@ namespace tbb {
 namespace internal {
     // The class calls user function in operator()
     template <typename Function, typename Iterator>
-    class parallel_for_each_body : internal::no_assign {
+    class parallel_for_each_body_do : internal::no_assign {
         const Function &my_func;
     public:
-        parallel_for_each_body(const Function &_func) : my_func(_func) {}
-        parallel_for_each_body(const parallel_for_each_body<Function, Iterator> &_caller) : my_func(_caller.my_func) {}
+        parallel_for_each_body_do(const Function &_func) : my_func(_func) {}
 
-        void operator() ( typename std::iterator_traits<Iterator>::reference value ) const {
+        void operator()(typename std::iterator_traits<Iterator>::reference value) const {
             my_func(value);
         }
     };
+
+    // The class calls user function in operator()
+    template <typename Function, typename Iterator>
+    class parallel_for_each_body_for : internal::no_assign {
+        const Function &my_func;
+    public:
+        parallel_for_each_body_for(const Function &_func) : my_func(_func) {}
+
+        void operator()(tbb::blocked_range<Iterator> range) const {
+#if __INTEL_COMPILER
+#pragma ivdep
+#endif
+            for(Iterator it = range.begin(), end = range.end(); it != end; ++it) {
+                my_func(*it);
+            }
+        }
+    };
+
+    template<typename Iterator, typename Function, typename Generic>
+    struct parallel_for_each_impl {
+#if __TBB_TASK_GROUP_CONTEXT
+        static void doit(Iterator first, Iterator last, const Function& f, task_group_context &context) {
+            internal::parallel_for_each_body_do<Function, Iterator> body(f);
+            tbb::parallel_do(first, last, body, context);
+        }
+#endif
+        static void doit(Iterator first, Iterator last, const Function& f) {
+            internal::parallel_for_each_body_do<Function, Iterator> body(f);
+            tbb::parallel_do(first, last, body);
+        }
+    };
+    template<typename Iterator, typename Function>
+    struct parallel_for_each_impl<Iterator, Function, std::random_access_iterator_tag> {
+#if __TBB_TASK_GROUP_CONTEXT
+        static void doit(Iterator first, Iterator last, const Function& f, task_group_context &context) {
+            internal::parallel_for_each_body_for<Function, Iterator> body(f);
+            tbb::parallel_for(tbb::blocked_range<Iterator>(first, last), body, context);
+        }
+#endif
+        static void doit(Iterator first, Iterator last, const Function& f) {
+            internal::parallel_for_each_body_for<Function, Iterator> body(f);
+            tbb::parallel_for(tbb::blocked_range<Iterator>(first, last), body);
+        }
+    };
 } // namespace internal
 //! @endcond
 
@@ -48,10 +92,9 @@ namespace internal {
 //! Calls function f for all items from [first, last) interval using user-supplied context
 /** @ingroup algorithms */
 #if __TBB_TASK_GROUP_CONTEXT
-template<typename InputIterator, typename Function>
-void parallel_for_each(InputIterator first, InputIterator last, const Function& f, task_group_context &context) {
-    internal::parallel_for_each_body<Function, InputIterator> body(f);
-    tbb::parallel_do (first, last, body, context);
+template<typename Iterator, typename Function>
+void parallel_for_each(Iterator first, Iterator last, const Function& f, task_group_context &context) {
+    internal::parallel_for_each_impl<Iterator, Function, typename std::iterator_traits<Iterator>::iterator_category>::doit(first, last, f, context);
 }
 
 //! Calls function f for all items from rng using user-supplied context
@@ -70,10 +113,9 @@ void parallel_for_each(const Range& rng, const Function& f, task_group_context&
 #endif /* __TBB_TASK_GROUP_CONTEXT */
 
 //! Uses default context
-template<typename InputIterator, typename Function>
-void parallel_for_each(InputIterator first, InputIterator last, const Function& f) {
-    internal::parallel_for_each_body<Function, InputIterator> body(f);
-    tbb::parallel_do (first, last, body);
+template<typename Iterator, typename Function>
+void parallel_for_each(Iterator first, Iterator last, const Function& f) {
+    internal::parallel_for_each_impl<Iterator, Function, typename std::iterator_traits<Iterator>::iterator_category>::doit(first, last, f);
 }
 
 //! Uses default context
diff --git a/include/tbb/task.h b/include/tbb/task.h
index a416f3e27e..2c1029183c 100644
--- a/include/tbb/task.h
+++ b/include/tbb/task.h
@@ -901,7 +901,11 @@ class empty_task: public task {
 namespace internal {
     template<typename F>
     class function_task : public task {
+#if __TBB_ALLOW_MUTABLE_FUNCTORS
         F my_func;
+#else
+        const F my_func;
+#endif
         /*override*/ task* execute() {
             my_func();
             return NULL;
diff --git a/include/tbb/tbb_config.h b/include/tbb/tbb_config.h
index 112c768b65..8c2226fe98 100644
--- a/include/tbb/tbb_config.h
+++ b/include/tbb/tbb_config.h
@@ -138,13 +138,12 @@
     #endif
     #define __TBB_STATIC_ASSERT_PRESENT               (__INTEL_CXX11_MODE__ || _MSC_VER >= 1600)
     #define __TBB_CPP11_TUPLE_PRESENT                 (_MSC_VER >= 1600 || (__GXX_EXPERIMENTAL_CXX0X__ && __TBB_GCC_VERSION >= 40300))
-    /**Intel C++ compiler 14.0 crashes on using __has_include. When it fixed, condition will need to be updated. **/
     #if (__clang__ && __INTEL_COMPILER > 1400)
+        /* Older versions of Intel Compiler do not have __has_include */
         #if (__has_feature(__cxx_generalized_initializers__) && __has_include(<initializer_list>))
             #define __TBB_INITIALIZER_LISTS_PRESENT   1
         #endif
     #else
-        /** TODO: when MSVC2013 is supported by Intel C++ compiler, it will be enabled silently by compiler, so rule will need to be updated.**/
         #define __TBB_INITIALIZER_LISTS_PRESENT       __INTEL_CXX11_MODE__ && __INTEL_COMPILER >= 1400 && (_MSC_VER >= 1800 || __TBB_GCC_VERSION >= 40400 || _LIBCPP_VERSION)
     #endif
     
@@ -206,7 +205,7 @@
     #define __TBB_STATIC_ASSERT_PRESENT               (_MSC_VER >= 1600)
     #define __TBB_CPP11_TUPLE_PRESENT                 (_MSC_VER >= 1600)
     #define __TBB_INITIALIZER_LISTS_PRESENT           (_MSC_VER >= 1800)
-    #define __TBB_CONSTEXPR_PRESENT                   0
+    #define __TBB_CONSTEXPR_PRESENT                   (_MSC_VER >= 1900)
     #define __TBB_DEFAULTED_AND_DELETED_FUNC_PRESENT  (_MSC_VER >= 1800)
     #define __TBB_NOEXCEPT_PRESENT                    (_MSC_VER >= 1900)
     #define __TBB_CPP11_STD_BEGIN_END_PRESENT         (_MSC_VER >= 1700)
@@ -322,23 +321,21 @@
 #endif
 
 #ifndef TBB_IMPLEMENT_CPP0X
-    /** By default, use C++11 classes if available **/
-    #if __GNUC__==4 && __GNUC_MINOR__>=4 && __GXX_EXPERIMENTAL_CXX0X__
-        #define TBB_IMPLEMENT_CPP0X 0
-    #elif __clang__ && __cplusplus >= 201103L
-        //TODO: consider introducing separate macros for each file?
-        //prevent injection of corresponding tbb names into std:: namespace if native headers are present
-        #if __has_include(<thread>) || __has_include(<condition_variable>)
-            #define TBB_IMPLEMENT_CPP0X 0
+/** By default, use C++11 classes if available **/
+    #if __clang__
+        /* Old versions of Intel Compiler do not have __has_include */
+        #if (__INTEL_COMPILER && __INTEL_COMPILER <= 1400) 
+            #define TBB_IMPLEMENT_CPP0X !(_LIBCPP_VERSION && (__cplusplus >= 201103L))
         #else
-            #define TBB_IMPLEMENT_CPP0X 1
+            #define TBB_IMPLEMENT_CPP0X (__cplusplus < 201103L || (!__has_include(<thread>) && !__has_include(<condition_variable>)))
         #endif
-    #elif _MSC_VER>=1700
-        #define TBB_IMPLEMENT_CPP0X 0
-    #elif __STDCPP_THREADS__
-        #define TBB_IMPLEMENT_CPP0X 0
+    #elif __GNUC__
+        #define TBB_IMPLEMENT_CPP0X (__TBB_GCC_VERSION < 40400 || !__GXX_EXPERIMENTAL_CXX0X__)
+    #elif _MSC_VER
+        #define TBB_IMPLEMENT_CPP0X (_MSC_VER < 1700)
     #else
-        #define TBB_IMPLEMENT_CPP0X 1
+        // TODO: Reconsider general approach to be more reliable, e.g. (!(__cplusplus >= 201103L && __ STDC_HOSTED__))
+        #define TBB_IMPLEMENT_CPP0X (!__STDCPP_THREADS__)
     #endif
 #endif /* TBB_IMPLEMENT_CPP0X */
 
@@ -628,7 +625,7 @@
     #define __TBB_FORCE_64BIT_ALIGNMENT_BROKEN 0
 #endif
 
-#if __TBB_DEFAULTED_AND_DELETED_FUNC_PRESENT && __TBB_GCC_VERSION < 40700 && !defined(__INTEL_COMPILER) && !defined (__clang__)
+#if __GNUC__ && !__INTEL_COMPILER && !__clang__ && __TBB_DEFAULTED_AND_DELETED_FUNC_PRESENT && __TBB_GCC_VERSION < 40700
     #define __TBB_ZERO_INIT_WITH_DEFAULTED_CTOR_BROKEN 1
 #endif
 
diff --git a/include/tbb/tbb_stddef.h b/include/tbb/tbb_stddef.h
index a1bc9b0629..fb66716801 100644
--- a/include/tbb/tbb_stddef.h
+++ b/include/tbb/tbb_stddef.h
@@ -26,7 +26,7 @@
 #define TBB_VERSION_MINOR 4
 
 // Engineering-focused interface version
-#define TBB_INTERFACE_VERSION 9000
+#define TBB_INTERFACE_VERSION 9001
 #define TBB_INTERFACE_VERSION_MAJOR TBB_INTERFACE_VERSION/1000
 
 // The oldest major interface version still supported
diff --git a/index.src.html b/index.html
similarity index 71%
rename from index.src.html
rename to index.html
index 1f815e53b7..d4692c7822 100644
--- a/index.src.html
+++ b/index.html
@@ -1,33 +1,41 @@
 <HTML>
 <BODY>
-
 <H2>Overview</H2>
 Top level directory for Intel&reg; Threading Building Blocks.
+<H2>Common directories</H2>
+<DL>
+<DT><A HREF="doc/html/index.html">doc</A>
+<DD>Documentation for the library.
+<DT><A HREF="include/index.html">include</A>
+<DD>Include files required for compiling code that uses the library.
+<DT><A HREF="examples/index.html">examples</A>
+<DD>Examples of how to use the library.
+</DL>
+<H2>Intel TBB source package</H2>
 <P>
 To build Intel TBB, use the <A HREF=Makefile>top-level Makefile</A>; see also the <A HREF=build/index.html#build>build directions</A>.
 To port Intel TBB to a new platform, operating system or architecture, see the <A HREF=build/index.html#port>porting directions</A>.
 </P>
-
-<H2>Files</H2>
+<H3>Files</H3>
 <DL>
 <DT><A HREF="Makefile">Makefile</A>
-<DD>Top-level Makefile for Intel TBB.  See also the <A HREF=build/index.html#build>build directions</A>.
+<DD>Top-level Makefile for Intel TBB. See also the <A HREF=build/index.html#build>build directions</A>.
 </DL>
-
-<H2>Directories</H2>
+<H3>Directories</H3>
 <DL>
-<DT><A HREF="doc/html/index.html">doc</A>
-<DD>Documentation for the library.
-<DT><A HREF="include/index.html">include</A>
-<DD>Include files required for compiling code that uses the library.
-<DT><A HREF="examples/index.html">examples</A>
-<DD>Examples of how to use the library.
 <DT><A HREF="src/index.html">src</A>
 <DD>Source code for the library.
 <DT><A HREF="build/index.html">build</A>
 <DD>Internal Makefile infrastructure for Intel TBB.  Do not use directly; see the <A HREF=build/index.html#build>build directions</A>.
 </DL>
-
+<H2>Intel TBB binary package</H2>
+<H3>Directories</H3>
+<DL>
+<DT><A HREF="bin">bin</A>
+<DD>Start-up scripts for sourcing library for Linux* OS and OS X*. For Windows* OS: start-up scripts and dynamic-link libraries.
+<DT><A HREF="lib">lib</A>
+<DD>Platform-specific binary files for the library.
+</DL>
 <HR>
 <p></p>
 Copyright &copy; 2005-2015 Intel Corporation.  All Rights Reserved.
@@ -38,4 +46,3 @@ <H2>Directories</H2>
 * Other names and brands may be claimed as the property of others.
 </BODY>
 </HTML>
-
diff --git a/jni/Application.mk b/jni/Application.mk
index feee666c27..11c18fc82e 100644
--- a/jni/Application.mk
+++ b/jni/Application.mk
@@ -35,11 +35,11 @@ endif
 endif
 endif
 
-APP_PLATFORM:=android-20
+APP_PLATFORM:=android-21
 NDK_TOOLCHAIN_VERSION:=4.9
 
 # Intel(R) C++ Compiler does not support ndk r10 version yet.
-ifeq (icc,$(compiler))
+ifeq (iccx86,$(compiler)$(APP_ABI))
     APP_PLATFORM:=android-9
     NDK_TOOLCHAIN_VERSION:=4.8
 endif
diff --git a/src/perf/time_hash_map_fill.cpp b/src/perf/time_hash_map_fill.cpp
index 13efc41362..a79fbcda24 100644
--- a/src/perf/time_hash_map_fill.cpp
+++ b/src/perf/time_hash_map_fill.cpp
@@ -108,8 +108,8 @@ struct Uniques : TesterBase {
     // Executes test mode for a given thread. Return value is ignored when used with timing wrappers.
     /*override*/ double test(int testn, int t)
     {
-        if( testn != 1 ) { // do insertions
-            for(int i = testn*value+t*n_items, e = testn*value+(t+1)*n_items; i < e; i++) {
+        if( testn == 0 ) { // do insertions
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
                 Table.insert( std::make_pair(Data[i],t) );
             }
         } else { // do last finds
@@ -138,7 +138,7 @@ void execute_percent(test_sandbox &the_test, int p) {
     int uniques = p==100?std::numeric_limits<int>::max() : MAX_TABLE_SIZE;
     ASSERT(p==100 || p <= 30, "Function is broken for %% > 30 except for 100%%");
     for(int i = 0; i < input_size; i++)
-        Data[i] = rand()%uniques;
+        Data[i] = (rand()*rand())%uniques;
     for(int t = MinThread; t <= MaxThread; t++)
         the_test.factory(input_size, t); // executes the tests specified in BOX-es for given 'value' and threads
     the_test.report.SetRoundTitle(rounds++, "%d%%", p);
diff --git a/src/perf/time_parallel_for_each.cpp b/src/perf/time_parallel_for_each.cpp
new file mode 100644
index 0000000000..15a814578e
--- /dev/null
+++ b/src/perf/time_parallel_for_each.cpp
@@ -0,0 +1,70 @@
+/*
+    Copyright 2005-2015 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks. Threading Building Blocks is free software;
+    you can redistribute it and/or modify it under the terms of the GNU General Public License
+    version 2  as  published  by  the  Free Software Foundation.  Threading Building Blocks is
+    distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+    implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    See  the GNU General Public License for more details.   You should have received a copy of
+    the  GNU General Public License along with Threading Building Blocks; if not, write to the
+    Free Software Foundation, Inc.,  51 Franklin St,  Fifth Floor,  Boston,  MA 02110-1301 USA
+
+    As a special exception,  you may use this file  as part of a free software library without
+    restriction.  Specifically,  if other files instantiate templates  or use macros or inline
+    functions from this file, or you compile this file and link it with other files to produce
+    an executable,  this file does not by itself cause the resulting executable to be covered
+    by the GNU General Public License. This exception does not however invalidate any other
+    reasons why the executable file might be covered by the GNU General Public License.
+*/
+
+#include <vector>
+#include <list>
+#include <iostream>
+#include <cstdlib>
+#include <algorithm>
+#include <string>
+
+#include "tbb/parallel_for_each.h"
+#include "tbb/tick_count.h"
+
+template <typename Type>
+void foo( Type &f ) {
+    f += 1.0f;
+}
+
+template <typename Container>
+void test( std::string testName, const int N, const int numRepeats ) {
+    typedef typename Container::value_type Type;
+    Container v;
+
+    for ( int i = 0; i < N; ++i ) {
+        v.push_back( static_cast<Type>(std::rand()) );
+    }
+
+    std::vector<double> times;
+    times.reserve( numRepeats );
+
+    for ( int i = 0; i < numRepeats; ++i ) {
+        tbb::tick_count t0 = tbb::tick_count::now();
+        tbb::parallel_for_each( v.begin(), v.end(), foo<Type> );
+        tbb::tick_count t1 = tbb::tick_count::now();
+        times.push_back( (t1 - t0).seconds()*1000 );
+    }
+
+    std::sort( times.begin(), times.end() );
+    std::cout << "Test " << testName << std::endl
+        << "min " << times[times.size() / 20] << " ms " << std::endl
+        << "med " << times[times.size() / 2] << " ms " << std::endl
+        << "max " << times[times.size() - times.size() / 20 - 1] << " ms " << std::endl;
+}
+
+int main( int argc, char* argv[] ) {
+    const int N = argc > 1 ? std::atoi( argv[1] ) : 10 * 1000;
+    const int numRepeats = argc > 2 ? std::atoi( argv[2] ) : 10;
+
+    test< std::vector<float> >( "std::vector<float>", N, numRepeats );
+    test< std::list<float> >( "std::list<float>", N / 100, numRepeats );
+
+    return 0;
+}
diff --git a/src/rml/test/test_rml_mixed.cpp b/src/rml/test/test_rml_mixed.cpp
index 8636c3e35c..3b2b06a41d 100644
--- a/src/rml/test/test_rml_mixed.cpp
+++ b/src/rml/test/test_rml_mixed.cpp
@@ -38,7 +38,7 @@ int TestMain () {
 // non-deterministic. Thus dynamic_link fails on some systems when the
 // application changes its current directory after the library (TBB/OpenMP/...)
 // is loaded but before the static constructors in the library are executed.
-#define CHDIR_SUPPORT_BROKEN ( ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && __GNUC_MINOR__ <= 9 ) || (__linux__ && __clang_major__ == 3 && __clang_minor__ == 5) )
+#define CHDIR_SUPPORT_BROKEN ( (__TBB_GCC_VERSION >= 40600 && __TBB_GCC_VERSION < 50200) || (__linux__ && __TBB_CLANG_VERSION == 30500) )
 
 const int OMP_ParallelRegionSize = 16;
 int TBB_MaxThread = 4;           // Includes master
diff --git a/src/tbb/arena.cpp b/src/tbb/arena.cpp
index 1fdaf89826..29a7e81df3 100644
--- a/src/tbb/arena.cpp
+++ b/src/tbb/arena.cpp
@@ -42,6 +42,11 @@ void generic_scheduler::attach_arena( arena* a, size_t index, bool is_master ) {
     my_arena_index = index;
     my_arena_slot = a->my_slots + index;
     attach_mailbox( affinity_id(index+1) );
+    if ( is_master && my_inbox.is_idle_state( true ) ) {
+        // Master enters an arena with its own task to be executed. It means that master is not
+        // going to enter stealing loop and take affinity tasks.
+        my_inbox.set_is_idle( false );
+    }
 #if __TBB_TASK_GROUP_CONTEXT
     // Context to be used by root tasks by default (if the user has not specified one).
     if( !is_master )
@@ -62,31 +67,54 @@ void generic_scheduler::attach_arena( arena* a, size_t index, bool is_master ) {
 #endif /* __TBB_TASK_PRIORITY */
 }
 
+inline static bool occupy_slot( generic_scheduler*& slot, generic_scheduler& s ) {
+    return !slot && as_atomic( slot ).compare_and_swap( &s, NULL ) == NULL;
+}
+
+size_t arena::occupy_free_slot_in_range( generic_scheduler& s, size_t lower, size_t upper ) {
+    if ( lower >= upper ) return out_of_arena;
+    // Start search for an empty slot from the one we occupied the last time
+    size_t index = s.my_arena_index;
+    if ( index < lower || index >= upper ) index = s.my_random.get() % (upper - lower) + lower;
+    __TBB_ASSERT( index >= lower && index < upper, NULL );
+    // Find a free slot
+    for ( size_t i = index; i < upper; ++i )
+        if ( occupy_slot(my_slots[i].my_scheduler, s) ) return i;
+    for ( size_t i = lower; i < index; ++i )
+        if ( occupy_slot(my_slots[i].my_scheduler, s) ) return i;
+    return out_of_arena;
+}
+
+template <bool as_worker>
+size_t arena::occupy_free_slot( generic_scheduler& s ) {
+    // Firstly, masters try to occupy reserved slots
+    size_t index = as_worker ? out_of_arena : occupy_free_slot_in_range( s, 0, my_num_reserved_slots );
+    if ( index == out_of_arena ) {
+        // Secondly, all threads try to occupy all non-reserved slots
+        index = occupy_free_slot_in_range( s, my_num_reserved_slots, my_num_slots );
+        // Likely this arena is already saturated
+        if ( index == out_of_arena )
+            return out_of_arena;
+    }
+
+    ITT_NOTIFY(sync_acquired, my_slots + index);
+    atomic_update( my_limit, (unsigned)(index + 1), std::less<unsigned>() );
+    return index;
+}
+
 void arena::process( generic_scheduler& s ) {
     __TBB_ASSERT( is_alive(my_guard), NULL );
     __TBB_ASSERT( governor::is_set(&s), NULL );
     __TBB_ASSERT( !s.my_innermost_running_task, NULL );
     __TBB_ASSERT( !s.my_dispatching_task, NULL );
 
-    __TBB_ASSERT( my_num_slots != 1, NULL );
-    // Start search for an empty slot from the one we occupied the last time
-    unsigned index = s.my_arena_index < my_num_slots ? s.my_arena_index : s.my_random.get() % (my_num_slots - 1) + 1,
-             end = index;
-    __TBB_ASSERT( index != 0, "A worker cannot occupy slot 0" );
-    __TBB_ASSERT( index < my_num_slots, NULL );
+    __TBB_ASSERT( my_num_slots > 1, NULL );
 
-    // Find a vacant slot
-    for ( ;; ) {
-        if ( !my_slots[index].my_scheduler && as_atomic(my_slots[index].my_scheduler).compare_and_swap(&s, NULL ) == NULL )
-            break;
-        if ( ++index == my_num_slots )
-            index = 1;
-        if ( index == end ) {
-            // Likely this arena is already saturated
-            goto quit;
-        }
-    }
-    ITT_NOTIFY(sync_acquired, my_slots + index);
+    size_t index = occupy_free_slot</*as_worker*/true>( s );
+    if ( index == out_of_arena )
+        goto quit;
+
+    __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" );
     s.attach_arena( this, index, /*is_master*/false );
 
 #if !__TBB_FP_CONTEXT
@@ -98,13 +126,27 @@ void arena::process( generic_scheduler& s ) {
     my_observers.notify_entry_observers( s.my_last_local_observer, /*worker=*/true );
 #endif /* __TBB_SCHEDULER_OBSERVER */
 
-    atomic_update( my_limit, index + 1, std::less<unsigned>() );
+    // Task pool can be marked as non-empty if the worker occupies the slot left by a master.
+    if ( s.my_arena_slot->task_pool != EmptyTaskPool ) {
+        __TBB_ASSERT( !s.my_innermost_running_task, NULL );
+        __TBB_ASSERT( !s.my_dispatching_task, NULL );
+        __TBB_ASSERT( s.my_inbox.is_idle_state(false), NULL );
+        s.local_wait_for_all( *s.my_dummy_task, NULL );
+        __TBB_ASSERT( s.my_inbox.is_idle_state(true), NULL );
+    }
 
     for ( ;; ) {
+        __TBB_ASSERT( is_alive(my_guard), NULL );
+        __TBB_ASSERT ( __TBB_load_relaxed(s.my_arena_slot->head) == __TBB_load_relaxed(s.my_arena_slot->tail),
+                       "Worker cannot leave arena while its task pool is not empty" );
+        __TBB_ASSERT( s.my_arena_slot->task_pool == EmptyTaskPool, "Empty task pool is not marked appropriately" );
+        // This check prevents relinquishing more than necessary workers because
+        // of the non-atomicity of the decision making procedure
+        if (num_workers_active() > my_num_workers_allotted)
+            break;
         // Try to steal a task.
         // Passing reference count is technically unnecessary in this context,
         // but omitting it here would add checks inside the function.
-        __TBB_ASSERT( is_alive(my_guard), NULL );
         task* t = s.receive_or_steal_task( s.my_dummy_task->prefix().ref_count );
         if (t) {
             // A side effect of receive_or_steal_task is that my_innermost_running_task can be set.
@@ -113,13 +155,6 @@ void arena::process( generic_scheduler& s ) {
             __TBB_ASSERT( !s.my_dispatching_task, NULL );
             s.local_wait_for_all(*s.my_dummy_task,t);
         }
-        __TBB_ASSERT ( __TBB_load_relaxed(s.my_arena_slot->head) == __TBB_load_relaxed(s.my_arena_slot->tail),
-                       "Worker cannot leave arena while its task pool is not empty" );
-        __TBB_ASSERT( s.my_arena_slot->task_pool == EmptyTaskPool, "Empty task pool is not marked appropriately" );
-        // This check prevents relinquishing more than necessary workers because
-        // of the non-atomicity of the decision making procedure
-        if (num_workers_active() > my_num_workers_allotted)
-            break;
     }
 #if __TBB_SCHEDULER_OBSERVER
     my_observers.notify_exit_observers( s.my_last_local_observer, /*worker=*/true );
@@ -148,7 +183,7 @@ void arena::process( generic_scheduler& s ) {
     on_thread_leaving</*is_master*/false>();
 }
 
-arena::arena ( market& m, unsigned num_slots ) {
+arena::arena ( market& m, unsigned num_slots, unsigned num_reserved_slots ) {
     __TBB_ASSERT( !my_guard, "improperly allocated arena?" );
     __TBB_ASSERT( sizeof(my_slots[0]) % NFS_GetLineSize()==0, "arena::slot size not multiple of cache line size" );
     __TBB_ASSERT( (uintptr_t)this % NFS_GetLineSize()==0, "arena misaligned" );
@@ -159,7 +194,8 @@ arena::arena ( market& m, unsigned num_slots ) {
     my_limit = 1;
     // Two slots are mandatory: for the master, and for 1 worker (required to support starvation resistant tasks).
     my_num_slots = num_slots_to_reserve(num_slots);
-    my_max_num_workers = num_slots-1;
+    my_num_reserved_slots = num_reserved_slots;
+    my_max_num_workers = num_slots-num_reserved_slots;
     my_references = 1; // accounts for the master
 #if __TBB_TASK_PRIORITY
     my_bottom_priority = my_top_priority = normalized_normal_priority;
@@ -168,7 +204,7 @@ arena::arena ( market& m, unsigned num_slots ) {
 #if __TBB_SCHEDULER_OBSERVER
     my_observers.my_arena = this;
 #endif /* __TBB_SCHEDULER_OBSERVER */
-    __TBB_ASSERT ( my_max_num_workers < my_num_slots, NULL );
+    __TBB_ASSERT ( my_max_num_workers <= my_num_slots, NULL );
     // Construct slots. Mark internal synchronization elements for the tools.
     for( unsigned i = 0; i < my_num_slots; ++i ) {
         __TBB_ASSERT( !my_slots[i].my_scheduler && !my_slots[i].task_pool, NULL );
@@ -190,7 +226,7 @@ arena::arena ( market& m, unsigned num_slots ) {
 #endif
 }
 
-arena& arena::allocate_arena( market& m, unsigned num_slots ) {
+arena& arena::allocate_arena( market& m, unsigned num_slots, unsigned num_reserved_slots ) {
     __TBB_ASSERT( sizeof(base_type) + sizeof(arena_slot) == sizeof(arena), "All arena data fields must go to arena_base" );
     __TBB_ASSERT( sizeof(base_type) % NFS_GetLineSize() == 0, "arena slots area misaligned: wrong padding" );
     __TBB_ASSERT( sizeof(mail_outbox) == NFS_MaxLineSize, "Mailbox padding is wrong" );
@@ -198,7 +234,7 @@ arena& arena::allocate_arena( market& m, unsigned num_slots ) {
     unsigned char* storage = (unsigned char*)NFS_Allocate( 1, n, NULL );
     // Zero all slots to indicate that they are empty
     memset( storage, 0, n );
-    return *new( storage + num_slots_to_reserve(num_slots) * sizeof(mail_outbox) ) arena(m, num_slots);
+    return *new( storage + num_slots_to_reserve(num_slots) * sizeof(mail_outbox) ) arena(m, num_slots, num_reserved_slots);
 }
 
 void arena::free_arena () {
@@ -524,19 +560,18 @@ struct nested_arena_context : no_copy {
     generic_scheduler &my_scheduler;
     scheduler_state const my_orig_state;
     void *my_orig_ptr;
-    bool my_adjusting;
-    nested_arena_context(generic_scheduler *s, arena* a, bool needs_adjusting, bool as_worker = false)
-        : my_scheduler(*s), my_orig_state(*s), my_orig_ptr(NULL), my_adjusting(needs_adjusting) {
-        s->nested_arena_entry(a, *this, as_worker);
+    nested_arena_context(generic_scheduler *s, arena* a, size_t slot_index, bool as_worker)
+        : my_scheduler(*s), my_orig_state(*s), my_orig_ptr(NULL) {
+        s->nested_arena_entry(a, slot_index, *this, as_worker);
     }
     ~nested_arena_context() {
         my_scheduler.nested_arena_exit(*this);
-        (scheduler_state&)my_scheduler = my_orig_state; // restore arena settings
+        static_cast<scheduler_state&>(my_scheduler) = my_orig_state; // restore arena settings
         governor::assume_scheduler( &my_scheduler );
     }
 };
 
-void generic_scheduler::nested_arena_entry(arena* a, nested_arena_context& c, bool as_worker) {
+void generic_scheduler::nested_arena_entry(arena* a, size_t slot_index, nested_arena_context& c, bool as_worker) {
     __TBB_ASSERT( is_alive(a->my_guard), NULL );
     if( a == my_arena ) {
 #if __TBB_TASK_GROUP_CONTEXT
@@ -551,8 +586,10 @@ void generic_scheduler::nested_arena_entry(arena* a, nested_arena_context& c, bo
         my_arena->orphan_offloaded_tasks( *this );
     my_offloaded_tasks = NULL;
 #endif /* __TBB_TASK_PRIORITY */
-    attach_arena( a, /*index*/0, /*is_master*/true );
+    attach_arena( a, slot_index, /*is_master*/true );
+    __TBB_ASSERT( my_arena == a, NULL );
     my_innermost_running_task = my_dispatching_task = as_worker? NULL : my_dummy_task;
+    my_is_worker = as_worker;
 #if __TBB_TASK_GROUP_CONTEXT
     // save dummy's context and replace it by arena's context
     c.my_orig_ptr = my_dummy_task->prefix().context;
@@ -565,9 +602,8 @@ void generic_scheduler::nested_arena_entry(arena* a, nested_arena_context& c, bo
 #endif
     // TODO? ITT_NOTIFY(sync_acquired, a->my_slots + index);
     // TODO: it requires market to have P workers (not P-1)
-    // TODO: it still allows temporary oversubscription by 1 worker (due to my_max_num_workers)
     // TODO: a preempted worker should be excluded from assignment to other arenas e.g. my_slack--
-    if( c.my_adjusting ) my_arena->my_market->adjust_demand(*my_arena, -1);
+    if( !as_worker && slot_index >= my_arena->my_num_reserved_slots ) my_arena->my_market->adjust_demand(*my_arena, -1);
 }
 
 void generic_scheduler::nested_arena_exit(nested_arena_context& c) {
@@ -577,7 +613,7 @@ void generic_scheduler::nested_arena_exit(nested_arena_context& c) {
 #endif
         return;
     }
-    if( c.my_adjusting ) my_arena->my_market->adjust_demand(*my_arena, 1);
+    if( !my_is_worker && my_arena_index >= my_arena->my_num_reserved_slots ) my_arena->my_market->adjust_demand(*my_arena, 1);
 #if __TBB_ARENA_OBSERVER
     my_arena->my_observers.notify_exit_observers( my_last_local_observer, /*worker=*/false );
 #endif /* __TBB_SCHEDULER_OBSERVER */
@@ -587,8 +623,9 @@ void generic_scheduler::nested_arena_exit(nested_arena_context& c) {
         my_arena->orphan_offloaded_tasks( *this );
     my_local_reload_epoch = *c.my_orig_state.my_ref_reload_epoch;
 #endif
-    // Free the master slot. TODO: support multiple masters
-    __TBB_store_with_release(my_arena->my_slots[0].my_scheduler, (generic_scheduler*)NULL);
+    // Free the master slot.
+    __TBB_ASSERT(my_arena->my_slots[my_arena_index].my_scheduler, "A slot is already empty");
+    __TBB_store_with_release(my_arena->my_slots[my_arena_index].my_scheduler, (generic_scheduler*)NULL);
     my_arena->my_exit_monitors.notify_all_relaxed(); // TODO: fix concurrent monitor to use notify_one (test MultipleMastersPart4 fails)
 #if __TBB_TASK_GROUP_CONTEXT
     // restore context of dummy task
@@ -617,14 +654,13 @@ namespace internal {
 
 void task_arena_base::internal_initialize( ) {
     governor::one_time_init();
-    __TBB_ASSERT( my_master_slots <= 1, "Number of slots reserved for master can be only [0,1]");
-    if( my_master_slots > 1 ) my_master_slots = 1; // TODO: make more masters
     bool default_concurrency_requested = false;
     if( my_max_concurrency < 1 ) {
         my_max_concurrency = (int)governor::default_num_threads();
         default_concurrency_requested = true;
     }
-    arena* new_arena = market::create_arena( my_max_concurrency + 1-my_master_slots/*it's +1 slot for num_masters=0*/,
+    __TBB_ASSERT( my_master_slots <= (unsigned)my_max_concurrency, "Number of slots reserved for master should not exceed arena concurrency");
+    arena* new_arena = market::create_arena( my_max_concurrency, my_master_slots,
                                               global_control::active_value(global_control::thread_stack_size),
                                               default_concurrency_requested );
     // increases market's ref count for task_arena
@@ -739,16 +775,15 @@ void task_arena_base::internal_execute( internal::delegate_base& d) const {
     generic_scheduler* s = governor::local_scheduler_weak();
     __TBB_ASSERT(s, "Scheduler is not initialized");
     // TODO: is it safe to assign slot to a scheduler which is not yet switched?
-    // TODO TEMP: one master, make more masters
-    if( s->my_arena == my_arena || (!__TBB_load_with_acquire(my_arena->my_slots[0].my_scheduler)
-            && as_atomic(my_arena->my_slots[0].my_scheduler).compare_and_swap(s, NULL ) == NULL) ) {
+    size_t index1 =  s->my_arena == my_arena ? s->my_arena_index : my_arena->occupy_free_slot</* as_worker*/false>( *s );
+    if ( index1 != arena::out_of_arena ) {
         cpu_ctl_env_helper cpu_ctl_helper;
         cpu_ctl_helper.set_env( __TBB_CONTEXT_ARG1(my_context) );
 #if TBB_USE_EXCEPTIONS
         try {
 #endif
         //TODO: replace dummy tasks for workers as well to avoid using of the_dummy_context
-        nested_arena_context scope(s, my_arena, !my_master_slots);
+        nested_arena_context scope( s, my_arena, index1, /*as_worker*/false );
         d();
 #if TBB_USE_EXCEPTIONS
         } catch(...) {
@@ -782,19 +817,18 @@ void task_arena_base::internal_execute( internal::delegate_base& d) const {
                 my_arena->my_exit_monitors.cancel_wait(waiter);
                 break;
             }
-            else if( !__TBB_load_with_acquire(my_arena->my_slots[0].my_scheduler) // TODO: refactor into a function?
-                    && as_atomic(my_arena->my_slots[0].my_scheduler).compare_and_swap(s, NULL ) == NULL ) {
+            size_t index2 = my_arena->occupy_free_slot</*as_worker*/false>( *s );
+            if( index2 != arena::out_of_arena ) {
                 my_arena->my_exit_monitors.cancel_wait(waiter);
-                nested_arena_context scope(s, my_arena, !my_master_slots);
+                nested_arena_context scope(s, my_arena, index2, /*as_worker*/false);
                 s->local_wait_for_all(root, NULL);
 #if TBB_USE_EXCEPTIONS
                 __TBB_ASSERT( !exec_context.my_exception, NULL ); // exception can be thrown above, not deferred
 #endif
                 __TBB_ASSERT( root.prefix().ref_count == 0, NULL );
                 break;
-            } else {
-                my_arena->my_exit_monitors.commit_wait(waiter);
             }
+            my_arena->my_exit_monitors.commit_wait(waiter);
         } while( __TBB_load_with_acquire(root.prefix().ref_count) == 2 );
 #if TBB_USE_EXCEPTIONS
         // process possible exception
@@ -811,8 +845,14 @@ class wait_task : public task {
     /*override*/ task* execute() {
         generic_scheduler* s = governor::local_scheduler_if_initialized();
         __TBB_ASSERT( s, NULL );
-        if( s->my_arena_index && s->worker_outermost_level() ) {
-            s->local_wait_for_all( *s->my_dummy_task, NULL ); // run remaining tasks
+        __TBB_ASSERT( s->master_outermost_level() || s->worker_outermost_level(), "The enqueued task can be processed only on outermost level" );
+         if( s->is_worker() ) {
+            __TBB_ASSERT( !s->my_dispatching_task && s->my_innermost_running_task == this, NULL );
+             // Mimic worker on outermost level to run remaining tasks
+            s->my_innermost_running_task = NULL;
+            s->local_wait_for_all( *s->my_dummy_task, NULL );
+            __TBB_ASSERT( !s->my_dispatching_task && !s->my_innermost_running_task, NULL );
+            s->my_innermost_running_task = this;
         } else s->my_arena->is_out_of_work(); // avoids starvation of internal_wait: issuing this task makes arena full
         my_signal.V();
         return NULL;
@@ -836,7 +876,7 @@ void task_arena_base::internal_wait() const {
         while( my_arena->my_pool_state != arena::SNAPSHOT_EMPTY ) {
             if( !__TBB_load_with_acquire(my_arena->my_slots[0].my_scheduler) // TODO TEMP: one master, make more masters
                 && as_atomic(my_arena->my_slots[0].my_scheduler).compare_and_swap(s, NULL) == NULL ) {
-                nested_arena_context a(s, my_arena, !my_master_slots, true);
+                nested_arena_context a(s, my_arena, 0, true);
                 s->wait_until_empty();
             } else {
                 binary_semaphore waiter; // TODO: replace by a single event notification from is_out_of_work
diff --git a/src/tbb/arena.h b/src/tbb/arena.h
index 7ee560afa7..4189588898 100644
--- a/src/tbb/arena.h
+++ b/src/tbb/arena.h
@@ -62,7 +62,7 @@ struct arena_base : padded<intrusive_list_node> {
     volatile intptr_t my_top_priority;  // heavy use in stealing loop
 #endif /* !__TBB_TASK_PRIORITY */
 
-    //! Maximal currently busy slot.
+    //! Maximal number of currently busy slots.
     atomic<unsigned> my_limit;          // heavy use in stealing loop
 
     //! Task pool for the tasks scheduled via task::enqueue() method
@@ -139,6 +139,9 @@ struct arena_base : padded<intrusive_list_node> {
     //! Number of slots in the arena
     unsigned my_num_slots;
 
+    //! Number of reserved slots (can be occupied only by masters)
+    unsigned my_num_reserved_slots;
+
     //! Indicates if there is an oversubscribing worker created to service enqueued tasks.
     bool my_mandatory_concurrency;
 
@@ -159,10 +162,10 @@ class arena: public padded<arena_base>
     typedef padded<arena_base> base_type;
 
     //! Constructor
-    arena ( market&, unsigned max_num_workers );
+    arena ( market&, unsigned max_num_workers, unsigned num_reserved_slots );
 
     //! Allocate an instance of arena.
-    static arena& allocate_arena( market&, unsigned num_slots );
+    static arena& allocate_arena( market&, unsigned num_slots, unsigned num_reserved_slots );
 
     static int unsigned num_slots_to_reserve ( unsigned num_slots ) {
         return max(2u, num_slots);
@@ -235,6 +238,13 @@ class arena: public padded<arena_base>
     intptr_t workers_task_node_count();
 #endif
 
+    static const size_t out_of_arena = ~size_t(0);
+    //! Tries to occupy a slot in the arena. On success, returns the slot index; if no slot is available, returns out_of_arena.
+    template <bool as_worker>
+    size_t occupy_free_slot( generic_scheduler& s );
+    //! Tries to occupy a slot in the specified range.
+    size_t occupy_free_slot_in_range( generic_scheduler& s, size_t lower, size_t upper );
+
     /** Must be the last data field */
     arena_slot my_slots[1];
 }; // class arena
diff --git a/src/tbb/cache_aligned_allocator.cpp b/src/tbb/cache_aligned_allocator.cpp
index 50054653aa..79343f5e2b 100644
--- a/src/tbb/cache_aligned_allocator.cpp
+++ b/src/tbb/cache_aligned_allocator.cpp
@@ -181,23 +181,23 @@ size_t NFS_GetLineSize() {
 
 void* NFS_Allocate( size_t n, size_t element_size, void* /*hint*/ ) {
     //TODO: make this functionality  available via an adaptor over generic STL like allocator
-    size_t cache_line_size = NFS_LineSize;
-    __TBB_ASSERT( cache_line_size <= NFS_MaxLineSize, "illegal value for NFS_LineSize" );
-    __TBB_ASSERT( is_power_of_two(cache_line_size), "must be power of two" );
+    const size_t nfs_cache_line_size = NFS_LineSize;
+    __TBB_ASSERT( nfs_cache_line_size <= NFS_MaxLineSize, "illegal value for NFS_LineSize" );
+    __TBB_ASSERT( is_power_of_two(nfs_cache_line_size), "must be power of two" );
     size_t bytes = n*element_size;
 
-    if (bytes<n || bytes+cache_line_size<bytes) {
+    if (bytes<n || bytes+nfs_cache_line_size<bytes) {
         // Overflow
         throw_exception(eid_bad_alloc);
     }
     // scalable_aligned_malloc considers zero size request an error, and returns NULL
     if (bytes==0) bytes = 1;
-    
-    void* result = (*padded_allocate_handler)( bytes, cache_line_size );
+
+    void* result = (*padded_allocate_handler)( bytes, nfs_cache_line_size );
     if (!result)
         throw_exception(eid_bad_alloc);
 
-    __TBB_ASSERT( is_aligned(result, cache_line_size), "The address returned isn't aligned to cache line size" );
+    __TBB_ASSERT( is_aligned(result, nfs_cache_line_size), "The address returned isn't aligned to cache line size" );
     return result;
 }
 
@@ -205,16 +205,16 @@ void NFS_Free( void* p ) {
     (*padded_free_handler)( p );
 }
 
-static void* padded_allocate( size_t bytes, size_t alignment ) {    
+static void* padded_allocate( size_t bytes, size_t alignment ) {
     unsigned char* result = NULL;
     unsigned char* base = (unsigned char*)malloc(alignment+bytes);
-    if( base ) {        
+    if( base ) {
         // Round up to the next line
         result = (unsigned char*)((uintptr_t)(base+alignment)&-alignment);
         // Record where block actually starts.
         ((uintptr_t*)result)[-1] = uintptr_t(base);
     }
-    return result;    
+    return result;
 }
 
 static void padded_free( void* p ) {
@@ -227,7 +227,7 @@ static void padded_free( void* p ) {
     }
 }
 
-void* __TBB_EXPORTED_FUNC allocate_via_handler_v3( size_t n ) {    
+void* __TBB_EXPORTED_FUNC allocate_via_handler_v3( size_t n ) {
     void* result = (*MallocHandler) (n);
     if (!result) {
         throw_exception(eid_bad_alloc);
@@ -236,7 +236,7 @@ void* __TBB_EXPORTED_FUNC allocate_via_handler_v3( size_t n ) {
 }
 
 void __TBB_EXPORTED_FUNC deallocate_via_handler_v3( void *p ) {
-    if( p ) {        
+    if( p ) {
         (*FreeHandler)( p );
     }
 }
diff --git a/src/tbb/concurrent_vector.cpp b/src/tbb/concurrent_vector.cpp
index 587f08ac6d..4c03e781e9 100644
--- a/src/tbb/concurrent_vector.cpp
+++ b/src/tbb/concurrent_vector.cpp
@@ -130,8 +130,7 @@ namespace internal {
         } else {
             ITT_NOTIFY(sync_acquired, &s);
         }
-        if(s.load<relaxed>() != segment_allocated())
-            throw_exception(eid_bad_last_alloc); // throw custom exception, because it's hard to recover correctly after segment_allocation_failed state
+        enforce_segment_allocated(s.load<relaxed>()); //it's hard to recover correctly after segment_allocation_failed state
         return s;
     }
 
@@ -196,8 +195,7 @@ namespace internal {
         const void *arg;
         safe_init_body(internal_array_op2 init, const void *src) : func(init), arg(src) {}
         void operator()(segment_t &s, void *begin, size_type n) const {
-            if(s.load<relaxed>() != segment_allocated())
-                throw_exception(eid_bad_last_alloc); // throw custom exception
+            enforce_segment_allocated(s.load<relaxed>());
             func( begin, arg, n );
         }
     };
@@ -280,10 +278,11 @@ concurrent_vector_base_v3::size_type concurrent_vector_base_v3::helper::enable_s
             array0 = s[0].load<acquire>();
         }
         ITT_NOTIFY(sync_acquired, &s[0]);
-        if(array0 != segment_allocated()) { // check for segment_allocation_failed state of initial segment
-            publish_segment(s[k], segment_allocation_failed()); // and assign segment_allocation_failed state here
-            throw_exception(eid_bad_last_alloc); // throw custom exception
-        }
+
+        segment_scope_guard k_segment_guard(s[k], false);
+        enforce_segment_allocated(array0); // initial segment should be allocated
+        k_segment_guard.dismiss();
+
         publish_segment( s[k],
             static_cast<void*>(array0.pointer<char>() + segment_base(k)*element_size )
         );
@@ -399,8 +398,7 @@ void concurrent_vector_base_v3::internal_assign( const concurrent_vector_base_v3
         size_type b=segment_base(k);
         size_type new_end = b>=n ? b : n;
         __TBB_ASSERT( my_early_size>new_end, NULL );
-        if( my_segment[k].load<relaxed>() != segment_allocated()) // check vector was broken before
-            throw_exception(eid_bad_last_alloc); // throw custom exception
+        enforce_segment_allocated(my_segment[k].load<relaxed>()); //if vector was broken before
         // destructors are supposed to not throw any exceptions
         destroy( my_segment[k].load<relaxed>().pointer<char>() + element_size*(new_end-b), my_early_size-new_end );
         my_early_size = new_end;
@@ -417,8 +415,8 @@ void concurrent_vector_base_v3::internal_assign( const concurrent_vector_base_v3
         helper::extend_table_if_necessary(*this, k, 0);
         if( my_segment[k].load<relaxed>() == segment_not_used())
             helper::enable_segment(*this, k, element_size);
-        else if( my_segment[k].load<relaxed>() != segment_allocated() )
-            throw_exception(eid_bad_last_alloc); // throw custom exception
+        else
+            enforce_segment_allocated(my_segment[k].load<relaxed>());
         size_type m = k? segment_size(k) : 2;
         if( m > n-b ) m = n-b;
         size_type a = 0;
@@ -475,8 +473,7 @@ concurrent_vector_base_v3::size_type concurrent_vector_base_v3::internal_grow_to
                 backoff.pause();
             ITT_NOTIFY(sync_acquired, &s);
         }
-        if( my_segment[i].load<relaxed>() != segment_allocated() )
-            throw_exception(eid_bad_last_alloc);
+        enforce_segment_allocated(my_segment[i].load<relaxed>());
     }
 #if TBB_USE_DEBUG
     size_type capacity = internal_capacity();
diff --git a/src/tbb/governor.cpp b/src/tbb/governor.cpp
index e12f266d71..50710a66a3 100644
--- a/src/tbb/governor.cpp
+++ b/src/tbb/governor.cpp
@@ -180,30 +180,30 @@ generic_scheduler* governor::init_scheduler_weak() {
 
 generic_scheduler* governor::init_scheduler( int num_threads, stack_size_type stack_size, bool auto_init ) {
     one_time_init();
-    uintptr_t v = theTLS.get();
-    generic_scheduler* s = tls_scheduler_of( v );
-    if( v&1 ) { // TLS holds scheduler instance with arena
-        __TBB_ASSERT( s->my_arena, "TLS is marked for scheduler with arena" );
-        s->my_ref_count += 1;
-        return s;
-    }
-    if( v ) { //TLS holds scheduler instance without arena
-        __TBB_ASSERT( !s->my_arena, "TLS is marked for scheduler without arena" );
-        __TBB_ASSERT( s->my_auto_initialized, "weakly initialized scheduler is supposed to be auto-initialized" );
-        s->attach_arena( market::create_arena( default_num_threads(), 0, true ), 0, /*is_master*/true );
-        __TBB_ASSERT( s->my_arena_index == 0, "Master thread must occupy the first slot in its arena" );
-        s->my_arena_slot->my_scheduler = s;
-        s->my_arena->my_default_ctx = s->default_context(); // it also transfers implied ownership
-        s->my_ref_count += 1;
-        assume_scheduler( s );
+    if ( uintptr_t v = theTLS.get() ) {
+        generic_scheduler* s = tls_scheduler_of( v );
+        if ( (v&1) == 0 ) { // TLS holds scheduler instance without arena
+            __TBB_ASSERT( s->my_ref_count == 1, "weakly initialized scheduler must have refcount equal to 1" );
+            __TBB_ASSERT( !s->my_arena, "weakly initialized scheduler  must have no arena" );
+            __TBB_ASSERT( s->my_auto_initialized, "weakly initialized scheduler is supposed to be auto-initialized" );
+            s->attach_arena( market::create_arena( default_num_threads(), 1, 0, true ), 0, /*is_master*/true );
+            __TBB_ASSERT( s->my_arena_index == 0, "Master thread must occupy the first slot in its arena" );
+            s->my_arena_slot->my_scheduler = s;
+            s->my_arena->my_default_ctx = s->default_context(); // it also transfers implied ownership
+            // Mark the scheduler as fully initialized
+            assume_scheduler( s );
+        }
+        // Increment refcount only for explicit instances of task_scheduler_init.
+        if ( !auto_init ) s->my_ref_count += 1;
+        __TBB_ASSERT( s->my_arena, "scheduler is not initialized fully" );
         return s;
     }
     // Create new scheduler instance with arena
     bool default_concurrency_requested = num_threads == task_scheduler_init::automatic;
     if( default_concurrency_requested )
         num_threads = default_num_threads();
-    arena *a = market::create_arena( num_threads, stack_size, default_concurrency_requested );
-    s = generic_scheduler::create_master( a );
+    arena *a = market::create_arena( num_threads, 1, stack_size, default_concurrency_requested );
+    generic_scheduler* s = generic_scheduler::create_master( a );
     __TBB_ASSERT(s, "Somehow a local scheduler creation for a master thread failed");
     __TBB_ASSERT( is_set(s), NULL );
     s->my_auto_initialized = auto_init;
@@ -214,20 +214,24 @@ void governor::terminate_scheduler( generic_scheduler* s, const task_scheduler_i
     __TBB_ASSERT( is_set(s), "Attempt to terminate non-local scheduler instance" );
     if (--(s->my_ref_count)) {
         // can't throw exception, because this is on dtor's call chain
-        __TBB_ASSERT_RELEASE( !BlockingTSI || BlockingTSI!=tsi_ptr,
+        __TBB_ASSERT_RELEASE( BlockingTSI!=tsi_ptr,
                               "Attempt to terminate nested scheduler in blocking mode" );
     } else {
+        bool needs_wait_workers = false;
+        if ( BlockingTSI==tsi_ptr ) {
+            needs_wait_workers = true;
+            BlockingTSI = NULL;
 #if TBB_USE_ASSERT
-        if (BlockingTSI) {
-            __TBB_ASSERT( BlockingTSI == tsi_ptr, "For blocking termination last terminate_scheduler must be blocking." );
             IsBlockingTerminationInProgress = true;
-        }
 #endif
-        s->cleanup_master();
+        }
+        s->cleanup_master( needs_wait_workers );
         __TBB_ASSERT( is_set(NULL), "cleanup_master has not cleared its TLS slot" );
-        BlockingTSI = NULL;
 #if TBB_USE_ASSERT
-        IsBlockingTerminationInProgress = false;
+        if ( needs_wait_workers ) {
+            __TBB_ASSERT( IsBlockingTerminationInProgress, NULL );
+            IsBlockingTerminationInProgress = false;
+        }
 #endif
     }
 }
@@ -236,12 +240,11 @@ void governor::auto_terminate(void* arg){
     generic_scheduler* s = tls_scheduler_of( uintptr_t(arg) ); // arg is equivalent to theTLS.get()
     if( s && s->my_auto_initialized ) {
         if( !--(s->my_ref_count) ) {
-            __TBB_ASSERT( !BlockingTSI, "Blocking auto-terminate is not supported." );
             // If the TLS slot is already cleared by OS or underlying concurrency
             // runtime, restore its value.
             if( !is_set(s) )
                 assume_scheduler(s);
-            s->cleanup_master();
+            s->cleanup_master( /*needs_wait_workers=*/false );
             __TBB_ASSERT( is_set(NULL), "cleanup_master has not cleared its TLS slot" );
         }
     }
diff --git a/src/tbb/governor.h b/src/tbb/governor.h
index cc10e89693..e45805f07f 100644
--- a/src/tbb/governor.h
+++ b/src/tbb/governor.h
@@ -151,8 +151,6 @@ class governor {
 
     static void initialize_rml_factory ();
 
-    static bool needsWaitWorkers () { return BlockingTSI!=NULL; }
-
     static bool does_client_join_workers (const tbb::internal::rml::tbb_client &client);
 
     //! Must be called before init_scheduler
diff --git a/src/tbb/market.cpp b/src/tbb/market.cpp
index 8d388fdeff..fe24fd7b23 100644
--- a/src/tbb/market.cpp
+++ b/src/tbb/market.cpp
@@ -226,12 +226,13 @@ bool governor::does_client_join_workers (const tbb::internal::rml::tbb_client &c
     return ((const market&)client).must_join_workers();
 }
 
-arena* market::create_arena ( int num_slots, size_t stack_size, bool default_concurrency_requested ) {
+arena* market::create_arena ( int num_slots, int num_reserved_slots, size_t stack_size, bool default_concurrency_requested ) {
     __TBB_ASSERT( num_slots > 0, NULL );
-    market &m = global_market( num_slots-1, stack_size, default_concurrency_requested,
+    __TBB_ASSERT( num_reserved_slots <= num_slots, NULL );
+    market &m = global_market( num_slots-num_reserved_slots, stack_size, default_concurrency_requested,
                                /*is_public*/ true ); // increases market's public ref count
 
-    arena& a = arena::allocate_arena( m, min(num_slots, (int)m.my_num_workers_hard_limit) );
+    arena& a = arena::allocate_arena( m, num_slots, num_reserved_slots );
     // Add newly created arena into the existing market's list.
     arenas_list_mutex_type::scoped_lock lock(m.my_arenas_list_mutex);
     m.insert_arena_into_list(a);
@@ -377,7 +378,7 @@ void market::update_allotment ( intptr_t highest_affected_priority ) {
         pl.workers_available = 0;
         arena_list_type::iterator it = pl.arenas.begin();
         for ( ; it != pl.arenas.end(); ++it ) {
-            __TBB_ASSERT( it->my_num_workers_requested || !it->my_num_workers_allotted, NULL );
+            __TBB_ASSERT( it->my_num_workers_requested >= 0 || !it->my_num_workers_allotted, NULL );
             it->my_num_workers_allotted = 0;
         }
     }
@@ -410,7 +411,6 @@ void market::adjust_demand ( arena& a, int delta ) {
     priority_level_info &pl = my_priority_levels[p];
     pl.workers_requested += delta;
     __TBB_ASSERT( pl.workers_requested >= 0, NULL );
-    __TBB_ASSERT( a.my_num_workers_requested >= 0, NULL );
     if ( a.my_num_workers_requested <= 0 ) {
         if ( a.my_top_priority != normalized_normal_priority ) {
             GATHER_STATISTIC( ++governor::local_scheduler_if_initialized()->my_counters.arena_prio_resets );
@@ -431,6 +431,8 @@ void market::adjust_demand ( arena& a, int delta ) {
     }
     else if ( p > my_global_top_priority ) {
         __TBB_ASSERT( pl.workers_requested > 0, NULL );
+        // TODO: investigate if the following invariant is always valid
+        __TBB_ASSERT( a.my_num_workers_requested >= 0, NULL );
         update_global_top_priority(p);
         a.my_num_workers_allotted = min( (int)my_num_workers_soft_limit, a.my_num_workers_requested );
         my_priority_levels[p - 1].workers_available = my_num_workers_soft_limit - a.my_num_workers_allotted;
@@ -600,6 +602,7 @@ bool market::lower_arena_priority ( arena& a, intptr_t new_priority, uintptr_t o
 }
 
 bool market::update_arena_priority ( arena& a, intptr_t new_priority ) {
+    // TODO: do not acquire this global lock while checking arena's state.
     arenas_list_mutex_type::scoped_lock lock(my_arenas_list_mutex);
 
     __TBB_ASSERT( my_global_top_priority >= a.my_top_priority || a.my_num_workers_requested <= 0, NULL );
diff --git a/src/tbb/market.h b/src/tbb/market.h
index 848bfcf0d0..887224ba66 100644
--- a/src/tbb/market.h
+++ b/src/tbb/market.h
@@ -256,7 +256,7 @@ class market : no_copy, rml::tbb_client {
     //! Creates an arena object
     /** If necessary, also creates global market instance, and boosts its ref count.
         Each call to create_arena() must be matched by the call to arena::free_arena(). **/
-    static arena* create_arena ( int num_slots, size_t stack_size, bool default_concurrency_requested );
+    static arena* create_arena ( int num_slots, int num_reserved_slots, size_t stack_size, bool default_concurrency_requested );
 
     //! Removes the arena from the market's list
     void try_destroy_arena ( arena*, uintptr_t aba_epoch );
diff --git a/src/tbb/scheduler.cpp b/src/tbb/scheduler.cpp
index 4517115369..c8ea9e19a1 100644
--- a/src/tbb/scheduler.cpp
+++ b/src/tbb/scheduler.cpp
@@ -1021,6 +1021,7 @@ generic_scheduler* generic_scheduler::create_worker( market& m, size_t index ) {
     __TBB_ASSERT(index, "workers should have index > 0");
     s->my_arena_index = index; // index is not a real slot in arena yet
     s->my_dummy_task->prefix().ref_count = 2;
+    s->my_is_worker = true;
     governor::sign_on(s);
     return s;
 }
@@ -1044,13 +1045,14 @@ generic_scheduler* generic_scheduler::create_master( arena* a ) {
     s->my_market->my_masters.push_front( *s );
     lock.release();
 #endif /* __TBB_TASK_GROUP_CONTEXT */
+    s->my_is_worker = false;
     if( a ) {
     // Master thread always occupies the first slot
         s->attach_arena( a, /*index*/0, /*is_master*/true );
     s->my_arena_slot->my_scheduler = s;
         a->my_default_ctx = s->default_context(); // also transfers implied ownership
     }
-    __TBB_ASSERT( !s->is_worker(), "Master thread must occupy the first slot in its arena" );
+    __TBB_ASSERT( s->my_arena_index == 0, "Master thread must occupy the first slot in its arena" );
     governor::sign_on(s);
 
 #if _WIN32||_WIN64
@@ -1074,7 +1076,7 @@ void generic_scheduler::cleanup_worker( void* arg, bool worker ) {
     s.free_scheduler();
 }
 
-void generic_scheduler::cleanup_master() {
+void generic_scheduler::cleanup_master( bool needs_wait_workers ) {
     arena* const a = my_arena;
     market * const m = my_market;
     __TBB_ASSERT( my_market, NULL );
@@ -1122,9 +1124,7 @@ void generic_scheduler::cleanup_master() {
     my_arena_slot = NULL; // detached from slot
     free_scheduler();
     // TODO: read global settings for the parameter at that point
-    // if workers are not joining, market can be released from on_thread_leaving(),
-    // so keep copy the state on local stack
-    bool must_join = m->join_workers = governor::needsWaitWorkers();
+    m->join_workers = needs_wait_workers;
     if( a ) {
 #if __TBB_STATISTICS_EARLY_DUMP
     // Resetting arena to EMPTY state (as earlier TBB versions did) should not be
@@ -1137,7 +1137,7 @@ void generic_scheduler::cleanup_master() {
 #endif
     a->on_thread_leaving</*is_master*/true>();
     }
-    if( must_join )
+    if( needs_wait_workers )
         m->wait_workers();
     m->release( /*is_public*/ a != NULL ); // TODO: ideally, it should always be true
 }
diff --git a/src/tbb/scheduler.h b/src/tbb/scheduler.h
index d3a301ca37..16685af17b 100644
--- a/src/tbb/scheduler.h
+++ b/src/tbb/scheduler.h
@@ -91,6 +91,7 @@ struct scheduler_state {
     //! Pointer to market's (for workers) or current arena's (for the master) reload epoch counter.
     volatile uintptr_t *my_ref_reload_epoch;
 #endif /* __TBB_TASK_PRIORITY */
+    bool my_is_worker;
 };
 
 //! Work stealing task scheduler.
@@ -255,7 +256,7 @@ class generic_scheduler: public scheduler
     static generic_scheduler* create_master( arena* a );
 
     //! Perform necessary cleanup when a master thread stops using TBB.
-    void cleanup_master();
+    void cleanup_master( bool needs_wait_workers );
 
     //! Initialize a scheduler for a worker thread.
     static generic_scheduler* create_worker( market& m, size_t index );
@@ -278,7 +279,7 @@ class generic_scheduler: public scheduler
 
     void attach_arena( arena*, size_t index, bool is_master );
 #if __TBB_TASK_ARENA
-    void nested_arena_entry( arena*, nested_arena_context &, bool as_worker );
+    void nested_arena_entry( arena*, size_t, nested_arena_context &, bool as_worker );
     void nested_arena_exit( nested_arena_context & );
     void wait_until_empty();
 #endif
@@ -540,7 +541,7 @@ inline void generic_scheduler::attach_mailbox( affinity_id id ) {
 }
 
 inline bool generic_scheduler::is_worker() {
-    return my_arena_index != 0; //TODO: rework for multiple master
+    return my_is_worker;
 }
 
 inline unsigned generic_scheduler::number_of_workers_in_my_arena() {
diff --git a/src/tbb/task_group_context.cpp b/src/tbb/task_group_context.cpp
index 9de7f1d68b..83a009c7c9 100644
--- a/src/tbb/task_group_context.cpp
+++ b/src/tbb/task_group_context.cpp
@@ -467,15 +467,18 @@ void task_group_context::set_priority ( priority_t prio ) {
     internal::generic_scheduler* s = governor::local_scheduler_if_initialized();
     if ( !s || !s->my_arena || !s->my_market->propagate_task_group_state(&task_group_context::my_priority, *this, p) )
         return;
-    // Updating arena priority here does not eliminate necessity of checking each
-    // task priority and updating arena priority if necessary before the task execution.
-    // These checks will be necessary because:
-    // a) set_priority() may be invoked before any tasks from this task group are spawned;
-    // b) all spawned tasks from this task group are retrieved from the task pools.
-    // These cases create a time window when arena priority may be lowered.
-    s->my_market->update_arena_priority( *s->my_arena, p );
+
     //! TODO: the arena of the calling thread might be unrelated;
     // need to find out the right arena for priority update.
+    // The executing status check only guarantees being inside some working arena.
+    if ( s->my_innermost_running_task->state() == task::executing )
+        // Updating arena priority here does not eliminate necessity of checking each
+        // task priority and updating arena priority if necessary before the task execution.
+        // These checks will be necessary because:
+        // a) set_priority() may be invoked before any tasks from this task group are spawned;
+        // b) all spawned tasks from this task group are retrieved from the task pools.
+        // These cases create a time window when arena priority may be lowered.
+        s->my_market->update_arena_priority( *s->my_arena, p );
 }
 
 priority_t task_group_context::priority () const {
diff --git a/src/tbb/tbb_misc.cpp b/src/tbb/tbb_misc.cpp
index 2064269a58..f4f58d36cd 100644
--- a/src/tbb/tbb_misc.cpp
+++ b/src/tbb/tbb_misc.cpp
@@ -196,12 +196,12 @@ bool cpu_has_speculation() {
 #if __TBB_TSX_AVAILABLE
 #if (__INTEL_COMPILER || __GNUC__ || _MSC_VER || __SUNPRO_CC)
     bool result = false;
-    const int hle_ebx_mask = 1<<4;
+    const int rtm_ebx_mask = 1<<11;
 #if _MSC_VER
     int info[4] = {0,0,0,0};
     const int reg_ebx = 1;
     __cpuidex(info, 7, 0);
-    result = (info[reg_ebx] & hle_ebx_mask)!=0;
+    result = (info[reg_ebx] & rtm_ebx_mask)!=0;
 #elif __GNUC__ || __SUNPRO_CC
     int32_t reg_ebx = 0;
     int32_t reg_eax = 7;
@@ -216,7 +216,7 @@ bool cpu_has_speculation() {
 #endif
                            "edx"
                            );
-    result = (reg_ebx & hle_ebx_mask)!=0 ;
+    result = (reg_ebx & rtm_ebx_mask)!=0 ;
 #endif
     return result;
 #else
diff --git a/src/tbbmalloc/backend.cpp b/src/tbbmalloc/backend.cpp
index 4bbf2cdf75..86f10149a6 100644
--- a/src/tbbmalloc/backend.cpp
+++ b/src/tbbmalloc/backend.cpp
@@ -97,7 +97,37 @@ void HugePagesStatus::doPrintStatus(bool state, const char *stateName)
     fputs("\n", stderr);
 }
 
-void *Backend::allocRawMem(size_t &size) const
+#if CHECK_ALLOCATION_RANGE
+
+void Backend::UsedAddressRange::registerAlloc(uintptr_t left, uintptr_t right)
+{
+    MallocMutex::scoped_lock lock(mutex);
+    if (left < leftBound)
+        leftBound = left;
+    if (right > rightBound)
+        rightBound = right;
+    MALLOC_ASSERT(leftBound, ASSERT_TEXT);
+    MALLOC_ASSERT(leftBound < rightBound, ASSERT_TEXT);
+    MALLOC_ASSERT(leftBound <= left && right <= rightBound, ASSERT_TEXT);
+}
+
+void Backend::UsedAddressRange::registerFree(uintptr_t left, uintptr_t right)
+{
+    MallocMutex::scoped_lock lock(mutex);
+    if (leftBound == left) {
+        if (rightBound == right) {
+            leftBound = ADDRESS_UPPER_BOUND;
+            rightBound = 0;
+        } else
+            leftBound = right;
+    } else if (rightBound == right)
+        rightBound = left;
+    MALLOC_ASSERT((!rightBound && leftBound == ADDRESS_UPPER_BOUND)
+                  || leftBound < rightBound, ASSERT_TEXT);
+}
+#endif // CHECK_ALLOCATION_RANGE
+
+void *Backend::allocRawMem(size_t &size)
 {
     void *res = NULL;
     size_t allocSize;
@@ -128,13 +158,15 @@ void *Backend::allocRawMem(size_t &size) const
 
     if ( res ) {
         size = allocSize;
+        if (!extMemPool->userPool())
+            usedAddrRange.registerAlloc((uintptr_t)res, (uintptr_t)res+size);
         AtomicAdd((intptr_t&)totalMemSize, size);
     }
 
     return res;
 }
 
-bool Backend::freeRawMem(void *object, size_t size) const
+bool Backend::freeRawMem(void *object, size_t size)
 {
     bool fail;
     AtomicAdd((intptr_t&)totalMemSize, -size);
@@ -142,6 +174,7 @@ bool Backend::freeRawMem(void *object, size_t size) const
         MALLOC_ASSERT(!extMemPool->fixedPool, "No free for fixed-size pools.");
         fail = (*extMemPool->rawFree)(extMemPool->poolId, object, size);
     } else {
+        usedAddrRange.registerFree((uintptr_t)object, (uintptr_t)object + size);
         hugePages.registerReleasing(object, size);
         fail = freeRawMemory(object, size);
     }
@@ -980,6 +1013,7 @@ void *Backend::remap(void *ptr, size_t oldSize, size_t newSize, size_t alignment
 
     MemRegion *oldRegion = static_cast<LastFreeBlock*>(right)->memRegion;
     MALLOC_ASSERT( oldRegion < ptr, ASSERT_TEXT );
+    const size_t oldRegionSize = oldRegion->allocSz;
     if (oldRegion->type != MEMREG_ONE_BLOCK)
         return NULL;  // we are not single in the region
     const size_t userOffset = (uintptr_t)ptr - (uintptr_t)oldRegion;
@@ -1024,6 +1058,9 @@ void *Backend::remap(void *ptr, size_t oldSize, size_t newSize, size_t alignment
     header->memoryBlock = lmb;
     MALLOC_ASSERT((uintptr_t)lmb + lmb->unalignedSize >=
                   (uintptr_t)object + lmb->objectSize, "An object must fit to the block.");
+
+    usedAddrRange.registerFree((uintptr_t)oldRegion, (uintptr_t)oldRegion + oldRegionSize);
+    usedAddrRange.registerAlloc((uintptr_t)region, (uintptr_t)region + requestSize);
     return object;
 }
 #endif /* BACKEND_HAS_MREMAP */
@@ -1363,6 +1400,7 @@ FreeBlock *Backend::addNewRegion(size_t size, MemRegionType memRegType, bool add
 void Backend::init(ExtMemoryPool *extMemoryPool)
 {
     extMemPool = extMemoryPool;
+    usedAddrRange.init();
     coalescQ.init(&bkndSync);
     bkndSync.init(this);
 }
diff --git a/src/tbbmalloc/frontend.cpp b/src/tbbmalloc/frontend.cpp
index bc06d658cf..09317abc82 100644
--- a/src/tbbmalloc/frontend.cpp
+++ b/src/tbbmalloc/frontend.cpp
@@ -2444,7 +2444,8 @@ static inline bool isSmallObject (void *ptr)
 /**** Check if an object was allocated by scalable_malloc ****/
 static inline bool isRecognized (void* ptr)
 {
-    return isLargeObject<unknownMem>(ptr) || isSmallObject(ptr);
+    return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
+        (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
 }
 
 static inline void freeSmallObject(MemoryPool *memPool, void *object)
@@ -2853,15 +2854,21 @@ extern "C" void __TBB_malloc_safer_free(void *object, void (*original_free)(void
     if (!object)
         return;
 
-    // must check 1st for large object, because small object check touches 4 pages on left,
-    // and it can be inaccessible
-    if (isLargeObject<unknownMem>(object)) {
-        TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
-
-        defaultMemPool->putToLLOCache(tls, object);
-    } else if (isSmallObject(object)) {
-        freeSmallObject(defaultMemPool, object);
-    } else if (original_free)
+    // tbbmalloc can allocate object only when tbbmalloc has been initialized
+    if (FencedLoad(mallocInitialized) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
+        if (isLargeObject<unknownMem>(object)) {
+            // must check 1st for large object, because small object check touches 4 pages on left,
+            // and it can be inaccessible
+            TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
+
+            defaultMemPool->putToLLOCache(tls, object);
+            return;
+        } else if (isSmallObject(object)) {
+            freeSmallObject(defaultMemPool, object);
+            return;
+        }
+    }
+    if (original_free)
         original_free(object);
 }
 
@@ -2903,7 +2910,7 @@ extern "C" void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original
 
     if (!ptr) {
         tmp = internalMalloc(sz);
-    } else if (isRecognized(ptr)) {
+    } else if (FencedLoad(mallocInitialized) && isRecognized(ptr)) {
         if (!sz) {
             internalFree(ptr);
             return NULL;
@@ -3029,7 +3036,7 @@ extern "C" void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, siz
 
     if (!ptr) {
         tmp = allocateAligned(defaultMemPool, size, alignment);
-    } else if (isRecognized(ptr)) {
+    } else if (FencedLoad(mallocInitialized) && isRecognized(ptr)) {
         if (!size) {
             internalFree(ptr);
             return NULL;
@@ -3094,7 +3101,7 @@ extern "C" size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize
 {
     if (object) {
         // Check if the memory was allocated by scalable_malloc
-        if (isRecognized(object))
+        if (FencedLoad(mallocInitialized) && isRecognized(object))
             return internalMsize(object);
         else if (original_msize)
             return original_msize(object);
@@ -3113,7 +3120,7 @@ extern "C" size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignmen
 {
     if (object) {
         // Check if the memory was allocated by scalable_malloc
-        if (isRecognized(object))
+        if (FencedLoad(mallocInitialized) && isRecognized(object))
             return internalMsize(object);
         else if (orig_aligned_msize)
             return orig_aligned_msize(object,alignment,offset);
diff --git a/src/tbbmalloc/proxy.cpp b/src/tbbmalloc/proxy.cpp
index 13b4569f28..438b3f0e27 100644
--- a/src/tbbmalloc/proxy.cpp
+++ b/src/tbbmalloc/proxy.cpp
@@ -331,38 +331,42 @@ void* __TBB_malloc_safer__aligned_realloc_##CRTLIB( void *ptr, size_t size, size
     return __TBB_malloc_safer_aligned_realloc( ptr, size, aligment, &func_ptrs );                    \
 }
 
-// limit is 30 bytes/60 symbols per line
+// limit is 30 bytes/60 symbols per line, * can be used to match any digit in bytecodes
 const char* known_bytecodes[] = {
 #if _WIN64
+ #if __TBB_OVERLOAD_OLD_MSVCR
     "4883EC284885C974",       //release free() win64
-    "4883EC384885C975",       //release msize() win64
     "4885C974375348",         //release free() 8.0.50727.42 win64
-    "48894C24084883EC28BA",   //debug prologue for win64
     "4C8BC1488B0DA6E4040033", //win64 SDK
-    "4883EC284885C975",       //release msize() 10.0.21003.1 win64
     "48895C2408574883EC20",   //release _aligned_msize() win64
+ #endif
+    "4883EC384885C975",       //release msize() 9.0 win64
+    "48894C24084883EC28BA",   //debug prologue for win64
+    "4883EC284885C975",       //release msize() 10.0.21003.1 win64
     "4C894424184889542410",   //debug _aligned_msize() win64
-#else
-    "558BEC6A018B",           //debug free() & _msize() 8.0.50727.4053 win32
+#else // _WIN64
+ #if __TBB_OVERLOAD_OLD_MSVCR
     "6A1868********E8",       //release free() 8.0.50727.4053 win32
-    "6A1C68********E8",       //release _msize() 8.0.50727.4053 win32
-    "558BEC837D08000F",       //release _msize() 11.0.51106.1 win32
-    "8BFF558BEC6A",           //debug free() & _msize() 9.0.21022.8 win32
-    "8BFF558BEC83",           //debug free() & _msize() 10.0.21003.1 win32
     "8BFF558BEC8B4508",       //release _aligned_msize() 10.0 win32
+ #endif
+    "6A1C68********E8",       //release _msize() 8.0.50727.4053, 9.0 win32
+    "558BEC6A018B",           //debug free() & _msize() 11.0 win32
+    "558BEC837D08000F",       //release _msize() 11.0.51106.1 win32
+    "8BFF558BEC6A",           //debug free() & _msize() 10.0.40219.325 win32
+    "8BFF558BEC83",           //release free() & _msize() 10.0.40219.325 win32
     "8BFF558BEC8B4510",       //debug _aligned_msize() 10.0 win32
     "558BEC8B451050",         //debug _aligned_msize() 11.0 win32
-#endif
+#endif // _WIN64
     NULL
     };
 
-#define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY(CRT_VER,function_name,dbg_modifier)                                                                         \
+#define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY(CRT_VER,function_name,dbg_modifier) \
     ReplaceFunctionWithStore( #CRT_VER #dbg_modifier ".dll", #function_name, (FUNCPTR)__TBB_malloc_safer_##function_name##_##CRT_VER##dbg_modifier, known_bytecodes, (FUNCPTR*)&orig_##function_name##_##CRT_VER##dbg_modifier );
 
-#define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY_NO_FALLBACK(CRT_VER,function_name,dbg_modifier)                                                             \
+#define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY_NO_FALLBACK(CRT_VER,function_name,dbg_modifier) \
     ReplaceFunctionWithStore( #CRT_VER #dbg_modifier ".dll", #function_name, (FUNCPTR)__TBB_malloc_safer_##function_name##_##CRT_VER##dbg_modifier, 0, NULL );
 
-#define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY_REDIRECT(CRT_VER,function_name,dest_func,dbg_modifier)                                                      \
+#define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY_REDIRECT(CRT_VER,function_name,dest_func,dbg_modifier) \
     ReplaceFunctionWithStore( #CRT_VER #dbg_modifier ".dll", #function_name, (FUNCPTR)__TBB_malloc_safer_##dest_func##_##CRT_VER##dbg_modifier, 0, NULL );
 
 
@@ -381,12 +385,14 @@ const char* known_bytecodes[] = {
     __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_RELEASE(CRT_VER) \
     __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_DEBUG(CRT_VER)
 
+#if __TBB_OVERLOAD_OLD_MSVCR
 __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr70d);
 __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr70);
 __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr71d);
 __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr71);
 __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr80d);
 __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr80);
+#endif
 __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr90d);
 __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr90);
 __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr100d);
@@ -441,8 +447,6 @@ void operator_delete_arr_t(void* ptr, const std::nothrow_t&) throw() {
 }
 
 const char* modules_to_replace[] = {
-    "msvcr80d.dll",
-    "msvcr80.dll",
     "msvcr90d.dll",
     "msvcr90.dll",
     "msvcr100d.dll",
@@ -451,11 +455,16 @@ const char* modules_to_replace[] = {
     "msvcr110.dll",
     "msvcr120d.dll",
     "msvcr120.dll",
-//    "ucrtbase.dll",
+    "ucrtbase.dll",
+//    "ucrtbased.dll" is not supported because of problems with _dbg functions
+#if __TBB_OVERLOAD_OLD_MSVCR
+    "msvcr80d.dll",
+    "msvcr80.dll",
     "msvcr70d.dll",
     "msvcr70.dll",
     "msvcr71d.dll",
     "msvcr71.dll",
+#endif
 #if __TBB_TODO
     // TODO: Try enabling replacement for non-versioned system binaries below
     "msvcrtd.dll",
@@ -526,27 +535,29 @@ typedef wchar_t unicode_char_t;
 
 void ReplaceFunctionWithStore( const unicode_char_t *dllName, const char *funcName, FUNCPTR newFunc, const char ** opcodes, FUNCPTR* origFunc,  FRR_ON_ERROR on_error = FRR_FAIL )
 {
-    FRR_TYPE type = ReplaceFunction( dllName, funcName, newFunc, opcodes, origFunc );
-    if (type == FRR_NODLL) return;
-    if (type != FRR_OK && on_error == FRR_FAIL)
-    {
-        fprintf(stderr, "Failed to replace function %s in module %s\n",
-                funcName, dllName);
-        exit(1);
-    }
+    FRR_TYPE res = ReplaceFunction( dllName, funcName, newFunc, opcodes, origFunc );
+
+    if (res == FRR_OK || res == FRR_NODLL || (res == FRR_NOFUNC && on_error == FRR_IGNORE))
+        return;
+
+    fprintf(stderr, "Failed to %s function %s in module %s\n",
+            res==FRR_NOFUNC? "find" : "replace", funcName, dllName);
+    exit(1);
 }
 
 void doMallocReplacement()
 {
     // Replace functions and keep backup of original code (separate for each runtime)
+#if __TBB_OVERLOAD_OLD_MSVCR
     __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL(msvcr70)
     __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL(msvcr71)
     __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL(msvcr80)
+#endif
     __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL(msvcr90)
     __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL(msvcr100)
     __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL(msvcr110)
     __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL(msvcr120)
-//    __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_RELEASE(ucrtbase)
+    __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_RELEASE(ucrtbase)
 
     // Replace functions without storing original code
     for (size_t j = 0; j < arrayLength(modules_to_replace); j++) {
diff --git a/src/tbbmalloc/tbbmalloc_internal.h b/src/tbbmalloc/tbbmalloc_internal.h
index 255f00e5a9..14b5b0585c 100644
--- a/src/tbbmalloc/tbbmalloc_internal.h
+++ b/src/tbbmalloc/tbbmalloc_internal.h
@@ -37,6 +37,7 @@
 
 // TODO: *BSD also has it
 #define BACKEND_HAS_MREMAP __linux__
+#define CHECK_ALLOCATION_RANGE MALLOC_DEBUG || MALLOC_ZONE_OVERLOAD_ENABLED || MALLOC_UNIXLIKE_OVERLOAD_ENABLED
 
 #include "tbb/tbb_config.h" // for __TBB_LIBSTDCPP_EXCEPTION_HEADERS_BROKEN
 #if __TBB_LIBSTDCPP_EXCEPTION_HEADERS_BROKEN
@@ -731,30 +732,64 @@ class Backend {
         inline bool operator()(size_t oldMaxReq, size_t requestSize) const;
     };
 
-    ExtMemoryPool *extMemPool;
+#if CHECK_ALLOCATION_RANGE
+    // Keep min and max of all addresses requested from OS,
+    // use it for checking memory possibly allocated by replaced allocators
+    // and for debugging purposes. Valid only for default memory pool.
+    class UsedAddressRange {
+        static const uintptr_t ADDRESS_UPPER_BOUND = UINTPTR_MAX;
+
+        uintptr_t   leftBound,
+                    rightBound;
+        MallocMutex mutex;
+    public:
+        // rightBound is zero-initialized
+        void init() { leftBound = ADDRESS_UPPER_BOUND; }
+        void registerAlloc(uintptr_t left, uintptr_t right);
+        void registerFree(uintptr_t left, uintptr_t right);
+        // as only left and right bounds are kept, we can return true
+        // for pointer not allocated by us, if more than single region
+        // was requested from OS
+        bool inRange(void *ptr) const {
+            const uintptr_t p = (uintptr_t)ptr;
+            return leftBound<=p && p<=rightBound;
+        }
+    };
+#else
+    class UsedAddressRange {
+    public:
+        void init() { }
+        void registerAlloc(uintptr_t, uintptr_t) {}
+        void registerFree(uintptr_t, uintptr_t) {}
+        bool inRange(void *) const { return true; }
+    };
+#endif
+
+    ExtMemoryPool   *extMemPool;
     // used for release every region on pool destroying
-    MemRegionList  regionList;
+    MemRegionList    regionList;
 
-    CoalRequestQ   coalescQ; // queue of coalescing requests
-    BackendSync    bkndSync;
+    CoalRequestQ     coalescQ; // queue of coalescing requests
+    BackendSync      bkndSync;
     // semaphore protecting adding more more memory from OS
     MemExtendingSema memExtendingSema;
-    size_t         totalMemSize,
-                   memSoftLimit;
+    size_t           totalMemSize,
+                     memSoftLimit;
+    UsedAddressRange usedAddrRange;
     // to keep 1st allocation large than requested, keep bootstrapping status
     enum {
         bootsrapMemNotDone = 0,
         bootsrapMemInitializing,
         bootsrapMemDone
     };
-    intptr_t       bootsrapMemStatus;
-    MallocMutex    bootsrapMemStatusMutex;
+    intptr_t         bootsrapMemStatus;
+    MallocMutex      bootsrapMemStatusMutex;
 
     // Using of maximal observed requested size allows decrease
     // memory consumption for small requests and decrease fragmentation
     // for workloads when small and large allocation requests are mixed.
     // TODO: decrease, not only increase it
-    size_t         maxRequestedSize;
+    size_t           maxRequestedSize;
 
     FreeBlock *addNewRegion(size_t size, MemRegionType type, bool addToBin);
     FreeBlock *findBlockInRegion(MemRegion *region, size_t exactBlockSize);
@@ -780,8 +815,8 @@ class Backend {
 
     void removeBlockFromBin(FreeBlock *fBlock);
 
-    void *allocRawMem(size_t &size) const;
-    bool freeRawMem(void *object, size_t size) const;
+    void *allocRawMem(size_t &size);
+    bool freeRawMem(void *object, size_t size);
 
     void putLargeBlock(LargeMemoryBlock *lmb);
     void releaseCachesToLimit();
@@ -820,6 +855,8 @@ class Backend {
     }
     inline size_t getMaxBinnedSize() const;
 
+    bool ptrCanBeValid(void *ptr) const { return usedAddrRange.inRange(ptr); }
+
 #if __TBB_MALLOC_WHITEBOX_TEST
     size_t getTotalMemSize() const { return totalMemSize; }
 #endif
diff --git a/src/test/harness.h b/src/test/harness.h
index fd68be4ec9..d738498dbe 100644
--- a/src/test/harness.h
+++ b/src/test/harness.h
@@ -131,7 +131,7 @@ void print_call_stack() {
     #elif __SUNPRO_CC
         REPORT("Call stack info:\n");
         printstack(fileno(stdout));
-    #elif _WIN32_WINNT > 0x0501 && _MSC_VER && !__TBB_WIN8UI_SUPPORT
+    #elif _WIN32_WINNT > 0x0501 && _MSC_VER>=1500 && !__TBB_WIN8UI_SUPPORT
         const int sz = 62; // XP limitation for number of frames
         void *buff[sz];
         int n = CaptureStackBackTrace(0, sz, buff, NULL);
diff --git a/src/test/harness_allocator_overload.h b/src/test/harness_allocator_overload.h
new file mode 100644
index 0000000000..28222f6033
--- /dev/null
+++ b/src/test/harness_allocator_overload.h
@@ -0,0 +1,39 @@
+/*
+    Copyright 2005-2015 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks. Threading Building Blocks is free software;
+    you can redistribute it and/or modify it under the terms of the GNU General Public License
+    version 2  as  published  by  the  Free Software Foundation.  Threading Building Blocks is
+    distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+    implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    See  the GNU General Public License for more details.   You should have received a copy of
+    the  GNU General Public License along with Threading Building Blocks; if not, write to the
+    Free Software Foundation, Inc.,  51 Franklin St,  Fifth Floor,  Boston,  MA 02110-1301 USA
+
+    As a special exception,  you may use this file  as part of a free software library without
+    restriction.  Specifically,  if other files instantiate templates  or use macros or inline
+    functions from this file, or you compile this file and link it with other files to produce
+    an executable,  this file does not by itself cause the resulting executable to be covered
+    by the GNU General Public License. This exception does not however invalidate any other
+    reasons why the executable file might be covered by the GNU General Public License.
+*/
+
+#ifndef tbb_test_harness_allocator_overload_H
+#define tbb_test_harness_allocator_overload_H
+
+#include "../tbbmalloc/proxy.h" // for MALLOC_UNIXLIKE_OVERLOAD_ENABLED, MALLOC_ZONE_OVERLOAD_ENABLED
+#include "tbb/tbb_config.h" // for __TBB_WIN8UI_SUPPORT
+
+// Skip configurations with unsupported system malloc overload:
+// skip unsupported MSVCs, WIN8UI and MINGW (it doesn't define _MSC_VER),
+// no support for MSVC 2015 in debug for now,
+// don't use defined(_MSC_VER), because result of using defined() in macro expansion is undefined
+#define MALLOC_WINDOWS_OVERLOAD_ENABLED ((_WIN32||_WIN64) && !__TBB_WIN8UI_SUPPORT && _MSC_VER >= 1500 && !(_MSC_VER == 1900 && _DEBUG))
+
+// Skip configurations with unsupported system malloc overload:
+// * overload via linking with -lmalloc_proxy is broken in offload,
+// as the library is loaded too late in that mode,
+// * LD_PRELOAD mechanism is broken in offload
+#define HARNESS_SKIP_TEST ((!MALLOC_WINDOWS_OVERLOAD_ENABLED && !MALLOC_UNIXLIKE_OVERLOAD_ENABLED && !MALLOC_ZONE_OVERLOAD_ENABLED) || __TBB_MIC_OFFLOAD)
+
+#endif // tbb_test_harness_allocator_overload_H
diff --git a/src/test/harness_defs.h b/src/test/harness_defs.h
index a4bde1f3ed..78581108ef 100644
--- a/src/test/harness_defs.h
+++ b/src/test/harness_defs.h
@@ -123,7 +123,7 @@
 #define __TBB_THROW_FROM_DTOR_BROKEN (__clang__ &&  (__apple_build_version__ &&  __apple_build_version__ < 5000279 || __TBB_CLANG_VERSION && __TBB_CLANG_VERSION < 50000))
 
 //std::uncaught_exception is broken on some version of stdlibc++ (it returns true with no active exception)
-#define __TBB_STD_UNCAUGHT_EXCEPTION_BROKEN (__linux__ && (__TBB_GCC_VERSION == 40407 || __TBB_GCC_VERSION == 40902))
+#define __TBB_STD_UNCAUGHT_EXCEPTION_BROKEN (__linux__ && (__TBB_GCC_VERSION == 40407 || __TBB_GCC_VERSION == 40802 || __TBB_GCC_VERSION == 40902))
 
 #if __TBB_LIBSTDCPP_EXCEPTION_HEADERS_BROKEN
   #define _EXCEPTION_PTR_H /* prevents exception_ptr.h inclusion */
diff --git a/src/test/harness_iterator.h b/src/test/harness_iterator.h
index a16dd6ab3b..a43a37b55f 100644
--- a/src/test/harness_iterator.h
+++ b/src/test/harness_iterator.h
@@ -91,8 +91,10 @@ class RandomIterator {
     T& operator* () { return *my_ptr; }
     RandomIterator& operator++ () { ++my_ptr; return *this; }
     bool operator== ( const RandomIterator& r ) { return my_ptr == r.my_ptr; }
-    difference_type operator- (const RandomIterator &r) {return my_ptr - r.my_ptr;}
+    bool operator!= ( const RandomIterator& r ) { return my_ptr != r.my_ptr; }
+    difference_type operator- (const RandomIterator &r) const {return my_ptr - r.my_ptr;}
     RandomIterator operator+ (difference_type n) {return RandomIterator(my_ptr + n);}
+    bool operator< (const RandomIterator &r) const {return my_ptr < r.my_ptr;}
 };
 
 template <class T>
@@ -116,8 +118,10 @@ class ConstRandomIterator {
     const T& operator* () { return *my_ptr; }
     ConstRandomIterator& operator++ () { ++my_ptr; return *this; }
     bool operator== ( const ConstRandomIterator& r ) { return my_ptr == r.my_ptr; }
-    difference_type operator- (const ConstRandomIterator &r) {return my_ptr - r.my_ptr;}
+    bool operator!= ( const ConstRandomIterator& r ) { return my_ptr != r.my_ptr; }
+    difference_type operator- (const ConstRandomIterator &r) const {return my_ptr - r.my_ptr;}
     ConstRandomIterator operator+ (difference_type n) {return ConstRandomIterator(my_ptr + n);}
+    bool operator< (const ConstRandomIterator &r) const {return my_ptr < r.my_ptr;}
 };
 
 } // namespace Harness
diff --git a/src/test/harness_tsx.h b/src/test/harness_tsx.h
index fba4b78f87..4dc084dedb 100644
--- a/src/test/harness_tsx.h
+++ b/src/test/harness_tsx.h
@@ -44,8 +44,8 @@ bool have_TSX() {
     const int reg_ebx = 1;
     int old_ecx = 0;
     __cpuidex(info, 7, old_ecx);
-    result = (info[reg_ebx] & hle_ebx_mask)!=0;
-    if( result ) ASSERT( (info[reg_ebx] & rtm_ebx_mask)!=0, NULL );
+    result = (info[reg_ebx] & rtm_ebx_mask)!=0;
+    if( result ) ASSERT( (info[reg_ebx] & hle_ebx_mask)!=0, NULL );
 #elif __GNUC__ || __SUNPRO_CC
     int32_t reg_ebx = 0;
     int32_t reg_eax = 7;
@@ -60,8 +60,8 @@ bool have_TSX() {
 #endif
                            "edx"
                            );
-    result = (reg_ebx & hle_ebx_mask)!=0 ;
-    if( result ) ASSERT( (reg_ebx & rtm_ebx_mask)!=0, NULL );
+    result = (reg_ebx & rtm_ebx_mask)!=0 ;
+    if( result ) ASSERT( (reg_ebx & hle_ebx_mask)!=0, NULL );
 #endif
     return result;
 }
diff --git a/src/test/test_allocator.h b/src/test/test_allocator.h
index 61bf2fe1f0..4a5d3ecb06 100644
--- a/src/test/test_allocator.h
+++ b/src/test/test_allocator.h
@@ -59,6 +59,33 @@ inline char PseudoRandomValue( size_t j, size_t k ) {
     return char(j*3 ^ j>>4 ^ k);
 }
 
+#if __APPLE__
+#include <fcntl.h>
+#include <unistd.h>
+
+// A RAII class to disable stderr in a certain scope. It's not thread-safe.
+class DisableStderr {
+    int stderrCopy;
+    static void dupToStderrAndClose(int fd) {
+        int ret = dup2(fd, STDERR_FILENO); // close current stderr
+        ASSERT(ret != -1, NULL);
+        ret = close(fd);
+        ASSERT(ret != -1, NULL);
+    }
+public:
+    DisableStderr() {
+        int devNull = open("/dev/null", O_WRONLY);
+        ASSERT(devNull != -1, NULL);
+        stderrCopy = dup(STDERR_FILENO);
+        ASSERT(stderrCopy != -1, NULL);
+        dupToStderrAndClose(devNull);
+    }
+    ~DisableStderr() {
+        dupToStderrAndClose(stderrCopy);
+    }
+};
+#endif
+
 //! T is type and A is allocator for that type 
 template<typename T, typename A>
 void TestBasic( A& a ) {
@@ -138,6 +165,11 @@ void TestBasic( A& a ) {
     bool exception_caught = false;
     typename A::pointer p1 = NULL;
     try {
+#if __APPLE__
+        // On OS X*, failure to map memory results in messages to stderr;
+        // suppress them.
+        DisableStderr disableStderr;
+#endif
         p1 = a.allocate(too_big);
     } catch ( std::bad_alloc ) {
         exception_caught = true;
diff --git a/src/test/test_atomic.cpp b/src/test/test_atomic.cpp
index cd454de930..75ef987cca 100644
--- a/src/test/test_atomic.cpp
+++ b/src/test/test_atomic.cpp
@@ -472,7 +472,7 @@ namespace TestConstExprInitializationOfGlobalObjectsHelper{
        static_before(){ result = (static_atomic==ct_value); }            \
     } ;                                                                  \
                                                                          \
-    typename tester<T>::static_before tester<T>::static_before_;         \
+    tester<T>::static_before tester<T>::static_before_;                  \
     tbb::atomic<T> tester<T>::static_atomic(ct_value);                   \
                                                                          \
     auto_registered_tests_helper::registration<T> tester<T>::registered; \
diff --git a/src/test/test_malloc_atexit.cpp b/src/test/test_malloc_atexit.cpp
index b2013c0eb0..b10a2c625a 100644
--- a/src/test/test_malloc_atexit.cpp
+++ b/src/test/test_malloc_atexit.cpp
@@ -27,12 +27,7 @@
  */
 
 #include <stdlib.h>
-#include "../tbbmalloc/proxy.h" // __TBB_malloc_safer_msize
-#include "tbb/tbb_config.h"     // for __TBB_WIN8UI_SUPPORT
-
-#if !(_WIN32||_WIN64 || MALLOC_UNIXLIKE_OVERLOAD_ENABLED || MALLOC_ZONE_OVERLOAD_ENABLED) || __TBB_WIN8UI_SUPPORT || __MINGW32__ || __MINGW64__
-#define HARNESS_SKIP_TEST 1
-#endif
+#include "harness_allocator_overload.h"
 
 // __TBB_malloc_safer_msize() returns 0 for unknown objects,
 // thus we can detect ownership
@@ -156,9 +151,6 @@ int TestMain () {
 #ifdef _PGO_INSTRUMENT
     REPORT("Known issue: test_malloc_atexit hangs if compiled with -prof-genx\n");
     return Harness::Skipped;
-#elif __TBB_MIC_OFFLOAD
-    REPORT("Known issue: libmalloc_proxy.so is loaded too late in the offload mode on the target when linked via -lmalloc_proxy\n");
-    return Harness::Skipped;
 #else
     ASSERT( dll_isMallocOverloaded(), "malloc was not replaced" );
     ASSERT( exe_isMallocOverloaded(), "malloc was not replaced" );
diff --git a/src/test/test_malloc_overload.cpp b/src/test/test_malloc_overload.cpp
index ff9137ebac..a758e7b0e8 100644
--- a/src/test/test_malloc_overload.cpp
+++ b/src/test/test_malloc_overload.cpp
@@ -36,22 +36,15 @@
 #define _ISOC11_SOURCE 1 // to get C11 declarations for GLIBC
 #define HARNESS_NO_PARSE_COMMAND_LINE 1
 
-#include "tbb/tbb_config.h" // to get __TBB_WIN8UI_SUPPORT
+#include "harness_allocator_overload.h"
 
-#if __linux__ || __APPLE__
-#define MALLOC_REPLACEMENT_AVAILABLE 1
-#elif _WIN32 && !__MINGW32__ && !__MINGW64__ && !__TBB_WIN8UI_SUPPORT
-#define MALLOC_REPLACEMENT_AVAILABLE 2
+#if MALLOC_WINDOWS_OVERLOAD_ENABLED
 #include "tbb/tbbmalloc_proxy.h"
 #endif
 
-// LD_PRELOAD mechanism is broken in offload, no support for MSVC 2015 in debug for now
-#if __TBB_MIC_OFFLOAD || !MALLOC_REPLACEMENT_AVAILABLE || (_MSC_VER >= 1900 && _DEBUG)
-#define HARNESS_SKIP_TEST 1
-#endif
 #include "harness.h"
 
-#if MALLOC_REPLACEMENT_AVAILABLE
+#if !HARNESS_SKIP_TEST
 
 #if __ANDROID__
   #include <android/api-level.h> // for __ANDROID_API__
@@ -76,7 +69,7 @@
 #endif
 #include <stdio.h>
 #include <new>
-#if MALLOC_REPLACEMENT_AVAILABLE == 1
+#if MALLOC_UNIXLIKE_OVERLOAD_ENABLED || MALLOC_ZONE_OVERLOAD_ENABLED
 #include <unistd.h> // for sysconf
 #include <dlfcn.h>
 #endif
@@ -199,9 +192,9 @@ static void scalableMallocCheckSize(void *object, size_t size)
         ASSERT(uintptr_t(lmb)<uintptr_t(((LargeObjectHdr*)object-1))
                && lmb->objectSize >= size, NULL);
     }
-#if MALLOC_REPLACEMENT_AVAILABLE == 1
+#if MALLOC_UNIXLIKE_OVERLOAD_ENABLED || MALLOC_ZONE_OVERLOAD_ENABLED
     ASSERT(malloc_usable_size(object) >= size, NULL);
-#elif MALLOC_REPLACEMENT_AVAILABLE == 2
+#elif MALLOC_WINDOWS_OVERLOAD_ENABLED
     // Check that _msize works correctly
     ASSERT(_msize(object) >= size, NULL);
     ASSERT(size<8 || _aligned_msize(object,8,0) >= size, NULL);
@@ -226,7 +219,7 @@ void CheckStdFuncOverload(void *(*malloc_p)(size_t), void *(*calloc_p)(size_t, s
     free_p(ptr1);
 }
 
-#if MALLOC_REPLACEMENT_AVAILABLE == 1
+#if MALLOC_UNIXLIKE_OVERLOAD_ENABLED || MALLOC_ZONE_OVERLOAD_ENABLED
 
 void CheckMemalignFuncOverload(void *(*memalign_p)(size_t, size_t),
                                void (*free_p)(void*))
@@ -259,7 +252,7 @@ void CheckPvalloc(void *(*pvalloc_p)(size_t), void (*free_p)(void*))
     }
 }
 
-#endif // MALLOC_REPLACEMENT_AVAILABLE
+#endif // MALLOC_UNIXLIKE_OVERLOAD_ENABLED || MALLOC_ZONE_OVERLOAD_ENABLED
 
 // regression test: on OS X scalable_free() treated small aligned object,
 // placed in large block, as small block
@@ -273,7 +266,7 @@ void CheckFreeAligned() {
 #if __TBB_POSIX_MEMALIGN_PRESENT
             int ret = posix_memalign(&ptr, align[a], sz[s]);
             ASSERT(!ret, NULL);
-#elif MALLOC_REPLACEMENT_AVAILABLE == 2
+#elif MALLOC_WINDOWS_OVERLOAD_ENABLED
             ptr = _aligned_malloc(sz[s], align[a]);
 #endif
             ASSERT(is_aligned(ptr, align[a]), NULL);
@@ -317,7 +310,7 @@ void TestZoneOverload() {
 int TestMain() {
     void *ptr, *ptr1;
 
-#if MALLOC_REPLACEMENT_AVAILABLE == 1
+#if MALLOC_UNIXLIKE_OVERLOAD_ENABLED || MALLOC_ZONE_OVERLOAD_ENABLED
     ASSERT(dlsym(RTLD_DEFAULT, "scalable_malloc"),
            "Lost dependence on malloc_proxy or LD_PRELOAD was not set?");
 #endif
@@ -347,7 +340,7 @@ int TestMain() {
     free(newEnv);
 
     CheckStdFuncOverload(malloc, calloc, realloc, free);
-#if MALLOC_REPLACEMENT_AVAILABLE == 1
+#if MALLOC_UNIXLIKE_OVERLOAD_ENABLED || MALLOC_ZONE_OVERLOAD_ENABLED
 
 #if __TBB_POSIX_MEMALIGN_PRESENT
     int ret = posix_memalign(&ptr, 1024, 3*minLargeObjectSize);
@@ -385,7 +378,7 @@ int TestMain() {
  #endif
 #endif // __linux__
 
-#elif MALLOC_REPLACEMENT_AVAILABLE == 2
+#else // MALLOC_WINDOWS_OVERLOAD_ENABLED
 
     ptr = _aligned_malloc(minLargeObjectSize, 16);
     scalableMallocCheckSize(ptr, minLargeObjectSize);
@@ -429,4 +422,4 @@ int TestMain() {
 
     return Harness::Done;
 }
-#endif /* MALLOC_REPLACEMENT_AVAILABLE */
+#endif // !HARNESS_SKIP_TEST
diff --git a/src/test/test_malloc_pools.cpp b/src/test/test_malloc_pools.cpp
index 3b8fb06146..1dbfa97868 100644
--- a/src/test/test_malloc_pools.cpp
+++ b/src/test/test_malloc_pools.cpp
@@ -331,16 +331,18 @@ static void *fixedBufGetMem(intptr_t pool_id, size_t &bytes)
     return ((FixedPoolHeadBase*)pool_id)->useData(bytes);
 }
 
-class FixedPoolRun: NoAssign {
-    Harness::SpinBarrier *startB;
+class FixedPoolUse: NoAssign {
+    static Harness::SpinBarrier startB;
     rml::MemoryPool *pool;
     size_t reqSize;
     int iters;
 public:
-    FixedPoolRun(Harness::SpinBarrier *b, rml::MemoryPool *p, size_t sz, int it) :
-        startB(b), pool(p), reqSize(sz), iters(it) {}
+    FixedPoolUse(unsigned threads, rml::MemoryPool *p, size_t sz, int it) :
+        pool(p), reqSize(sz), iters(it) {
+        startB.initialize(threads);
+    }
     void operator()( int /*id*/ ) const {
-        startB->wait();
+        startB.wait();
         for (int i=0; i<iters; i++) {
             void *o = pool_malloc(pool, reqSize);
             ASSERT(o, NULL);
@@ -349,6 +351,8 @@ class FixedPoolRun: NoAssign {
     }
 };
 
+Harness::SpinBarrier FixedPoolUse::startB;
+
 class FixedPoolNomem: NoAssign {
     Harness::SpinBarrier *startB;
     rml::MemoryPool *pool;
@@ -394,78 +398,73 @@ void TestFixedBufferPool()
     rml::MemPoolPolicy pol(fixedBufGetMem, NULL, 0, /*fixedSizePool=*/true,
                            /*keepMemTillDestroy=*/false);
     rml::MemoryPool *pool;
-    FixedPoolHead<MAX_OBJECT + 1024*1024> head;
-
-    pool_create_v1((intptr_t)&head, &pol, &pool);
-    void *largeObj = pool_malloc(pool, MAX_OBJECT);
-    ASSERT(largeObj, NULL);
-    pool_free(pool, largeObj);
-
-    largeObj = pool_malloc(pool, MAX_OBJECT);
-    ASSERT(largeObj, NULL);
-    pool_free(pool, largeObj);
-
-    for (int i=0; i<ITERS; i++) {
-        ptrs[i] = pool_malloc(pool, MAX_OBJECT/ITERS);
-        ASSERT(ptrs[i], NULL);
-    }
-    for (int i=0; i<ITERS; i++)
-        pool_free(pool, ptrs[i]);
+    {
+        FixedPoolHead<MAX_OBJECT + 1024*1024> head;
 
-    largeObj = pool_malloc(pool, MAX_OBJECT);
-    ASSERT(largeObj, NULL);
-    pool_free(pool, largeObj);
+        pool_create_v1((intptr_t)&head, &pol, &pool);
+        {
+            NativeParallelFor( 1, FixedPoolUse(1, pool, MAX_OBJECT, 2) );
 
-    // each thread asks for an MAX_OBJECT/p/2 object,
-    // /2 is to cover fragmentation
-    for (int p=MinThread; p<=MaxThread; p++) {
-        Harness::SpinBarrier startB(p);
-        NativeParallelFor( p, FixedPoolRun(&startB, pool,
-                                           MAX_OBJECT/p/2, 10000) );
-    }
-    {
-        size_t maxSz;
-        int p = 512;
-        Harness::SpinBarrier barrier(p);
+            for (int i=0; i<ITERS; i++) {
+                ptrs[i] = pool_malloc(pool, MAX_OBJECT/ITERS);
+                ASSERT(ptrs[i], NULL);
+            }
+            for (int i=0; i<ITERS; i++)
+                pool_free(pool, ptrs[i]);
 
-        // Find maximal useful object size. Start with MAX_OBJECT/2,
-        // as the pool might be fragmented by BootStrapBlocks consumed during
-        // FixedPoolRun.
-        size_t l, r;
-        ASSERT(haveEnoughSpace(pool, MAX_OBJECT/2), NULL);
-        for (l = MAX_OBJECT/2, r = MAX_OBJECT + 1024*1024; l < r-1; ) {
-            size_t mid = (l+r)/2;
-            if (haveEnoughSpace(pool, mid))
-                l = mid;
+            NativeParallelFor( 1, FixedPoolUse(1, pool, MAX_OBJECT, 1) );
+        }
+        // each thread asks for an MAX_OBJECT/p/2 object,
+        // /2 is to cover fragmentation
+        for (int p=MinThread; p<=MaxThread; p++)
+            NativeParallelFor( p, FixedPoolUse(p, pool, MAX_OBJECT/p/2, 10000) );
+        {
+            const int p=128;
+            NativeParallelFor( p, FixedPoolUse(p, pool, MAX_OBJECT/p/2, 1) );
+        }
+        {
+            size_t maxSz;
+            const int p = 512;
+            Harness::SpinBarrier barrier(p);
+
+            // Find maximal useful object size. Start with MAX_OBJECT/2,
+            // as the pool might be fragmented by BootStrapBlocks consumed during
+            // FixedPoolRun.
+            size_t l, r;
+            ASSERT(haveEnoughSpace(pool, MAX_OBJECT/2), NULL);
+            for (l = MAX_OBJECT/2, r = MAX_OBJECT + 1024*1024; l < r-1; ) {
+                size_t mid = (l+r)/2;
+                if (haveEnoughSpace(pool, mid))
+                    l = mid;
+                else
+                    r = mid;
+            }
+            maxSz = l;
+            ASSERT(!haveEnoughSpace(pool, maxSz+1), "Expect to find boundary value.");
+            // consume all available memory
+            void *largeObj = pool_malloc(pool, maxSz);
+            ASSERT(largeObj, NULL);
+            void *o = pool_malloc(pool, 64);
+            if (o) // pool fragmented, skip FixedPoolNomem
+                pool_free(pool, o);
             else
-                r = mid;
+                NativeParallelFor( p, FixedPoolNomem(&barrier, pool) );
+            pool_free(pool, largeObj);
+            // keep some space unoccupied
+            largeObj = pool_malloc(pool, maxSz-512*1024);
+            ASSERT(largeObj, NULL);
+            NativeParallelFor( p, FixedPoolSomeMem(&barrier, pool) );
+            pool_free(pool, largeObj);
         }
-        maxSz = l;
-        ASSERT(!haveEnoughSpace(pool, maxSz+1), "Expect to find boundary value.");
-        // consume all available memory
-        largeObj = pool_malloc(pool, maxSz);
-        ASSERT(largeObj, NULL);
-        void *o = pool_malloc(pool, 64);
-        if (o) // pool fragmented, skip FixedPoolNomem
-            pool_free(pool, o);
-        else
-            NativeParallelFor( p, FixedPoolNomem(&barrier, pool) );
-        pool_free(pool, largeObj);
-        // keep some space unoccupied
-        largeObj = pool_malloc(pool, maxSz-512*1024);
-        ASSERT(largeObj, NULL);
-        NativeParallelFor( p, FixedPoolSomeMem(&barrier, pool) );
-        pool_free(pool, largeObj);
+        bool ok = pool_destroy(pool);
+        ASSERT(ok, NULL);
     }
-    bool ok = pool_destroy(pool);
-    ASSERT(ok, NULL);
-
+    // check that fresh untouched pool can successfully fulfil requests from 128 threads
     {
         FixedPoolHead<MAX_OBJECT + 1024*1024> head;
         pool_create_v1((intptr_t)&head, &pol, &pool);
         int p=128;
-        Harness::SpinBarrier startB(p);
-        NativeParallelFor( p, FixedPoolRun(&startB, pool, MAX_OBJECT/p/2, 1) );
+        NativeParallelFor( p, FixedPoolUse(p, pool, MAX_OBJECT/p/2, 1) );
         bool ok = pool_destroy(pool);
         ASSERT(ok, NULL);
     }
@@ -650,7 +649,7 @@ void TestPoolCreation()
 
     for (created=0; created<MAX_POOLS; created++) {
         putMemCalls = getMemCalls = 0;
-        MemPoolError res = pool_create_v1(created, &okPolicy, pools+created);
+        res = pool_create_v1(created, &okPolicy, pools+created);
         if (res!=POOL_OK) {
             ASSERT(!getMemCalls && !putMemCalls, "No leak after fail.");
             break;
@@ -668,7 +667,7 @@ void TestPoolCreation()
     }
     // check that failure during pool creation doesn't lead to leaks
     for (size_t i=0; i<created; i++) {
-        MemPoolError res = pool_create_v1(i, &okPolicy, pools+i);
+        res = pool_create_v1(i, &okPolicy, pools+i);
         ASSERT(res==POOL_OK, "2nd iteration of pool creation must be possible.");
         void *o = pool_malloc(pools[i], 1024);
         ASSERT(o, "Created pool must be useful.");
diff --git a/src/test/test_task_arena.cpp b/src/test/test_task_arena.cpp
index 507ed3aecf..92aee243b3 100644
--- a/src/test/test_task_arena.cpp
+++ b/src/test/test_task_arena.cpp
@@ -131,6 +131,8 @@ class ConcurrencyTrackingBody {
 
 class ArenaObserver : public tbb::task_scheduler_observer {
     int myId;
+    int myMaxConcurrency;
+    int myNumReservedSlots;
     /*override*/
     void on_scheduler_entry( bool is_worker ) {
         REMARK("a %s #%p is entering arena %d from %d on slot %d\n", is_worker?"worker":"master",
@@ -141,8 +143,8 @@ class ArenaObserver : public tbb::task_scheduler_observer {
         ASSERT(old_id.local() != myId, "double-entry to the same arena");
         local_id.local() = myId;
         slot_id.local() = tbb::task_arena::current_thread_index();
-        if(is_worker) ASSERT(tbb::task_arena::current_thread_index()>0, NULL);
-        else ASSERT(tbb::task_arena::current_thread_index()==0, NULL);
+        ASSERT(tbb::task_arena::current_thread_index()<(myMaxConcurrency>1?myMaxConcurrency:2), NULL);
+        if(is_worker) ASSERT(tbb::task_arena::current_thread_index()>=myNumReservedSlots, NULL);
     }
     /*override*/
     void on_scheduler_exit( bool is_worker ) {
@@ -155,9 +157,12 @@ class ArenaObserver : public tbb::task_scheduler_observer {
         old_id.local() = 0;
     }
 public:
-    ArenaObserver(tbb::task_arena &a, int id) : tbb::task_scheduler_observer(a) {
-        ASSERT(id, NULL);
-        myId = id;
+    ArenaObserver(tbb::task_arena &a, int maxConcurrency, int numReservedSlots, int id) 
+        : tbb::task_scheduler_observer(a)
+        , myId(id)
+        , myMaxConcurrency(maxConcurrency)
+        , myNumReservedSlots(numReservedSlots) {
+        ASSERT(myId, NULL);
         observe(true);
     }
     ~ArenaObserver () {
@@ -182,9 +187,9 @@ void TestConcurrentArenas(int p) {
     //Harness::ConcurrencyTracker::Reset();
     tbb::task_arena a1;
     a1.initialize(1,0);
-    ArenaObserver o1(a1, p*2+1);
+    ArenaObserver o1(a1, 1, 0, p*2+1);
     tbb::task_arena a2(2,1);
-    ArenaObserver o2(a2, p*2+2);
+    ArenaObserver o2(a2, 2, 1, p*2+2);
     Harness::SpinBarrier barrier(2);
     AsynchronousWork work(barrier);
     a1.enqueue(work); // put async work
@@ -318,12 +323,30 @@ class MultipleMastersPart4 : NoAssign {
     }
 };
 
+class MultipleMastersPart5 : NoAssign {
+    tbb::task_arena &my_a;
+    Harness::SpinBarrier &my_b;
+
+public:
+    MultipleMastersPart5( tbb::task_arena &a, Harness::SpinBarrier &b) : my_a(a), my_b(b) {}
+    // NativeParallelFor's functor
+    void operator()(int) const {
+        local_id.local() = 1;
+        my_a.execute(*this);
+    }
+    // Arena's functor
+    void operator()() const {
+        ASSERT( local_id.local() == 1, "Unexpected thread." );
+        my_b.timed_wait( 10 );
+    }
+};
+
 void TestMultipleMasters(int p) {
     {
         REMARK("multiple masters, part 1\n");
         tbb::task_arena a(1,0);
         a.initialize();
-        ArenaObserver o(a, 1);
+        ArenaObserver o(a, 1, 0, 1);
         Harness::SpinBarrier barrier1(p), barrier2(2*p+1); // each of p threads will submit two tasks signaling the barrier
         NativeParallelFor( p, MultipleMastersBody(a, barrier1, barrier2) );
         barrier2.timed_wait(10);
@@ -331,7 +354,7 @@ void TestMultipleMasters(int p) {
     } {
         REMARK("multiple masters, part 2\n");
         tbb::task_arena a(2,1);
-        ArenaObserver o(a, 2);
+        ArenaObserver o(a, 2, 1, 2);
         Harness::SpinBarrier barrier(p+2);
         a.enqueue(AsynchronousWork(barrier, /*blocking=*/true)); // occupy the worker, a regression test for bug 1981
         NativeParallelFor( p, MultipleMastersPart2(a, barrier) );
@@ -349,11 +372,18 @@ void TestMultipleMasters(int p) {
         int c = p%3? (p%2? p : 2) : 3;
         REMARK("multiple masters, part 4: contexts, arena(%d)\n", c);
         tbb::task_arena a(c, 1);
-        ArenaObserver o(a, c);
+        ArenaObserver o(a, c, 1, c);
         Harness::SpinBarrier barrier(c);
         MultipleMastersPart4 test(a, barrier);
         NativeParallelFor(p, test);
         a.debug_wait_until_empty();
+    } {
+        // Check if multiple masters can achive maximum concurrency.
+        REMARK("multiple masters, part 5: masters on barrier, arena(%d)\n", p);
+        tbb::task_arena a(p, 1);
+        Harness::SpinBarrier barrier(p);
+        MultipleMastersPart5 test(a, barrier);
+        NativeParallelFor(p, test);
     }
 }
 
@@ -470,6 +500,115 @@ void TestArenaEntryConsistency() {
         body.test(i);
 }
 
+class TestArenaMaxParallelismBody : NoAssign {
+    tbb::task_arena &my_a;
+    int my_p;
+    Harness::SpinBarrier *my_barrier;
+public:
+    TestArenaMaxParallelismBody( tbb::task_arena &a, int p, Harness::SpinBarrier *b = NULL ) : my_a( a ), my_p( p ), my_barrier(b) {}
+    // NativeParallelFor's functor
+    void operator()( int ) const {
+        my_a.execute( *this );
+    }
+    // Arena's functor
+    void operator()() const {
+        int idx = tbb::task_arena::current_thread_index();
+        ASSERT( idx < (my_p > 1 ? my_p : 2), NULL );
+        if ( my_barrier ) my_barrier->timed_wait( 10 );
+        else Harness::Sleep( 10 );
+    }
+};
+
+void TestArenaMaxParallelism( int p ) {
+    {
+        tbb::task_arena a( p, 0 );
+        Harness::SpinBarrier b( p );
+        TestArenaMaxParallelismBody test( a, p, &b );
+        for ( int i = 1; i < p; ++i )
+            a.enqueue( test );
+        a.execute( test );
+        a.debug_wait_until_empty();
+    }
+    {
+        tbb::task_arena a( p, 1 );
+        Harness::SpinBarrier b( p );
+        TestArenaMaxParallelismBody test( a, p, &b );
+        for ( int i = 1; i < p; ++i )
+            a.enqueue( test );
+        a.execute( test );
+        a.debug_wait_until_empty();
+    }
+    {
+        tbb::task_arena a( p, 0 );
+        NativeParallelFor( 2*p, TestArenaMaxParallelismBody( a, p ) );
+        a.debug_wait_until_empty();
+    }
+    {
+        tbb::task_arena a( p, 1 );
+        NativeParallelFor( 2*p, TestArenaMaxParallelismBody( a, p ) );
+        a.debug_wait_until_empty();
+    }
+}
+
+class TestArenaReservedMasterSlotsBody : NoAssign {
+    tbb::task_arena &my_a;
+    Harness::SpinBarrier &my_barrier;
+    Harness::SpinBarrier &my_worker_barrier;
+    int my_max_concurrency;
+    int my_reserved_slots;
+public:
+    TestArenaReservedMasterSlotsBody( tbb::task_arena &a, Harness::SpinBarrier &b, Harness::SpinBarrier &worker_b, int max_concurrency, int reserved_slots )
+        : my_a( a ), my_barrier(b), my_worker_barrier(worker_b), my_max_concurrency(max_concurrency), my_reserved_slots(reserved_slots)  {}
+    // NativeParallelFor's functor
+    void operator()( int ) const {
+        local_id.local() = 1;
+        my_a.execute( *this );
+    }
+    // Arena's functor
+    void operator()() const {
+        int idx = tbb::task_arena::current_thread_index();
+        ASSERT( idx < (my_max_concurrency > 1 ? my_max_concurrency : 2), NULL );
+        if ( local_id.local() != 1 ) {
+            // Worker thread
+            ASSERT( idx >= my_reserved_slots, NULL );
+            my_worker_barrier.timed_wait( 10 );
+        } else {
+            ASSERT( idx < my_reserved_slots, "Masters are not supposed to occupy non-reserved slots in this test" );
+        }
+        my_barrier.timed_wait( 10 );
+    }
+};
+
+void TestArenaReservedMasterSlots( int p ) {
+    for ( int reserved_slots = 0; reserved_slots <= p; ++reserved_slots ) {
+        tbb::task_arena a( p, reserved_slots );
+        Harness::SpinBarrier barrier(p);
+        Harness::SpinBarrier worker_barrier( p - reserved_slots + 1 );
+        TestArenaReservedMasterSlotsBody test( a, barrier, worker_barrier, p, reserved_slots );
+        for ( int i = reserved_slots; i < p; ++i )
+            a.enqueue( test );
+        worker_barrier.timed_wait( 10 );
+        if ( reserved_slots )
+            NativeParallelFor( reserved_slots, test );
+        a.debug_wait_until_empty();
+        ResetTLS();
+    }
+}
+
+struct test_functor_t {
+    void operator()() { ASSERT( false, "Non-const operator called" ); }
+    void operator()() const { /* library requires this overload only */ }
+};
+
+void TestConstantFunctorRequirement() {
+    tbb::task_arena a;
+    test_functor_t tf;
+    a.enqueue( tf );
+#if __TBB_TASK_PRIORITY
+    a.enqueue( tf, tbb::priority_normal );
+#endif
+}
+
 int TestMain () {
     // TODO: a workaround for temporary p-1 issue in market
     tbb::task_scheduler_init init_market_p_plus_one(MaxThread+1);
@@ -480,7 +619,10 @@ int TestMain () {
         ResetTLS();
         TestMultipleMasters( p );
         ResetTLS();
+        TestArenaMaxParallelism( p );
     }
     TestArenaEntryConsistency();
+    TestArenaReservedMasterSlots( MaxThread );
+    TestConstantFunctorRequirement();
     return Harness::Done;
 }
diff --git a/src/test/test_task_group.cpp b/src/test/test_task_group.cpp
index 5d60f34410..02d1c4cb73 100644
--- a/src/test/test_task_group.cpp
+++ b/src/test/test_task_group.cpp
@@ -810,6 +810,18 @@ void TestStructuredWait () {
     sg.wait();
 }
 
+struct test_functor_t {
+    void operator()() { ASSERT( false, "Non-const operator called" ); }
+    void operator()() const { /* library requires this overload only */ }
+};
+
+void TestConstantFunctorRequirement() {
+    tbb::task_group g;
+    test_functor_t tf;
+    g.run( tf ); g.wait();
+    g.run_and_wait( tf );
+}
+
 int TestMain () {
     REMARK ("Testing %s task_group functionality\n", TBBTEST_USE_TBB ? "TBB" : "PPL");
     for( int p=MinThread; p<=MaxThread; ++p ) {
@@ -855,6 +867,7 @@ int TestMain () {
         s->Release();
 #endif
     }
+    TestConstantFunctorRequirement();
 #if __TBB_THROW_ACROSS_MODULE_BOUNDARY_BROKEN
     REPORT("Known issue: exception handling tests are skipped.\n");
 #endif
diff --git a/src/test/test_task_priority.cpp b/src/test/test_task_priority.cpp
index 033ad4cc2f..5fbda70326 100644
--- a/src/test/test_task_priority.cpp
+++ b/src/test/test_task_priority.cpp
@@ -538,6 +538,24 @@ void TestSetPriority() {
             delete g_trees[t][i];
 }
 }//namespace test_propagation
+
+namespace regression {
+// This is a regression test for a bug with task_group_context used from a thread that created its local scheduler but not the implicit arena
+class TestTGContext {
+public:
+    void operator() (int) const {
+        tbb::task_group_context ctx;
+        ctx.cancel_group_execution();   // initializes the local weak scheduler on the thread
+        ctx.set_priority(tbb::priority_high);
+    }
+};
+
+void TestTGContextOnNewThread() {
+    REMARK("Testing a regression for a bug with task_group_context\n");
+    TestTGContext body;
+    NativeParallelFor(1, body);
+}
+}//namespace regression_priorities
 #endif /* __TBB_TASK_PRIORITY */
 
 #if !__TBB_TEST_SKIP_AFFINITY
@@ -573,6 +591,7 @@ int TestMain () {
     TestPrioritySwitchBetweenTwoMasters();
     PreemptionActivatorId = 1;
     TestPrioritySwitchBetweenTwoMasters();
+    regression::TestTGContextOnNewThread();
     return Harness::Done;
 }
 
diff --git a/src/test/test_tbb_fork.cpp b/src/test/test_tbb_fork.cpp
index 823c51f601..4fb1934116 100644
--- a/src/test/test_tbb_fork.cpp
+++ b/src/test/test_tbb_fork.cpp
@@ -106,8 +106,8 @@ class RunWorkersBody : NoAssign {
     RunWorkersBody(bool waitWorkers) : wait_workers(waitWorkers) {}
     void operator()(const int /*threadID*/) const {
         tbb::task_scheduler_init sch(MaxThread, 0, wait_workers);
-            tbb::parallel_for(tbb::blocked_range<int>(0, 10000, 1), AllocTask(),
-                              tbb::simple_partitioner());
+        tbb::parallel_for(tbb::blocked_range<int>(0, 10000, 1), AllocTask(),
+                          tbb::simple_partitioner());
     }
 };
 
@@ -120,11 +120,35 @@ void TestBlockNonblock()
     }
 }
 
+class RunInNativeThread : NoAssign {
+    bool create_tsi;
+public:
+    RunInNativeThread(bool create_tsi_) : create_tsi(create_tsi_) {}
+    void operator()(const int /*threadID*/) const {
+        // nested TSI or auto-initialized TSI can be terminated when
+        // wait_workers is true (deferred TSI means auto-initialization)
+        tbb::task_scheduler_init tsi(create_tsi? 2 :
+                                     tbb::task_scheduler_init::deferred);
+        tbb::parallel_for(tbb::blocked_range<int>(0, 10000, 1), AllocTask(),
+                              tbb::simple_partitioner());
+    }
+};
+
+void TestTasksInThread()
+{
+    tbb::task_scheduler_init sch(2, 0, /*wait_workers=*/true);
+    tbb::parallel_for(tbb::blocked_range<int>(0, 10000, 1), AllocTask(),
+                      tbb::simple_partitioner());
+    for (int i=0; i<2; i++)
+        NativeParallelFor(2, RunInNativeThread(/*create_tsi=*/1==i));
+}
+
 int TestMain()
 {
     using namespace Harness;
 
     TestBlockNonblock();
+    TestTasksInThread();
 
     bool child = false;
 #if _WIN32||_WIN64
diff --git a/src/test/test_tbb_version.cpp b/src/test/test_tbb_version.cpp
index a2f223917f..0d08b1c71d 100644
--- a/src/test/test_tbb_version.cpp
+++ b/src/test/test_tbb_version.cpp
@@ -238,7 +238,7 @@ int main(int argc, char *argv[] ) {
 void initialize_strings_vector(std::vector <string_pair>* vector)
 {
     vector->push_back(string_pair("TBB: VERSION\t\t4.4", required));          // check TBB_VERSION
-    vector->push_back(string_pair("TBB: INTERFACE VERSION\t9000", required)); // check TBB_INTERFACE_VERSION
+    vector->push_back(string_pair("TBB: INTERFACE VERSION\t9001", required)); // check TBB_INTERFACE_VERSION
     vector->push_back(string_pair("TBB: BUILD_DATE", required));
     vector->push_back(string_pair("TBB: BUILD_HOST", required));
     vector->push_back(string_pair("TBB: BUILD_OS", required));