From 5e356040155954695316ad694670d4931ea2a524 Mon Sep 17 00:00:00 2001
From: Dipin Hora <dipin@sendence.com>
Date: Wed, 6 Jun 2018 14:20:34 +0000
Subject: [PATCH] Fix multi-worker performance related to dynamic scheduler
 scaling

As part of performance testing Wallaroo using multiple workers,
@JONBRWN discovered a regression in both throughput and latency.
He tracked the issue down the commit that re-enabled dynamic
scheduler scaling (fc80968ba54dedc2d486c74a7c39114081465497).
NOTE: This performance issue did not exist for singler worker
runs of Wallaroo.

Some head scratching and testing led to the current commit to
resolve the multi-worker performance issue. My best guess is that
before this change the `steal` loop was dependent on a memory
access to determine if dynamic scheduler scaling needed to
suspend a thread or not as its initial check. This would lead to
somewhat erratic behavior where some times the `steal` loop would
take long while other times it wouldn't depending on how long the
memory load took. This had a follow-on impact on actor execution
because of ASIO messages because they wouldn't be picked up off
of the queue for work as quickly as they could be due to the
extra memory accesses.

This commit changes the ordering of some operations to ensure
that there is more consistent memory accesses for the loop
resulting in more consistent actor actor execution for ASIO
messages resolving the multi-worker performance issue that
@JONBRWN discovered.
---
 src/libponyrt/sched/scheduler.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/libponyrt/sched/scheduler.c b/src/libponyrt/sched/scheduler.c
index 3a461d0207..8c715977a5 100644
--- a/src/libponyrt/sched/scheduler.c
+++ b/src/libponyrt/sched/scheduler.c
@@ -543,8 +543,8 @@ static pony_actor_t* perhaps_suspend_scheduler(
   // if we're the highest active scheduler thread
   // and there are more active schedulers than the minimum requested
   // and we're not terminating
-  if ((sched == &scheduler[current_active_scheduler_count - 1])
-    && (current_active_scheduler_count > min_scheduler_count)
+  if ((current_active_scheduler_count > min_scheduler_count)
+    && (sched == &scheduler[current_active_scheduler_count - 1])
     && (!sched->terminate)
 #if defined(USE_SCHEDULER_SCALING_PTHREADS)
     // try to acquire mutex if using pthreads
@@ -677,10 +677,10 @@ static pony_actor_t* steal(scheduler_t* sched)
     //    By waiting 1 millisecond before sending a block message, we are going to
     //    delay quiescence by a small amount of time but also optimize work
     //    stealing for generating far fewer block/unblock messages.
+    uint32_t current_active_scheduler_count = get_active_scheduler_count();
+
     if (!block_sent)
     {
-      uint32_t current_active_scheduler_count = get_active_scheduler_count();
-
       // make sure thread scaling order is still valid. we should never be
       // active if the active_scheduler_count isn't larger than our index.
       pony_assert(current_active_scheduler_count > (uint32_t)sched->index);
@@ -718,9 +718,6 @@ static pony_actor_t* steal(scheduler_t* sched)
       // if we do suspend, we'll send a unblock message first to ensure cnf/ack
       // cycle works as expected
 
-      // get active scheduler count
-      uint32_t current_active_scheduler_count = get_active_scheduler_count();
-
       // make sure thread scaling order is still valid. we should never be
       // active if the active_scheduler_count isn't larger than our index.
       pony_assert(current_active_scheduler_count > (uint32_t)sched->index);