diff --git a/.release-notes/cycle-races.md b/.release-notes/cycle-races.md new file mode 100644 index 0000000000..ca77aa3ffc --- /dev/null +++ b/.release-notes/cycle-races.md @@ -0,0 +1,7 @@ +## Fix multiple races within actor/cycle detector interactions + +The Pony runtime includes an optional cycle detector that is on by default. The cycle detector runs and looks for groups of blocked actors that will have reference counts above 0 but are unable to do any more work as all members are blocked and don't have additional work to do. + +Over time, we have made a number of changes to the cycle detector to improve it's performance and mitigate its impact on running Pony programs. In the process of improving the cycle detectors performance, it has become more and more complicated. That complication led to several race conditions that existed in the interaction between actors and the cycle detector. Each of these race conditions could lead to an actor getting freed more than once, causing an application crash or an attempt to access an actor after it had been deleted. + +We've identified and fixed what we believe are all the existing race conditions in the current design. diff --git a/src/libponyrt/actor/actor.c b/src/libponyrt/actor/actor.c index 2ae4955125..810567442e 100644 --- a/src/libponyrt/actor/actor.c +++ b/src/libponyrt/actor/actor.c @@ -35,6 +35,7 @@ enum FLAG_SYSTEM = 1 << 2, FLAG_UNSCHEDULED = 1 << 3, FLAG_CD_CONTACTED = 1 << 4, + FLAG_RC_OVER_ZERO_SEEN = 1 << 5, }; enum @@ -562,6 +563,18 @@ bool ponyint_actor_run(pony_ctx_t* ctx, pony_actor_t* actor, bool polling) pony_msg_t* msg; size_t app = 0; + + // check to see at the start of a run, and most importantly, the first time + // we run if our GC is over 0. If it is 0 the first time we run, then the + // actor starts as an orphan with no references to it. There is no way that + // the cycle detector will find out about an orphan actor unless it either + // contacts the cycle detector itself (which we don't do) or the orphan + // contacts another and needs to participate in the cycle detection protocol. + // if an actors rc never goes above 0, it will be able to safely delete itself + // even in the presence of the cycle detector. + if (!actor_noblock && actor->gc.rc > 0) + set_internal_flag(actor, FLAG_RC_OVER_ZERO_SEEN); + // If we have been scheduled, the head will not be marked as empty. pony_msg_t* head = atomic_load_explicit(&actor->q.head, memory_order_acquire); @@ -578,6 +591,12 @@ bool ponyint_actor_run(pony_ctx_t* ctx, pony_actor_t* actor, bool polling) bool app_msg = handle_message(ctx, actor, msg); + // if an actors rc never goes above 0, it will be able to safely delete + // itself even in the presence of the cycle detector. This is one of two + // checks to see if that invariant is in place for a given actor. + if (!actor_noblock && actor->gc.rc > 0) + set_internal_flag(actor, FLAG_RC_OVER_ZERO_SEEN); + #ifdef USE_RUNTIMESTATS used_cpu = ponyint_sched_cpu_used(ctx); #endif @@ -643,139 +662,116 @@ bool ponyint_actor_run(pony_ctx_t* ctx, pony_actor_t* actor, bool polling) if(!has_internal_flag(actor, FLAG_BLOCKED | FLAG_SYSTEM | FLAG_BLOCKED_SENT)) { set_internal_flag(actor, FLAG_BLOCKED); - - if(!actor_noblock - && (actor->gc.rc > 0) - && !has_internal_flag(actor, FLAG_CD_CONTACTED) - ) - { - // The cycle detector (CD) doesn't know we exist so it won't try - // and reach out to us even though we're blocked, so send block message - // and set flag that the CD knows we exist now so that when we block - // in the future we will wait for the CD to reach out and ask - // if we're blocked or not. - // But, only if gc.rc > 0 because if gc.rc == 0 we are a zombie. - send_block(ctx, actor); - } - } - if ((actor->gc.rc == 0) && has_internal_flag(actor, FLAG_BLOCKED)) + if (has_internal_flag(actor, FLAG_BLOCKED)) { - // Here, we is what we know to be true: - // - // - the actor is blocked - // - the actor likely has no messages in its queue - // - there's no references to this actor - // - - // check if the actors queue is empty or not - if(ponyint_messageq_isempty(&actor->q)) + if (actor->gc.rc == 0) { - // The actors queue is empty which means this actor is a zombie - // and can be reaped. - // If the cycle detector is disabled (or it doesn't know about this actor), - // this actors queue is guaranteed to remain empty after this point as no - // other actor has references to it. - // If the cycle detector is enabled and knows about this actor, - // then it can possibly add messages onto the actors queue making - // it not safe to be destroyed. - // - // As a result, how reaping can occur depends on whether the cycle - // detector is running whether it is holding a reference to the actor - // or not. + // Here, we is what we know to be true: // - // When 'actor_noblock` is true, the cycle detector isn't running. - // this means actors won't be garbage collected unless we take special - // action. Therefore if `noblock` is on, we should garbage collect the - // actor. + // - the actor is blocked + // - the actor likely has no messages in its queue + // - there's no references to this actor // - // When the cycle detector is running, "deleting locally" is still far - // more efficient and will free up memory sooner than letting the cycle - // detector do it. However, that is only safe to delete locally if the - // cycle detector isn't holding a reference to the actor. If we have never - // sent a reference to the cycle detector (FLAG_CD_CONTACTED), then this - // is perfectly safe to do. - if (actor_noblock || !has_internal_flag(actor, FLAG_CD_CONTACTED)) + + if (actor_noblock || !has_internal_flag(actor, FLAG_RC_OVER_ZERO_SEEN)) { - // mark the queue as empty or else destroy will hang - bool empty = ponyint_messageq_markempty(&actor->q); + // When 'actor_noblock` is true, the cycle detector isn't running. + // this means actors won't be garbage collected unless we take special + // action. Therefore if `noblock` is on, we should garbage collect the + // actor + // + // When the cycle detector is running, it is still safe to locally + // delete if our RC has never been above 0 because the cycle detector + // can't possibly know about the actor's existence so, if it's message + // queue is empty, doing a local delete is safe. + if(ponyint_messageq_isempty(&actor->q)) + { + // The actors queue is empty which means this actor is a zombie + // and can be reaped. + + // mark the queue as empty or else destroy will hang + bool empty = ponyint_messageq_markempty(&actor->q); - // make sure the queue is actually empty as expected - pony_assert(empty); + // make sure the queue is actually empty as expected + pony_assert(empty); - // "Locally delete" the actor. - ponyint_actor_setpendingdestroy(actor); - ponyint_actor_final(ctx, actor); - ponyint_actor_sendrelease(ctx, actor); - ponyint_actor_destroy(actor); + // "Locally delete" the actor. + ponyint_actor_setpendingdestroy(actor); + ponyint_actor_final(ctx, actor); + ponyint_actor_sendrelease(ctx, actor); + ponyint_actor_destroy(actor); - // make sure the scheduler will not reschedule this actor - return !empty; + // make sure the scheduler will not reschedule this actor + return !empty; + } } else { // The cycle detector is running so we have to ensure that it doesn't // send a message to the actor while we're in the process of telling // it that it is safe to destroy the actor. - - // the cycle detector will not send a message if the actor is marked - // pending destroy so mark this actor as pending destroy - ponyint_actor_setpendingdestroy(actor); - - // check to make sure the messageq is still empty - // and the cycle detector didn't send a message in the meantime - if(!ponyint_messageq_isempty(&actor->q)) + // + // Before we check if our queue is empty, we need to obtain the + // "critical delete" atomic for this actor. The cycle detector will + // bail from sending any messages if it can't obtain the atomic. + // Similarly, if the actor can't obtain the atomic here, then we do not + // attempt any "I can be destroyed" operations as the cycle detector is + // in the process of sending us a message. + if (ponyint_acquire_cycle_detector_critical(actor)) { - // need to undo set pending destroy because the cycle detector - // sent the actor a message and it is not safe to destroy this - // actor as a result. - unset_sync_flag(actor, SYNC_FLAG_PENDINGDESTROY); - - // If we mark the queue as empty, then it is no longer safe to do any - // operations on this actor that aren't concurrency safe so make - // `ponyint_messageq_markempty` the last thing we do. - // Return true (i.e. reschedule immediately) if our queue isn't empty. - return !ponyint_messageq_markempty(&actor->q); + if(ponyint_messageq_isempty(&actor->q)) + { + // At this point the actors queue is empty and the cycle detector + // will not send it any more messages because we "own" the barrier + // for sending cycle detector messages to this actor. + ponyint_actor_setpendingdestroy(actor); + + // Tell cycle detector that this actor is a zombie and will not get + // any more messages/work and can be reaped. + // Mark the actor as FLAG_BLOCKED_SENT and send a BLOCKED message + // to speed up reaping otherwise waiting for the cycle detector + // to get around to asking if we're blocked could result in + // unnecessary memory growth. + // + // We're blocked, send block message telling the cycle detector + // to reap this actor (because its `rc == 0`). + // This is concurrency safe because, only the cycle detector might + // have a reference to this actor (rc is 0) so another actor can not + // send it an application message that results this actor becoming + // unblocked (which would create a race condition) and we've also + // ensured that the cycle detector will not send this actor any more + // messages (which would also create a race condition). + send_block(ctx, actor); + + // mark the queue as empty or else destroy will hang + bool empty = ponyint_messageq_markempty(&actor->q); + + // make sure the queue is actually empty as expected + pony_assert(empty); + + // "give up" critical section ownership + ponyint_release_cycle_detector_critical(actor); + + // make sure the scheduler will not reschedule this actor + return !empty; + } else { + // "give up" critical section ownership + ponyint_release_cycle_detector_critical(actor); + } + } + } + } else { + // gc is greater than 0 + if (!actor_noblock && !has_internal_flag(actor, FLAG_CD_CONTACTED)) + { + // The cycle detector is running and we've never contacted it ourselves, + // so let's it know we exist in case it is unaware. + if (ponyint_acquire_cycle_detector_critical(actor)) + { + send_block(ctx, actor); + // "give up" critical section ownership + ponyint_release_cycle_detector_critical(actor); } - - // At this point the actors queue is empty and the cycle detector - // will not send it any more messages because it is set as pending - // destroy. - - // Based on how the cycle detector protocol works, FLAG_BLOCKED_SENT - // shouldn't be set at this time. We have previously contact the - // cycle detector and sent a blocked message then we should have - // unblocked before having gotten here. - // Dipin is 99% sure this invariant applies. Sean is less certain. - // It's possible this might be a source of bug. If it is, then the - // solution is probably to make this an if that encloses the remaining - // logic in this block. - pony_assert(!has_internal_flag(actor, FLAG_BLOCKED_SENT)); - - // Tell cycle detector that this actor is a zombie and will not get - // any more messages/work and can be reaped. - // Mark the actor as FLAG_BLOCKED_SENT and send a BLOCKED message - // to speed up reaping otherwise waiting for the cycle detector - // to get around to asking if we're blocked could result in unnecessary - // memory growth. - // - // We're blocked, send block message telling the cycle detector - // to reap this actor (because its `rc == 0`). - // This is concurrency safe because, only the cycle detector might - // have a reference to this actor (rc is 0) so another actor can not - // send it an application message that results this actor becoming - // unblocked (which would create a race condition) and we've also - // ensured that the cycle detector will not send this actor any more - // messages (which would also create a race condition). - send_block(ctx, actor); - - // mark the queue as empty or else destroy will hang - bool empty = ponyint_messageq_markempty(&actor->q); - - // make sure the queue is actually empty as expected - pony_assert(empty); - - // make sure the scheduler will not reschedule this actor - return !empty; } } } @@ -1257,3 +1253,14 @@ size_t ponyint_actor_total_alloc_size(pony_actor_t* actor) + POOL_ALLOC_SIZE(pony_msg_t); } #endif + +bool ponyint_acquire_cycle_detector_critical(pony_actor_t* actor) +{ + uint8_t expected = 0; + return atomic_compare_exchange_strong_explicit(&actor->cycle_detector_critical, &expected, 1, memory_order_acq_rel, memory_order_acquire); +} + +void ponyint_release_cycle_detector_critical(pony_actor_t* actor) +{ + actor->cycle_detector_critical = 0; +} diff --git a/src/libponyrt/actor/actor.h b/src/libponyrt/actor/actor.h index 91b2c662ee..4777ec671b 100644 --- a/src/libponyrt/actor/actor.h +++ b/src/libponyrt/actor/actor.h @@ -56,6 +56,8 @@ typedef struct pony_actor_t messageq_t q; // sync flags are access from multiple scheduler threads concurrently PONY_ATOMIC(uint8_t) sync_flags; + // accessed from the cycle detector and the actor + PONY_ATOMIC(uint8_t) cycle_detector_critical; // keep things accessed by other actors on a separate cache line alignas(64) heap_t heap; // 52/104 bytes @@ -150,6 +152,10 @@ size_t ponyint_actor_total_mem_size(pony_actor_t* actor); size_t ponyint_actor_total_alloc_size(pony_actor_t* actor); #endif +bool ponyint_acquire_cycle_detector_critical(pony_actor_t* actor); + +void ponyint_release_cycle_detector_critical(pony_actor_t* actor); + PONY_EXTERN_C_END #endif diff --git a/src/libponyrt/gc/cycle.c b/src/libponyrt/gc/cycle.c index ef8332e012..dd9ed74724 100644 --- a/src/libponyrt/gc/cycle.c +++ b/src/libponyrt/gc/cycle.c @@ -16,6 +16,130 @@ #define CD_MAX_CHECK_BLOCKED 1000 +// +// About the Pony cycle detector +// +// The cycle detector exists to reap actors and cycles of actors. Not every +// Pony program needs the cycle detector. If an application contains no cycles +// amongst it's actors or is able via its shutdown mechanism to break +// those cycles then the cycle detector isn't need and is extra overhead. Such +// programs use the "ponynoblock" runtime option to turn off the cycle +// detector. +// +// Some applications do need a cycle detector in order to garbage collect +// actors. The job of the cycle detector is find groups of actors (cycles) where +// no more work is possible because all of the actors in the cycle are blocked. +// When the cycle detector finds a blocked cycle, it deletes the members of +// cycle. +// +// The cycle detector is a special actor that is implemented here in cycle.c in +// c rather than being implemented in Pony. Like other actors, it is run by +// the various schedulers. Unlike other actors, it does not participate in the +// scheduler's backpressure system as the cycle detector is marked as a "system +// actor". All system actors are not part of the backpressure system. +// +// The cycle detector has evolved over the years and is much more complicated +// now than it was in early versions of Pony. "In the beginning", every actor +// would send a message upon its creation to the cycle detector. This allowed +// the detector to track all the actors ahd use a protocol that still mostly +// exists to query actor's about there view of the world and put together a +// "omniscient" view of actor relations and find cycles of blocked actors. +// +// Early cycle detector designs had a few problems. +// +// 1. Every actor was communicating with the cycle detector which made a the +// cycle detector a bottleneck in many applications. +// 2. The cycle detector was run like other actors which meant that it often +// couldn't keep up with particularly "CD busy applications". When the cycle +// detector can't keep up, memory pressure grows from it's queue. +// 3. The cycle detector was scheduled like every other actor and would steal +// cycles from "real work actors". In some applications a non-trivial amount +// of CPU time was spent on cycle detection. +// 4. It was easy to overload the cycle detector by creating large numbers of +// actors in a short period of time. +// +// Despite these problems, the early cycle detector designs were relatively +// easy to understand and didn't contain possible race conditions. +// +// Over time, as we have evolved the cycle detector to be less likely to suffer +// from memory pressure or to be overly grabby with CPU time, it has gotten more +// and more complicated. It would be rather easy now when modifying the cycle +// detector to introduce dangerous race conditions that could result in crashing +// Pony programs. +// +// Changes that have been made to the cycle detector over time: +// +// The cycle detector is no longer run like a regular actor. Instead, it will +// only ever be run by scheduler 0. And it will only every be run every X amount +// of units (which is configurable using the `cdinternal` option). This +// scheduling change greatly dropped the performance impact of the cycle +// detector on "average" Pony programs. +// +// Early versions of the cycle detector looked at every actor that it knew about +// looking for cycles each time it ran. This could result in large amounts of +// CPU usage and severely impact performance. Performance was most impacted as +// the number of actors grew and as the number of CPUs shrank. Single threaded +// Pony programs with a lot of actors could spend a very large amount of time +// looking for cycles. +// +// Every actor informed the cycle detector of its existence via message passing +// when the actor was created. This could cause a lot of additional message +// activity. +// +// Every actor informed the cycle detector of its existence even if it was +// never blocked. This resulted in a lot of extra work as the purpose of the +// cycle detector is to find cycles of blocked actors. Despite this, all actors +// were informing the cycle detector of their existence. +// +// The relationship between individual actors and the cycle detector is now as +// follows. +// +// Actors only inform the cycle detector of their existence if they are blocked. +// +// We've introduced a concept of "orphaned" actor that at the time of their +// creation have a reference count of 0. Orphaned actors whose rc never gets +// above 0 can delete themselves locally like they would if `ponynoblock` was +// in use. +// +// Any actor that has had a reference count above 0 but currently has a +// reference count of 0 and is blocked can inform the cycle detector that it is +// blocked (and is isolated) so that the cycle detector can speed up reaping it. +// +// The three above changes can, for many applications, have a huge impact on +// performance and memory usage. However, these gains come at a cost. +// +// Early versions of the cycle detector were trivially race condition free. Now, +// it is non-trivial to keep cycle detector/actor interactions race free. +// +// Where there was once only one way for the cycle detector to become aware of +// an actor, there are now two. An actor might inform the cycle detector of +// itself when it blocks. A different actor that knows about an actor, might +// also have shared its view of the world with the cycle detector such that +// the cycle detector knows about the actor without it ever having contacted the +// cycle detector. This can result in the cycle detector having multiple +// protcol messages in flight that could be leading the deletion of an actor. +// Care must be taken in the cycle detector to not contact actors that are +// "pending deletion" as it could lead to a double-free. +// +// There are critical sections of "deletion" code that are non-atomic in both +// the actor and the cycle detector where for any actor, only the actor or +// cycle detector can be executing. If both sides were to be executing critical +// sections at the same time, a double free could result. We have an atomic +// in place on a per actor basis that is used to prevent both sides from +// executing their critical sections at the same time. If either can not +// acquire the critical section atomic, it means the other side is in their +// section and an invariant needed for safe deletion is about to be broken. +// +// If all of this sounds complicated, it is. Tread carefully. The three people +// who know the cycle detector best all approved performance improvements +// around the time of Pony 0.38.0 that introduced race conditions. +// +// We feel the additional complexity is worth it because the performance gains +// have been so large. However, we are looking at doing away with the cycle +// detector entirely and coming up with a decentralized approach to detecting +// actor cycles and reaping them. Until that time, we have the current cycle +// detector that has evolved to this state over time. + typedef struct block_msg_t { pony_msg_t msg; @@ -516,10 +640,23 @@ static void send_conf(pony_ctx_t* ctx, perceived_t* per) while((view = ponyint_viewmap_next(&per->map, &i)) != NULL) { - // only send if actor is not pending destroy to ensure - // no race conditions when an actor might have an `rc == 0` - if(!ponyint_actor_pendingdestroy(view->actor)) - pony_sendi(ctx, view->actor, ACTORMSG_CONF, per->token); + // The actor itself is also allowed to initiate a deletion of itself if + // it is blocked and has a reference count of 0. Because deletion is + // not an atomic operation, and the message queue remaining empty is an + // invariant that can't be violated if the cycle detector is sending + // a message to an actor, that actor isn't eligible to initiate getting + // reaped in a short-circuit fashion. Only the actor or the cycle + // detector can be in the "cycle detector critical" section for the actor. + if (ponyint_acquire_cycle_detector_critical(view->actor)) + { + // To avoid a race condition, we have to make sure that the actor isn't + // pending destroy before sending a message. Failure to do so could + // eventually result in a double free. + if(!ponyint_actor_pendingdestroy(view->actor)) + pony_sendi(ctx, view->actor, ACTORMSG_CONF, per->token); + + ponyint_release_cycle_detector_critical(view->actor); + } } } @@ -671,10 +808,23 @@ static void check_blocked(pony_ctx_t* ctx, detector_t* d) // if it is not already blocked if(!view->blocked) { - // only send if actor is not pending destroy to ensure - // no race conditions when an actor might have an `rc == 0` - if(!ponyint_actor_pendingdestroy(view->actor)) - pony_send(ctx, view->actor, ACTORMSG_ISBLOCKED); + // The actor itself is also allowed to initiate a deletion of itself if + // it is blocked and has a reference count of 0. Because deletion is + // not an atomic operation, and the message queue remaining empty is an + // invariant that can't be violated if the cycle detector is sending + // a message to an actor, that actor isn't eligible to initiate getting + // reaped in a short-circuit fashion. Only the actor or the cycle + // detector can be in the "cycle detector critical" section for the actor. + if (ponyint_acquire_cycle_detector_critical(view->actor)) + { + // To avoid a race condition, we have to make sure that the actor isn't + // pending destroy before sending a message. Failure to do so could + // eventually result in a double free. + if(!ponyint_actor_pendingdestroy(view->actor)) + pony_send(ctx, view->actor, ACTORMSG_ISBLOCKED); + + ponyint_release_cycle_detector_critical(view->actor); + } } // Stop if we've hit the max limit for # of actors to check