From 1b9387eddc1f52165d3243d2ace9be0c62495eea Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Sat, 20 Nov 2021 00:02:09 -0800
Subject: [PATCH] Eliminate SHAMapInnerNode lock contention:

The `SHAMapInnerNode` class had a global mutex to protect the
array of node children. Profiling suggested that around 4% of
all attempts to lock the global would block.

This commit removes that global mutex, and replaces it with a
new per-node 16-way spinlock (implemented so as not to effect
the size of an inner node objet), effectively eliminating the
lock contention.
---
 src/ripple/shamap/SHAMapInnerNode.h        |   7 +-
 src/ripple/shamap/impl/SHAMapInnerNode.cpp | 105 ++++++++++++++++++---
 src/ripple/shamap/impl/SHAMapSync.cpp      |   3 +-
 src/ripple/shamap/impl/TaggedPointer.ipp   |   4 +-
 4 files changed, 103 insertions(+), 16 deletions(-)
diff --git a/src/ripple/shamap/SHAMapInnerNode.h b/src/ripple/shamap/SHAMapInnerNode.h
index db7244e7019..5f0765e9c26 100644
--- a/src/ripple/shamap/SHAMapInnerNode.h
+++ b/src/ripple/shamap/SHAMapInnerNode.h
@@ -27,8 +27,10 @@
 #include <ripple/shamap/SHAMapTreeNode.h>
 #include <ripple/shamap/impl/TaggedPointer.h>
 
+#include <atomic>
 #include <bitset>
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <mutex>
 #include <optional>
@@ -53,7 +55,8 @@ class SHAMapInnerNode final : public SHAMapTreeNode,
     std::uint32_t fullBelowGen_ = 0;
     std::uint16_t isBranch_ = 0;
 
-    static std::mutex childLock;
+    /** A bitlock for the children of this node, with one bit per child */
+    mutable std::atomic<std::uint16_t> lock_ = 0;
 
     /** Convert arrays stored in `hashesAndChildren_` so they can store the
         requested number of children.
@@ -155,7 +158,7 @@ class SHAMapInnerNode final : public SHAMapTreeNode,
     std::shared_ptr<SHAMapTreeNode>
     getChild(int branch);
 
-    virtual std::shared_ptr<SHAMapTreeNode>
+    std::shared_ptr<SHAMapTreeNode>
     canonicalizeChild(int branch, std::shared_ptr<SHAMapTreeNode> node);
 
     // sync functions
diff --git a/src/ripple/shamap/impl/SHAMapInnerNode.cpp b/src/ripple/shamap/impl/SHAMapInnerNode.cpp
index 6a2a4504fea..c47ac3864ba 100644
--- a/src/ripple/shamap/impl/SHAMapInnerNode.cpp
+++ b/src/ripple/shamap/impl/SHAMapInnerNode.cpp
@@ -19,11 +19,9 @@
 
 #include <ripple/shamap/SHAMapInnerNode.h>
 
-#include <ripple/basics/ByteUtilities.h>
 #include <ripple/basics/Log.h>
 #include <ripple/basics/Slice.h>
 #include <ripple/basics/contract.h>
-#include <ripple/basics/safe_cast.h>
 #include <ripple/beast/core/LexicalCast.h>
 #include <ripple/protocol/HashPrefix.h>
 #include <ripple/protocol/digest.h>
@@ -33,14 +31,88 @@
 #include <openssl/sha.h>
 
 #include <algorithm>
-#include <array>
 #include <iterator>
-#include <mutex>
 #include <utility>
 
+// This is used for the _mm_pause instruction:
+#include <immintrin.h>
+
 namespace ripple {
 
-std::mutex SHAMapInnerNode::childLock;
+/** A specialized 16-way spinlock used to protect inner node branches.
+
+    This class packs 16 separate spinlocks into a single 16-bit value. It makes
+    it possible to lock any one lock at once or, alternatively, all together.
+
+    The implementation tries to use portable constructs but has to be low-level
+    for performance.
+ */
+class SpinBitlock
+{
+private:
+    std::atomic<std::uint16_t>& bits_;
+    std::uint16_t mask_;
+
+public:
+    SpinBitlock(std::atomic<std::uint16_t>& lock) : bits_(lock), mask_(0xFFFF)
+    {
+    }
+
+    SpinBitlock(std::atomic<std::uint16_t>& lock, int index)
+        : bits_(lock), mask_(1 << index)
+    {
+        assert(index >= 0 && index < 16);
+    }
+
+    [[nodiscard]] bool
+    try_lock()
+    {
+        // If we want to grab all the individual bitlocks at once we cannot
+        // use `fetch_or`! To see why, imagine that `lock_ == 0x0020` which
+        // means that the `fetch_or` would return `0x0020` but all the bits
+        // would already be (incorrectly!) set. Oops!
+        std::uint16_t expected = 0;
+
+        if (mask_ != 0xFFFF)
+            return (bits_.fetch_or(mask_, std::memory_order_acquire) & mask_) ==
+                expected;
+
+        return bits_.compare_exchange_weak(
+            expected,
+            mask_,
+            std::memory_order_acquire,
+            std::memory_order_relaxed);
+    }
+
+    void
+    lock()
+    {
+        // Testing suggests that 99.9999% of the time this will succeed, so
+        // we try to optimize the fast path.
+        if (try_lock())
+            return;
+
+        do
+        {
+            // We try to spin for a few times:
+            for (int i = 0; i != 100; ++i)
+            {
+                if (try_lock())
+                    return;
+
+                _mm_pause();
+            }
+
+            std::this_thread::yield();
+        } while ((bits_.load(std::memory_order_relaxed) & mask_) == 0);
+    }
+
+    void
+    unlock()
+    {
+        bits_.fetch_and(~mask_, std::memory_order_release);
+    }
+};
 
 SHAMapInnerNode::SHAMapInnerNode(
     std::uint32_t cowid,
@@ -108,7 +180,10 @@ SHAMapInnerNode::clone(std::uint32_t cowid) const
             cloneHashes[branchNum] = thisHashes[indexNum];
         });
     }
-    std::lock_guard lock(childLock);
+
+    SpinBitlock sl(lock_);
+    std::lock_guard lock(sl);
+
     if (thisIsSparse)
     {
         int cloneChildIndex = 0;
@@ -341,8 +416,11 @@ SHAMapInnerNode::getChildPointer(int branch)
     assert(branch >= 0 && branch < branchFactor);
     assert(!isEmptyBranch(branch));
 
-    std::lock_guard lock(childLock);
-    return hashesAndChildren_.getChildren()[*getChildIndex(branch)].get();
+    auto const index = *getChildIndex(branch);
+
+    SpinBitlock sl(lock_, index);
+    std::lock_guard lock(sl);
+    return hashesAndChildren_.getChildren()[index].get();
 }
 
 std::shared_ptr<SHAMapTreeNode>
@@ -351,8 +429,11 @@ SHAMapInnerNode::getChild(int branch)
     assert(branch >= 0 && branch < branchFactor);
     assert(!isEmptyBranch(branch));
 
-    std::lock_guard lock(childLock);
-    return hashesAndChildren_.getChildren()[*getChildIndex(branch)];
+    auto const index = *getChildIndex(branch);
+
+    SpinBitlock sl(lock_, index);
+    std::lock_guard lock(sl);
+    return hashesAndChildren_.getChildren()[index];
 }
 
 SHAMapHash const&
@@ -377,7 +458,9 @@ SHAMapInnerNode::canonicalizeChild(
     auto [_, hashes, children] = hashesAndChildren_.getHashesAndChildren();
     assert(node->getHash() == hashes[childIndex]);
 
-    std::lock_guard lock(childLock);
+    SpinBitlock sl(lock_, childIndex);
+    std::lock_guard lock(sl);
+
     if (children[childIndex])
     {
         // There is already a node hooked up, return it
diff --git a/src/ripple/shamap/impl/SHAMapSync.cpp b/src/ripple/shamap/impl/SHAMapSync.cpp
index 8cd5bd704ef..1e233d55f23 100644
--- a/src/ripple/shamap/impl/SHAMapSync.cpp
+++ b/src/ripple/shamap/impl/SHAMapSync.cpp
@@ -583,7 +583,6 @@ SHAMap::addKnownNode(
     }
 
     auto const generation = f_.getFullBelowCache(ledgerSeq_)->getGeneration();
-    auto newNode = SHAMapTreeNode::makeFromWire(rawNode);
     SHAMapNodeID iNodeID;
     auto iNode = root_.get();
 
@@ -612,6 +611,8 @@ SHAMap::addKnownNode(
 
         if (iNode == nullptr)
         {
+            auto newNode = SHAMapTreeNode::makeFromWire(rawNode);
+
             if (!newNode || childHash != newNode->getHash())
             {
                 JLOG(journal_.warn()) << "Corrupt node received";
diff --git a/src/ripple/shamap/impl/TaggedPointer.ipp b/src/ripple/shamap/impl/TaggedPointer.ipp
index d1110f49387..30bf68426b6 100644
--- a/src/ripple/shamap/impl/TaggedPointer.ipp
+++ b/src/ripple/shamap/impl/TaggedPointer.ipp
@@ -17,9 +17,9 @@
 */
 //==============================================================================
 
-#include <ripple/shamap/impl/TaggedPointer.h>
-
+#include <ripple/basics/ByteUtilities.h>
 #include <ripple/shamap/SHAMapInnerNode.h>
+#include <ripple/shamap/impl/TaggedPointer.h>
 
 #include <array>