diff --git a/engines/ep/src/checkpoint_manager.cc b/engines/ep/src/checkpoint_manager.cc index 6e1604072f..a1f32749cc 100644 --- a/engines/ep/src/checkpoint_manager.cc +++ b/engines/ep/src/checkpoint_manager.cc @@ -836,6 +836,8 @@ std::vector CheckpointManager::getListOfCursorsToDrop() { ? *backupFound->second : *persistenceCursor; + getListOfCursorsToDropHook(); + for (const auto& pair : cursors) { const auto cursor = pair.second; // Note: Strict condition here. @@ -1734,13 +1736,23 @@ void CheckpointManager::addStats(const AddStatFn& add_stat, } void CheckpointManager::takeAndResetCursors(CheckpointManager& other) { - pCursor = other.pCursor; - persistenceCursor = pCursor.lock().get(); - for (auto& cursor : other.cursors) { - cursors[cursor.second->getName()] = cursor.second; + other.takeAndResetCursorsHook(); + + Cursor otherPCursor; + cursor_index otherCursors; + { + std::lock_guard otherLH(other.queueLock); + otherPCursor = other.pCursor; + otherCursors = std::move(other.cursors); + other.cursors.clear(); } - other.cursors.clear(); + std::lock_guard lh(queueLock); + pCursor = std::move(otherPCursor); + persistenceCursor = pCursor.lock().get(); + for (auto& cursor : otherCursors) { + cursors[cursor.second->getName()] = std::move(cursor.second); + } resetCursors(); } diff --git a/engines/ep/src/checkpoint_manager.h b/engines/ep/src/checkpoint_manager.h index 48870a646d..1ad8cf7d0b 100644 --- a/engines/ep/src/checkpoint_manager.h +++ b/engines/ep/src/checkpoint_manager.h @@ -599,6 +599,14 @@ class CheckpointManager { // (and not yet re-acquired) the CM::lock. Introduced in MB-56644. TestingHook<> expelHook; + /// Testing hook called at the start of CM::takeAndResetCursors. + /// Introduced in MB-59601. + TestingHook<> takeAndResetCursorsHook; + + /// Testing hook called just before iterating CM::cursors in + /// CM::getListOfCursorsToDrop. Introduced in MB-59601. + TestingHook<> getListOfCursorsToDropHook; + protected: /** * @param lh, the queueLock held diff --git a/engines/ep/tests/module_tests/checkpoint_remover_test.cc b/engines/ep/tests/module_tests/checkpoint_remover_test.cc index 8d9438cafc..f03a6aa913 100644 --- a/engines/ep/tests/module_tests/checkpoint_remover_test.cc +++ b/engines/ep/tests/module_tests/checkpoint_remover_test.cc @@ -22,6 +22,7 @@ #include "collections/vbucket_manifest_handles.h" #include "dcp/response.h" #include "test_helpers.h" +#include "thread_gate.h" #include "vbucket.h" #include @@ -454,6 +455,70 @@ TEST_P(CheckpointRemoverTest, MemRecoveryByCheckpointCreation) { EXPECT_EQ(0, store->getRequiredCMMemoryReduction()); } +// Without the fix, there is a data race in +// CheckpointManager::takeAndResetCursors which did not take a queueLock, +// and could mutate the CheckpointManager while it is being accessed, +// e.g. in CheckpointManager::getListOfCursorsToDrop. +TEST_P(CheckpointRemoverTest, MB59601) { + if (!isPersistent()) { + GTEST_SKIP(); + } + + setVBucketStateAndRunPersistTask(vbid, vbucket_state_active); + auto& config = engine->getConfiguration(); + config.setChkExpelEnabled(false); + config.setMaxSize(100UL * 1024 * 1024); + // Disable the mem-based checkpoint creation in this test, we would end up + // doing straight CheckpointRemoval rather than ItemExpel/CursorDrop + config.setCheckpointMaxSize(std::numeric_limits::max()); + const auto chkptMemRecoveryLimit = + config.getMaxSize() * store->getCheckpointMemoryRatio() * + store->getCheckpointMemoryRecoveryUpperMark(); + auto& stats = engine->getEpStats(); + stats.mem_low_wat.store(1); + + int numItems = 0; + const std::string value(1024 * 1024, 'x'); + while (stats.getCheckpointManagerEstimatedMemUsage() < + chkptMemRecoveryLimit) { + auto docKey = "key_" + std::to_string(++numItems); + store_item(vbid, makeStoredDocKey(docKey), value); + } + flushVBucketToDiskIfPersistent(vbid, numItems); + + // VB needs to be replica to rollback + store->setVBucketState(vbid, vbucket_state_replica); + + EXPECT_GT(stats.getNumCheckpoints(), 0); + EXPECT_GT(store->getRequiredCMMemoryReduction(), 0); + + /// Synchronises just before accessing and mutating CM::cursors + ThreadGate tg(2); + std::thread bgThread; + + auto& oldManager = *store->getVBucket(vbid)->checkpointManager; + oldManager.takeAndResetCursorsHook = [this, &tg, &bgThread]() { + // Note: takeAndResetCursorsHook is executed *after* the new VBucket + // has already been created + + auto& newManager = *store->getVBucket(vbid)->checkpointManager; + newManager.getListOfCursorsToDropHook = [&tg]() { tg.threadUp(); }; + bgThread = std::thread([this]() { + auto remover = std::make_shared( + *engine, + engine->getEpStats(), + engine->getConfiguration().getChkRemoverStime(), + 0); + remover->run(); + }); + + tg.threadUp(); + }; + + store->rollback(vbid, 0); + bgThread.join(); +} + // Test written for MB-36366. With the fix removed this test failed because // post expel, we continued onto cursor dropping. // MB-36447 - unreliable test, disabling for now