From 2d6a022bb819aca9fe9689ac23184b01b070e12f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 27 May 2023 15:55:43 +0300 Subject: [PATCH] Don't allow two timeline_delete operations to run concurrently. (#4313) If the timeline is already being deleted, return an error. We used to notice the duplicate request and error out in persist_index_part_with_deleted_flag(), but it's better to detect it earlier. Add an explicit lock for the deletion. Note: This doesn't do anything about the async cancellation problem (github issue #3478): if the original HTTP request dropped, because the client disconnected, the timeline deletion stops half-way through the operation. That needs to be fixed, too, but that's a separate story. (This is a simpler replacement for PR #4194. I'm also working on the cancellation shielding, see PR #4314.) --- pageserver/src/tenant.rs | 106 +++++++++++--------- pageserver/src/tenant/timeline.rs | 5 + test_runner/regress/test_timeline_delete.py | 2 +- 3 files changed, 67 insertions(+), 46 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2827830f02ca..991f5ca1c64e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1436,7 +1436,11 @@ impl Tenant { Ok(()) } - /// Removes timeline-related in-memory data + /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its + /// data from disk. + /// + /// This doesn't currently delete all data from S3, but sets a flag in its + /// index_part.json file to mark it as deleted. pub async fn delete_timeline( &self, timeline_id: TimelineId, @@ -1446,7 +1450,11 @@ impl Tenant { // Transition the timeline into TimelineState::Stopping. // This should prevent new operations from starting. - let timeline = { + // + // Also grab the Timeline's delete_lock to prevent another deletion from starting. + let timeline; + let mut delete_lock_guard; + { let mut timelines = self.timelines.lock().unwrap(); // Ensure that there are no child timelines **attached to that pageserver**, @@ -1464,20 +1472,36 @@ impl Tenant { Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound), }; - let timeline = Arc::clone(timeline_entry.get()); + timeline = Arc::clone(timeline_entry.get()); + + // Prevent two tasks from trying to delete the timeline at the same time. + // + // XXX: We should perhaps return an HTTP "202 Accepted" to signal that the caller + // needs to poll until the operation has finished. But for now, we return an + // error, because the control plane knows to retry errors. + delete_lock_guard = timeline.delete_lock.try_lock().map_err(|_| { + DeleteTimelineError::Other(anyhow::anyhow!( + "timeline deletion is already in progress" + )) + })?; + + // If another task finished the deletion just before we acquired the lock, + // return success. + if *delete_lock_guard { + return Ok(()); + } + timeline.set_state(TimelineState::Stopping); drop(timelines); - timeline - }; + } // Now that the Timeline is in Stopping state, request all the related tasks to // shut down. // - // NB: If you call delete_timeline multiple times concurrently, they will - // all go through the motions here. Make sure the code here is idempotent, - // and don't error out if some of the shutdown tasks have already been - // completed! + // NB: If this fails half-way through, and is retried, the retry will go through + // all the same steps again. Make sure the code here is idempotent, and don't + // error out if some of the shutdown tasks have already been completed! // Stop the walreceiver first. debug!("waiting for wal receiver to shutdown"); @@ -1518,6 +1542,10 @@ impl Tenant { // If we (now, or already) marked it successfully as deleted, we can proceed Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); @@ -1528,14 +1556,12 @@ impl Tenant { { // Grab the layer_removal_cs lock, and actually perform the deletion. // - // This lock prevents multiple concurrent delete_timeline calls from - // stepping on each other's toes, while deleting the files. It also - // prevents GC or compaction from running at the same time. + // This lock prevents prevents GC or compaction from running at the same time. + // The GC task doesn't register itself with the timeline it's operating on, + // so it might still be running even though we called `shutdown_tasks`. // // Note that there are still other race conditions between - // GC, compaction and timeline deletion. GC task doesn't - // register itself properly with the timeline it's - // operating on. See + // GC, compaction and timeline deletion. See // https://github.com/neondatabase/neon/issues/2671 // // No timeout here, GC & Compaction should be responsive to the @@ -1597,37 +1623,27 @@ impl Tenant { }); // Remove the timeline from the map. - let mut timelines = self.timelines.lock().unwrap(); - let children_exist = timelines - .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); - // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. - // We already deleted the layer files, so it's probably best to panic. - // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) - if children_exist { - panic!("Timeline grew children while we removed layer files"); - } - let removed_timeline = timelines.remove(&timeline_id); - if removed_timeline.is_none() { - // This can legitimately happen if there's a concurrent call to this function. - // T1 T2 - // lock - // unlock - // lock - // unlock - // remove files - // lock - // remove from map - // unlock - // return - // remove files - // lock - // remove from map observes empty map - // unlock - // return - debug!("concurrent call to this function won the race"); + { + let mut timelines = self.timelines.lock().unwrap(); + + let children_exist = timelines + .iter() + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. + // We already deleted the layer files, so it's probably best to panic. + // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) + if children_exist { + panic!("Timeline grew children while we removed layer files"); + } + + timelines.remove(&timeline_id).expect( + "timeline that we were deleting was concurrently removed from 'timelines' map", + ); } - drop(timelines); + + // All done! Mark the deletion as completed and release the delete_lock + *delete_lock_guard = true; + drop(delete_lock_guard); Ok(()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4bfebd93df66..9dd5352a540c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -236,6 +236,10 @@ pub struct Timeline { state: watch::Sender, + /// Prevent two tasks from deleting the timeline at the same time. If held, the + /// timeline is being deleted. If 'true', the timeline has already been deleted. + pub delete_lock: tokio::sync::Mutex, + eviction_task_timeline_state: tokio::sync::Mutex, } @@ -1414,6 +1418,7 @@ impl Timeline { eviction_task_timeline_state: tokio::sync::Mutex::new( EvictionTaskTimelineState::default(), ), + delete_lock: tokio::sync::Mutex::new(false), }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 7135b621cbed..99bf4002079f 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -371,7 +371,7 @@ def first_call_hit_failpoint(): # make the second call and assert behavior log.info("second call start") - error_msg_re = "another task is already setting the deleted_flag, started at" + error_msg_re = "timeline deletion is already in progress" with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err: ps_http.timeline_delete(env.initial_tenant, child_timeline_id) assert second_call_err.value.status_code == 500