VSR: Fix liveness for io_depth_write overflow when replicas <= 2

Where `config.pipelining_max` exceeds `config.io_depth_write` it's possible for a client request to be unable to acquire a write IOP if we have maxed out our IO depth. This can lead to deadlock for a cluster of one or two, since there is no other way for the leader to repair the dirty op because no other replica has it. The fix is for `on_prepare_timeout()` to retry the prepare. Reported-by: @ThreeFx Fixes: #5
tigerbeetle · Sep 14, 2021 · eb423c3 · eb423c3
1 parent f35002d
commit eb423c3
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 3 deletions.
diff --git a/src/config.zig b/src/config.zig
@@ -2,7 +2,7 @@
 pub const deployment_environment = .development;
 
 /// The maximum log level in increasing order of verbosity (emergency=0, debug=7):
-pub const log_level = 7;
+pub const log_level = 6;
 
 /// The maximum number of replicas allowed in a cluster.
 /// This has been limited to 5 just to decrease the amount of memory required by the VOPR simulator.

diff --git a/src/vsr/journal.zig b/src/vsr/journal.zig
@@ -510,7 +510,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
             }
 
             const read = self.reads.acquire() orelse {
-                self.read_prepare_log(op, checksum, "no iop available");
+                self.read_prepare_log(op, checksum, "waiting for IOP");
                 callback(replica, null, null);
                 return;
             };
@@ -827,7 +827,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
             assert(self.has_dirty(message.header));
 
             const write = self.writes.acquire() orelse {
-                self.write_prepare_debug(message.header, "no IOP available");
+                self.write_prepare_debug(message.header, "waiting for IOP");
                 callback(replica, null, trigger);
                 return;
             };

diff --git a/src/vsr/replica.zig b/src/vsr/replica.zig
@@ -1056,6 +1056,7 @@ pub fn Replica(
             self.send_message_to_replica(message.header.replica, start_view);
         }
 
+        /// TODO This is a work in progress (out of scope for the bounty)
         fn on_recovery(self: *Self, message: *const Message) void {
             if (self.status != .normal) {
                 log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
@@ -1112,6 +1113,7 @@ pub fn Replica(
             self.send_message_to_replica(message.header.replica, response);
         }
 
+        /// TODO This is a work in progress (out of scope for the bounty)
         fn on_recovery_response(self: *Self, message: *Message) void {}
 
         fn on_request_prepare(self: *Self, message: *const Message) void {
@@ -1436,6 +1438,20 @@ pub fn Replica(
 
                 log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
                 assert(prepare.ok_from_all_replicas[self.replica] == null);
+
+                // We may be slow and waiting for the write to complete.
+                //
+                // We may even have maxed out our IO depth and been unable to initiate the write,
+                // which can happen if `config.pipelining_max` exceeds `config.io_depth_write`.
+                // This can lead to deadlock for a cluster of one or two (if we do not retry here),
+                // since there is no other way for the leader to repair the dirty op because no
+                // other replica has it.
+                //
+                // Retry the write through `on_repair()` which will work out which is which.
+                // We do expect that the op would have been run through `on_prepare()` already.
+                assert(prepare.message.header.op <= self.op);
+                self.on_repair(prepare.message);
+
                 return;
             }