Skip to content

Commit

Permalink
VSR: Fix liveness for io_depth_write overflow when replicas <= 2
Browse files Browse the repository at this point in the history
Where `config.pipelining_max` exceeds `config.io_depth_write` it's
possible for a client request to be unable to acquire a write IOP if we
have maxed out our IO depth.

This can lead to deadlock for a cluster of one or two, since there is
no other way for the leader to repair the dirty op because no other
replica has it.

The fix is for `on_prepare_timeout()` to retry the prepare.

Reported-by: @ThreeFx

Fixes: #5
  • Loading branch information
jorangreef committed Sep 14, 2021
1 parent f35002d commit eb423c3
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/config.zig
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
pub const deployment_environment = .development;

/// The maximum log level in increasing order of verbosity (emergency=0, debug=7):
pub const log_level = 7;
pub const log_level = 6;

/// The maximum number of replicas allowed in a cluster.
/// This has been limited to 5 just to decrease the amount of memory required by the VOPR simulator.
Expand Down
4 changes: 2 additions & 2 deletions src/vsr/journal.zig
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
}

const read = self.reads.acquire() orelse {
self.read_prepare_log(op, checksum, "no iop available");
self.read_prepare_log(op, checksum, "waiting for IOP");
callback(replica, null, null);
return;
};
Expand Down Expand Up @@ -827,7 +827,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
assert(self.has_dirty(message.header));

const write = self.writes.acquire() orelse {
self.write_prepare_debug(message.header, "no IOP available");
self.write_prepare_debug(message.header, "waiting for IOP");
callback(replica, null, trigger);
return;
};
Expand Down
16 changes: 16 additions & 0 deletions src/vsr/replica.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1056,6 +1056,7 @@ pub fn Replica(
self.send_message_to_replica(message.header.replica, start_view);
}

/// TODO This is a work in progress (out of scope for the bounty)
fn on_recovery(self: *Self, message: *const Message) void {
if (self.status != .normal) {
log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
Expand Down Expand Up @@ -1112,6 +1113,7 @@ pub fn Replica(
self.send_message_to_replica(message.header.replica, response);
}

/// TODO This is a work in progress (out of scope for the bounty)
fn on_recovery_response(self: *Self, message: *Message) void {}

fn on_request_prepare(self: *Self, message: *const Message) void {
Expand Down Expand Up @@ -1436,6 +1438,20 @@ pub fn Replica(

log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
assert(prepare.ok_from_all_replicas[self.replica] == null);

// We may be slow and waiting for the write to complete.
//
// We may even have maxed out our IO depth and been unable to initiate the write,
// which can happen if `config.pipelining_max` exceeds `config.io_depth_write`.
// This can lead to deadlock for a cluster of one or two (if we do not retry here),
// since there is no other way for the leader to repair the dirty op because no
// other replica has it.
//
// Retry the write through `on_repair()` which will work out which is which.
// We do expect that the op would have been run through `on_prepare()` already.
assert(prepare.message.header.op <= self.op);
self.on_repair(prepare.message);

return;
}

Expand Down

0 comments on commit eb423c3

Please sign in to comment.