-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
mvcc3.proto
315 lines (291 loc) · 13.9 KB
/
mvcc3.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
syntax = "proto3";
package cockroach.storage.enginepb;
option go_package = "enginepb";
import "util/hlc/timestamp.proto";
import "gogoproto/gogo.proto";
// TxnMeta is the metadata of a Transaction record.
message TxnMeta {
option (gogoproto.goproto_stringer) = false;
option (gogoproto.populate) = true;
// id is a unique UUID value which identifies the transaction.
// This field is always filled in.
bytes id = 1 [(gogoproto.customname) = "ID",
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.nullable) = false];
reserved 2;
// key is the key which anchors the transaction. This is typically
// the first key read or written during the transaction and
// determines which range in the cluster will hold the transaction
// record.
bytes key = 3; // TODO(tschottdorf): [(gogoproto.casttype) = "Key"];
// Incremented on txn retry.
int32 epoch = 4 [(gogoproto.casttype) = "TxnEpoch"];
// The proposed timestamp for the transaction. This starts as the current wall
// time on the txn coordinator, and is forwarded by the timestamp cache if the
// txn attempts to write "beneath" another txn's writes.
//
// Writes within the txn are performed using the most up-to-date value of this
// timestamp that is available. For example, suppose a txn starts at some
// timestamp, writes a key/value, and has its timestamp forwarded while doing
// so because a later version already exists at that key. As soon as the txn
// coordinator learns of the updated timestamp, it will begin performing
// writes at the updated timestamp. The coordinator may, however, continue
// issuing writes at the original timestamp before it learns about the
// forwarded timestamp. The process of resolving the intents when the txn
// commits will bump any intents written at an older timestamp to the final
// commit timestamp.
//
// Note that reads do not occur at this timestamp; they instead occur at
// ReadTimestamp, which is tracked in the containing roachpb.Transaction.
//
// Writes used to be performed at the txn's read timestamp, which was
// necessary to avoid lost update anomalies in snapshot isolation mode. We no
// longer support snapshot isolation mode, and there are now several important
// reasons that writes are performed at this timestamp instead of the txn's
// original timestamp:
//
// 1. This timestamp is forwarded by the timestamp cache when this
// transaction attempts to write beneath a more recent read. Leaving the
// intent at the original timestamp would write beneath that read, which
// would violate an invariant that time-bound iterators rely on.
//
// For example, consider a client that uses a time-bound iterator to
// poll for changes to a key. The client reads (ts5, ts10], sees no
// writes, and reports that no changes have occurred up to t10. Then a
// txn writes an intent at its original timestamp ts7. The txn's
// timestamp is forwarded to ts11 by the timestamp cache thanks to the
// client's read. Meanwhile, the client reads (ts10, ts15] and, again
// seeing no intents, reports that no changes have occurred to the key
// up to t15. Now the txn commits at ts11 and bumps the intent to ts11.
// But the client thinks it has seen all changes up to t15, and so never
// sees the intent! We avoid this problem by writing intents at the
// provisional commit timestamp instead. In this example, the intent
// would instead be written at ts11 and picked up by the client's next
// read from (ts10, ts15].
//
// 2. Unnecessary PushTxn roundtrips are avoided. If a transaction is
// forwarded from ts5 to ts10, the rest of its intents will be written
// at ts10. Reads at t < ts10 that encounter these intents can ignore
// them; if the intents had instead been left at ts5, these reads would
// have needed to send PushTxn requests just to find out that the txn
// had, in fact, been forwarded to a non-conflicting time.
//
// 3. Unnecessary intent rewriting is avoided. Writing at the original
// timestamp when this timestamp has been forwarded guarantees that the
// value will need to be rewritten at the forwarded timestamp if the
// transaction commits.
//
util.hlc.Timestamp write_timestamp = 5 [(gogoproto.nullable) = false];
// The timestamp that the transaction was assigned by its gateway when it
// began its first epoch. This is the earliest timestamp that the transaction
// could have written any of its intents at.
//
// The timestamp is currently used in three places:
// 1. by the transaction itself and by concurrent transactions when
// determining whether this transaction's record can be initially
// written. The timestamp is compared against the transaction's
// corresponding timestamp cache entry to ensure that a
// finalized transaction can never commit, either after a replay
// or a transaction abort. See CanCreateTxnRecord.
// 2. by intent resolution to efficiently scan for intents while
// using a time-bound iterator - i.e. there can be intents to
// resolve up to the timestamp that the txn started with.
// 3. by would-be pushers, when they run into an intent but the corresponding
// txn record was not yet written. In that case, the pusher uses this field
// as an indication of a timestamp when the pushee's coordinator is known
// to have been alive.
//
// NOTE: this could use a ClockTimestamp type, but doing so results in a
// large diff that doesn't seem worth it, given that we never feed this
// timestamp back into a clock.
util.hlc.Timestamp min_timestamp = 9 [(gogoproto.nullable) = false];
// The transaction's priority, ratcheted on transaction pushes.
int32 priority = 6 [(gogoproto.casttype) = "TxnPriority"];
// A zero-indexed sequence number which is increased on each request
// sent as part of the transaction. When set in the header of a batch of
// requests, the value will correspond to the sequence number of the
// last request. Used to provide idempotency and to protect against
// out-of-order application (by means of a transaction retry).
int32 sequence = 7 [(gogoproto.casttype) = "TxnSeq"];
reserved 8;
// The ID of the node where this transaction originated.
// This field represents either a SQLInstanceID of a SQL pod, a SQL
// gateway NodeID, or a KV node ID (in the case of KV-initiated
// transactions) and was introduced for the purposes of SQL Observability.
// TODO(sarkesian): Refactor to use gogoproto.casttype GenericNodeID when #73309 completes.
int32 coordinator_node_id = 10 [(gogoproto.customname) = "CoordinatorNodeID"];
}
// IgnoredSeqNumRange describes a range of ignored seqnums.
// The range is inclusive on both ends.
message IgnoredSeqNumRange {
option (gogoproto.equal) = true;
option (gogoproto.populate) = true;
int32 start = 1 [(gogoproto.casttype) = "TxnSeq"];
int32 end = 2 [(gogoproto.casttype) = "TxnSeq"];
}
// MVCCStatsDelta is convertible to MVCCStats, but uses signed variable width
// encodings for most fields that make it more efficient to store negative
// values. This makes the encodings incompatible.
message MVCCStatsDelta {
option (gogoproto.equal) = true;
int64 contains_estimates = 14;
sfixed64 last_update_nanos = 1;
sfixed64 intent_age = 2;
sfixed64 gc_bytes_age = 3 [(gogoproto.customname) = "GCBytesAge"];
sint64 live_bytes = 4;
sint64 live_count = 5;
sint64 key_bytes = 6;
sint64 key_count = 7;
sint64 val_bytes = 8;
sint64 val_count = 9;
sint64 intent_bytes = 10;
sint64 intent_count = 11;
sint64 separated_intent_count = 16;
sint64 sys_bytes = 12;
sint64 sys_count = 13;
sint64 abort_span_bytes = 15;
// WARNING: Do not add any PII-holding fields here, as this
// whole message is marked as safe for log redaction.
}
// MVCCPersistentStats is convertible to MVCCStats, but uses signed variable
// width encodings for most fields that make it efficient to store positive
// values but inefficient to store negative values. This makes the encodings
// incompatible.
message MVCCPersistentStats {
option (gogoproto.equal) = true;
option (gogoproto.populate) = true;
int64 contains_estimates = 14; // must never go negative absent a bug
sfixed64 last_update_nanos = 1;
sfixed64 intent_age = 2;
sfixed64 gc_bytes_age = 3 [(gogoproto.customname) = "GCBytesAge"];
int64 live_bytes = 4;
int64 live_count = 5;
int64 key_bytes = 6;
int64 key_count = 7;
int64 val_bytes = 8;
int64 val_count = 9;
int64 intent_bytes = 10;
int64 intent_count = 11;
int64 separated_intent_count = 16;
int64 sys_bytes = 12;
int64 sys_count = 13;
int64 abort_span_bytes = 15;
}
// RangeAppliedState combines the raft and lease applied indices with
// mvcc stats. These are all persisted on each transition of the Raft
// state machine (i.e. on each Raft application), so they are stored
// in the same RocksDB key for efficiency.
message RangeAppliedState {
option (gogoproto.equal) = true;
option (gogoproto.populate) = true;
// raft_applied_index is the highest (and last) index applied to the Raft
// state machine.
uint64 raft_applied_index = 1;
// lease_applied_index is the highest (and last) lease index applied to the
// Raft state machine.
uint64 lease_applied_index = 2;
// range_stats is the set of mvcc stats that accounts for the current value
// of the Raft state machine.
MVCCPersistentStats range_stats = 3 [(gogoproto.nullable) = false];
// raft_closed_timestamp is the largest timestamp that is known to have been
// closed through Raft commands as of this lease applied index. This means
// that the current leaseholder (if any) and any future leaseholder will not
// evaluate writes at or below this timestamp, and also that any in-flight
// commands that can still apply are writing at higher timestamps.
// Non-leaseholder replicas are free to serve "follower reads" at or below
// this timestamp.
//
// TODO(andrei): Make this field not-nullable in 21.2, once all the ranges
// have a closed timestamp applied to their state (this might need a
// migration). In 21.1 we cannot write empty timestamp to disk because that
// looks like an inconsistency to the consistency-checker.
util.hlc.Timestamp raft_closed_timestamp = 4;
// raft_applied_index_term is the term corresponding to raft_applied_index.
// The serialized proto will not contain this field until code starts
// setting it to a value > 0. This is desirable since we don't want a mixed
// version cluster to have divergent replica state simply because we have
// introduced this field. An explicit migration will cause this field to
// start being populated.
uint64 raft_applied_index_term = 5;
}
// MVCCWriteValueOp corresponds to a value being written outside of a
// transaction.
message MVCCWriteValueOp {
bytes key = 1;
util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
bytes value = 3;
bytes prev_value = 4;
}
// MVCCUpdateIntentOp corresponds to an intent being written for a given
// transaction.
message MVCCWriteIntentOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
bytes txn_key = 2;
util.hlc.Timestamp txn_min_timestamp = 4 [(gogoproto.nullable) = false];
util.hlc.Timestamp timestamp = 3 [(gogoproto.nullable) = false];
}
// MVCCUpdateIntentOp corresponds to an intent being updates at a larger
// timestamp for a given transaction.
message MVCCUpdateIntentOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
}
// MVCCCommitIntentOp corresponds to an intent being committed for a given
// transaction.
message MVCCCommitIntentOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
bytes key = 2;
util.hlc.Timestamp timestamp = 3 [(gogoproto.nullable) = false];
bytes value = 4;
bytes prev_value = 5;
}
// MVCCAbortIntentOp corresponds to an intent being aborted for a given
// transaction.
//
// This operation does not necessarily indicate that the intent's transaction
// was aborted, just that an intent was removed without being committed. For
// instance, a committed transaction will abort any intents it decided not to
// write in its final epoch.
message MVCCAbortIntentOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
}
// MVCCAbortTxnOp corresponds to an entire transaction being aborted. The
// operation indicates that none of the transaction's intents will ever be
// committed.
message MVCCAbortTxnOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
}
// MVCCLogicalOp is a union of all logical MVCC operation types.
message MVCCLogicalOp {
option (gogoproto.onlyone) = true;
MVCCWriteValueOp write_value = 1;
MVCCWriteIntentOp write_intent = 2;
MVCCUpdateIntentOp update_intent = 3;
MVCCCommitIntentOp commit_intent = 4;
MVCCAbortIntentOp abort_intent = 5;
MVCCAbortTxnOp abort_txn = 6;
}