From 32fdf830ec73599e1e9848dab9ec2b6bc20eb2af Mon Sep 17 00:00:00 2001 From: Marek Siarkowicz Date: Wed, 8 May 2024 11:12:18 +0200 Subject: [PATCH] Update the robustness README and fix the #14370 reproduction case Signed-off-by: Marek Siarkowicz --- tests/robustness/README.md | 96 +++++++++++++++++++-------- tests/robustness/failpoint/network.go | 3 - tests/robustness/scenarios.go | 20 +++--- 3 files changed, 81 insertions(+), 38 deletions(-) diff --git a/tests/robustness/README.md b/tests/robustness/README.md index eb5be5bbcec0..b07cef7acc89 100644 --- a/tests/robustness/README.md +++ b/tests/robustness/README.md @@ -71,31 +71,62 @@ Errors in etcd model could be causing false positives, which makes the ability t ## Analysing failure -If robustness tests fails we want to analyse the report to confirm if the issue is on etcd side. Location of this report -is included in test logs. One of log lines should look like: +If robustness tests fails we want to analyse the report to confirm if the issue is on etcd side. Location of the directory with the report +is mentioned `Saving robustness test report` log. Logs from report generation should look like: ``` - history.go:34: Model is not linearizable - logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving member data dir {"member": "TestRobustnessIssue14370-test-0", "path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0"} - logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving watch responses {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/responses.json"} - logger.go:130: 2023-03-18T12:18:03.247+0100 INFO Saving watch events {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/events.json"} - logger.go:130: 2023-05-15T17:42:37.792+0200 INFO Saving operation history {"path": "/tmp/TestRobustness_ClusterOfSize3_Kubernetes/client-1/operations.json"} - logger.go:130: 2023-05-15T17:42:37.793+0200 INFO Saving watch responses {"path": "/tmp/TestRobustness_ClusterOfSize3_Kubernetes/client-2/watch.json"} - logger.go:130: 2023-03-18T12:18:03.256+0100 INFO Saving visualization {"path": "/tmp/TestRobustness_Issue14370/history.html"} + logger.go:146: 2024-05-08T10:42:54.429+0200 INFO Saving robustness test report {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550"} + logger.go:146: 2024-05-08T10:42:54.429+0200 INFO Saving member data dir {"member": "TestRobustnessRegressionIssue14370-test-0", "path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/server-TestRobustnessRegressionIssue14370-test-0"} + logger.go:146: 2024-05-08T10:42:54.430+0200 INFO no watch operations for client, skip persisting {"client-id": 1} + logger.go:146: 2024-05-08T10:42:54.430+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-1/operations.json"} + logger.go:146: 2024-05-08T10:42:54.430+0200 INFO Saving watch operations {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-2/watch.json"} + logger.go:146: 2024-05-08T10:42:54.431+0200 INFO no KV operations for client, skip persisting {"client-id": 2} + logger.go:146: 2024-05-08T10:42:54.431+0200 INFO no watch operations for client, skip persisting {"client-id": 3} + logger.go:146: 2024-05-08T10:42:54.431+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-3/operations.json"} + logger.go:146: 2024-05-08T10:42:54.433+0200 INFO no watch operations for client, skip persisting {"client-id": 4} + logger.go:146: 2024-05-08T10:42:54.433+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-4/operations.json"} + logger.go:146: 2024-05-08T10:42:54.434+0200 INFO no watch operations for client, skip persisting {"client-id": 5} + logger.go:146: 2024-05-08T10:42:54.434+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-5/operations.json"} + logger.go:146: 2024-05-08T10:42:54.435+0200 INFO no watch operations for client, skip persisting {"client-id": 6} + logger.go:146: 2024-05-08T10:42:54.435+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-6/operations.json"} + logger.go:146: 2024-05-08T10:42:54.437+0200 INFO no watch operations for client, skip persisting {"client-id": 7} + logger.go:146: 2024-05-08T10:42:54.437+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-7/operations.json"} + logger.go:146: 2024-05-08T10:42:54.438+0200 INFO no watch operations for client, skip persisting {"client-id": 8} + logger.go:146: 2024-05-08T10:42:54.438+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-8/operations.json"} + logger.go:146: 2024-05-08T10:42:54.439+0200 INFO no watch operations for client, skip persisting {"client-id": 9} + logger.go:146: 2024-05-08T10:42:54.439+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-9/operations.json"} + logger.go:146: 2024-05-08T10:42:54.440+0200 INFO no watch operations for client, skip persisting {"client-id": 10} + logger.go:146: 2024-05-08T10:42:54.440+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-10/operations.json"} + logger.go:146: 2024-05-08T10:42:54.441+0200 INFO Saving visualization {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/history.html"} ``` -Report includes multiple types of files: -* Member db files, can be used to verify disk/memory corruption. -* Watch responses saved as json, can be used to validate [watch API guarantees]. -* Operation history saved as both html visualization and a json, can be used to validate [KV API guarantees]. +Report follows the hierarchy: +* `server-*` - etcd server data directories, can be used to verify disk/memory corruption. + * `member` + * `wal` - Write Ahead Log (WAL) directory, that can be analysed using `etcd-dump-logs` command line tool available in `tools` directory. + * `snap` - Snapshot directory, includes the bbolt database file `db`, that can be analysed using `etcd-dump-db` command line tool available in `tools` directory. +* `client-*` - Client request and response dumps in json format. + * `watch.jon` - Watch requests and responses, can be used to validate [watch API guarantees]. + * `operations.json` - KV operation history +* `history.html` - Visualization of KV operation history, can be used to validate [KV API guarantees]. ### Example analysis of linearization issue Let's reproduce and analyse robustness test report for issue [#14370]. To reproduce the issue by yourself run `make test-robustness-issue14370`. -After a couple of tries robustness tests should fail with a log `Model is not linearizable` and save report locally. +After a couple of tries robustness tests should fail with a log `Linearization failed` and save report locally. -Lineralization issues are easiest to analyse via history visualization. -Open `/tmp/TestRobustness_Issue14370/history.html` file in your browser. +Example: +``` + logger.go:146: 2024-05-08T10:42:53.379+0200 INFO Validating linearizable operations {"timeout": "5m0s"} + logger.go:146: 2024-05-08T10:42:54.429+0200 ERROR Linearization failed {"duration": "1.050105973s"} + validate.go:39: Failed linearization, skipping further validation + logger.go:146: 2024-05-08T10:42:54.429+0200 INFO Saving robustness test report {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550"} + ... + logger.go:146: 2024-05-08T10:42:54.441+0200 INFO Saving visualization {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/history.html"} +``` + +Linearization issues are easiest to analyse via history visualization. +Open `/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/history.html` file in your browser. Jump to the error in linearization by clicking `[ jump to first error ]` on the top of the page. You should see a graph similar to the one on the image below. @@ -114,23 +145,34 @@ Let's reproduce and analyse robustness test report for issue [#15271]. To reproduce the issue by yourself run `make test-robustness-issue15271`. After a couple of tries robustness tests should fail with a logs `Broke watch guarantee` and save report locally. +Example: +``` + logger.go:146: 2024-05-08T10:50:11.301+0200 INFO Validating linearizable operations {"timeout": "5m0s"} + logger.go:146: 2024-05-08T10:50:15.754+0200 INFO Linearization success {"duration": "4.453346487s"} + logger.go:146: 2024-05-08T10:50:15.754+0200 INFO Validating watch + logger.go:146: 2024-05-08T10:50:15.849+0200 ERROR Broke watch guarantee {"guarantee": "ordered", "client": 4, "revision": 3} + validate.go:45: Failed validating watch history, err: broke Ordered - events are ordered by revision; an event will never appear on a watch if it precedes an event in time that has already been posted + logger.go:146: 2024-05-08T10:50:15.849+0200 INFO Validating serializable operations + logger.go:146: 2024-05-08T10:50:15.866+0200 INFO Saving robustness test report {"path": "/tmp/TestRobustnessRegression_Issue15271/1715158215866033806"} +``` + Watch issues are easiest to analyse by reading the recorded watch history. -Watch history is recorded for each client separated in different subdirectory under `/tmp/TestRobustness_Issue15271/` +Watch history is recorded for each client separated in different subdirectory under `/tmp/TestRobustnessRegression_Issue15271/1715158215866033806` Open `watch.json` for client mentioned in log `Broke watch guarantee`. -For example for client `14` open `/tmp/TestRobustness_Issue15271/client-14/watch.json`. +For client `4` that broke the watch guarantee open `/tmp/TestRobustnessRegression_Issue15271/1715158215866033806/client-4/watch.json`. -Each line consists of json blob corresponding to single watch response observed by client. -Look for lines with `mod_revision` equal to revision mentioned in the first log with `Broke watch guarantee` -You should see two lines where the `mod_revision` decreases like ones below: +Each line consists of json blob corresponding to single watch request sent by client. +Look for events with `Revision` equal to revision mentioned in the first log with `Broke watch guarantee`, in this case look for `"Revision":3,`. +You should see watch responses where the `Revision` decreases like ones below: ``` -{"Events":[{"Op":{"Type":"put","Key":"5","WithPrefix":false,"Limit":0,"Value":{"Value":"357","Hash":0},"LeaseID":0},"Revision":335}],"IsProgressNotify":false,"Revision":335,"Time":1050415777} -{"Events":[{"Op":{"Type":"put","Key":"1","WithPrefix":false,"Limit":0,"Value":{"Value":"24","Hash":0},"LeaseID":0},"Revision":24}, ... +{"Events":[{"Type":"put-operation","Key":"key5","Value":{"Value":"793","Hash":0},"Revision":799,"IsCreate":false,"PrevValue":null}],"IsProgressNotify":false,"Revision":799,"Time":3202907249,"Error":""} +{"Events":[{"Type":"put-operation","Key":"key4","Value":{"Value":"1","Hash":0},"Revision":3,"IsCreate":true,"PrevValue":null}, ... ``` -Up to the first line the `revision` of events within responses only increased up to a value of `335`. -However, the following line includes an event with `revision` equal `24`. +Up to the first response the `Revision` of events only increased up to a value of `799`. +However, the following line includes an event with `Revision` equal `3`. If you follow the `revision` throughout the file you should notice that watch replayed revisions second time. -This is incorrect and breaks `Ordered` and `Unique` [watch API guarantees]. -This is consistent with the root cause of [#14370] where member reconnecting to cluster will incorrectly resend revisions. +This is incorrect and breaks `Ordered` [watch API guarantees]. +This is consistent with the root cause of [#14370] where member reconnecting to cluster will resend revisions. [#15271]: https://github.com/etcd-io/etcd/issues/15271 diff --git a/tests/robustness/failpoint/network.go b/tests/robustness/failpoint/network.go index 5d59fba3d99c..fa8cc849c4f9 100644 --- a/tests/robustness/failpoint/network.go +++ b/tests/robustness/failpoint/network.go @@ -55,9 +55,6 @@ func (tb triggerBlackhole) Trigger(ctx context.Context, t *testing.T, member e2e } func (tb triggerBlackhole) Available(config e2e.EtcdProcessClusterConfig, process e2e.EtcdProcess) bool { - if tb.waitTillSnapshot && config.ServerConfig.SnapshotCatchUpEntries > 100 { - return false - } return config.ClusterSize > 1 && process.PeerProxy() != nil } diff --git a/tests/robustness/scenarios.go b/tests/robustness/scenarios.go index 01cd00c8b2b6..a94eccaed6db 100644 --- a/tests/robustness/scenarios.go +++ b/tests/robustness/scenarios.go @@ -19,6 +19,7 @@ import ( "testing" "time" + "github.com/coreos/go-semver/semver" "go.etcd.io/etcd/api/v3/version" "go.etcd.io/etcd/tests/v3/framework/e2e" "go.etcd.io/etcd/tests/v3/robustness/failpoint" @@ -162,19 +163,22 @@ func regressionScenarios(t *testing.T) []testScenario { e2e.WithClusterSize(1), ), }) - // TODO: Deflake waiting for waiting until snapshot for etcd versions that don't support setting snapshot catchup entries. - if v.Compare(version.V3_6) >= 0 { + if v.Compare(version.V3_5) >= 0 { + opts := []e2e.EPClusterOption{ + e2e.WithSnapshotCount(100), + e2e.WithPeerProxy(true), + e2e.WithIsPeerTLS(true), + } + v3_5_13 := semver.Version{Major: 3, Minor: 5, Patch: 13} + if v.Compare(v3_5_13) >= 0 { + opts = append(opts, e2e.WithSnapshotCatchUpEntries(100)) + } scenarios = append(scenarios, testScenario{ name: "Issue15271", failpoint: failpoint.BlackholeUntilSnapshot, profile: traffic.HighTrafficProfile, traffic: traffic.EtcdPut, - cluster: *e2e.NewConfig( - e2e.WithSnapshotCatchUpEntries(100), - e2e.WithSnapshotCount(100), - e2e.WithPeerProxy(true), - e2e.WithIsPeerTLS(true), - ), + cluster: *e2e.NewConfig(opts...), }) } return scenarios