diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 117bba5c614..e59c2865c05 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -38,13 +38,16 @@ jobs: GOARCH=amd64 PASSES='fmt bom dep' ./test ;; linux-amd64-integration-1-cpu) - GOARCH=amd64 CPU=1 RACE='false' make test-integration + make install-gofail + GOARCH=amd64 CPU=1 RACE='false' FAILPOINTS='true' make test-integration ;; linux-amd64-integration-2-cpu) - GOARCH=amd64 CPU=2 RACE='false' make test-integration + make install-gofail + GOARCH=amd64 CPU=2 RACE='false' FAILPOINTS='true' make test-integration ;; linux-amd64-integration-4-cpu) - GOARCH=amd64 CPU=4 RACE='false' make test-integration + make install-gofail + GOARCH=amd64 CPU=4 RACE='false' FAILPOINTS='true' make test-integration ;; linux-amd64-functional) ./build && GOARCH=amd64 PASSES='functional' ./test diff --git a/Makefile b/Makefile index 0af7d1e2dc4..5e06d3308a1 100644 --- a/Makefile +++ b/Makefile @@ -538,3 +538,12 @@ GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/g .PHONY: install-gofail install-gofail: go install go.etcd.io/gofail@${GOFAIL_VERSION} + + +.PHONY: gofail-enable +gofail-enable: install-gofail + PASSES="toggle_failpoints" FAILPOINTS=true ./test + +.PHONY: gofail-disable +gofail-disable: install-gofail + PASSES="toggle_failpoints" ./test diff --git a/build b/build index 3fa8cbeaef7..7cc3d2f95cb 100755 --- a/build +++ b/build @@ -21,7 +21,7 @@ GOFAIL_VERSION=$(cd tools/mod && go list -m -f "{{.Version}}" go.etcd.io/gofail) toggle_failpoints() { mode="$1" if command -v gofail >/dev/null 2>&1; then - gofail "$mode" etcdserver/ mvcc/ mvcc/backend/ wal/ + gofail "$mode" etcdserver/ lease/leasehttp/ mvcc/ mvcc/backend/ wal/ # shellcheck disable=SC2086 if [[ "$mode" == "enable" ]]; then go get go.etcd.io/gofail@${GOFAIL_VERSION} diff --git a/etcdserver/v3_server.go b/etcdserver/v3_server.go index a1040aca63a..698a524a80c 100644 --- a/etcdserver/v3_server.go +++ b/etcdserver/v3_server.go @@ -363,6 +363,9 @@ func (s *EtcdServer) leaseTimeToLive(ctx context.Context, r *pb.LeaseTimeToLiveR if err := s.waitAppliedIndex(); err != nil { return nil, err } + + // gofail: var beforeLookupWhenLeaseTimeToLive struct{} + // primary; timetolive directly from leader le := s.lessor.Lookup(lease.LeaseID(r.ID)) if le == nil { @@ -378,6 +381,15 @@ func (s *EtcdServer) leaseTimeToLive(ctx context.Context, r *pb.LeaseTimeToLiveR } resp.Keys = kbs } + + // The leasor could be demoted if leader changed during lookup. + // We should return error to force retry instead of returning + // incorrect remaining TTL. + if le.Demoted() { + // NOTE: lease.ErrNotPrimary is not retryable error for + // client. Instead, uses ErrLeaderChanged. + return nil, ErrLeaderChanged + } return resp, nil } diff --git a/go.mod b/go.mod index d3e0671770e..46b1e87b0f5 100644 --- a/go.mod +++ b/go.mod @@ -33,6 +33,7 @@ require ( github.com/urfave/cli v1.20.0 github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 go.etcd.io/bbolt v1.3.9 + go.etcd.io/gofail v0.1.0 go.uber.org/zap v1.10.0 golang.org/x/crypto v0.21.0 golang.org/x/net v0.23.0 diff --git a/go.sum b/go.sum index 1cb9420028d..26ae45f8770 100644 --- a/go.sum +++ b/go.sum @@ -177,6 +177,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.etcd.io/bbolt v1.3.9 h1:8x7aARPEXiXbHmtUwAIv7eV2fQFHrLLavdiJ3uzJXoI= go.etcd.io/bbolt v1.3.9/go.mod h1:zaO32+Ti0PK1ivdPtgMESzuzL2VPoIG1PCQNvOdo/dE= +go.etcd.io/gofail v0.1.0 h1:XItAMIhOojXFQMgrxjnd2EIIHun/d5qL0Pf7FzVTkFg= +go.etcd.io/gofail v0.1.0/go.mod h1:VZBCXYGZhHAinaBiiqYvuDynvahNsAyLFwB3kEHKz1M= go.uber.org/atomic v1.3.2 h1:2Oa65PReHzfn29GpvgsYwloV9AVFHPDk8tYxt2c2tr4= go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/multierr v1.1.0 h1:HoEmRHQPVSqub6w2z2d2EOVs2fjyFRGyofhKuyDq0QI= diff --git a/integration/v3_lease_test.go b/integration/v3_lease_test.go index 7f9742ccda6..bd6f01edd27 100644 --- a/integration/v3_lease_test.go +++ b/integration/v3_lease_test.go @@ -21,10 +21,14 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.etcd.io/etcd/clientv3" "go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes" pb "go.etcd.io/etcd/etcdserver/etcdserverpb" "go.etcd.io/etcd/mvcc/mvccpb" "go.etcd.io/etcd/pkg/testutil" + gofail "go.etcd.io/gofail/runtime" "google.golang.org/grpc/codes" "google.golang.org/grpc/metadata" @@ -1021,6 +1025,76 @@ func TestV3LeaseRecoverKeyWithMutipleLease(t *testing.T) { } } +func TestV3LeaseTimeToLiveWithLeaderChanged(t *testing.T) { + t.Run("normal", func(subT *testing.T) { + testV3LeaseTimeToLiveWithLeaderChanged(subT, "beforeLookupWhenLeaseTimeToLive") + }) + + t.Run("forward", func(subT *testing.T) { + testV3LeaseTimeToLiveWithLeaderChanged(subT, "beforeLookupWhenForwardLeaseTimeToLive") + }) +} + +func testV3LeaseTimeToLiveWithLeaderChanged(t *testing.T, fpName string) { + if len(gofail.List()) == 0 { + t.Skip("please run 'make gofail-enable' before running the test") + } + + clus := NewClusterV3(t, &ClusterConfig{Size: 3}) + defer clus.Terminate(t) + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + oldLeadIdx := clus.WaitLeader(t) + followerIdx := (oldLeadIdx + 1) % 3 + + followerMemberID := clus.Members[followerIdx].ID() + + oldLeadC := clus.Client(oldLeadIdx) + + leaseResp, err := oldLeadC.Grant(ctx, 100) + require.NoError(t, err) + + require.NoError(t, gofail.Enable(fpName, `sleep("3s")`)) + t.Cleanup(func() { + terr := gofail.Disable(fpName) + if terr != nil && terr != gofail.ErrDisabled { + t.Fatalf("failed to disable %s: %v", fpName, terr) + } + }) + + readyCh := make(chan struct{}) + errCh := make(chan error, 1) + + var targetC *clientv3.Client + switch fpName { + case "beforeLookupWhenLeaseTimeToLive": + targetC = oldLeadC + case "beforeLookupWhenForwardLeaseTimeToLive": + targetC = clus.Client((oldLeadIdx + 2) % 3) + default: + t.Fatalf("unsupported %s failpoint", fpName) + } + + go func() { + <-readyCh + time.Sleep(1 * time.Second) + + _, merr := oldLeadC.MoveLeader(ctx, uint64(followerMemberID)) + assert.NoError(t, gofail.Disable(fpName)) + errCh <- merr + }() + + close(readyCh) + + ttlResp, err := targetC.TimeToLive(ctx, leaseResp.ID) + require.NoError(t, err) + require.GreaterOrEqual(t, int64(100), ttlResp.TTL) + + require.NoError(t, <-errCh) +} + // acquireLeaseAndKey creates a new lease and creates an attached key. func acquireLeaseAndKey(clus *ClusterV3, key string) (int64, error) { // create lease diff --git a/lease/leasehttp/http.go b/lease/leasehttp/http.go index e2f5b1ce1d6..d59868cde2f 100644 --- a/lease/leasehttp/http.go +++ b/lease/leasehttp/http.go @@ -103,6 +103,9 @@ func (h *leaseHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { http.Error(w, ErrLeaseHTTPTimeout.Error(), http.StatusRequestTimeout) return } + + // gofail: var beforeLookupWhenForwardLeaseTimeToLive struct{} + l := h.l.Lookup(lease.LeaseID(lreq.LeaseTimeToLiveRequest.ID)) if l == nil { http.Error(w, lease.ErrLeaseNotFound.Error(), http.StatusNotFound) @@ -126,6 +129,14 @@ func (h *leaseHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { resp.LeaseTimeToLiveResponse.Keys = kbs } + // The leasor could be demoted if leader changed during lookup. + // We should return error to force retry instead of returning + // incorrect remaining TTL. + if l.Demoted() { + http.Error(w, lease.ErrNotPrimary.Error(), http.StatusInternalServerError) + return + } + v, err = resp.Marshal() if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) diff --git a/lease/lessor.go b/lease/lessor.go index 02718b94e59..2a53be3226f 100644 --- a/lease/lessor.go +++ b/lease/lessor.go @@ -901,6 +901,13 @@ func (l *Lease) forever() { l.expiry = forever } +// Demoted returns true if the lease's expiry has been reset to forever. +func (l *Lease) Demoted() bool { + l.expiryMu.Lock() + defer l.expiryMu.Unlock() + return l.expiry == forever +} + // Keys returns all the keys attached to the lease. func (l *Lease) Keys() []string { l.mu.RLock() diff --git a/test b/test index 6a916687c63..4cb508696c7 100755 --- a/test +++ b/test @@ -690,6 +690,10 @@ function build_cov_pass { go test -tags cov -c -covermode=set -coverpkg="$PKGS_COMMA" -o "${out}/etcdctl_test" "${REPO_PATH}/etcdctl" } +function toggle_failpoints_pass { + toggle_failpoints_default +} + # fail fast on static tests function build_pass { echo "Checking build..."