diff --git a/.changelog/17075.txt b/.changelog/17075.txt new file mode 100644 index 0000000000000..1c882fabbddd3 --- /dev/null +++ b/.changelog/17075.txt @@ -0,0 +1,3 @@ +```release-note:improvement +agent: remove agent cache dependency from service mesh leaf certificate management +``` diff --git a/.changelog/17483.txt b/.changelog/17483.txt new file mode 100644 index 0000000000000..26c81dbe4cdf7 --- /dev/null +++ b/.changelog/17483.txt @@ -0,0 +1,3 @@ +```release-note:bug +peering: Fix a bug that caused server agents to continue cleaning up peering resources even after loss of leadership. +``` diff --git a/.changelog/17582.txt b/.changelog/17582.txt new file mode 100644 index 0000000000000..122b9df98116e --- /dev/null +++ b/.changelog/17582.txt @@ -0,0 +1,3 @@ +```release-note:feature +cli: `consul operator raft list-peers` command shows the number of commits each follower is trailing the leader by to aid in troubleshooting. +``` diff --git a/.changelog/17593.txt b/.changelog/17593.txt deleted file mode 100644 index 1f84e75f57427..0000000000000 --- a/.changelog/17593.txt +++ /dev/null @@ -1,3 +0,0 @@ -```release-note:bug -docs: fix list of telemetry metrics -``` diff --git a/.changelog/17596.txt b/.changelog/17596.txt new file mode 100644 index 0000000000000..1058df1ea3ab2 --- /dev/null +++ b/.changelog/17596.txt @@ -0,0 +1,3 @@ +```release-note:improvement + debug: change default setting of consul debug command. now default duration is 5ms and default log level is 'TRACE' + ``` \ No newline at end of file diff --git a/.changelog/17719.txt b/.changelog/17719.txt new file mode 100644 index 0000000000000..f45370b3f7dc6 --- /dev/null +++ b/.changelog/17719.txt @@ -0,0 +1,3 @@ +```release-note:security +Bump Dockerfile base image to `alpine:3.18`. + ``` diff --git a/.changelog/17739.txt b/.changelog/17739.txt new file mode 100644 index 0000000000000..14bbceeaa0849 --- /dev/null +++ b/.changelog/17739.txt @@ -0,0 +1,3 @@ +```release-note:bug +http: fixed API endpoint `PUT /acl/token/:AccessorID` (update token), no longer requires `AccessorID` in the request body. Web UI can now update tokens. + ``` diff --git a/.changelog/17755.txt b/.changelog/17755.txt new file mode 100644 index 0000000000000..7edf7b26e1591 --- /dev/null +++ b/.changelog/17755.txt @@ -0,0 +1,3 @@ +```release-note:improvement +mesh: Stop jwt providers referenced by intentions from being deleted. +``` \ No newline at end of file diff --git a/.changelog/17757.txt b/.changelog/17757.txt new file mode 100644 index 0000000000000..e207438cf8437 --- /dev/null +++ b/.changelog/17757.txt @@ -0,0 +1,3 @@ +```release-note:improvement +connect: Improve transparent proxy support for virtual services and failovers. +``` diff --git a/.changelog/17759.txt b/.changelog/17759.txt new file mode 100644 index 0000000000000..0836608ae1f24 --- /dev/null +++ b/.changelog/17759.txt @@ -0,0 +1,3 @@ +```release-note:improvement +extensions: Improve validation and error feedback for `property-override` builtin Envoy extension +``` diff --git a/.changelog/17775.txt b/.changelog/17775.txt new file mode 100644 index 0000000000000..8060cfa128ce7 --- /dev/null +++ b/.changelog/17775.txt @@ -0,0 +1,3 @@ +```release-note:bug +connect: Fix issue where changes to service exports were not reflected in proxies. +``` diff --git a/.changelog/17888.txt b/.changelog/17888.txt new file mode 100644 index 0000000000000..f50fcae09b074 --- /dev/null +++ b/.changelog/17888.txt @@ -0,0 +1,3 @@ +```release-note:improvement +connect: Add capture group labels from Envoy cluster FQDNs to Envoy exported metric labels +``` \ No newline at end of file diff --git a/.changelog/_5517.txt b/.changelog/_5517.txt new file mode 100644 index 0000000000000..5152a6ff78f73 --- /dev/null +++ b/.changelog/_5517.txt @@ -0,0 +1,3 @@ +```release-note:bug +namespaces: **(Enterprise only)** fixes a bug where agent health checks stop syncing for all services on a node if the namespace of any service has been removed from the server. +``` diff --git a/.changelog/_5614.txt b/.changelog/_5614.txt new file mode 100644 index 0000000000000..9951b9111875c --- /dev/null +++ b/.changelog/_5614.txt @@ -0,0 +1,4 @@ +```release-note:bug +namespaces: **(Enterprise only)** fixes a bug where namespaces are stuck in a deferred deletion state indefinitely under some conditions. +Also fixes the Consul query metadata present in the HTTP headers of the namespace read and list endpoints. +``` diff --git a/.github/workflows/nightly-test-1.16.x.yaml b/.github/workflows/nightly-test-1.12.x.yaml similarity index 98% rename from .github/workflows/nightly-test-1.16.x.yaml rename to .github/workflows/nightly-test-1.12.x.yaml index c30ed6811c2b8..0f016075e261a 100644 --- a/.github/workflows/nightly-test-1.16.x.yaml +++ b/.github/workflows/nightly-test-1.12.x.yaml @@ -1,7 +1,7 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: MPL-2.0 -name: Nightly Test 1.16.x +name: Nightly Test 1.12.x on: schedule: - cron: '0 4 * * *' @@ -9,8 +9,8 @@ on: env: EMBER_PARTITION_TOTAL: 4 # Has to be changed in tandem with the matrix.partition - BRANCH: "release/1.16.x" - BRANCH_NAME: "release-1.16.x" # Used for naming artifacts + BRANCH: "release/1.12.x" + BRANCH_NAME: "release-1.12.x" # Used for naming artifacts jobs: frontend-test-workspace-node: diff --git a/Dockerfile b/Dockerfile index 065156752ea17..7599ec7b35b80 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,7 @@ # Official docker image that includes binaries from releases.hashicorp.com. This # downloads the release from releases.hashicorp.com and therefore requires that # the release is published before building the Docker image. -FROM docker.mirror.hashicorp.services/alpine:3.17 as official +FROM docker.mirror.hashicorp.services/alpine:3.18 as official # This is the release of Consul to pull in. ARG VERSION @@ -112,7 +112,7 @@ CMD ["agent", "-dev", "-client", "0.0.0.0"] # Production docker image that uses CI built binaries. # Remember, this image cannot be built locally. -FROM docker.mirror.hashicorp.services/alpine:3.17 as default +FROM docker.mirror.hashicorp.services/alpine:3.18 as default ARG PRODUCT_VERSION ARG BIN_NAME diff --git a/agent/acl_endpoint.go b/agent/acl_endpoint.go index da838b1a646e1..d6f230a8f64e2 100644 --- a/agent/acl_endpoint.go +++ b/agent/acl_endpoint.go @@ -441,8 +441,16 @@ func (s *HTTPHandlers) aclTokenSetInternal(req *http.Request, tokenAccessorID st return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: fmt.Sprintf("Token decoding failed: %v", err)} } - if !create && args.ACLToken.AccessorID != tokenAccessorID { - return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: "Token Accessor ID in URL and payload do not match"} + if !create { + // NOTE: AccessorID in the request body is optional when not creating a new token. + // If not present in the body and only in the URL then it will be filled in by Consul. + if args.ACLToken.AccessorID == "" { + args.ACLToken.AccessorID = tokenAccessorID + } + + if args.ACLToken.AccessorID != tokenAccessorID { + return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: "Token Accessor ID in URL and payload do not match"} + } } var out structs.ACLToken diff --git a/agent/acl_endpoint_test.go b/agent/acl_endpoint_test.go index 20c982492a7c3..0c948880e0365 100644 --- a/agent/acl_endpoint_test.go +++ b/agent/acl_endpoint_test.go @@ -907,6 +907,48 @@ func TestACL_HTTP(t *testing.T) { tokenMap[token.AccessorID] = token }) + t.Run("Update without AccessorID in request body", func(t *testing.T) { + originalToken := tokenMap[idMap["token-cloned"]] + + // Secret will be filled in + tokenInput := &structs.ACLToken{ + Description: "Even Better description for this cloned token", + Policies: []structs.ACLTokenPolicyLink{ + { + ID: idMap["policy-read-all-nodes"], + Name: policyMap[idMap["policy-read-all-nodes"]].Name, + }, + }, + NodeIdentities: []*structs.ACLNodeIdentity{ + { + NodeName: "foo", + Datacenter: "bar", + }, + }, + } + + req, _ := http.NewRequest("PUT", "/v1/acl/token/"+originalToken.AccessorID, jsonBody(tokenInput)) + req.Header.Add("X-Consul-Token", "root") + resp := httptest.NewRecorder() + obj, err := a.srv.ACLTokenCRUD(resp, req) + require.NoError(t, err) + token, ok := obj.(*structs.ACLToken) + require.True(t, ok) + + require.Equal(t, originalToken.AccessorID, token.AccessorID) + require.Equal(t, originalToken.SecretID, token.SecretID) + require.Equal(t, tokenInput.Description, token.Description) + require.Equal(t, tokenInput.Policies, token.Policies) + require.Equal(t, tokenInput.NodeIdentities, token.NodeIdentities) + require.True(t, token.CreateIndex > 0) + require.True(t, token.CreateIndex < token.ModifyIndex) + require.NotNil(t, token.Hash) + require.NotEqual(t, token.Hash, []byte{}) + require.NotEqual(t, token.Hash, originalToken.Hash) + + tokenMap[token.AccessorID] = token + }) + t.Run("CRUD Missing Token Accessor ID", func(t *testing.T) { req, _ := http.NewRequest("GET", "/v1/acl/token/", nil) req.Header.Add("X-Consul-Token", "root") diff --git a/agent/agent.go b/agent/agent.go index 0b06688c483ba..90bfffc1afe98 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -49,6 +49,7 @@ import ( grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns" middleware "github.com/hashicorp/consul/agent/grpc-middleware" "github.com/hashicorp/consul/agent/hcp/scada" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/local" "github.com/hashicorp/consul/agent/proxycfg" proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue" @@ -123,6 +124,7 @@ var configSourceToName = map[configSource]string{ ConfigSourceLocal: "local", ConfigSourceRemote: "remote", } + var configSourceFromName = map[string]configSource{ "local": ConfigSourceLocal, "remote": ConfigSourceRemote, @@ -247,6 +249,9 @@ type Agent struct { // cache is the in-memory cache for data the Agent requests. cache *cache.Cache + // leafCertManager issues and caches leaf certs as needed. + leafCertManager *leafcert.Manager + // checkReapAfter maps the check ID to a timeout after which we should // reap its associated service checkReapAfter map[structs.CheckID]time.Duration @@ -428,6 +433,12 @@ type Agent struct { // - create the AutoConfig object for future use in fully // resolving the configuration func New(bd BaseDeps) (*Agent, error) { + if bd.LeafCertManager == nil { + return nil, errors.New("LeafCertManager is required") + } + if bd.NetRPC == nil { + return nil, errors.New("NetRPC is required") + } a := Agent{ checkReapAfter: make(map[structs.CheckID]time.Duration), checkMonitors: make(map[structs.CheckID]*checks.CheckMonitor), @@ -454,6 +465,7 @@ func New(bd BaseDeps) (*Agent, error) { tlsConfigurator: bd.TLSConfigurator, config: bd.RuntimeConfig, cache: bd.Cache, + leafCertManager: bd.LeafCertManager, routineManager: routine.NewManager(bd.Logger), scadaProvider: bd.HCP.Provider, } @@ -497,6 +509,9 @@ func New(bd BaseDeps) (*Agent, error) { }, } + // TODO(rb): remove this once NetRPC is properly available in BaseDeps without an Agent + bd.NetRPC.SetNetRPC(&a) + // We used to do this in the Start method. However it doesn't need to go // there any longer. Originally it did because we passed the agent // delegate to some of the cache registrations. Now we just @@ -674,7 +689,7 @@ func (a *Agent) Start(ctx context.Context) error { Datacenter: a.config.Datacenter, ACLsEnabled: a.config.ACLsEnabled, }, - Cache: a.cache, + LeafCertManager: a.leafCertManager, GetStore: func() servercert.Store { return server.FSM().State() }, TLSConfigurator: a.tlsConfigurator, } @@ -4354,13 +4369,6 @@ func (a *Agent) registerCache() { a.cache.RegisterType(cachetype.ConnectCARootName, &cachetype.ConnectCARoot{RPC: a}) - a.cache.RegisterType(cachetype.ConnectCALeafName, &cachetype.ConnectCALeaf{ - RPC: a, - Cache: a.cache, - Datacenter: a.config.Datacenter, - TestOverrideCAChangeInitialDelay: a.config.ConnectTestCALeafRootChangeSpread, - }) - a.cache.RegisterType(cachetype.IntentionMatchName, &cachetype.IntentionMatch{RPC: a}) a.cache.RegisterType(cachetype.IntentionUpstreamsName, &cachetype.IntentionUpstreams{RPC: a}) @@ -4521,7 +4529,7 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources { IntentionUpstreams: proxycfgglue.CacheIntentionUpstreams(a.cache), IntentionUpstreamsDestination: proxycfgglue.CacheIntentionUpstreamsDestination(a.cache), InternalServiceDump: proxycfgglue.CacheInternalServiceDump(a.cache), - LeafCertificate: proxycfgglue.CacheLeafCertificate(a.cache), + LeafCertificate: proxycfgglue.LocalLeafCerts(a.leafCertManager), PeeredUpstreams: proxycfgglue.CachePeeredUpstreams(a.cache), PeeringList: proxycfgglue.CachePeeringList(a.cache), PreparedQuery: proxycfgglue.CachePrepraredQuery(a.cache), @@ -4547,7 +4555,11 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources { sources.ExportedPeeredServices = proxycfgglue.ServerExportedPeeredServices(deps) sources.FederationStateListMeshGateways = proxycfgglue.ServerFederationStateListMeshGateways(deps) sources.GatewayServices = proxycfgglue.ServerGatewayServices(deps) - sources.Health = proxycfgglue.ServerHealth(deps, proxycfgglue.ClientHealth(a.rpcClientHealth)) + // We do not use this health check currently due to a bug with the way that service exports + // interact with ACLs and the streaming backend. See comments in `proxycfgglue.ServerHealthBlocking` + // for more details. + // sources.Health = proxycfgglue.ServerHealth(deps, proxycfgglue.ClientHealth(a.rpcClientHealth)) + sources.Health = proxycfgglue.ServerHealthBlocking(deps, proxycfgglue.ClientHealth(a.rpcClientHealth), server.FSM().State()) sources.HTTPChecks = proxycfgglue.ServerHTTPChecks(deps, a.config.NodeName, proxycfgglue.CacheHTTPChecks(a.cache), a.State) sources.Intentions = proxycfgglue.ServerIntentions(deps) sources.IntentionUpstreams = proxycfgglue.ServerIntentionUpstreams(deps) diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index d63936e29466d..f9e02f8f11ad2 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -11,12 +11,16 @@ import ( "strings" "time" - "github.com/hashicorp/go-bexpr" "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" + "github.com/mitchellh/hashstructure" + + "github.com/hashicorp/consul/envoyextensions/xdscommon" + "github.com/hashicorp/consul/version" + + "github.com/hashicorp/go-bexpr" "github.com/hashicorp/serf/coordinate" "github.com/hashicorp/serf/serf" - "github.com/mitchellh/hashstructure" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -24,16 +28,15 @@ import ( cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/consul" "github.com/hashicorp/consul/agent/debug" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/structs" token_store "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/api" - "github.com/hashicorp/consul/envoyextensions/xdscommon" "github.com/hashicorp/consul/ipaddr" "github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging/monitor" "github.com/hashicorp/consul/types" - "github.com/hashicorp/consul/version" ) type Self struct { @@ -1569,7 +1572,7 @@ func (s *HTTPHandlers) AgentConnectCALeafCert(resp http.ResponseWriter, req *htt // TODO(peering): expose way to get kind=mesh-gateway type cert with appropriate ACLs - args := cachetype.ConnectCALeafRequest{ + args := leafcert.ConnectCALeafRequest{ Service: serviceName, // Need name not ID } var qOpts structs.QueryOptions @@ -1598,17 +1601,13 @@ func (s *HTTPHandlers) AgentConnectCALeafCert(resp http.ResponseWriter, req *htt return nil, nil } - raw, m, err := s.agent.cache.Get(req.Context(), cachetype.ConnectCALeafName, &args) + reply, m, err := s.agent.leafCertManager.Get(req.Context(), &args) if err != nil { return nil, err } + defer setCacheMeta(resp, &m) - reply, ok := raw.(*structs.IssuedCert) - if !ok { - // This should never happen, but we want to protect against panics - return nil, fmt.Errorf("internal error: response type not correct") - } setIndex(resp, reply.ModifyIndex) return reply, nil diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index fc0345cee4788..367a998a298ad 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -21,6 +21,10 @@ import ( "time" "github.com/armon/go-metrics" + + "github.com/hashicorp/consul/api" + "github.com/hashicorp/consul/version" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-uuid" "github.com/hashicorp/serf/serf" @@ -40,14 +44,12 @@ import ( "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" tokenStore "github.com/hashicorp/consul/agent/token" - "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/envoyextensions/xdscommon" "github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/sdk/testutil" "github.com/hashicorp/consul/sdk/testutil/retry" "github.com/hashicorp/consul/testrpc" "github.com/hashicorp/consul/types" - "github.com/hashicorp/consul/version" ) func createACLTokenWithAgentReadPolicy(t *testing.T, srv *HTTPHandlers) string { @@ -6912,14 +6914,27 @@ func TestAgentConnectCALeafCert_good(t *testing.T) { require.Equal(t, issued, issued2) } + replyCh := make(chan *httptest.ResponseRecorder, 1) + + go func(index string) { + resp := httptest.NewRecorder() + req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test?index="+index, nil) + a.srv.h.ServeHTTP(resp, req) + + replyCh <- resp + }(index) + // Set a new CA ca2 := connect.TestCAConfigSet(t, a, nil) // Issue a blocking query to ensure that the cert gets updated appropriately t.Run("test blocking queries update leaf cert", func(t *testing.T) { - resp := httptest.NewRecorder() - req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test?index="+index, nil) - a.srv.h.ServeHTTP(resp, req) + var resp *httptest.ResponseRecorder + select { + case resp = <-replyCh: + case <-time.After(500 * time.Millisecond): + t.Fatal("blocking query did not wake up during rotation") + } dec := json.NewDecoder(resp.Body) issued2 := &structs.IssuedCert{} require.NoError(t, dec.Decode(issued2)) diff --git a/agent/agent_test.go b/agent/agent_test.go index b2f0ea32e3177..b234573f3e57a 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -52,6 +52,7 @@ import ( "github.com/hashicorp/consul/agent/consul" "github.com/hashicorp/consul/agent/hcp" "github.com/hashicorp/consul/agent/hcp/scada" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/api" @@ -328,9 +329,16 @@ func TestAgent_HTTPMaxHeaderBytes(t *testing.T) { }, HTTPMaxHeaderBytes: tt.maxHeaderBytes, }, - Cache: cache.New(cache.Options{}), + Cache: cache.New(cache.Options{}), + NetRPC: &LazyNetRPC{}, } + bd.LeafCertManager = leafcert.NewManager(leafcert.Deps{ + CertSigner: leafcert.NewNetRPCCertSigner(bd.NetRPC), + RootsReader: leafcert.NewCachedRootsReader(bd.Cache, "dc1"), + Config: leafcert.Config{}, + }) + cfg := config.RuntimeConfig{BuildDate: time.Date(2000, 1, 1, 0, 0, 1, 0, time.UTC)} bd, err = initEnterpriseBaseDeps(bd, &cfg) require.NoError(t, err) @@ -5443,9 +5451,16 @@ func TestAgent_ListenHTTP_MultipleAddresses(t *testing.T) { &net.TCPAddr{IP: net.ParseIP("127.0.0.1"), Port: ports[1]}, }, }, - Cache: cache.New(cache.Options{}), + Cache: cache.New(cache.Options{}), + NetRPC: &LazyNetRPC{}, } + bd.LeafCertManager = leafcert.NewManager(leafcert.Deps{ + CertSigner: leafcert.NewNetRPCCertSigner(bd.NetRPC), + RootsReader: leafcert.NewCachedRootsReader(bd.Cache, "dc1"), + Config: leafcert.Config{}, + }) + cfg := config.RuntimeConfig{BuildDate: time.Date(2000, 1, 1, 0, 0, 1, 0, time.UTC)} bd, err = initEnterpriseBaseDeps(bd, &cfg) require.NoError(t, err) @@ -6029,9 +6044,16 @@ func TestAgent_startListeners(t *testing.T) { RuntimeConfig: &config.RuntimeConfig{ HTTPAddrs: []net.Addr{}, }, - Cache: cache.New(cache.Options{}), + Cache: cache.New(cache.Options{}), + NetRPC: &LazyNetRPC{}, } + bd.LeafCertManager = leafcert.NewManager(leafcert.Deps{ + CertSigner: leafcert.NewNetRPCCertSigner(bd.NetRPC), + RootsReader: leafcert.NewCachedRootsReader(bd.Cache, "dc1"), + Config: leafcert.Config{}, + }) + bd, err := initEnterpriseBaseDeps(bd, &config.RuntimeConfig{}) require.NoError(t, err) @@ -6161,8 +6183,15 @@ func TestAgent_startListeners_scada(t *testing.T) { }, RuntimeConfig: &config.RuntimeConfig{}, Cache: cache.New(cache.Options{}), + NetRPC: &LazyNetRPC{}, } + bd.LeafCertManager = leafcert.NewManager(leafcert.Deps{ + CertSigner: leafcert.NewNetRPCCertSigner(bd.NetRPC), + RootsReader: leafcert.NewCachedRootsReader(bd.Cache, "dc1"), + Config: leafcert.Config{}, + }) + cfg := config.RuntimeConfig{BuildDate: time.Date(2000, 1, 1, 0, 0, 1, 0, time.UTC)} bd, err := initEnterpriseBaseDeps(bd, &cfg) require.NoError(t, err) @@ -6214,7 +6243,13 @@ func TestAgent_checkServerLastSeen(t *testing.T) { }, RuntimeConfig: &config.RuntimeConfig{}, Cache: cache.New(cache.Options{}), + NetRPC: &LazyNetRPC{}, } + bd.LeafCertManager = leafcert.NewManager(leafcert.Deps{ + CertSigner: leafcert.NewNetRPCCertSigner(bd.NetRPC), + RootsReader: leafcert.NewCachedRootsReader(bd.Cache, "dc1"), + Config: leafcert.Config{}, + }) agent, err := New(bd) require.NoError(t, err) diff --git a/agent/auto-config/auto_config_test.go b/agent/auto-config/auto_config_test.go index dcb06741f5f84..a5ab97e0f45d7 100644 --- a/agent/auto-config/auto_config_test.go +++ b/agent/auto-config/auto_config_test.go @@ -21,6 +21,7 @@ import ( cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" @@ -566,9 +567,8 @@ func TestGoRoutineManagement(t *testing.T) { }) leafReq := ac.leafCertRequest() - mcfg.cache.On("Notify", + mcfg.leafCerts.On("Notify", mock.Anything, - cachetype.ConnectCALeafName, &leafReq, leafWatchID, mock.Anything, @@ -717,10 +717,9 @@ func startedAutoConfig(t *testing.T, autoEncrypt bool) testAutoConfig { mock.Anything, ).Return(nil).Once() - mcfg.cache.On("Notify", + mcfg.leafCerts.On("Notify", mock.Anything, - cachetype.ConnectCALeafName, - &cachetype.ConnectCALeafRequest{ + &leafcert.ConnectCALeafRequest{ Datacenter: "dc1", Agent: "autoconf", Token: originalToken, @@ -875,10 +874,9 @@ func TestTokenUpdate(t *testing.T) { }) leafCtx, leafCancel := context.WithCancel(context.Background()) - testAC.mcfg.cache.On("Notify", + testAC.mcfg.leafCerts.On("Notify", mock.Anything, - cachetype.ConnectCALeafName, - &cachetype.ConnectCALeafRequest{ + &leafcert.ConnectCALeafRequest{ Datacenter: "dc1", Agent: "autoconf", Token: newToken, @@ -975,14 +973,14 @@ func TestCertUpdate(t *testing.T) { NotAfter: secondCert.ValidBefore, }).Once() - req := cachetype.ConnectCALeafRequest{ + req := leafcert.ConnectCALeafRequest{ Datacenter: "dc1", Agent: "autoconf", Token: testAC.originalToken, DNSSAN: defaultDNSSANs, IPSAN: defaultIPSANs, } - require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + require.True(t, testAC.mcfg.leafCerts.sendNotification(context.Background(), req.Key(), cache.UpdateEvent{ CorrelationID: leafWatchID, Result: secondCert, Meta: cache.ResultMeta{ @@ -1102,14 +1100,14 @@ func TestFallback(t *testing.T) { // now that all the mocks are set up we can trigger the whole thing by sending the second expired cert // as a cache update event. - req := cachetype.ConnectCALeafRequest{ + req := leafcert.ConnectCALeafRequest{ Datacenter: "dc1", Agent: "autoconf", Token: testAC.originalToken, DNSSAN: defaultDNSSANs, IPSAN: defaultIPSANs, } - require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + require.True(t, testAC.mcfg.leafCerts.sendNotification(context.Background(), req.Key(), cache.UpdateEvent{ CorrelationID: leafWatchID, Result: secondCert, Meta: cache.ResultMeta{ diff --git a/agent/auto-config/auto_encrypt_test.go b/agent/auto-config/auto_encrypt_test.go index e4a01d0b2177f..2c94b6a5540a0 100644 --- a/agent/auto-config/auto_encrypt_test.go +++ b/agent/auto-config/auto_encrypt_test.go @@ -20,6 +20,7 @@ import ( cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/lib/retry" @@ -347,10 +348,9 @@ func TestAutoEncrypt_TokenUpdate(t *testing.T) { }) leafCtx, leafCancel := context.WithCancel(context.Background()) - testAC.mcfg.cache.On("Notify", + testAC.mcfg.leafCerts.On("Notify", mock.Anything, - cachetype.ConnectCALeafName, - &cachetype.ConnectCALeafRequest{ + &leafcert.ConnectCALeafRequest{ Datacenter: "dc1", Agent: "autoconf", Token: newToken, @@ -430,14 +430,14 @@ func TestAutoEncrypt_CertUpdate(t *testing.T) { NotAfter: secondCert.ValidBefore, }).Once() - req := cachetype.ConnectCALeafRequest{ + req := leafcert.ConnectCALeafRequest{ Datacenter: "dc1", Agent: "autoconf", Token: testAC.originalToken, DNSSAN: defaultDNSSANs, IPSAN: defaultIPSANs, } - require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + require.True(t, testAC.mcfg.leafCerts.sendNotification(context.Background(), req.Key(), cache.UpdateEvent{ CorrelationID: leafWatchID, Result: secondCert, Meta: cache.ResultMeta{ @@ -538,14 +538,14 @@ func TestAutoEncrypt_Fallback(t *testing.T) { // now that all the mocks are set up we can trigger the whole thing by sending the second expired cert // as a cache update event. - req := cachetype.ConnectCALeafRequest{ + req := leafcert.ConnectCALeafRequest{ Datacenter: "dc1", Agent: "autoconf", Token: testAC.originalToken, DNSSAN: defaultDNSSANs, IPSAN: defaultIPSANs, } - require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + require.True(t, testAC.mcfg.leafCerts.sendNotification(context.Background(), req.Key(), cache.UpdateEvent{ CorrelationID: leafWatchID, Result: secondCert, Meta: cache.ResultMeta{ diff --git a/agent/auto-config/config.go b/agent/auto-config/config.go index 5845ec70df0a7..d0f1670ab73a7 100644 --- a/agent/auto-config/config.go +++ b/agent/auto-config/config.go @@ -13,7 +13,9 @@ import ( "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/config" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/metadata" + "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/lib/retry" ) @@ -33,6 +35,19 @@ type Cache interface { Prepopulate(t string, result cache.FetchResult, dc string, peerName string, token string, key string) error } +// LeafCertManager is an interface to represent the methods of the +// agent/leafcert.Manager struct that we care about +type LeafCertManager interface { + Prepopulate( + ctx context.Context, + key string, + index uint64, + value *structs.IssuedCert, + authorityKeyID string, + ) error + Notify(ctx context.Context, req *leafcert.ConnectCALeafRequest, correlationID string, ch chan<- cache.UpdateEvent) error +} + // ServerProvider is an interface that can be used to find one server in the local DC known to // the agent via Gossip type ServerProvider interface { @@ -92,9 +107,12 @@ type Config struct { TLSConfigurator TLSConfigurator // Cache is an object implementing our Cache interface. The Cache - // used at runtime must be able to handle Roots and Leaf Cert watches + // used at runtime must be able to handle Roots watches Cache Cache + // LeafCertManager is an object implementing our LeafCertManager interface. + LeafCertManager LeafCertManager + // FallbackLeeway is the amount of time after certificate expiration before // invoking the fallback routine. If not set this will default to 10s. FallbackLeeway time.Duration diff --git a/agent/auto-config/mock_test.go b/agent/auto-config/mock_test.go index 6cefb92245d20..263befae112cb 100644 --- a/agent/auto-config/mock_test.go +++ b/agent/auto-config/mock_test.go @@ -15,6 +15,7 @@ import ( "github.com/hashicorp/consul/agent/cache" cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" @@ -112,6 +113,85 @@ type mockWatcher struct { done <-chan struct{} } +type mockLeafCerts struct { + mock.Mock + + lock sync.Mutex + watchers map[string][]mockWatcher +} + +var _ LeafCertManager = (*mockLeafCerts)(nil) + +func newMockLeafCerts(t *testing.T) *mockLeafCerts { + m := mockLeafCerts{ + watchers: make(map[string][]mockWatcher), + } + m.Test(t) + return &m +} + +func (m *mockLeafCerts) Notify(ctx context.Context, req *leafcert.ConnectCALeafRequest, correlationID string, ch chan<- cache.UpdateEvent) error { + ret := m.Called(ctx, req, correlationID, ch) + + err := ret.Error(0) + if err == nil { + m.lock.Lock() + key := req.Key() + m.watchers[key] = append(m.watchers[key], mockWatcher{ch: ch, done: ctx.Done()}) + m.lock.Unlock() + } + return err +} + +func (m *mockLeafCerts) Prepopulate( + ctx context.Context, + key string, + index uint64, + value *structs.IssuedCert, + authorityKeyID string, +) error { + // we cannot know what the private key is prior to it being injected into the cache. + // therefore redact it here and all mock expectations should take that into account + restore := value.PrivateKeyPEM + value.PrivateKeyPEM = "redacted" + + ret := m.Called(ctx, key, index, value, authorityKeyID) + + if restore != "" { + value.PrivateKeyPEM = restore + } + return ret.Error(0) +} + +func (m *mockLeafCerts) sendNotification(ctx context.Context, key string, u cache.UpdateEvent) bool { + m.lock.Lock() + defer m.lock.Unlock() + + watchers, ok := m.watchers[key] + if !ok || len(m.watchers) < 1 { + return false + } + + var newWatchers []mockWatcher + + for _, watcher := range watchers { + select { + case watcher.ch <- u: + newWatchers = append(newWatchers, watcher) + case <-watcher.done: + // do nothing, this watcher will be removed from the list + case <-ctx.Done(): + // return doesn't matter here really, the test is being cancelled + return true + } + } + + // this removes any already cancelled watches from being sent to + m.watchers[key] = newWatchers + + return true +} + type mockCache struct { mock.Mock @@ -223,6 +303,7 @@ type mockedConfig struct { directRPC *mockDirectRPC serverProvider *mockServerProvider cache *mockCache + leafCerts *mockLeafCerts tokens *mockTokenStore tlsCfg *mockTLSConfigurator enterpriseConfig *mockedEnterpriseConfig @@ -233,6 +314,7 @@ func newMockedConfig(t *testing.T) *mockedConfig { directRPC := newMockDirectRPC(t) serverProvider := newMockServerProvider(t) mcache := newMockCache(t) + mleafs := newMockLeafCerts(t) tokens := newMockTokenStore(t) tlsCfg := newMockTLSConfigurator(t) @@ -246,6 +328,7 @@ func newMockedConfig(t *testing.T) *mockedConfig { if !t.Failed() { directRPC.AssertExpectations(t) serverProvider.AssertExpectations(t) + mleafs.AssertExpectations(t) mcache.AssertExpectations(t) tokens.AssertExpectations(t) tlsCfg.AssertExpectations(t) @@ -258,6 +341,7 @@ func newMockedConfig(t *testing.T) *mockedConfig { DirectRPC: directRPC, ServerProvider: serverProvider, Cache: mcache, + LeafCertManager: mleafs, Tokens: tokens, TLSConfigurator: tlsCfg, Logger: testutil.Logger(t), @@ -267,6 +351,7 @@ func newMockedConfig(t *testing.T) *mockedConfig { directRPC: directRPC, serverProvider: serverProvider, cache: mcache, + leafCerts: mleafs, tokens: tokens, tlsCfg: tlsCfg, @@ -311,7 +396,7 @@ func (m *mockedConfig) expectInitialTLS(t *testing.T, agentName, datacenter, tok rootsReq.CacheInfo().Key, ).Return(nil).Once() - leafReq := cachetype.ConnectCALeafRequest{ + leafReq := leafcert.ConnectCALeafRequest{ Token: token, Agent: agentName, Datacenter: datacenter, @@ -323,24 +408,18 @@ func (m *mockedConfig) expectInitialTLS(t *testing.T, agentName, datacenter, tok // on up with the request. copy := *cert copy.PrivateKeyPEM = "redacted" - leafRes := cache.FetchResult{ - Value: ©, - Index: copy.RaftIndex.ModifyIndex, - State: cachetype.ConnectCALeafSuccess(ca.SigningKeyID), - } // we should prepopulate the cache with the agents cert - m.cache.On("Prepopulate", - cachetype.ConnectCALeafName, - leafRes, - datacenter, - "", - token, + m.leafCerts.On("Prepopulate", + mock.Anything, leafReq.Key(), + copy.RaftIndex.ModifyIndex, + ©, + ca.SigningKeyID, ).Return(nil).Once() // when prepopulating the cert in the cache we grab the token so - // we should expec that here + // we should expect that here m.tokens.On("AgentToken").Return(token).Once() } diff --git a/agent/auto-config/tls.go b/agent/auto-config/tls.go index 67bdc0276534d..e39022bc959b7 100644 --- a/agent/auto-config/tls.go +++ b/agent/auto-config/tls.go @@ -11,6 +11,7 @@ import ( "github.com/hashicorp/consul/agent/cache" cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/proto/private/pbautoconf" "github.com/hashicorp/consul/proto/private/pbconnect" @@ -106,12 +107,14 @@ func (ac *AutoConfig) populateCertificateCache(certs *structs.SignedResponse) er leafReq := ac.leafCertRequest() // prepolutate leaf cache - certRes := cache.FetchResult{ - Value: &certs.IssuedCert, - Index: certs.IssuedCert.RaftIndex.ModifyIndex, - State: cachetype.ConnectCALeafSuccess(connect.EncodeSigningKeyID(cert.AuthorityKeyId)), - } - if err := ac.acConfig.Cache.Prepopulate(cachetype.ConnectCALeafName, certRes, leafReq.Datacenter, structs.DefaultPeerKeyword, leafReq.Token, leafReq.Key()); err != nil { + err = ac.acConfig.LeafCertManager.Prepopulate( + context.Background(), + leafReq.Key(), + certs.IssuedCert.RaftIndex.ModifyIndex, + &certs.IssuedCert, + connect.EncodeSigningKeyID(cert.AuthorityKeyId), + ) + if err != nil { return err } @@ -129,7 +132,7 @@ func (ac *AutoConfig) setupCertificateCacheWatches(ctx context.Context) (context } leafReq := ac.leafCertRequest() - err = ac.acConfig.Cache.Notify(notificationCtx, cachetype.ConnectCALeafName, &leafReq, leafWatchID, ac.cacheUpdates) + err = ac.acConfig.LeafCertManager.Notify(notificationCtx, &leafReq, leafWatchID, ac.cacheUpdates) if err != nil { cancel() return nil, err @@ -194,8 +197,8 @@ func (ac *AutoConfig) caRootsRequest() structs.DCSpecificRequest { return structs.DCSpecificRequest{Datacenter: ac.config.Datacenter} } -func (ac *AutoConfig) leafCertRequest() cachetype.ConnectCALeafRequest { - return cachetype.ConnectCALeafRequest{ +func (ac *AutoConfig) leafCertRequest() leafcert.ConnectCALeafRequest { + return leafcert.ConnectCALeafRequest{ Datacenter: ac.config.Datacenter, Agent: ac.config.NodeName, DNSSAN: ac.getDNSSANs(), diff --git a/agent/cache-types/connect_ca_leaf.go b/agent/cache-types/connect_ca_leaf.go deleted file mode 100644 index fb5811042b608..0000000000000 --- a/agent/cache-types/connect_ca_leaf.go +++ /dev/null @@ -1,774 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: MPL-2.0 - -package cachetype - -import ( - "context" - "errors" - "fmt" - "net" - "sync" - "sync/atomic" - "time" - - "github.com/mitchellh/hashstructure" - - "github.com/hashicorp/consul/acl" - "github.com/hashicorp/consul/lib" - - "github.com/hashicorp/consul/agent/cache" - "github.com/hashicorp/consul/agent/connect" - "github.com/hashicorp/consul/agent/consul" - "github.com/hashicorp/consul/agent/structs" -) - -// Recommended name for registration. -const ConnectCALeafName = "connect-ca-leaf" - -// caChangeJitterWindow is the time over which we spread each round of retries -// when attempting to get a new certificate following a root rotation. It's -// selected to be a trade-off between not making rotation unnecessarily slow on -// a tiny cluster while not hammering the servers on a huge cluster -// unnecessarily hard. Servers rate limit to protect themselves from the -// expensive crypto work, but in practice have 10k+ RPCs all in the same second -// will cause a major disruption even on large servers due to downloading the -// payloads, parsing msgpack etc. Instead we pick a window that for now is fixed -// but later might be either user configurable (not nice since it would become -// another hard-to-tune value) or set dynamically by the server based on it's -// knowledge of how many certs need to be rotated. Currently the server doesn't -// know that so we pick something that is reasonable. We err on the side of -// being slower that we need in trivial cases but gentler for large deployments. -// 30s means that even with a cluster of 10k service instances, the server only -// has to cope with ~333 RPCs a second which shouldn't be too bad if it's rate -// limiting the actual expensive crypto work. -// -// The actual backoff strategy when we are rate limited is to have each cert -// only retry once with each window of this size, at a point in the window -// selected at random. This performs much better than exponential backoff in -// terms of getting things rotated quickly with more predictable load and so -// fewer rate limited requests. See the full simulation this is based on at -// https://github.com/banks/sim-rate-limit-backoff/blob/master/README.md for -// more detail. -const caChangeJitterWindow = 30 * time.Second - -// ConnectCALeaf supports fetching and generating Connect leaf -// certificates. -type ConnectCALeaf struct { - RegisterOptionsBlockingNoRefresh - caIndex uint64 // Current index for CA roots - - // rootWatchMu protects access to the rootWatchSubscribers map and - // rootWatchCancel - rootWatchMu sync.Mutex - // rootWatchSubscribers is a set of chans, one for each currently in-flight - // Fetch. These chans have root updates delivered from the root watcher. - rootWatchSubscribers map[chan struct{}]struct{} - // rootWatchCancel is a func to call to stop the background root watch if any. - // You must hold inflightMu to read (e.g. call) or write the value. - rootWatchCancel func() - - // testRootWatchStart/StopCount are testing helpers that allow tests to - // observe the reference counting behavior that governs the shared root watch. - // It's not exactly pretty to expose internals like this, but seems cleaner - // than constructing elaborate and brittle test cases that we can infer - // correct behavior from, and simpler than trying to probe runtime goroutine - // traces to infer correct behavior that way. They must be accessed - // atomically. - testRootWatchStartCount uint32 - testRootWatchStopCount uint32 - - RPC RPC // RPC client for remote requests - Cache *cache.Cache // Cache that has CA root certs via ConnectCARoot - Datacenter string // This agent's datacenter - - // TestOverrideCAChangeInitialDelay allows overriding the random jitter after a - // root change with a fixed delay. So far ths is only done in tests. If it's - // zero the caChangeInitialSpreadDefault maximum jitter will be used but if - // set, it overrides and provides a fixed delay. To essentially disable the - // delay in tests they can set it to 1 nanosecond. We may separately allow - // configuring the jitter limit by users later but this is different and for - // tests only since we need to set a deterministic time delay in order to test - // the behavior here fully and determinstically. - TestOverrideCAChangeInitialDelay time.Duration -} - -// fetchState is some additional metadata we store with each cert in the cache -// to track things like expiry and coordinate paces root rotations. It's -// important this doesn't contain any pointer types since we rely on the struct -// being copied to avoid modifying the actual state in the cache entry during -// Fetch. Pointers themselves are OK, but if we point to another struct that we -// call a method or modify in some way that would directly mutate the cache and -// cause problems. We'd need to deep-clone in that case in Fetch below. -// time.Time technically contains a pointer to the Location but we ignore that -// since all times we get from our wall clock should point to the same Location -// anyway. -type fetchState struct { - // authorityKeyId is the ID of the CA key (whether root or intermediate) that signed - // the current cert. This is just to save parsing the whole cert everytime - // we have to check if the root changed. - authorityKeyID string - - // forceExpireAfter is used to coordinate renewing certs after a CA rotation - // in a staggered way so that we don't overwhelm the servers. - forceExpireAfter time.Time - - // activeRootRotationStart is set when the root has changed and we need to get - // a new cert but haven't got one yet. forceExpireAfter will be set to the - // next scheduled time we should try our CSR, but this is needed to calculate - // the retry windows if we are rate limited when we try. See comment on - // caChangeJitterWindow above for more. - activeRootRotationStart time.Time - - // consecutiveRateLimitErrs stores how many rate limit errors we've hit. We - // use this to choose a new window for the next retry. See comment on - // caChangeJitterWindow above for more. - consecutiveRateLimitErrs int -} - -func ConnectCALeafSuccess(authorityKeyID string) interface{} { - return fetchState{ - authorityKeyID: authorityKeyID, - forceExpireAfter: time.Time{}, - consecutiveRateLimitErrs: 0, - activeRootRotationStart: time.Time{}, - } -} - -// fetchStart is called on each fetch that is about to block and wait for -// changes to the leaf. It subscribes a chan to receive updates from the shared -// root watcher and triggers root watcher if it's not already running. -func (c *ConnectCALeaf) fetchStart(rootUpdateCh chan struct{}) { - c.rootWatchMu.Lock() - defer c.rootWatchMu.Unlock() - // Lazy allocation - if c.rootWatchSubscribers == nil { - c.rootWatchSubscribers = make(map[chan struct{}]struct{}) - } - // Make sure a root watcher is running. We don't only do this on first request - // to be more tolerant of errors that could cause the root watcher to fail and - // exit. - if c.rootWatchCancel == nil { - ctx, cancel := context.WithCancel(context.Background()) - c.rootWatchCancel = cancel - go c.rootWatcher(ctx) - } - c.rootWatchSubscribers[rootUpdateCh] = struct{}{} -} - -// fetchDone is called when a blocking call exits to unsubscribe from root -// updates and possibly stop the shared root watcher if it's no longer needed. -// Note that typically root CA is still being watched by clients directly and -// probably by the ProxyConfigManager so it will stay hot in cache for a while, -// we are just not monitoring it for updates any more. -func (c *ConnectCALeaf) fetchDone(rootUpdateCh chan struct{}) { - c.rootWatchMu.Lock() - defer c.rootWatchMu.Unlock() - delete(c.rootWatchSubscribers, rootUpdateCh) - if len(c.rootWatchSubscribers) == 0 && c.rootWatchCancel != nil { - // This was the last request. Stop the root watcher. - c.rootWatchCancel() - c.rootWatchCancel = nil - } -} - -// rootWatcher is the shared rootWatcher that runs in a background goroutine -// while needed by one or more inflight Fetch calls. -func (c *ConnectCALeaf) rootWatcher(ctx context.Context) { - atomic.AddUint32(&c.testRootWatchStartCount, 1) - defer atomic.AddUint32(&c.testRootWatchStopCount, 1) - - ch := make(chan cache.UpdateEvent, 1) - err := c.Cache.Notify(ctx, ConnectCARootName, &structs.DCSpecificRequest{ - Datacenter: c.Datacenter, - }, "roots", ch) - - notifyChange := func() { - c.rootWatchMu.Lock() - defer c.rootWatchMu.Unlock() - - for ch := range c.rootWatchSubscribers { - select { - case ch <- struct{}{}: - default: - // Don't block - chans are 1-buffered so act as an edge trigger and - // reload CA state directly from cache so they never "miss" updates. - } - } - } - - if err != nil { - // Trigger all inflight watchers. We don't pass the error, but they will - // reload from cache and observe the same error and return it to the caller, - // or if it's transient, will continue and the next Fetch will get us back - // into the right state. Seems better than busy loop-retrying here given - // that almost any error we would see here would also be returned from the - // cache get this will trigger. - notifyChange() - return - } - - var oldRoots *structs.IndexedCARoots - // Wait for updates to roots or all requests to stop - for { - select { - case <-ctx.Done(): - return - case e := <-ch: - // Root response changed in some way. Note this might be the initial - // fetch. - if e.Err != nil { - // See above rationale about the error propagation - notifyChange() - continue - } - - roots, ok := e.Result.(*structs.IndexedCARoots) - if !ok { - // See above rationale about the error propagation - notifyChange() - continue - } - - // Check that the active root is actually different from the last CA - // config there are many reasons the config might have changed without - // actually updating the CA root that is signing certs in the cluster. - // The Fetch calls will also validate this since the first call here we - // don't know if it changed or not, but there is no point waking up all - // Fetch calls to check this if we know none of them will need to act on - // this update. - if oldRoots != nil && oldRoots.ActiveRootID == roots.ActiveRootID { - continue - } - - // Distribute the update to all inflight requests - they will decide - // whether or not they need to act on it. - notifyChange() - oldRoots = roots - } - } -} - -// calculateSoftExpiry encapsulates our logic for when to renew a cert based on -// it's age. It returns a pair of times min, max which makes it easier to test -// the logic without non-deterministic jitter to account for. The caller should -// choose a time randomly in between these. -// -// We want to balance a few factors here: -// - renew too early and it increases the aggregate CSR rate in the cluster -// - renew too late and it risks disruption to the service if a transient -// error prevents the renewal -// - we want a broad amount of jitter so if there is an outage, we don't end -// up with all services in sync and causing a thundering herd every -// renewal period. Broader is better for smoothing requests but pushes -// both earlier and later tradeoffs above. -// -// Somewhat arbitrarily the current strategy looks like this: -// -// 0 60% 90% -// Issued [------------------------------|===============|!!!!!] Expires -// 72h TTL: 0 ~43h ~65h -// 1h TTL: 0 36m 54m -// -// Where |===| is the soft renewal period where we jitter for the first attempt -// and |!!!| is the danger zone where we just try immediately. -// -// In the happy path (no outages) the average renewal occurs half way through -// the soft renewal region or at 75% of the cert lifetime which is ~54 hours for -// a 72 hour cert, or 45 mins for a 1 hour cert. -// -// If we are already in the softRenewal period, we randomly pick a time between -// now and the start of the danger zone. -// -// We pass in now to make testing easier. -func calculateSoftExpiry(now time.Time, cert *structs.IssuedCert) (min time.Time, max time.Time) { - - certLifetime := cert.ValidBefore.Sub(cert.ValidAfter) - if certLifetime < 10*time.Minute { - // Shouldn't happen as we limit to 1 hour shortest elsewhere but just be - // defensive against strange times or bugs. - return now, now - } - - // Find the 60% mark in diagram above - softRenewTime := cert.ValidAfter.Add(time.Duration(float64(certLifetime) * 0.6)) - hardRenewTime := cert.ValidAfter.Add(time.Duration(float64(certLifetime) * 0.9)) - - if now.After(hardRenewTime) { - // In the hard renew period, or already expired. Renew now! - return now, now - } - - if now.After(softRenewTime) { - // Already in the soft renew period, make now the lower bound for jitter - softRenewTime = now - } - return softRenewTime, hardRenewTime -} - -func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache.FetchResult, error) { - var result cache.FetchResult - - // Get the correct type - reqReal, ok := req.(*ConnectCALeafRequest) - if !ok { - return result, fmt.Errorf( - "Internal cache failure: request wrong type: %T", req) - } - - // Lightweight copy this object so that manipulating QueryOptions doesn't race. - dup := *reqReal - reqReal = &dup - - // Do we already have a cert in the cache? - var existing *structs.IssuedCert - // Really important this is not a pointer type since otherwise we would set it - // to point to the actual fetchState in the cache entry below and then would - // be directly modifying that in the cache entry even when we might later - // return an error and not update index etc. By being a value, we force a copy - var state fetchState - if opts.LastResult != nil { - existing, ok = opts.LastResult.Value.(*structs.IssuedCert) - if !ok { - return result, fmt.Errorf( - "Internal cache failure: last value wrong type: %T", opts.LastResult.Value) - } - if opts.LastResult.State != nil { - state, ok = opts.LastResult.State.(fetchState) - if !ok { - return result, fmt.Errorf( - "Internal cache failure: last state wrong type: %T", opts.LastResult.State) - } - } - } - - // Handle brand new request first as it's simplest. - if existing == nil { - return c.generateNewLeaf(reqReal, result) - } - - // Setup result to mirror the current value for if we timeout or hit a rate - // limit. This allows us to update the state (e.g. for backoff or retry - // coordination on root change) even if we don't get a new cert. - result.Value = existing - result.Index = existing.ModifyIndex - result.State = state - - // Since state is not a pointer, we can't just set it once in result and then - // continue to update it later since we will be updating only our copy. - // Instead we have a helper function that is used to make sure the state is - // updated in the result when we return. - lastResultWithNewState := func() cache.FetchResult { - return cache.FetchResult{ - Value: existing, - Index: existing.ModifyIndex, - State: state, - } - } - - // Beyond this point we need to only return lastResultWithNewState() not just - // result since otherwise we might "loose" state updates we expect not to. - - // We have a certificate in cache already. Check it's still valid. - now := time.Now() - minExpire, maxExpire := calculateSoftExpiry(now, existing) - expiresAt := minExpire.Add(lib.RandomStagger(maxExpire.Sub(minExpire))) - - // Check if we have been force-expired by a root update that jittered beyond - // the timeout of the query it was running. - if !state.forceExpireAfter.IsZero() && state.forceExpireAfter.Before(expiresAt) { - expiresAt = state.forceExpireAfter - } - - if expiresAt.Equal(now) || expiresAt.Before(now) { - // Already expired, just make a new one right away - return c.generateNewLeaf(reqReal, lastResultWithNewState()) - } - - // If we called Fetch() with MustRevalidate then this call came from a non-blocking query. - // Any prior CA rotations should've already expired the cert. - // All we need to do is check whether the current CA is the one that signed the leaf. If not, generate a new leaf. - // This is not a perfect solution (as a CA rotation update can be missed) but it should take care of instances like - // see https://github.com/hashicorp/consul/issues/10871, https://github.com/hashicorp/consul/issues/9862 - // This seems to me like a hack, so maybe we can revisit the caching/ fetching logic in this case - if req.CacheInfo().MustRevalidate { - roots, err := c.rootsFromCache() - if err != nil { - return lastResultWithNewState(), err - } - if activeRootHasKey(roots, state.authorityKeyID) { - return lastResultWithNewState(), nil - } - - // if we reach here then the current leaf was not signed by the same CAs, just regen - return c.generateNewLeaf(reqReal, lastResultWithNewState()) - } - - // We are about to block and wait for a change or timeout. - - // Make a chan we can be notified of changes to CA roots on. It must be - // buffered so we don't miss broadcasts from rootsWatch. It is an edge trigger - // so a single buffer element is sufficient regardless of whether we consume - // the updates fast enough since as soon as we see an element in it, we will - // reload latest CA from cache. - rootUpdateCh := make(chan struct{}, 1) - - // The roots may have changed in between blocking calls. We need to verify - // that the existing cert was signed by the current root. If it was we still - // want to do the whole jitter thing. We could code that again here but it's - // identical to the select case below so we just trigger our own update chan - // and let the logic below handle checking if the CA actually changed in the - // common case where it didn't it is a no-op anyway. - rootUpdateCh <- struct{}{} - - // Subscribe our chan to get root update notification. - c.fetchStart(rootUpdateCh) - defer c.fetchDone(rootUpdateCh) - - // Setup the timeout chan outside the loop so we don't keep bumping the timeout - // later if we loop around. - timeoutCh := time.After(opts.Timeout) - - // Setup initial expiry chan. We may change this if root update occurs in the - // loop below. - expiresCh := time.After(expiresAt.Sub(now)) - - // Current cert is valid so just wait until it expires or we time out. - for { - select { - case <-timeoutCh: - // We timed out the request with same cert. - return lastResultWithNewState(), nil - - case <-expiresCh: - // Cert expired or was force-expired by a root change. - return c.generateNewLeaf(reqReal, lastResultWithNewState()) - - case <-rootUpdateCh: - // A root cache change occurred, reload roots from cache. - roots, err := c.rootsFromCache() - if err != nil { - return lastResultWithNewState(), err - } - - // Handle _possibly_ changed roots. We still need to verify the new active - // root is not the same as the one our current cert was signed by since we - // can be notified spuriously if we are the first request since the - // rootsWatcher didn't know about the CA we were signed by. We also rely - // on this on every request to do the initial check that the current roots - // are the same ones the current cert was signed by. - if activeRootHasKey(roots, state.authorityKeyID) { - // Current active CA is the same one that signed our current cert so - // keep waiting for a change. - continue - } - state.activeRootRotationStart = time.Now() - - // CA root changed. We add some jitter here to avoid a thundering herd. - // See docs on caChangeJitterWindow const. - delay := lib.RandomStagger(caChangeJitterWindow) - if c.TestOverrideCAChangeInitialDelay > 0 { - delay = c.TestOverrideCAChangeInitialDelay - } - // Force the cert to be expired after the jitter - the delay above might - // be longer than we have left on our timeout. We set forceExpireAfter in - // the cache state so the next request will notice we still need to renew - // and do it at the right time. This is cleared once a new cert is - // returned by generateNewLeaf. - state.forceExpireAfter = state.activeRootRotationStart.Add(delay) - // If the delay time is within the current timeout, we want to renew the - // as soon as it's up. We change the expire time and chan so that when we - // loop back around, we'll wait at most delay until generating a new cert. - if state.forceExpireAfter.Before(expiresAt) { - expiresAt = state.forceExpireAfter - expiresCh = time.After(delay) - } - continue - } - } -} - -func activeRootHasKey(roots *structs.IndexedCARoots, currentSigningKeyID string) bool { - for _, ca := range roots.Roots { - if ca.Active { - return ca.SigningKeyID == currentSigningKeyID - } - } - // Shouldn't be possible since at least one root should be active. - return false -} - -func (c *ConnectCALeaf) rootsFromCache() (*structs.IndexedCARoots, error) { - // Background is fine here because this isn't a blocking query as no index is set. - // Therefore this will just either be a cache hit or return once the non-blocking query returns. - rawRoots, _, err := c.Cache.Get(context.Background(), ConnectCARootName, &structs.DCSpecificRequest{ - Datacenter: c.Datacenter, - }) - if err != nil { - return nil, err - } - roots, ok := rawRoots.(*structs.IndexedCARoots) - if !ok { - return nil, errors.New("invalid RootCA response type") - } - return roots, nil -} - -// generateNewLeaf does the actual work of creating a new private key, -// generating a CSR and getting it signed by the servers. result argument -// represents the last result currently in cache if any along with its state. -func (c *ConnectCALeaf) generateNewLeaf(req *ConnectCALeafRequest, - result cache.FetchResult) (cache.FetchResult, error) { - - var state fetchState - if result.State != nil { - var ok bool - state, ok = result.State.(fetchState) - if !ok { - return result, fmt.Errorf( - "Internal cache failure: result state wrong type: %T", result.State) - } - } - - // Need to lookup RootCAs response to discover trust domain. This should be a - // cache hit. - roots, err := c.rootsFromCache() - if err != nil { - return result, err - } - if roots.TrustDomain == "" { - return result, errors.New("cluster has no CA bootstrapped yet") - } - - // Build the cert uri - var id connect.CertURI - var dnsNames []string - var ipAddresses []net.IP - - switch { - case req.Service != "": - id = &connect.SpiffeIDService{ - Host: roots.TrustDomain, - Datacenter: req.Datacenter, - Partition: req.TargetPartition(), - Namespace: req.TargetNamespace(), - Service: req.Service, - } - dnsNames = append(dnsNames, req.DNSSAN...) - - case req.Agent != "": - id = &connect.SpiffeIDAgent{ - Host: roots.TrustDomain, - Datacenter: req.Datacenter, - Partition: req.TargetPartition(), - Agent: req.Agent, - } - dnsNames = append([]string{"localhost"}, req.DNSSAN...) - ipAddresses = append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::1")}, req.IPSAN...) - - case req.Kind == structs.ServiceKindMeshGateway: - id = &connect.SpiffeIDMeshGateway{ - Host: roots.TrustDomain, - Datacenter: req.Datacenter, - Partition: req.TargetPartition(), - } - dnsNames = append(dnsNames, req.DNSSAN...) - - case req.Kind != "": - return result, fmt.Errorf("unsupported kind: %s", req.Kind) - - case req.Server: - if req.Datacenter == "" { - return result, errors.New("datacenter name must be specified") - } - id = &connect.SpiffeIDServer{ - Host: roots.TrustDomain, - Datacenter: req.Datacenter, - } - dnsNames = append(dnsNames, connect.PeeringServerSAN(req.Datacenter, roots.TrustDomain)) - - default: - return result, errors.New("URI must be either service, agent, server, or kind") - } - - // Create a new private key - - // TODO: for now we always generate EC keys on clients regardless of the key - // type being used by the active CA. This is fine and allowed in TLS1.2 and - // signing EC CSRs with an RSA key is supported by all current CA providers so - // it's OK. IFF we ever need to support a CA provider that refuses to sign a - // CSR with a different signature algorithm, or if we have compatibility - // issues with external PKI systems that require EC certs be signed with ECDSA - // from the CA (this was required in TLS1.1 but not in 1.2) then we can - // instead intelligently pick the key type we generate here based on the key - // type of the active signing CA. We already have that loaded since we need - // the trust domain. - pk, pkPEM, err := connect.GeneratePrivateKey() - if err != nil { - return result, err - } - - // Create a CSR. - csr, err := connect.CreateCSR(id, pk, dnsNames, ipAddresses) - if err != nil { - return result, err - } - - // Request signing - var reply structs.IssuedCert - args := structs.CASignRequest{ - WriteRequest: structs.WriteRequest{Token: req.Token}, - Datacenter: req.Datacenter, - CSR: csr, - } - if err := c.RPC.RPC(context.Background(), "ConnectCA.Sign", &args, &reply); err != nil { - if err.Error() == consul.ErrRateLimited.Error() { - if result.Value == nil { - // This was a first fetch - we have no good value in cache. In this case - // we just return the error to the caller rather than rely on surprising - // semi-blocking until the rate limit is appeased or we timeout - // behavior. It's likely the caller isn't expecting this to block since - // it's an initial fetch. This also massively simplifies this edge case. - return result, err - } - - if state.activeRootRotationStart.IsZero() { - // We hit a rate limit error by chance - for example a cert expired - // before the root rotation was observed (not triggered by rotation) but - // while server is working through high load from a recent rotation. - // Just pretend there is a rotation and the retry logic here will start - // jittering and retrying in the same way from now. - state.activeRootRotationStart = time.Now() - } - - // Increment the errors in the state - state.consecutiveRateLimitErrs++ - - delay := lib.RandomStagger(caChangeJitterWindow) - if c.TestOverrideCAChangeInitialDelay > 0 { - delay = c.TestOverrideCAChangeInitialDelay - } - - // Find the start of the next window we can retry in. See comment on - // caChangeJitterWindow for details of why we use this strategy. - windowStart := state.activeRootRotationStart.Add( - time.Duration(state.consecutiveRateLimitErrs) * delay) - - // Pick a random time in that window - state.forceExpireAfter = windowStart.Add(delay) - - // Return a result with the existing cert but the new state - the cache - // will see this as no change. Note that we always have an existing result - // here due to the nil value check above. - result.State = state - return result, nil - } - return result, err - } - reply.PrivateKeyPEM = pkPEM - - // Reset rotation state - state.forceExpireAfter = time.Time{} - state.consecutiveRateLimitErrs = 0 - state.activeRootRotationStart = time.Time{} - - cert, err := connect.ParseCert(reply.CertPEM) - if err != nil { - return result, err - } - // Set the CA key ID so we can easily tell when a active root has changed. - state.authorityKeyID = connect.EncodeSigningKeyID(cert.AuthorityKeyId) - - result.Value = &reply - // Store value not pointer so we don't accidentally mutate the cache entry - // state in Fetch. - result.State = state - result.Index = reply.ModifyIndex - return result, nil -} - -// ConnectCALeafRequest is the cache.Request implementation for the -// ConnectCALeaf cache type. This is implemented here and not in structs -// since this is only used for cache-related requests and not forwarded -// directly to any Consul servers. -type ConnectCALeafRequest struct { - Token string - Datacenter string - DNSSAN []string - IPSAN []net.IP - MinQueryIndex uint64 - MaxQueryTime time.Duration - acl.EnterpriseMeta - MustRevalidate bool - - // The following flags indicate the entity we are requesting a cert for. - // Only one of these must be specified. - Service string // Given a Service name, not ID, the request is for a SpiffeIDService. - Agent string // Given an Agent name, not ID, the request is for a SpiffeIDAgent. - Kind structs.ServiceKind // Given "mesh-gateway", the request is for a SpiffeIDMeshGateway. No other kinds supported. - Server bool // If true, the request is for a SpiffeIDServer. -} - -func (r *ConnectCALeafRequest) Key() string { - r.EnterpriseMeta.Normalize() - - switch { - case r.Agent != "": - v, err := hashstructure.Hash([]interface{}{ - r.Agent, - r.PartitionOrDefault(), - }, nil) - if err == nil { - return fmt.Sprintf("agent:%d", v) - } - case r.Kind == structs.ServiceKindMeshGateway: - v, err := hashstructure.Hash([]interface{}{ - r.PartitionOrDefault(), - r.DNSSAN, - r.IPSAN, - }, nil) - if err == nil { - return fmt.Sprintf("kind:%d", v) - } - case r.Kind != "": - // this is not valid - case r.Server: - v, err := hashstructure.Hash([]interface{}{ - "server", - r.Datacenter, - }, nil) - if err == nil { - return fmt.Sprintf("server:%d", v) - } - default: - v, err := hashstructure.Hash([]interface{}{ - r.Service, - r.EnterpriseMeta, - r.DNSSAN, - r.IPSAN, - }, nil) - if err == nil { - return fmt.Sprintf("service:%d", v) - } - } - - // If there is an error, we don't set the key. A blank key forces - // no cache for this request so the request is forwarded directly - // to the server. - return "" -} - -func (req *ConnectCALeafRequest) TargetPartition() string { - return req.PartitionOrDefault() -} - -func (r *ConnectCALeafRequest) CacheInfo() cache.RequestInfo { - return cache.RequestInfo{ - Token: r.Token, - Key: r.Key(), - Datacenter: r.Datacenter, - MinIndex: r.MinQueryIndex, - Timeout: r.MaxQueryTime, - MustRevalidate: r.MustRevalidate, - } -} diff --git a/agent/cache-types/connect_ca_leaf_oss.go b/agent/cache-types/connect_ca_leaf_oss.go deleted file mode 100644 index 949d4a1bc6a2b..0000000000000 --- a/agent/cache-types/connect_ca_leaf_oss.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: MPL-2.0 - -//go:build !consulent -// +build !consulent - -package cachetype - -func (req *ConnectCALeafRequest) TargetNamespace() string { - return "default" -} diff --git a/agent/cache-types/connect_ca_leaf_test.go b/agent/cache-types/connect_ca_leaf_test.go deleted file mode 100644 index ea1219fcfcccd..0000000000000 --- a/agent/cache-types/connect_ca_leaf_test.go +++ /dev/null @@ -1,1178 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: MPL-2.0 - -package cachetype - -import ( - "context" - "crypto/x509" - "encoding/pem" - "fmt" - "net" - "strings" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/mock" - "github.com/stretchr/testify/require" - - "github.com/hashicorp/consul/agent/cache" - "github.com/hashicorp/consul/agent/connect" - "github.com/hashicorp/consul/agent/consul" - "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/sdk/testutil/retry" -) - -func TestCalculateSoftExpire(t *testing.T) { - tests := []struct { - name string - now string - issued string - lifetime time.Duration - wantMin string - wantMax string - }{ - { - name: "72h just issued", - now: "2018-01-01 00:00:01", - issued: "2018-01-01 00:00:00", - lifetime: 72 * time.Hour, - // Should jitter between 60% and 90% of the lifetime which is 43.2/64.8 - // hours after issued - wantMin: "2018-01-02 19:12:00", - wantMax: "2018-01-03 16:48:00", - }, - { - name: "72h in renew range", - // This time should be inside the renewal range. - now: "2018-01-02 20:00:20", - issued: "2018-01-01 00:00:00", - lifetime: 72 * time.Hour, - // Min should be the "now" time - wantMin: "2018-01-02 20:00:20", - wantMax: "2018-01-03 16:48:00", - }, - { - name: "72h in hard renew", - // This time should be inside the renewal range. - now: "2018-01-03 18:00:00", - issued: "2018-01-01 00:00:00", - lifetime: 72 * time.Hour, - // Min and max should both be the "now" time - wantMin: "2018-01-03 18:00:00", - wantMax: "2018-01-03 18:00:00", - }, - { - name: "72h expired", - // This time is after expiry - now: "2018-01-05 00:00:00", - issued: "2018-01-01 00:00:00", - lifetime: 72 * time.Hour, - // Min and max should both be the "now" time - wantMin: "2018-01-05 00:00:00", - wantMax: "2018-01-05 00:00:00", - }, - { - name: "1h just issued", - now: "2018-01-01 00:00:01", - issued: "2018-01-01 00:00:00", - lifetime: 1 * time.Hour, - // Should jitter between 60% and 90% of the lifetime which is 36/54 mins - // hours after issued - wantMin: "2018-01-01 00:36:00", - wantMax: "2018-01-01 00:54:00", - }, - { - name: "1h in renew range", - // This time should be inside the renewal range. - now: "2018-01-01 00:40:00", - issued: "2018-01-01 00:00:00", - lifetime: 1 * time.Hour, - // Min should be the "now" time - wantMin: "2018-01-01 00:40:00", - wantMax: "2018-01-01 00:54:00", - }, - { - name: "1h in hard renew", - // This time should be inside the renewal range. - now: "2018-01-01 00:55:00", - issued: "2018-01-01 00:00:00", - lifetime: 1 * time.Hour, - // Min and max should both be the "now" time - wantMin: "2018-01-01 00:55:00", - wantMax: "2018-01-01 00:55:00", - }, - { - name: "1h expired", - // This time is after expiry - now: "2018-01-01 01:01:01", - issued: "2018-01-01 00:00:00", - lifetime: 1 * time.Hour, - // Min and max should both be the "now" time - wantMin: "2018-01-01 01:01:01", - wantMax: "2018-01-01 01:01:01", - }, - { - name: "too short lifetime", - // This time is after expiry - now: "2018-01-01 01:01:01", - issued: "2018-01-01 00:00:00", - lifetime: 1 * time.Minute, - // Min and max should both be the "now" time - wantMin: "2018-01-01 01:01:01", - wantMax: "2018-01-01 01:01:01", - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - now, err := time.Parse("2006-01-02 15:04:05", tc.now) - require.NoError(t, err) - issued, err := time.Parse("2006-01-02 15:04:05", tc.issued) - require.NoError(t, err) - wantMin, err := time.Parse("2006-01-02 15:04:05", tc.wantMin) - require.NoError(t, err) - wantMax, err := time.Parse("2006-01-02 15:04:05", tc.wantMax) - require.NoError(t, err) - - min, max := calculateSoftExpiry(now, &structs.IssuedCert{ - ValidAfter: issued, - ValidBefore: issued.Add(tc.lifetime), - }) - - require.Equal(t, wantMin, min) - require.Equal(t, wantMax, max) - }) - } -} - -// Test that after an initial signing, new CA roots (new ID) will -// trigger a blocking query to execute. -func TestConnectCALeaf_changingRoots(t *testing.T) { - if testing.Short() { - t.Skip("too slow for testing.Short") - } - - if testingRace { - t.Skip("fails with -race because caRoot.Active is modified concurrently") - } - t.Parallel() - - rpc := TestRPC(t) - defer rpc.AssertExpectations(t) - - typ, rootsCh := testCALeafType(t, rpc) - defer close(rootsCh) - - caRoot := connect.TestCA(t, nil) - caRoot.Active = true - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: 1}, - } - - // We need this later but needs to be defined so we sign second CSR with it - // otherwise we break the cert root checking. - caRoot2 := connect.TestCA(t, nil) - - // Instrument ConnectCA.Sign to return signed cert - var resp *structs.IssuedCert - var idx uint64 - - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil). - Run(func(args mock.Arguments) { - ca := caRoot - cIdx := atomic.AddUint64(&idx, 1) - if cIdx > 1 { - // Second time round use the new CA - ca = caRoot2 - } - reply := args.Get(3).(*structs.IssuedCert) - leaf, _ := connect.TestLeaf(t, "web", ca) - reply.CertPEM = leaf - reply.ValidAfter = time.Now().Add(-1 * time.Hour) - reply.ValidBefore = time.Now().Add(11 * time.Hour) - reply.CreateIndex = cIdx - reply.ModifyIndex = reply.CreateIndex - resp = reply - }) - - // We'll reuse the fetch options and request - opts := cache.FetchOptions{MinIndex: 0, Timeout: 10 * time.Second} - req := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web"} - - // First fetch should return immediately - fetchCh := TestFetchCh(t, typ, opts, req) - select { - case <-time.After(100 * time.Millisecond): - t.Fatal("shouldn't block waiting for fetch") - case result := <-fetchCh: - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - require.Equal(t, uint64(1), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } - - // Second fetch should block with set index - opts.MinIndex = 1 - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case result := <-fetchCh: - t.Fatalf("should not return: %#v", result) - case <-time.After(100 * time.Millisecond): - } - - // Let's send in new roots, which should trigger the sign req. We need to take - // care to set the new root as active - caRoot2.Active = true - caRoot.Active = false - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot2.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot2, - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: atomic.AddUint64(&idx, 1)}, - } - select { - case <-time.After(100 * time.Millisecond): - t.Fatal("shouldn't block waiting for fetch") - case result := <-fetchCh: - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - // 3 since the second CA "update" used up 2 - require.Equal(t, uint64(3), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - opts.MinIndex = 3 - } - - // Third fetch should block - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case result := <-fetchCh: - t.Fatalf("should not return: %#v", result) - case <-time.After(100 * time.Millisecond): - } -} - -// Tests that if the root change jitter is longer than the time left on the -// timeout, we return normally but then still renew the cert on a subsequent -// call. -func TestConnectCALeaf_changingRootsJitterBetweenCalls(t *testing.T) { - t.Parallel() - - rpc := TestRPC(t) - defer rpc.AssertExpectations(t) - - typ, rootsCh := testCALeafType(t, rpc) - defer close(rootsCh) - - // Override the root-change delay so we will timeout first. We can't set it to - // a crazy high value otherwise we'll have to wait that long in the test to - // see if it actually happens on subsequent calls. We instead reduce the - // timeout in FetchOptions to be much shorter than this. - typ.TestOverrideCAChangeInitialDelay = 100 * time.Millisecond - - caRoot := connect.TestCA(t, nil) - caRoot.Active = true - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: 1}, - } - - // Instrument ConnectCA.Sign to return signed cert - var resp *structs.IssuedCert - var idx uint64 - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil). - Run(func(args mock.Arguments) { - reply := args.Get(3).(*structs.IssuedCert) - leaf, _ := connect.TestLeaf(t, "web", caRoot) - reply.CertPEM = leaf - reply.ValidAfter = time.Now().Add(-1 * time.Hour) - reply.ValidBefore = time.Now().Add(11 * time.Hour) - reply.CreateIndex = atomic.AddUint64(&idx, 1) - reply.ModifyIndex = reply.CreateIndex - resp = reply - }) - - // We'll reuse the fetch options and request. Timeout must be much shorter - // than the initial root delay. 20ms means that if we deliver the root change - // during the first blocking call, we should need to block fully for 5 more - // calls before the cert is renewed. We pick a timeout that is not an exact - // multiple of the 100ms delay above to reduce the chance that timing works - // out in a way that makes it hard to tell a timeout from an early return due - // to a cert renewal. - opts := cache.FetchOptions{MinIndex: 0, Timeout: 35 * time.Millisecond} - req := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web"} - - // First fetch should return immediately - fetchCh := TestFetchCh(t, typ, opts, req) - select { - case <-time.After(100 * time.Millisecond): - t.Fatal("shouldn't block waiting for fetch") - case result := <-fetchCh: - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - require.Equal(t, uint64(1), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } - - // Let's send in new roots, which should eventually trigger the sign req. We - // need to take care to set the new root as active. Note that this is - // implicitly testing that root updates that happen in between leaf blocking - // queries are still noticed too. At this point no leaf blocking query is - // running so the root watch should be stopped. By pushing this update, the - // next blocking query will _immediately_ see the new root which means it - // needs to correctly notice that it is not the same one that generated the - // current cert and start the rotation. This is good, just not obvious that - // the behavior is actually well tested here when it is. - caRoot2 := connect.TestCA(t, nil) - caRoot2.Active = true - caRoot.Active = false - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot2.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot2, - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: atomic.AddUint64(&idx, 1)}, - } - earliestRootDelivery := time.Now() - - // Some number of fetches (2,3,4 likely) should timeout after 20ms and after - // 100ms has elapsed total we should see the new cert. Since this is all very - // timing dependent, we don't hard code exact numbers here and instead loop - // for plenty of time and do as many calls as it takes and just assert on the - // time taken and that the call either blocks and returns the cached cert, or - // returns the new one. - opts.MinIndex = 1 - var shouldExpireAfter time.Time - i := 1 - rootsDelivered := false - for rootsDelivered { - start := time.Now() - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case result := <-fetchCh: - v := mustFetchResult(t, result) - timeTaken := time.Since(start) - - // There are two options, either it blocked waiting for the delay after - // the rotation or it returned the new CA cert before the timeout was - // done. TO be more robust against timing, we take the value as the - // decider for which case it is, and assert timing matches our expected - // bounds rather than vice versa. - - if v.Index > uint64(1) { - // Got a new cert - require.Equal(t, resp, v.Value) - require.Equal(t, uint64(3), v.Index) - // Should not have been delivered before the delay - require.True(t, time.Since(earliestRootDelivery) > typ.TestOverrideCAChangeInitialDelay) - // All good. We are done! - rootsDelivered = true - } else { - // Should be the cached cert - require.Equal(t, resp, v.Value) - require.Equal(t, uint64(1), v.Index) - // Sanity check we blocked for the whole timeout - require.Truef(t, timeTaken > opts.Timeout, - "should block for at least %s, returned after %s", - opts.Timeout, timeTaken) - // Sanity check that the forceExpireAfter state was set correctly - shouldExpireAfter = v.State.(*fetchState).forceExpireAfter - require.True(t, shouldExpireAfter.After(time.Now())) - require.True(t, shouldExpireAfter.Before(time.Now().Add(typ.TestOverrideCAChangeInitialDelay))) - } - // Set the LastResult for subsequent fetches - opts.LastResult = &v - case <-time.After(50 * time.Millisecond): - t.Fatalf("request %d blocked too long", i) - } - i++ - - // Sanity check that we've not gone way beyond the deadline without a - // new cert. We give some leeway to make it less brittle. - require.Falsef(t, time.Now().After(shouldExpireAfter.Add(100*time.Millisecond)), - "waited extra 100ms and delayed CA rotate renew didn't happen") - } -} - -// Tests that if the root changes in between blocking calls we still pick it up. -func TestConnectCALeaf_changingRootsBetweenBlockingCalls(t *testing.T) { - t.Parallel() - - rpc := TestRPC(t) - defer rpc.AssertExpectations(t) - - typ, rootsCh := testCALeafType(t, rpc) - defer close(rootsCh) - - caRoot := connect.TestCA(t, nil) - caRoot.Active = true - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: 1}, - } - - // Instrument ConnectCA.Sign to return signed cert - var resp *structs.IssuedCert - var idx uint64 - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil). - Run(func(args mock.Arguments) { - reply := args.Get(3).(*structs.IssuedCert) - leaf, _ := connect.TestLeaf(t, "web", caRoot) - reply.CertPEM = leaf - reply.ValidAfter = time.Now().Add(-1 * time.Hour) - reply.ValidBefore = time.Now().Add(11 * time.Hour) - reply.CreateIndex = atomic.AddUint64(&idx, 1) - reply.ModifyIndex = reply.CreateIndex - resp = reply - }) - - // We'll reuse the fetch options and request. Short timeout important since we - // wait the full timeout before chaning roots. - opts := cache.FetchOptions{MinIndex: 0, Timeout: 35 * time.Millisecond} - req := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web"} - - // First fetch should return immediately - fetchCh := TestFetchCh(t, typ, opts, req) - select { - case <-time.After(100 * time.Millisecond): - t.Fatal("shouldn't block waiting for fetch") - case result := <-fetchCh: - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - require.Equal(t, uint64(1), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } - - // Next fetch should block for the full timeout - start := time.Now() - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case <-time.After(100 * time.Millisecond): - t.Fatal("shouldn't block for too long waiting for fetch") - case result := <-fetchCh: - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - // Still the initial cached result - require.Equal(t, uint64(1), v.Index) - // Sanity check that it waited - require.True(t, time.Since(start) > opts.Timeout) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } - - // No active requests, simulate root change now - caRoot2 := connect.TestCA(t, nil) - caRoot2.Active = true - caRoot.Active = false - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot2.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot2, - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: atomic.AddUint64(&idx, 1)}, - } - earliestRootDelivery := time.Now() - - // We should get the new cert immediately on next fetch (since test override - // root change jitter to be 1 nanosecond so no delay expected). - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case <-time.After(100 * time.Millisecond): - t.Fatal("shouldn't block too long waiting for fetch") - case result := <-fetchCh: - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - // Index should be 3 since root change consumed 2 - require.Equal(t, uint64(3), v.Index) - // Sanity check that we didn't wait too long - require.True(t, time.Since(earliestRootDelivery) < opts.Timeout) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } - -} - -func TestConnectCALeaf_CSRRateLimiting(t *testing.T) { - if testing.Short() { - t.Skip("too slow for testing.Short") - } - - t.Parallel() - - rpc := TestRPC(t) - defer rpc.AssertExpectations(t) - - typ, rootsCh := testCALeafType(t, rpc) - defer close(rootsCh) - - // Each jitter window will be only 100 ms long to make testing quick but - // highly likely not to fail based on scheduling issues. - typ.TestOverrideCAChangeInitialDelay = 100 * time.Millisecond - - // Setup root that will be returned by the mocked Root cache fetch - caRoot := connect.TestCA(t, nil) - caRoot.Active = true - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: 1}, - } - - // Instrument ConnectCA.Sign - var resp *structs.IssuedCert - var idx, rateLimitedRPCs uint64 - - genCert := func(args mock.Arguments) { - reply := args.Get(3).(*structs.IssuedCert) - leaf, _ := connect.TestLeaf(t, "web", caRoot) - reply.CertPEM = leaf - reply.ValidAfter = time.Now().Add(-1 * time.Hour) - reply.ValidBefore = time.Now().Add(11 * time.Hour) - reply.CreateIndex = atomic.AddUint64(&idx, 1) - reply.ModifyIndex = reply.CreateIndex - resp = reply - } - - incRateLimit := func(args mock.Arguments) { - atomic.AddUint64(&rateLimitedRPCs, 1) - } - - // First call return rate limit error. This is important as it checks - // behavior when cache is empty and we have to return a nil Value but need to - // save state to do the right thing for retry. - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything). - Return(consul.ErrRateLimited).Once().Run(incRateLimit) - // Then succeed on second call - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything). - Return(nil).Run(genCert).Once() - // Then be rate limited again on several further calls - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything). - Return(consul.ErrRateLimited).Twice().Run(incRateLimit) - // Then fine after that - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything). - Return(nil).Run(genCert) - - opts := cache.FetchOptions{MinIndex: 0, Timeout: 10 * time.Minute} - req := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web"} - - // First fetch should return rate limit error directly - client is expected to - // backoff itself. - fetchCh := TestFetchCh(t, typ, opts, req) - select { - case <-time.After(200 * time.Millisecond): - t.Fatal("shouldn't block longer than one jitter window for success") - case result := <-fetchCh: - switch v := result.(type) { - case error: - require.Error(t, v) - require.Equal(t, consul.ErrRateLimited.Error(), v.Error()) - case cache.FetchResult: - t.Fatalf("Expected error") - } - } - - // Second call should return correct cert immediately. - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case <-time.After(100 * time.Millisecond): - t.Fatal("shouldn't block waiting for fetch") - case result := <-fetchCh: - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - require.Equal(t, uint64(1), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - // Set MinIndex - opts.MinIndex = 1 - } - - // Send in new roots, which should trigger the next sign req. We need to take - // care to set the new root as active - caRoot2 := connect.TestCA(t, nil) - caRoot2.Active = true - caRoot.Active = false - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot2.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot2, - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: atomic.AddUint64(&idx, 1)}, - } - earliestRootDelivery := time.Now() - - // Sanity check state - require.Equal(t, uint64(1), atomic.LoadUint64(&rateLimitedRPCs)) - - // After root rotation jitter has been waited out, a new CSR will - // be attempted but will fail and return the previous cached result with no - // error since we will try again soon. - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case <-time.After(200 * time.Millisecond): - t.Fatal("shouldn't block too long waiting for fetch") - case result := <-fetchCh: - // We should block for _at least_ one jitter period since we set that to - // 100ms and in test override mode we always pick the max jitter not a - // random amount. - require.True(t, time.Since(earliestRootDelivery) > 100*time.Millisecond) - require.Equal(t, uint64(2), atomic.LoadUint64(&rateLimitedRPCs)) - - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - // 1 since this should still be the original cached result as we failed to - // get a new cert. - require.Equal(t, uint64(1), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } - - // Root rotation state is now only captured in the opts.LastResult.State so a - // subsequent call should also wait for 100ms and then attempt to generate a - // new cert since we failed last time. - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case <-time.After(200 * time.Millisecond): - t.Fatal("shouldn't block too long waiting for fetch") - case result := <-fetchCh: - // We should block for _at least_ two jitter periods now. - require.True(t, time.Since(earliestRootDelivery) > 200*time.Millisecond) - require.Equal(t, uint64(3), atomic.LoadUint64(&rateLimitedRPCs)) - - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - // 1 since this should still be the original cached result as we failed to - // get a new cert. - require.Equal(t, uint64(1), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } - - // Now we've had two rate limit failures and seen root rotation state work - // across both the blocking request that observed the rotation and the - // subsequent one. The next request should wait out the rest of the backoff - // and then actually fetch a new cert at last! - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case <-time.After(200 * time.Millisecond): - t.Fatal("shouldn't block too long waiting for fetch") - case result := <-fetchCh: - // We should block for _at least_ three jitter periods now. - require.True(t, time.Since(earliestRootDelivery) > 300*time.Millisecond) - require.Equal(t, uint64(3), atomic.LoadUint64(&rateLimitedRPCs)) - - v := mustFetchResult(t, result) - require.Equal(t, resp, v.Value) - // 3 since the rootCA change used 2 - require.Equal(t, uint64(3), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } -} - -// This test runs multiple concurrent callers watching different leaf certs and -// tries to ensure that the background root watch activity behaves correctly. -func TestConnectCALeaf_watchRootsDedupingMultipleCallers(t *testing.T) { - if testing.Short() { - t.Skip("too slow for testing.Short") - } - - if testingRace { - t.Skip("fails with -race because caRoot.Active is modified concurrently") - } - t.Parallel() - - rpc := TestRPC(t) - defer rpc.AssertExpectations(t) - - typ, rootsCh := testCALeafType(t, rpc) - defer close(rootsCh) - - caRoot := connect.TestCA(t, nil) - caRoot.Active = true - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: 1}, - } - - // Instrument ConnectCA.Sign to return signed cert - var idx uint64 - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil). - Run(func(args mock.Arguments) { - reply := args.Get(3).(*structs.IssuedCert) - // Note we will sign certs for same service name each time because - // otherwise we have to re-invent whole CSR endpoint here to be able to - // control things - parse PEM sign with right key etc. It doesn't matter - - // we use the CreateIndex to differentiate the "right" results. - leaf, _ := connect.TestLeaf(t, "web", caRoot) - reply.CertPEM = leaf - reply.ValidAfter = time.Now().Add(-1 * time.Hour) - reply.ValidBefore = time.Now().Add(11 * time.Hour) - reply.CreateIndex = atomic.AddUint64(&idx, 1) - reply.ModifyIndex = reply.CreateIndex - }) - - // n is the number of clients we'll run - n := 3 - - // setup/testDoneCh are used for coordinating clients such that each has - // initial cert delivered and is blocking before the root changes. It's not a - // wait group since we want to be able to timeout the main test goroutine if - // one of the clients gets stuck. Instead it's a buffered chan. - setupDoneCh := make(chan error, n) - testDoneCh := make(chan error, n) - // rootsUpdate is used to coordinate clients so they know when they should - // expect to see leaf renewed after root change. - rootsUpdatedCh := make(chan struct{}) - - // Create a function that models a single client. It should go through the - // steps of getting an initial cert and then watching for changes until root - // updates. - client := func(i int) { - // We'll reuse the fetch options and request - opts := cache.FetchOptions{MinIndex: 0, Timeout: 10 * time.Second} - req := &ConnectCALeafRequest{Datacenter: "dc1", Service: fmt.Sprintf("web-%d", i)} - - // First fetch should return immediately - fetchCh := TestFetchCh(t, typ, opts, req) - select { - case <-time.After(100 * time.Millisecond): - setupDoneCh <- fmt.Errorf("shouldn't block waiting for fetch") - return - case result := <-fetchCh: - v := mustFetchResult(t, result) - opts.LastResult = &v - } - - // Second fetch should block with set index - opts.MinIndex = 1 - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case result := <-fetchCh: - setupDoneCh <- fmt.Errorf("should not return: %#v", result) - return - case <-time.After(100 * time.Millisecond): - } - - // We're done with setup and the blocking call is still blocking in - // background. - setupDoneCh <- nil - - // Wait until all others are also done and roots change incase there are - // stragglers delaying the root update. - select { - case <-rootsUpdatedCh: - case <-time.After(200 * time.Millisecond): - testDoneCh <- fmt.Errorf("waited too long for root update") - return - } - - // Now we should see root update within a short period - select { - case <-time.After(100 * time.Millisecond): - testDoneCh <- fmt.Errorf("shouldn't block waiting for fetch") - return - case result := <-fetchCh: - v := mustFetchResult(t, result) - if opts.MinIndex == v.Value.(*structs.IssuedCert).CreateIndex { - testDoneCh <- fmt.Errorf("index must be different") - return - } - } - - testDoneCh <- nil - } - - // Sanity check the roots watcher is not running yet - assertRootsWatchCounts(t, typ, 0, 0) - - for i := 0; i < n; i++ { - go client(i) - } - - timeoutCh := time.After(200 * time.Millisecond) - - for i := 0; i < n; i++ { - select { - case <-timeoutCh: - t.Fatal("timed out waiting for clients") - case err := <-setupDoneCh: - if err != nil { - t.Fatalf(err.Error()) - } - } - } - - // Should be 3 clients running now, so the roots watcher should have started - // once and not stopped. - assertRootsWatchCounts(t, typ, 1, 0) - - // Now we deliver the root update - caRoot2 := connect.TestCA(t, nil) - caRoot2.Active = true - caRoot.Active = false - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot2.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot2, - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: atomic.AddUint64(&idx, 1)}, - } - // And notify clients - close(rootsUpdatedCh) - - timeoutCh = time.After(200 * time.Millisecond) - for i := 0; i < n; i++ { - select { - case <-timeoutCh: - t.Fatalf("timed out waiting for %d of %d clients to renew after root change", n-i, n) - case err := <-testDoneCh: - if err != nil { - t.Fatalf(err.Error()) - } - } - } - - // All active requests have returned the new cert so the rootsWatcher should - // have stopped. This is timing dependent though so retry a few times - retry.RunWith(retry.ThreeTimes(), t, func(r *retry.R) { - assertRootsWatchCounts(r, typ, 1, 1) - }) -} - -func assertRootsWatchCounts(t require.TestingT, typ *ConnectCALeaf, wantStarts, wantStops int) { - if tt, ok := t.(*testing.T); ok { - tt.Helper() - } - starts := atomic.LoadUint32(&typ.testRootWatchStartCount) - stops := atomic.LoadUint32(&typ.testRootWatchStopCount) - require.Equal(t, wantStarts, int(starts)) - require.Equal(t, wantStops, int(stops)) -} - -func mustFetchResult(t *testing.T, result interface{}) cache.FetchResult { - t.Helper() - switch v := result.(type) { - case error: - require.NoError(t, v) - case cache.FetchResult: - return v - default: - t.Fatalf("unexpected type from fetch %T", v) - } - return cache.FetchResult{} -} - -// Test that after an initial signing, an expiringLeaf will trigger a -// blocking query to resign. -func TestConnectCALeaf_expiringLeaf(t *testing.T) { - if testing.Short() { - t.Skip("too slow for testing.Short") - } - - t.Parallel() - - rpc := TestRPC(t) - defer rpc.AssertExpectations(t) - - typ, rootsCh := testCALeafType(t, rpc) - defer close(rootsCh) - - caRoot := connect.TestCA(t, nil) - caRoot.Active = true - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: 1}, - } - - // Instrument ConnectCA.Sign to - var resp *structs.IssuedCert - var idx uint64 - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil). - Run(func(args mock.Arguments) { - reply := args.Get(3).(*structs.IssuedCert) - reply.CreateIndex = atomic.AddUint64(&idx, 1) - reply.ModifyIndex = reply.CreateIndex - - leaf, _ := connect.TestLeaf(t, "web", caRoot) - reply.CertPEM = leaf - - if reply.CreateIndex == 1 { - // First call returns expired cert to prime cache with an expired one. - reply.ValidAfter = time.Now().Add(-13 * time.Hour) - reply.ValidBefore = time.Now().Add(-1 * time.Hour) - } else { - reply.ValidAfter = time.Now().Add(-1 * time.Hour) - reply.ValidBefore = time.Now().Add(11 * time.Hour) - } - - resp = reply - }) - - // We'll reuse the fetch options and request - opts := cache.FetchOptions{MinIndex: 0, Timeout: 10 * time.Second} - req := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web"} - - // First fetch should return immediately - fetchCh := TestFetchCh(t, typ, opts, req) - select { - case <-time.After(100 * time.Millisecond): - t.Fatal("shouldn't block waiting for fetch") - case result := <-fetchCh: - switch v := result.(type) { - case error: - require.NoError(t, v) - case cache.FetchResult: - require.Equal(t, resp, v.Value) - require.Equal(t, uint64(1), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } - } - - // Second fetch should return immediately despite there being - // no updated CA roots, because we issued an expired cert. - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case <-time.After(100 * time.Millisecond): - t.Fatal("shouldn't block waiting for fetch") - case result := <-fetchCh: - switch v := result.(type) { - case error: - require.NoError(t, v) - case cache.FetchResult: - require.Equal(t, resp, v.Value) - require.Equal(t, uint64(2), v.Index) - // Set the LastResult for subsequent fetches - opts.LastResult = &v - } - } - - // Third fetch should block since the cert is not expiring and - // we also didn't update CA certs. - opts.MinIndex = 2 - fetchCh = TestFetchCh(t, typ, opts, req) - select { - case result := <-fetchCh: - t.Fatalf("should not return: %#v", result) - case <-time.After(100 * time.Millisecond): - } -} - -func TestConnectCALeaf_DNSSANForService(t *testing.T) { - t.Parallel() - - rpc := TestRPC(t) - defer rpc.AssertExpectations(t) - - typ, rootsCh := testCALeafType(t, rpc) - defer close(rootsCh) - - caRoot := connect.TestCA(t, nil) - caRoot.Active = true - rootsCh <- structs.IndexedCARoots{ - ActiveRootID: caRoot.ID, - TrustDomain: "fake-trust-domain.consul", - Roots: []*structs.CARoot{ - caRoot, - }, - QueryMeta: structs.QueryMeta{Index: 1}, - } - - // Instrument ConnectCA.Sign to - var caReq *structs.CASignRequest - rpc.On("RPC", mock.Anything, "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil). - Run(func(args mock.Arguments) { - reply := args.Get(3).(*structs.IssuedCert) - leaf, _ := connect.TestLeaf(t, "web", caRoot) - reply.CertPEM = leaf - - caReq = args.Get(2).(*structs.CASignRequest) - }) - - opts := cache.FetchOptions{MinIndex: 0, Timeout: 10 * time.Second} - req := &ConnectCALeafRequest{ - Datacenter: "dc1", - Service: "web", - DNSSAN: []string{"test.example.com"}, - } - _, err := typ.Fetch(opts, req) - require.NoError(t, err) - - pemBlock, _ := pem.Decode([]byte(caReq.CSR)) - csr, err := x509.ParseCertificateRequest(pemBlock.Bytes) - require.NoError(t, err) - require.Equal(t, csr.DNSNames, []string{"test.example.com"}) -} - -// testConnectCaRoot wraps ConnectCARoot to disable refresh so that the gated -// channel controls the request directly. Otherwise, we get background refreshes and -// it screws up the ordering of the channel reads of the testGatedRootsRPC -// implementation. -type testConnectCaRoot struct { - ConnectCARoot -} - -func (r testConnectCaRoot) RegisterOptions() cache.RegisterOptions { - return cache.RegisterOptions{ - Refresh: false, - SupportsBlocking: true, - } -} - -// testCALeafType returns a *ConnectCALeaf that is pre-configured to -// use the given RPC implementation for "ConnectCA.Sign" operations. -func testCALeafType(t *testing.T, rpc RPC) (*ConnectCALeaf, chan structs.IndexedCARoots) { - // This creates an RPC implementation that will block until the - // value is sent on the channel. This lets us control when the - // next values show up. - rootsCh := make(chan structs.IndexedCARoots, 10) - rootsRPC := &testGatedRootsRPC{ValueCh: rootsCh} - - // Create a cache - c := cache.New(cache.Options{}) - c.RegisterType(ConnectCARootName, &testConnectCaRoot{ - ConnectCARoot: ConnectCARoot{RPC: rootsRPC}, - }) - // Create the leaf type - return &ConnectCALeaf{ - RPC: rpc, - Cache: c, - Datacenter: "dc1", - // Override the root-change spread so we don't have to wait up to 20 seconds - // to see root changes work. Can be changed back for specific tests that - // need to test this, Note it's not 0 since that used default but is - // effectively the same. - TestOverrideCAChangeInitialDelay: 1 * time.Microsecond, - }, rootsCh -} - -// testGatedRootsRPC will send each subsequent value on the channel as the -// RPC response, blocking if it is waiting for a value on the channel. This -// can be used to control when background fetches are returned and what they -// return. -// -// This should be used with Refresh = false for the registration options so -// automatic refreshes don't mess up the channel read ordering. -type testGatedRootsRPC struct { - ValueCh chan structs.IndexedCARoots -} - -func (r *testGatedRootsRPC) RPC(ctx context.Context, method string, args interface{}, reply interface{}) error { - if method != "ConnectCA.Roots" { - return fmt.Errorf("invalid RPC method: %s", method) - } - - replyReal := reply.(*structs.IndexedCARoots) - *replyReal = <-r.ValueCh - return nil -} - -func TestConnectCALeaf_Key(t *testing.T) { - key := func(r ConnectCALeafRequest) string { - return r.Key() - } - t.Run("service", func(t *testing.T) { - t.Run("name", func(t *testing.T) { - r1 := key(ConnectCALeafRequest{Service: "web"}) - r2 := key(ConnectCALeafRequest{Service: "api"}) - require.True(t, strings.HasPrefix(r1, "service:"), "Key %s does not start with service:", r1) - require.True(t, strings.HasPrefix(r2, "service:"), "Key %s does not start with service:", r2) - require.NotEqual(t, r1, r2, "Cache keys for different services should not be equal") - }) - t.Run("dns-san", func(t *testing.T) { - r3 := key(ConnectCALeafRequest{Service: "foo", DNSSAN: []string{"a.com"}}) - r4 := key(ConnectCALeafRequest{Service: "foo", DNSSAN: []string{"b.com"}}) - require.NotEqual(t, r3, r4, "Cache keys for different DNSSAN should not be equal") - }) - t.Run("ip-san", func(t *testing.T) { - r5 := key(ConnectCALeafRequest{Service: "foo", IPSAN: []net.IP{net.ParseIP("192.168.4.139")}}) - r6 := key(ConnectCALeafRequest{Service: "foo", IPSAN: []net.IP{net.ParseIP("192.168.4.140")}}) - require.NotEqual(t, r5, r6, "Cache keys for different IPSAN should not be equal") - }) - }) - t.Run("agent", func(t *testing.T) { - t.Run("name", func(t *testing.T) { - r1 := key(ConnectCALeafRequest{Agent: "abc"}) - require.True(t, strings.HasPrefix(r1, "agent:"), "Key %s does not start with agent:", r1) - }) - t.Run("dns-san ignored", func(t *testing.T) { - r3 := key(ConnectCALeafRequest{Agent: "foo", DNSSAN: []string{"a.com"}}) - r4 := key(ConnectCALeafRequest{Agent: "foo", DNSSAN: []string{"b.com"}}) - require.Equal(t, r3, r4, "DNSSAN is ignored for agent type") - }) - t.Run("ip-san ignored", func(t *testing.T) { - r5 := key(ConnectCALeafRequest{Agent: "foo", IPSAN: []net.IP{net.ParseIP("192.168.4.139")}}) - r6 := key(ConnectCALeafRequest{Agent: "foo", IPSAN: []net.IP{net.ParseIP("192.168.4.140")}}) - require.Equal(t, r5, r6, "IPSAN is ignored for agent type") - }) - }) - t.Run("kind", func(t *testing.T) { - t.Run("invalid", func(t *testing.T) { - r1 := key(ConnectCALeafRequest{Kind: "terminating-gateway"}) - require.Empty(t, r1) - }) - t.Run("mesh-gateway", func(t *testing.T) { - t.Run("normal", func(t *testing.T) { - r1 := key(ConnectCALeafRequest{Kind: "mesh-gateway"}) - require.True(t, strings.HasPrefix(r1, "kind:"), "Key %s does not start with kind:", r1) - }) - t.Run("dns-san", func(t *testing.T) { - r3 := key(ConnectCALeafRequest{Kind: "mesh-gateway", DNSSAN: []string{"a.com"}}) - r4 := key(ConnectCALeafRequest{Kind: "mesh-gateway", DNSSAN: []string{"b.com"}}) - require.NotEqual(t, r3, r4, "Cache keys for different DNSSAN should not be equal") - }) - t.Run("ip-san", func(t *testing.T) { - r5 := key(ConnectCALeafRequest{Kind: "mesh-gateway", IPSAN: []net.IP{net.ParseIP("192.168.4.139")}}) - r6 := key(ConnectCALeafRequest{Kind: "mesh-gateway", IPSAN: []net.IP{net.ParseIP("192.168.4.140")}}) - require.NotEqual(t, r5, r6, "Cache keys for different IPSAN should not be equal") - }) - }) - }) - t.Run("server", func(t *testing.T) { - r1 := key(ConnectCALeafRequest{ - Server: true, - Datacenter: "us-east", - }) - require.True(t, strings.HasPrefix(r1, "server:"), "Key %s does not start with server:", r1) - }) -} diff --git a/agent/cache-types/norace_test.go b/agent/cache-types/norace_test.go deleted file mode 100644 index 3f316d5d3ffa4..0000000000000 --- a/agent/cache-types/norace_test.go +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: MPL-2.0 - -//go:build !race -// +build !race - -package cachetype - -const testingRace = false diff --git a/agent/cache-types/race_test.go b/agent/cache-types/race_test.go deleted file mode 100644 index 88dcf82a4c4b6..0000000000000 --- a/agent/cache-types/race_test.go +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: MPL-2.0 - -//go:build race -// +build race - -package cachetype - -const testingRace = true diff --git a/agent/config/builder.go b/agent/config/builder.go index 665688b8c8641..5d191ce8b3ac3 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -984,7 +984,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) { AutoEncryptIPSAN: autoEncryptIPSAN, AutoEncryptAllowTLS: autoEncryptAllowTLS, AutoConfig: autoConfig, - Cloud: b.cloudConfigVal(c.Cloud), + Cloud: b.cloudConfigVal(c), ConnectEnabled: connectEnabled, ConnectCAProvider: connectCAProvider, ConnectCAConfig: connectCAConfig, @@ -2541,21 +2541,26 @@ func validateAutoConfigAuthorizer(rt RuntimeConfig) error { return nil } -func (b *builder) cloudConfigVal(v *CloudConfigRaw) hcpconfig.CloudConfig { +func (b *builder) cloudConfigVal(v Config) hcpconfig.CloudConfig { val := hcpconfig.CloudConfig{ ResourceID: os.Getenv("HCP_RESOURCE_ID"), } - if v == nil { + // Node id might get overriden in setup.go:142 + nodeID := stringVal(v.NodeID) + val.NodeID = types.NodeID(nodeID) + val.NodeName = b.nodeName(v.NodeName) + + if v.Cloud == nil { return val } - val.ClientID = stringVal(v.ClientID) - val.ClientSecret = stringVal(v.ClientSecret) - val.AuthURL = stringVal(v.AuthURL) - val.Hostname = stringVal(v.Hostname) - val.ScadaAddress = stringVal(v.ScadaAddress) + val.ClientID = stringVal(v.Cloud.ClientID) + val.ClientSecret = stringVal(v.Cloud.ClientSecret) + val.AuthURL = stringVal(v.Cloud.AuthURL) + val.Hostname = stringVal(v.Cloud.Hostname) + val.ScadaAddress = stringVal(v.Cloud.ScadaAddress) - if resourceID := stringVal(v.ResourceID); resourceID != "" { + if resourceID := stringVal(v.Cloud.ResourceID); resourceID != "" { val.ResourceID = resourceID } return val diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index c1cd85ac502f8..f868ea964b180 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -619,6 +619,7 @@ func TestLoad_IntegrationWithFlags(t *testing.T) { rt.NodeName = "a" rt.TLS.NodeName = "a" rt.DataDir = dataDir + rt.Cloud.NodeName = "a" }, }) run(t, testCase{ @@ -630,6 +631,7 @@ func TestLoad_IntegrationWithFlags(t *testing.T) { expected: func(rt *RuntimeConfig) { rt.NodeID = "a" rt.DataDir = dataDir + rt.Cloud.NodeID = "a" }, }) run(t, testCase{ @@ -2319,6 +2321,8 @@ func TestLoad_IntegrationWithFlags(t *testing.T) { rt.Cloud = hcpconfig.CloudConfig{ // ID is only populated from env if not populated from other sources. ResourceID: "env-id", + NodeName: "thehostname", + NodeID: "", } // server things @@ -2359,6 +2363,7 @@ func TestLoad_IntegrationWithFlags(t *testing.T) { rt.Cloud = hcpconfig.CloudConfig{ // ID is only populated from env if not populated from other sources. ResourceID: "file-id", + NodeName: "thehostname", } // server things @@ -6317,6 +6322,8 @@ func TestLoad_FullConfig(t *testing.T) { Hostname: "DH4bh7aC", AuthURL: "332nCdR2", ScadaAddress: "aoeusth232", + NodeID: types.NodeID("AsUIlw99"), + NodeName: "otlLxGaI", }, DNSAddrs: []net.Addr{tcpAddr("93.95.95.81:7001"), udpAddr("93.95.95.81:7001")}, DNSARecordLimit: 29907, diff --git a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden index b6ee9a98129ff..6bb08ff95feda 100644 --- a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden +++ b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden @@ -134,7 +134,9 @@ "ManagementToken": "hidden", "ResourceID": "cluster1", "ScadaAddress": "", - "TLSConfig": null + "TLSConfig": null, + "NodeID": "", + "NodeName": "" }, "ConfigEntryBootstrap": [], "ConnectCAConfig": {}, diff --git a/agent/consul/discoverychain/compile.go b/agent/consul/discoverychain/compile.go index 634ab4a6ff769..20227db3ef195 100644 --- a/agent/consul/discoverychain/compile.go +++ b/agent/consul/discoverychain/compile.go @@ -1009,19 +1009,25 @@ RESOLVE_AGAIN: Type: structs.DiscoveryGraphNodeTypeResolver, Name: target.ID, Resolver: &structs.DiscoveryResolver{ - Default: resolver.IsDefault(), - Target: target.ID, - ConnectTimeout: connectTimeout, - RequestTimeout: resolver.RequestTimeout, - PrioritizeByLocality: resolver.PrioritizeByLocality.ToDiscovery(), + Default: resolver.IsDefault(), + Target: target.ID, + ConnectTimeout: connectTimeout, + RequestTimeout: resolver.RequestTimeout, }, LoadBalancer: resolver.LoadBalancer, } - // Merge default values from the proxy defaults proxyDefault := c.entries.GetProxyDefaults(targetID.PartitionOrDefault()) - if proxyDefault != nil && node.Resolver.PrioritizeByLocality == nil { - node.Resolver.PrioritizeByLocality = proxyDefault.PrioritizeByLocality.ToDiscovery() + + // Only set PrioritizeByLocality for targets in the same partition. + if target.Partition == c.evaluateInPartition && target.Peer == "" { + if resolver.PrioritizeByLocality != nil { + target.PrioritizeByLocality = resolver.PrioritizeByLocality.ToDiscovery() + } + + if target.PrioritizeByLocality == nil && proxyDefault != nil { + target.PrioritizeByLocality = proxyDefault.PrioritizeByLocality.ToDiscovery() + } } target.Subset = resolver.Subsets[target.ServiceSubset] diff --git a/agent/consul/discoverychain/compile_test.go b/agent/consul/discoverychain/compile_test.go index 692c0c4fdacac..ca39aa236fc95 100644 --- a/agent/consul/discoverychain/compile_test.go +++ b/agent/consul/discoverychain/compile_test.go @@ -3301,6 +3301,7 @@ func newTarget(opts structs.DiscoveryTargetOpts, modFn func(t *structs.Discovery t.SNI = connect.TargetSNI(t, "trustdomain.consul") t.Name = t.SNI t.ConnectTimeout = 5 * time.Second // default + t.PrioritizeByLocality = opts.PrioritizeByLocality if modFn != nil { modFn(t) } diff --git a/agent/consul/leader.go b/agent/consul/leader.go index f2905e6d8f44e..c91655c5c863e 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -364,6 +364,8 @@ func (s *Server) revokeLeadership() { s.revokeEnterpriseLeadership() + s.stopDeferredDeletion() + s.stopFederationStateAntiEntropy() s.stopFederationStateReplication() diff --git a/agent/consul/operator_raft_endpoint.go b/agent/consul/operator_raft_endpoint.go index f619b611f7da9..7b0bcbc5cc035 100644 --- a/agent/consul/operator_raft_endpoint.go +++ b/agent/consul/operator_raft_endpoint.go @@ -48,6 +48,12 @@ func (op *Operator) RaftGetConfiguration(args *structs.DCSpecificRequest, reply serverMap[raft.ServerAddress(addr)] = member } + serverIDLastIndexMap := make(map[raft.ServerID]uint64) + + for _, serverState := range op.srv.autopilot.GetState().Servers { + serverIDLastIndexMap[serverState.Server.ID] = serverState.Stats.LastIndex + } + // Fill out the reply. leader := op.srv.raft.Leader() reply.Index = future.Index() @@ -66,6 +72,7 @@ func (op *Operator) RaftGetConfiguration(args *structs.DCSpecificRequest, reply Leader: server.Address == leader, Voter: server.Suffrage == raft.Voter, ProtocolVersion: raftProtocolVersion, + LastIndex: serverIDLastIndexMap[server.ID], } reply.Servers = append(reply.Servers, entry) } diff --git a/agent/consul/operator_raft_endpoint_test.go b/agent/consul/operator_raft_endpoint_test.go index 7ce5b6e946c86..7242c40e6c45a 100644 --- a/agent/consul/operator_raft_endpoint_test.go +++ b/agent/consul/operator_raft_endpoint_test.go @@ -50,6 +50,13 @@ func TestOperator_RaftGetConfiguration(t *testing.T) { if len(future.Configuration().Servers) != 1 { t.Fatalf("bad: %v", future.Configuration().Servers) } + + serverIDLastIndexMap := make(map[raft.ServerID]uint64) + + for _, serverState := range s1.autopilot.GetState().Servers { + serverIDLastIndexMap[serverState.Server.ID] = serverState.Stats.LastIndex + } + me := future.Configuration().Servers[0] expected := structs.RaftConfigurationResponse{ Servers: []*structs.RaftServer{ @@ -60,6 +67,7 @@ func TestOperator_RaftGetConfiguration(t *testing.T) { Leader: true, Voter: true, ProtocolVersion: "3", + LastIndex: serverIDLastIndexMap[me.ID], }, }, Index: future.Index(), @@ -113,6 +121,10 @@ func TestOperator_RaftGetConfiguration_ACLDeny(t *testing.T) { if len(future.Configuration().Servers) != 1 { t.Fatalf("bad: %v", future.Configuration().Servers) } + serverIDLastIndexMap := make(map[raft.ServerID]uint64) + for _, serverState := range s1.autopilot.GetState().Servers { + serverIDLastIndexMap[serverState.Server.ID] = serverState.Stats.LastIndex + } me := future.Configuration().Servers[0] expected := structs.RaftConfigurationResponse{ Servers: []*structs.RaftServer{ @@ -123,6 +135,7 @@ func TestOperator_RaftGetConfiguration_ACLDeny(t *testing.T) { Leader: true, Voter: true, ProtocolVersion: "3", + LastIndex: serverIDLastIndexMap[me.ID], }, }, Index: future.Index(), diff --git a/agent/consul/servercert/manager.go b/agent/consul/servercert/manager.go index d343db4a08a3a..75c2a4f276082 100644 --- a/agent/consul/servercert/manager.go +++ b/agent/consul/servercert/manager.go @@ -8,22 +8,23 @@ import ( "fmt" "time" + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-memdb" + "github.com/hashicorp/consul/agent/cache" - cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/lib/retry" - "github.com/hashicorp/go-hclog" - "github.com/hashicorp/go-memdb" ) // Correlation ID for leaf cert watches. const leafWatchID = "leaf" -// Cache is an interface to represent the necessary methods of the agent/cache.Cache. +// LeafCertManager is an interface to represent the necessary methods of the agent/leafcert.Manager. // It is used to request and renew the server leaf certificate. -type Cache interface { - Notify(ctx context.Context, t string, r cache.Request, correlationID string, ch chan<- cache.UpdateEvent) error +type LeafCertManager interface { + Notify(ctx context.Context, req *leafcert.ConnectCALeafRequest, correlationID string, ch chan<- cache.UpdateEvent) error } // TLSConfigurator is an interface to represent the necessary methods of the tlsutil.Configurator. @@ -52,7 +53,7 @@ type Config struct { type Deps struct { Config Config Logger hclog.Logger - Cache Cache + LeafCertManager LeafCertManager GetStore func() Store TLSConfigurator TLSConfigurator waiter retry.Waiter @@ -67,9 +68,8 @@ type CertManager struct { // config contains agent configuration necessary for the cert manager to operate. config Config - // cache provides an API to issue internal RPC requests and receive notifications - // when there are changes. - cache Cache + // leafCerts grants access to request and renew the server leaf cert. + leafCerts LeafCertManager // cacheUpdateCh receives notifications of cache update events for resources watched. cacheUpdateCh chan cache.UpdateEvent @@ -85,10 +85,13 @@ type CertManager struct { } func NewCertManager(deps Deps) *CertManager { + if deps.LeafCertManager == nil { + panic("LeafCertManager is required") + } return &CertManager{ config: deps.Config, logger: deps.Logger, - cache: deps.Cache, + leafCerts: deps.LeafCertManager, cacheUpdateCh: make(chan cache.UpdateEvent, 1), getStore: deps.GetStore, tlsConfigurator: deps.TLSConfigurator, @@ -156,12 +159,12 @@ func (m *CertManager) watchServerToken(ctx context.Context) { cancel() notifyCtx, cancel = context.WithCancel(ctx) - req := cachetype.ConnectCALeafRequest{ + req := leafcert.ConnectCALeafRequest{ Datacenter: m.config.Datacenter, Token: token.Value, Server: true, } - if err := m.cache.Notify(notifyCtx, cachetype.ConnectCALeafName, &req, leafWatchID, m.cacheUpdateCh); err != nil { + if err := m.leafCerts.Notify(notifyCtx, &req, leafWatchID, m.cacheUpdateCh); err != nil { return fmt.Errorf("failed to setup leaf cert notifications: %w", err) } @@ -174,11 +177,11 @@ func (m *CertManager) watchServerToken(ctx context.Context) { } func (m *CertManager) watchLeafCert(ctx context.Context) error { - req := cachetype.ConnectCALeafRequest{ + req := leafcert.ConnectCALeafRequest{ Datacenter: m.config.Datacenter, Server: true, } - if err := m.cache.Notify(ctx, cachetype.ConnectCALeafName, &req, leafWatchID, m.cacheUpdateCh); err != nil { + if err := m.leafCerts.Notify(ctx, &req, leafWatchID, m.cacheUpdateCh); err != nil { return fmt.Errorf("failed to setup leaf cert notifications: %w", err) } diff --git a/agent/consul/servercert/manager_test.go b/agent/consul/servercert/manager_test.go index 048e0742294f2..dfadfe4b953fb 100644 --- a/agent/consul/servercert/manager_test.go +++ b/agent/consul/servercert/manager_test.go @@ -8,13 +8,15 @@ import ( "testing" "time" + "github.com/hashicorp/go-memdb" + "github.com/stretchr/testify/require" + "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/lib/retry" "github.com/hashicorp/consul/sdk/testutil" - "github.com/hashicorp/go-memdb" - "github.com/stretchr/testify/require" ) type fakeStore struct { @@ -109,7 +111,7 @@ type watchInfo struct { token string } -type fakeCache struct { +type fakeLeafCertManager struct { updateCh chan<- cache.UpdateEvent // watched is a map of watched correlation IDs to the ACL token of the request. @@ -120,7 +122,7 @@ type fakeCache struct { syncCh chan struct{} } -func (c *fakeCache) triggerLeafUpdate() { +func (c *fakeLeafCertManager) triggerLeafUpdate() { c.updateCh <- cache.UpdateEvent{ CorrelationID: leafWatchID, Result: &structs.IssuedCert{ @@ -131,14 +133,14 @@ func (c *fakeCache) triggerLeafUpdate() { } } -func (c *fakeCache) Notify(ctx context.Context, t string, r cache.Request, correlationID string, ch chan<- cache.UpdateEvent) error { - c.watched[correlationID] = watchInfo{ctx: ctx, token: r.CacheInfo().Token} +func (c *fakeLeafCertManager) Notify(ctx context.Context, r *leafcert.ConnectCALeafRequest, correlationID string, ch chan<- cache.UpdateEvent) error { + c.watched[correlationID] = watchInfo{ctx: ctx, token: r.Token} c.updateCh = ch c.syncCh <- struct{}{} return nil } -func (c *fakeCache) timeoutIfNotUpdated(t *testing.T) error { +func (c *fakeLeafCertManager) timeoutIfNotUpdated(t *testing.T) error { t.Helper() select { @@ -159,7 +161,7 @@ func testWaiter() retry.Waiter { func TestCertManager_ACLsDisabled(t *testing.T) { tlsConfigurator := fakeTLSConfigurator{syncCh: make(chan struct{}, 1)} - cache := fakeCache{watched: make(map[string]watchInfo), syncCh: make(chan struct{}, 1)} + leafCerts := fakeLeafCertManager{watched: make(map[string]watchInfo), syncCh: make(chan struct{}, 1)} store := fakeStore{ conf: make(chan *structs.CAConfiguration, 1), tokenEntry: make(chan *structs.SystemMetadataEntry, 1), @@ -172,7 +174,7 @@ func TestCertManager_ACLsDisabled(t *testing.T) { ACLsEnabled: false, }, TLSConfigurator: &tlsConfigurator, - Cache: &cache, + LeafCertManager: &leafCerts, GetStore: func() Store { return &store }, }) @@ -185,11 +187,11 @@ func TestCertManager_ACLsDisabled(t *testing.T) { require.Empty(t, tlsConfigurator.cert) require.Empty(t, tlsConfigurator.peeringServerName) - require.Contains(t, cache.watched, leafWatchID) + require.Contains(t, leafCerts.watched, leafWatchID) }) testutil.RunStep(t, "leaf cert update", func(t *testing.T) { - cache.triggerLeafUpdate() + leafCerts.triggerLeafUpdate() // Wait for the update to arrive. require.NoError(t, tlsConfigurator.timeoutIfNotUpdated(t)) @@ -214,7 +216,7 @@ func TestCertManager_ACLsDisabled(t *testing.T) { func TestCertManager_ACLsEnabled(t *testing.T) { tlsConfigurator := fakeTLSConfigurator{syncCh: make(chan struct{}, 1)} - cache := fakeCache{watched: make(map[string]watchInfo), syncCh: make(chan struct{}, 1)} + leafCerts := fakeLeafCertManager{watched: make(map[string]watchInfo), syncCh: make(chan struct{}, 1)} store := fakeStore{ conf: make(chan *structs.CAConfiguration, 1), tokenEntry: make(chan *structs.SystemMetadataEntry, 1), @@ -227,7 +229,7 @@ func TestCertManager_ACLsEnabled(t *testing.T) { ACLsEnabled: true, }, TLSConfigurator: &tlsConfigurator, - Cache: &cache, + LeafCertManager: &leafCerts, GetStore: func() Store { return &store }, }) @@ -240,7 +242,7 @@ func TestCertManager_ACLsEnabled(t *testing.T) { require.Empty(t, tlsConfigurator.cert) require.Empty(t, tlsConfigurator.peeringServerName) - require.Empty(t, cache.watched) + require.Empty(t, leafCerts.watched) }) var leafCtx context.Context @@ -249,16 +251,16 @@ func TestCertManager_ACLsEnabled(t *testing.T) { testutil.RunStep(t, "server token update", func(t *testing.T) { store.setServerToken("first-secret", tokenCanceler) - require.NoError(t, cache.timeoutIfNotUpdated(t)) + require.NoError(t, leafCerts.timeoutIfNotUpdated(t)) - require.Contains(t, cache.watched, leafWatchID) - require.Equal(t, "first-secret", cache.watched[leafWatchID].token) + require.Contains(t, leafCerts.watched, leafWatchID) + require.Equal(t, "first-secret", leafCerts.watched[leafWatchID].token) - leafCtx = cache.watched[leafWatchID].ctx + leafCtx = leafCerts.watched[leafWatchID].ctx }) testutil.RunStep(t, "leaf cert update", func(t *testing.T) { - cache.triggerLeafUpdate() + leafCerts.triggerLeafUpdate() // Wait for the update to arrive. require.NoError(t, tlsConfigurator.timeoutIfNotUpdated(t)) @@ -276,15 +278,15 @@ func TestCertManager_ACLsEnabled(t *testing.T) { // Fire the existing WatchSet to simulate a state store update. tokenCanceler <- struct{}{} - // The leaf watch in the cache should have been reset. - require.NoError(t, cache.timeoutIfNotUpdated(t)) + // The leaf watch in the leafCerts should have been reset. + require.NoError(t, leafCerts.timeoutIfNotUpdated(t)) // The original leaf watch context should have been canceled. require.Error(t, leafCtx.Err()) // A new leaf watch is expected with the new token. - require.Contains(t, cache.watched, leafWatchID) - require.Equal(t, "second-secret", cache.watched[leafWatchID].token) + require.Contains(t, leafCerts.watched, leafWatchID) + require.Equal(t, "second-secret", leafCerts.watched[leafWatchID].token) }) testutil.RunStep(t, "ca config update", func(t *testing.T) { diff --git a/agent/consul/state/config_entry.go b/agent/consul/state/config_entry.go index 340a53f1192cd..9abaafc390d3a 100644 --- a/agent/consul/state/config_entry.go +++ b/agent/consul/state/config_entry.go @@ -634,6 +634,12 @@ func validateProposedConfigEntryInGraph( case structs.TCPRoute: case structs.RateLimitIPConfig: case structs.JWTProvider: + if newEntry == nil && existingEntry != nil { + err := validateJWTProviderIsReferenced(tx, kindName, existingEntry) + if err != nil { + return err + } + } default: return fmt.Errorf("unhandled kind %q during validation of %q", kindName.Kind, kindName.Name) } @@ -704,6 +710,66 @@ func getReferencedProviderNames(j *structs.IntentionJWTRequirement, s []*structs return providerNames } +// validateJWTProviderIsReferenced iterates over intentions to determine if the provider being +// deleted is referenced by any intention. +// +// This could be an expensive operation based on the number of intentions. We purposely set this to only +// run on delete and don't expect this to be called often. +func validateJWTProviderIsReferenced(tx ReadTxn, kn configentry.KindName, ce structs.ConfigEntry) error { + meta := acl.NewEnterpriseMetaWithPartition( + kn.EnterpriseMeta.PartitionOrDefault(), + acl.DefaultNamespaceName, + ) + entry, ok := ce.(*structs.JWTProviderConfigEntry) + if !ok { + return fmt.Errorf("invalid jwt provider config entry: %T", entry) + } + + _, ixnEntries, err := configEntriesByKindTxn(tx, nil, structs.ServiceIntentions, &meta) + if err != nil { + return err + } + + err = findJWTProviderNameReferences(ixnEntries, entry.Name) + if err != nil { + return err + } + + return nil +} + +func findJWTProviderNameReferences(entries []structs.ConfigEntry, pName string) error { + errMsg := "cannot delete jwt provider config entry referenced by an intention. Provider name: %s, intention name: %s" + for _, entry := range entries { + ixn, ok := entry.(*structs.ServiceIntentionsConfigEntry) + if !ok { + return fmt.Errorf("type %T is not a service intentions config entry", entry) + } + + if ixn.JWT != nil { + for _, prov := range ixn.JWT.Providers { + if prov.Name == pName { + return fmt.Errorf(errMsg, pName, ixn.Name) + } + } + } + + for _, s := range ixn.Sources { + for _, perm := range s.Permissions { + if perm.JWT == nil { + continue + } + for _, prov := range perm.JWT.Providers { + if prov.Name == pName { + return fmt.Errorf(errMsg, pName, ixn.Name) + } + } + } + } + } + return nil +} + // This fetches all the jwt-providers config entries and iterates over them // to validate that any provider referenced exists. // This is okay because we assume there are very few jwt-providers per partition diff --git a/agent/consul/state/config_entry_test.go b/agent/consul/state/config_entry_test.go index 572719dc4b1f1..d72f12c876890 100644 --- a/agent/consul/state/config_entry_test.go +++ b/agent/consul/state/config_entry_test.go @@ -3714,3 +3714,178 @@ func TestStateStore_DiscoveryChain_AttachVirtualIPs(t *testing.T) { require.Equal(t, []string{"2.2.2.2", "3.3.3.3"}, chain.ManualVirtualIPs) } + +func TestFindJWTProviderNameReferences(t *testing.T) { + oktaProvider := structs.IntentionJWTProvider{Name: "okta"} + auth0Provider := structs.IntentionJWTProvider{Name: "auth0"} + cases := map[string]struct { + entries []structs.ConfigEntry + providerName string + expectedError string + }{ + "no jwt at any level": { + entries: []structs.ConfigEntry{}, + providerName: "okta", + }, + "provider not referenced": { + entries: []structs.ConfigEntry{ + &structs.ServiceIntentionsConfigEntry{ + Kind: "service-intentions", + Name: "api-intention", + JWT: &structs.IntentionJWTRequirement{ + Providers: []*structs.IntentionJWTProvider{&oktaProvider, &auth0Provider}, + }, + }, + }, + providerName: "fake-provider", + }, + "only top level jwt with no permissions": { + entries: []structs.ConfigEntry{ + &structs.ServiceIntentionsConfigEntry{ + Kind: "service-intentions", + Name: "api-intention", + JWT: &structs.IntentionJWTRequirement{ + Providers: []*structs.IntentionJWTProvider{&oktaProvider, &auth0Provider}, + }, + }, + }, + providerName: "okta", + expectedError: "cannot delete jwt provider config entry referenced by an intention. Provider name: okta, intention name: api-intention", + }, + "top level jwt with permissions": { + entries: []structs.ConfigEntry{ + &structs.ServiceIntentionsConfigEntry{ + Kind: "service-intentions", + Name: "api-intention", + JWT: &structs.IntentionJWTRequirement{ + Providers: []*structs.IntentionJWTProvider{&oktaProvider}, + }, + Sources: []*structs.SourceIntention{ + { + Name: "api", + Action: "allow", + Permissions: []*structs.IntentionPermission{ + { + Action: "allow", + JWT: &structs.IntentionJWTRequirement{ + Providers: []*structs.IntentionJWTProvider{&oktaProvider}, + }, + }, + }, + }, + { + Name: "serv", + Action: "allow", + Permissions: []*structs.IntentionPermission{ + { + Action: "allow", + JWT: &structs.IntentionJWTRequirement{ + Providers: []*structs.IntentionJWTProvider{&auth0Provider}, + }, + }, + }, + }, + { + Name: "web", + Action: "allow", + Permissions: []*structs.IntentionPermission{ + {Action: "allow"}, + }, + }, + }, + }, + }, + providerName: "auth0", + expectedError: "cannot delete jwt provider config entry referenced by an intention. Provider name: auth0, intention name: api-intention", + }, + "no top level jwt and existing permissions": { + entries: []structs.ConfigEntry{ + &structs.ServiceIntentionsConfigEntry{ + Kind: "service-intentions", + Name: "api-intention", + Sources: []*structs.SourceIntention{ + { + Name: "api", + Action: "allow", + Permissions: []*structs.IntentionPermission{ + { + Action: "allow", + JWT: &structs.IntentionJWTRequirement{ + Providers: []*structs.IntentionJWTProvider{&oktaProvider}, + }, + }, + }, + }, + { + Name: "serv", + Action: "allow", + Permissions: []*structs.IntentionPermission{ + { + Action: "allow", + JWT: &structs.IntentionJWTRequirement{ + Providers: []*structs.IntentionJWTProvider{&auth0Provider}, + }, + }, + }, + }, + { + Name: "web", + Action: "allow", + Permissions: []*structs.IntentionPermission{ + {Action: "allow"}, + }, + }, + }, + }, + }, + providerName: "okta", + expectedError: "cannot delete jwt provider config entry referenced by an intention. Provider name: okta, intention name: api-intention", + }, + } + + for name, tt := range cases { + tt := tt + t.Run(name, func(t *testing.T) { + err := findJWTProviderNameReferences(tt.entries, tt.providerName) + + if tt.expectedError != "" { + require.Error(t, err) + require.Contains(t, err.Error(), tt.expectedError) + } else { + require.NoError(t, err) + } + }) + } +} + +func TestStore_ValidateJWTProviderIsReferenced(t *testing.T) { + s := testStateStore(t) + + // First create a config entry + provider := &structs.JWTProviderConfigEntry{ + Kind: structs.JWTProvider, + Name: "okta", + } + require.NoError(t, s.EnsureConfigEntry(0, provider)) + + // create a service intention referencing the config entry + ixn := &structs.ServiceIntentionsConfigEntry{ + Name: "api", + JWT: &structs.IntentionJWTRequirement{ + Providers: []*structs.IntentionJWTProvider{ + {Name: provider.Name}, + }, + }, + } + require.NoError(t, s.EnsureConfigEntry(1, ixn)) + + // attempt deleting a referenced provider + err := s.DeleteConfigEntry(0, structs.JWTProvider, provider.Name, nil) + require.Error(t, err) + require.Contains(t, err.Error(), `cannot delete jwt provider config entry referenced by an intention. Provider name: okta, intention name: api`) + + // delete the intention + require.NoError(t, s.DeleteConfigEntry(1, structs.ServiceIntentions, ixn.Name, nil)) + // successfully delete the provider after deleting the intention + require.NoError(t, s.DeleteConfigEntry(0, structs.JWTProvider, provider.Name, nil)) +} diff --git a/agent/consul/watch/server_local.go b/agent/consul/watch/server_local.go index f407d2c1648f2..5937ba1c6a10e 100644 --- a/agent/consul/watch/server_local.go +++ b/agent/consul/watch/server_local.go @@ -16,8 +16,9 @@ import ( ) var ( - ErrorNotFound = errors.New("no data found for query") - ErrorNotChanged = errors.New("data did not change for query") + ErrorNotFound = errors.New("no data found for query") + ErrorNotChanged = errors.New("data did not change for query") + ErrorACLResetData = errors.New("an acl update forced a state reset") errNilContext = errors.New("cannot call ServerLocalNotify with a nil context") errNilGetStore = errors.New("cannot call ServerLocalNotify without a callback to get a StateStore") @@ -320,8 +321,15 @@ func serverLocalNotifyRoutine[ResultType any, StoreType StateStore]( return } + // An ACL reset error can be raised so that the index greater-than check is + // bypassed. We should not propagate it to the caller. + forceReset := errors.Is(err, ErrorACLResetData) + if forceReset { + err = nil + } + // Check the index to see if we should call notify - if minIndex == 0 || minIndex < index { + if minIndex == 0 || minIndex < index || forceReset { notify(ctx, correlationID, result, err) minIndex = index } diff --git a/agent/envoyextensions/builtin/ext-authz/ext_authz.go b/agent/envoyextensions/builtin/ext-authz/ext_authz.go index 67d93cd2ba655..7400aef13a04c 100644 --- a/agent/envoyextensions/builtin/ext-authz/ext_authz.go +++ b/agent/envoyextensions/builtin/ext-authz/ext_authz.go @@ -68,6 +68,8 @@ func (a *extAuthz) PatchFilters(cfg *ext_cmn.RuntimeConfig, filters []*envoy_lis return filters, nil } + a.configureInsertOptions(cfg.Protocol) + switch cfg.Protocol { case "grpc", "http2", "http": extAuthzFilter, err := a.Config.toEnvoyHttpFilter(cfg) @@ -107,13 +109,26 @@ func (a *extAuthz) fromArguments(args map[string]any) error { return a.validate() } +func (a *extAuthz) configureInsertOptions(protocol string) { + // If the insert options have been expressly configured, then use them. + if a.InsertOptions.Location != "" { + return + } + + // Configure the default, insert the filter immediately before the terminal filter. + a.InsertOptions.Location = ext_cmn.InsertBeforeFirstMatch + switch protocol { + case "grpc", "http2", "http": + a.InsertOptions.FilterName = "envoy.filters.http.router" + default: + a.InsertOptions.FilterName = "envoy.filters.network.tcp_proxy" + } +} + func (a *extAuthz) normalize() { if a.ProxyType == "" { a.ProxyType = api.ServiceKindConnectProxy } - if a.InsertOptions.Location == "" { - a.InsertOptions.Location = ext_cmn.InsertFirst - } a.Config.normalize() } diff --git a/agent/envoyextensions/builtin/ext-authz/ext_authz_test.go b/agent/envoyextensions/builtin/ext-authz/ext_authz_test.go index e0b4245edda21..88e87d7e9a8f8 100644 --- a/agent/envoyextensions/builtin/ext-authz/ext_authz_test.go +++ b/agent/envoyextensions/builtin/ext-authz/ext_authz_test.go @@ -59,7 +59,7 @@ func TestConstructor(t *testing.T) { }, }, }, - errMsg: `invalid host for Target.URI "foo.bar.com:9191": expected 'localhost' or '127.0.0.1'`, + errMsg: `invalid host for Target.URI "foo.bar.com:9191": expected "localhost", "127.0.0.1", or "::1"`, }, "non-loopback address": { args: map[string]any{ @@ -72,7 +72,34 @@ func TestConstructor(t *testing.T) { }, }, }, - errMsg: `invalid host for Target.URI "10.0.0.1:9191": expected 'localhost' or '127.0.0.1'`, + errMsg: `invalid host for Target.URI "10.0.0.1:9191": expected "localhost", "127.0.0.1", or "::1"`, + }, + "invalid target port": { + args: map[string]any{ + "ProxyType": "connect-proxy", + "Config": map[string]any{ + "GrpcService": map[string]any{ + "Target": map[string]any{ + "URI": "localhost:zero", + }, + }, + }, + }, + errMsg: `invalid format for Target.URI "localhost:zero": expected host:port`, + }, + "invalid target timeout": { + args: map[string]any{ + "ProxyType": "connect-proxy", + "Config": map[string]any{ + "GrpcService": map[string]any{ + "Target": map[string]any{ + "URI": "localhost:9191", + "Timeout": "one", + }, + }, + }, + }, + errMsg: `failed to parse Target.Timeout "one" as a duration`, }, "no uri or service target": { args: map[string]any{ diff --git a/agent/envoyextensions/builtin/ext-authz/structs.go b/agent/envoyextensions/builtin/ext-authz/structs.go index 979bc8a86c7b3..a14cedd63a765 100644 --- a/agent/envoyextensions/builtin/ext-authz/structs.go +++ b/agent/envoyextensions/builtin/ext-authz/structs.go @@ -31,8 +31,12 @@ import ( const ( LocalExtAuthzClusterName = "local_ext_authz" + defaultMetadataNS = "consul" defaultStatPrefix = "response" defaultStatusOnError = 403 + localhost = "localhost" + localhostIPv4 = "127.0.0.1" + localhostIPv6 = "::1" ) type extAuthzConfig struct { @@ -44,7 +48,6 @@ type extAuthzConfig struct { MetadataContextNamespaces []string StatusOnError *int StatPrefix string - TransportApiVersion TransportApiVersion WithRequestBody *BufferSettings failureModeAllow bool @@ -185,6 +188,12 @@ func (c *extAuthzConfig) toEnvoyCluster(_ *cmn.RuntimeConfig) (*envoy_cluster_v3 return nil, err } + clusterType := &envoy_cluster_v3.Cluster_Type{Type: envoy_cluster_v3.Cluster_STATIC} + if host == localhost { + // If the host is "localhost" use a STRICT_DNS cluster type to perform DNS lookup. + clusterType = &envoy_cluster_v3.Cluster_Type{Type: envoy_cluster_v3.Cluster_STRICT_DNS} + } + var typedExtProtoOpts map[string]*anypb.Any if c.isGRPC() { // By default HTTP/1.1 is used for the transport protocol. gRPC requires that we explicitly configure HTTP/2 @@ -205,7 +214,7 @@ func (c *extAuthzConfig) toEnvoyCluster(_ *cmn.RuntimeConfig) (*envoy_cluster_v3 return &envoy_cluster_v3.Cluster{ Name: LocalExtAuthzClusterName, - ClusterDiscoveryType: &envoy_cluster_v3.Cluster_Type{Type: envoy_cluster_v3.Cluster_STATIC}, + ClusterDiscoveryType: clusterType, ConnectTimeout: target.timeoutDurationPB(), LoadAssignment: &envoy_endpoint_v3.ClusterLoadAssignment{ ClusterName: LocalExtAuthzClusterName, @@ -238,8 +247,8 @@ func (c extAuthzConfig) toEnvoyHttpFilter(cfg *cmn.RuntimeConfig) (*envoy_http_v extAuthzFilter := &envoy_http_ext_authz_v3.ExtAuthz{ StatPrefix: c.StatPrefix, WithRequestBody: c.WithRequestBody.toEnvoy(), - TransportApiVersion: c.TransportApiVersion.toEnvoy(), - MetadataContextNamespaces: c.MetadataContextNamespaces, + TransportApiVersion: envoy_core_v3.ApiVersion_V3, + MetadataContextNamespaces: append(c.MetadataContextNamespaces, defaultMetadataNS), FailureModeAllow: c.failureModeAllow, BootstrapMetadataLabelsKey: c.BootstrapMetadataLabelsKey, } @@ -281,7 +290,7 @@ func (c extAuthzConfig) toEnvoyNetworkFilter(cfg *cmn.RuntimeConfig) (*envoy_lis extAuthzFilter := &envoy_ext_authz_v3.ExtAuthz{ GrpcService: grpcSvc, StatPrefix: c.StatPrefix, - TransportApiVersion: c.TransportApiVersion.toEnvoy(), + TransportApiVersion: envoy_core_v3.ApiVersion_V3, FailureModeAllow: c.failureModeAllow, } @@ -645,18 +654,13 @@ func (t *Target) validate() error { } if t.isURI() { - // Strip the protocol if one was provided - if _, addr, hasProto := strings.Cut(t.URI, "://"); hasProto { - t.URI = addr - } - addr := strings.Split(t.URI, ":") - if len(addr) == 2 { - t.host = addr[0] - if t.host != "localhost" && t.host != "127.0.0.1" { - resultErr = multierror.Append(resultErr, fmt.Errorf("invalid host for Target.URI %q: expected 'localhost' or '127.0.0.1'", t.URI)) - } - if t.port, err = strconv.Atoi(addr[1]); err != nil { - resultErr = multierror.Append(resultErr, fmt.Errorf("invalid port for Target.URI %q", addr[1])) + t.host, t.port, err = parseAddr(t.URI) + if err == nil { + switch t.host { + case localhost, localhostIPv4, localhostIPv6: + default: + resultErr = multierror.Append(resultErr, + fmt.Errorf("invalid host for Target.URI %q: expected %q, %q, or %q", t.URI, localhost, localhostIPv4, localhostIPv6)) } } else { resultErr = multierror.Append(resultErr, fmt.Errorf("invalid format for Target.URI %q: expected host:port", t.URI)) @@ -673,17 +677,21 @@ func (t *Target) validate() error { return resultErr } -type TransportApiVersion string - -func (t TransportApiVersion) toEnvoy() envoy_core_v3.ApiVersion { - switch strings.ToLower(string(t)) { - case "v2": - //nolint:staticcheck - return envoy_core_v3.ApiVersion_V2 - case "auto": - //nolint:staticcheck - return envoy_core_v3.ApiVersion_AUTO +func parseAddr(s string) (host string, port int, err error) { + // Strip the protocol if one was provided + if _, addr, hasProto := strings.Cut(s, "://"); hasProto { + s = addr + } + idx := strings.LastIndex(s, ":") + switch idx { + case -1, len(s) - 1: + err = fmt.Errorf("invalid input format %q: expected host:port", s) + case 0: + host = localhost + port, err = strconv.Atoi(s[idx+1:]) default: - return envoy_core_v3.ApiVersion_V3 + host = s[:idx] + port, err = strconv.Atoi(s[idx+1:]) } + return } diff --git a/agent/envoyextensions/builtin/property-override/property_override.go b/agent/envoyextensions/builtin/property-override/property_override.go index 51d78368523f4..41e98074b7a23 100644 --- a/agent/envoyextensions/builtin/property-override/property_override.go +++ b/agent/envoyextensions/builtin/property-override/property_override.go @@ -191,6 +191,10 @@ func (f *ResourceFilter) validate() error { return err } + if len(f.Services) > 0 && f.TrafficDirection != extensioncommon.TrafficDirectionOutbound { + return fmt.Errorf("patch contains non-empty ResourceFilter.Services but ResourceFilter.TrafficDirection is not %q", + extensioncommon.TrafficDirectionOutbound) + } for i := range f.Services { sn := f.Services[i] sn.normalize() @@ -255,9 +259,9 @@ func (p *propertyOverride) validate() error { } var resultErr error - for _, patch := range p.Patches { + for i, patch := range p.Patches { if err := patch.validate(p.Debug); err != nil { - resultErr = multierror.Append(resultErr, err) + resultErr = multierror.Append(resultErr, fmt.Errorf("invalid Patches[%d]: %w", i, err)) } } diff --git a/agent/envoyextensions/builtin/property-override/property_override_test.go b/agent/envoyextensions/builtin/property-override/property_override_test.go index 21889d840f4a2..0e4317f9ddb72 100644 --- a/agent/envoyextensions/builtin/property-override/property_override_test.go +++ b/agent/envoyextensions/builtin/property-override/property_override_test.go @@ -63,6 +63,7 @@ func TestConstructor(t *testing.T) { expected propertyOverride ok bool errMsg string + errFunc func(*testing.T, error) } validTestCase := func(o Op, d extensioncommon.TrafficDirection, t ResourceType) testCase { @@ -216,6 +217,50 @@ func TestConstructor(t *testing.T) { ok: false, errMsg: fmt.Sprintf("field Value is not supported for %s operation", OpRemove), }, + "multiple patches includes indexed errors": { + arguments: makeArguments(map[string]any{"Patches": []map[string]any{ + makePatch(map[string]any{ + "Op": OpRemove, + "Value": 0, + }), + makePatch(map[string]any{ + "Op": OpAdd, + "Value": nil, + }), + makePatch(map[string]any{ + "Op": OpAdd, + "Path": "/foo", + }), + }}), + ok: false, + errFunc: func(t *testing.T, err error) { + require.ErrorContains(t, err, "invalid Patches[0]: field Value is not supported for remove operation") + require.ErrorContains(t, err, "invalid Patches[1]: non-nil Value is required") + require.ErrorContains(t, err, "invalid Patches[2]: no match for field 'foo'") + }, + }, + "multiple patches single error contains correct index": { + arguments: makeArguments(map[string]any{"Patches": []map[string]any{ + makePatch(map[string]any{ + "Op": OpAdd, + "Value": "foo", + }), + makePatch(map[string]any{ + "Op": OpRemove, + "Value": 1, + }), + makePatch(map[string]any{ + "Op": OpAdd, + "Value": "bar", + }), + }}), + ok: false, + errFunc: func(t *testing.T, err error) { + require.ErrorContains(t, err, "invalid Patches[1]: field Value is not supported for remove operation") + require.NotContains(t, err.Error(), "invalid Patches[0]") + require.NotContains(t, err.Error(), "invalid Patches[2]") + }, + }, "empty service name": { arguments: makeArguments(map[string]any{"Patches": []map[string]any{ makePatch(map[string]any{ @@ -229,6 +274,20 @@ func TestConstructor(t *testing.T) { ok: false, errMsg: "service name is required", }, + "non-empty services with invalid traffic direction": { + arguments: makeArguments(map[string]any{"Patches": []map[string]any{ + makePatch(map[string]any{ + "ResourceFilter": makeResourceFilter(map[string]any{ + "TrafficDirection": extensioncommon.TrafficDirectionInbound, + "Services": []map[string]any{ + {"Name:": "foo"}, + }, + }), + }), + }}), + ok: false, + errMsg: "patch contains non-empty ResourceFilter.Services but ResourceFilter.TrafficDirection is not \"outbound\"", + }, // See decode.HookWeakDecodeFromSlice for more details. In practice, we can end up // with a "Patches" field decoded to the single "Patch" value contained in the // serialized slice (raised from the containing slice). Using WeakDecode solves @@ -333,7 +392,13 @@ func TestConstructor(t *testing.T) { require.NoError(t, err) require.Equal(t, &extensioncommon.BasicEnvoyExtender{Extension: &tc.expected}, e) } else { - require.ErrorContains(t, err, tc.errMsg) + require.Error(t, err) + if tc.errMsg != "" { + require.ErrorContains(t, err, tc.errMsg) + } + if tc.errFunc != nil { + tc.errFunc(t, err) + } } }) } diff --git a/agent/envoyextensions/builtin/property-override/structpatcher.go b/agent/envoyextensions/builtin/property-override/structpatcher.go index 3a54ca25e40a2..91de4cf7f86d1 100644 --- a/agent/envoyextensions/builtin/property-override/structpatcher.go +++ b/agent/envoyextensions/builtin/property-override/structpatcher.go @@ -75,7 +75,7 @@ func findTargetMessageAndField(m protoreflect.Message, parsedPath []string, patc } // Check whether we have a non-terminal (parent) field in the path for which we - // don't support child lookup. + // don't support child operations. switch { case fieldDesc.IsList(): return nil, nil, fmt.Errorf("path contains member of repeated field '%s'; repeated field member access is not supported", @@ -83,6 +83,21 @@ func findTargetMessageAndField(m protoreflect.Message, parsedPath []string, patc case fieldDesc.IsMap(): return nil, nil, fmt.Errorf("path contains member of map field '%s'; map field member access is not supported", fieldName) + case fieldDesc.Message() != nil && fieldDesc.Message().FullName() == "google.protobuf.Any": + // Return a more helpful error for Any fields early. + // + // Doing this here prevents confusing two-step errors, e.g. "no match for field @type" + // on Any, when in fact we don't support variant proto message fields like Any in general. + // Because Any is a Message, we'd fail on invalid child fields or unsupported bytes target + // fields first. + // + // In the future, we could support Any by using the type field to initialize a struct for + // the nested message value. + return nil, nil, fmt.Errorf("variant-type message fields (google.protobuf.Any) are not supported") + case !(fieldDesc.Kind() == protoreflect.MessageKind): + // Non-Any fields that could be used to serialize protos as bytes will get a clear error message + // in this scenario. This also catches accidental use of non-complex fields as parent fields. + return nil, nil, fmt.Errorf("path contains member of non-message field '%s' (type '%s'); this type does not support child fields", fieldName, fieldDesc.Kind()) } fieldM := m.Get(fieldDesc).Message() @@ -137,6 +152,10 @@ func applyAdd(parentM protoreflect.Message, fieldDesc protoreflect.FieldDescript // similar to a list (repeated field). This map handling is specific to _our_ patch semantics for // updating multiple message fields at once. if isMapValue && !fieldDesc.IsMap() { + if fieldDesc.Kind() != protoreflect.MessageKind { + return fmt.Errorf("non-message field type '%s' cannot be set via a map", fieldDesc.Kind()) + } + // Get a fresh copy of the target field's message, then set the children indicated by the patch. fieldM := parentM.Get(fieldDesc).Message().New() for k, v := range mapValue { @@ -151,6 +170,7 @@ func applyAdd(parentM protoreflect.Message, fieldDesc protoreflect.FieldDescript fieldM.Set(targetFieldDesc, val) } parentM.Set(fieldDesc, protoreflect.ValueOf(fieldM)) + } else { // Just set the field directly, as our patch value is not a map. val, err := toProtoValue(parentM, fieldDesc, patch.Value) @@ -280,6 +300,9 @@ func toProtoValue(parentM protoreflect.Message, fieldDesc protoreflect.FieldDesc case float64: return toProtoNumericValue(fieldDesc, val) } + case protoreflect.BytesKind, + protoreflect.GroupKind: + return unsupportedTargetTypeErr(fieldDesc) } // Fall back to protoreflect.ValueOf, which may panic if an unexpected type is passed. diff --git a/agent/envoyextensions/builtin/property-override/structpatcher_test.go b/agent/envoyextensions/builtin/property-override/structpatcher_test.go index 579f0f71c98a0..ac7379f9f1868 100644 --- a/agent/envoyextensions/builtin/property-override/structpatcher_test.go +++ b/agent/envoyextensions/builtin/property-override/structpatcher_test.go @@ -2,6 +2,7 @@ package propertyoverride import ( "fmt" + "google.golang.org/protobuf/types/known/anypb" "testing" envoy_cluster_v3 "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3" @@ -592,6 +593,31 @@ func TestPatchStruct(t *testing.T) { }, ok: true, }, + "remove single field: Any": { + args: args{ + k: &envoy_cluster_v3.Cluster{ + ClusterDiscoveryType: &envoy_cluster_v3.Cluster_ClusterType{ + ClusterType: &envoy_cluster_v3.Cluster_CustomClusterType{ + TypedConfig: &anypb.Any{ + TypeUrl: "foo", + }, + }, + }, + }, + patches: []Patch{ + makeRemovePatch( + "/cluster_type/typed_config", + ), + }, + }, + // Invalid actual config, but used as an example of removing Any field directly + expected: &envoy_cluster_v3.Cluster{ + ClusterDiscoveryType: &envoy_cluster_v3.Cluster_ClusterType{ + ClusterType: &envoy_cluster_v3.Cluster_CustomClusterType{}, + }, + }, + ok: true, + }, "remove single field deeply nested": { args: args{ k: &envoy_cluster_v3.Cluster{ @@ -858,6 +884,69 @@ func TestPatchStruct(t *testing.T) { ok: false, errMsg: "unsupported target field type 'map'", }, + "add unsupported target: non-message field via map": { + args: args{ + k: &envoy_cluster_v3.Cluster{}, + patches: []Patch{ + makeAddPatch( + "/name", + map[string]any{ + "cluster_refresh_rate": "5s", + "cluster_refresh_timeout": "3s", + "redirect_refresh_interval": "5s", + "redirect_refresh_threshold": 5, + }, + ), + }, + }, + ok: false, + errMsg: "non-message field type 'string' cannot be set via a map", + }, + "add unsupported target: non-message parent field via single value": { + args: args{ + k: &envoy_cluster_v3.Cluster{}, + patches: []Patch{ + makeAddPatch( + "/name/foo", + "bar", + ), + }, + }, + ok: false, + errMsg: "path contains member of non-message field 'name' (type 'string'); this type does not support child fields", + }, + "add unsupported target: non-message parent field via map": { + args: args{ + k: &envoy_cluster_v3.Cluster{}, + patches: []Patch{ + makeAddPatch( + "/name/foo", + map[string]any{ + "cluster_refresh_rate": "5s", + "cluster_refresh_timeout": "3s", + "redirect_refresh_interval": "5s", + "redirect_refresh_threshold": 5, + }, + ), + }, + }, + ok: false, + errMsg: "path contains member of non-message field 'name' (type 'string'); this type does not support child fields", + }, + "add unsupported target: Any field": { + args: args{ + k: &envoy_cluster_v3.Cluster{}, + patches: []Patch{ + makeAddPatch( + // Purposefully use a wrong-but-reasonable field name to ensure special error is returned + "/cluster_type/typed_config/@type", + "foo", + ), + }, + }, + ok: false, + errMsg: "variant-type message fields (google.protobuf.Any) are not supported", + }, "add unsupported target: repeated message": { args: args{ k: &envoy_cluster_v3.Cluster{}, diff --git a/agent/grpc-external/services/resource/list_by_owner_test.go b/agent/grpc-external/services/resource/list_by_owner_test.go index 19fe799caf08c..218971a050daa 100644 --- a/agent/grpc-external/services/resource/list_by_owner_test.go +++ b/agent/grpc-external/services/resource/list_by_owner_test.go @@ -74,7 +74,7 @@ func TestListByOwner_TypeNotRegistered(t *testing.T) { }) require.Error(t, err) require.Equal(t, codes.InvalidArgument.String(), status.Code(err).String()) - require.Contains(t, err.Error(), "resource type demo.v2.artist not registered") + require.Contains(t, err.Error(), "resource type demo.v2.Artist not registered") } func TestListByOwner_Empty(t *testing.T) { @@ -126,7 +126,7 @@ func TestListByOwner_Many(t *testing.T) { } func TestListByOwner_ACL_PerTypeDenied(t *testing.T) { - authz := AuthorizerFrom(t, `key_prefix "resource/demo.v2.album/" { policy = "deny" }`) + authz := AuthorizerFrom(t, `key_prefix "resource/demo.v2.Album/" { policy = "deny" }`) _, rsp, err := roundTripListByOwner(t, authz) // verify resource filtered out, hence no results @@ -135,7 +135,7 @@ func TestListByOwner_ACL_PerTypeDenied(t *testing.T) { } func TestListByOwner_ACL_PerTypeAllowed(t *testing.T) { - authz := AuthorizerFrom(t, `key_prefix "resource/demo.v2.album/" { policy = "read" }`) + authz := AuthorizerFrom(t, `key_prefix "resource/demo.v2.Album/" { policy = "read" }`) album, rsp, err := roundTripListByOwner(t, authz) // verify resource not filtered out diff --git a/agent/grpc-external/services/resource/list_test.go b/agent/grpc-external/services/resource/list_test.go index 7d102b090ce0f..4d6b50951b758 100644 --- a/agent/grpc-external/services/resource/list_test.go +++ b/agent/grpc-external/services/resource/list_test.go @@ -58,7 +58,7 @@ func TestList_TypeNotFound(t *testing.T) { }) require.Error(t, err) require.Equal(t, codes.InvalidArgument.String(), status.Code(err).String()) - require.Contains(t, err.Error(), "resource type demo.v2.artist not registered") + require.Contains(t, err.Error(), "resource type demo.v2.Artist not registered") } func TestList_Empty(t *testing.T) { @@ -178,7 +178,7 @@ func TestList_ACL_ListAllowed_ReadDenied(t *testing.T) { // allow list, deny read authz := AuthorizerFrom(t, demo.ArtistV2ListPolicy, - `key_prefix "resource/demo.v2.artist/" { policy = "deny" }`) + `key_prefix "resource/demo.v2.Artist/" { policy = "deny" }`) _, rsp, err := roundTripList(t, authz) // verify resource filtered out by key:read denied hence no results diff --git a/agent/grpc-external/services/resource/read_test.go b/agent/grpc-external/services/resource/read_test.go index 237895eacc551..cca911ec15b5b 100644 --- a/agent/grpc-external/services/resource/read_test.go +++ b/agent/grpc-external/services/resource/read_test.go @@ -71,7 +71,7 @@ func TestRead_TypeNotFound(t *testing.T) { _, err = client.Read(context.Background(), &pbresource.ReadRequest{Id: artist.Id}) require.Error(t, err) require.Equal(t, codes.InvalidArgument.String(), status.Code(err).String()) - require.Contains(t, err.Error(), "resource type demo.v2.artist not registered") + require.Contains(t, err.Error(), "resource type demo.v2.Artist not registered") } func TestRead_ResourceNotFound(t *testing.T) { diff --git a/agent/grpc-external/services/resource/watch_test.go b/agent/grpc-external/services/resource/watch_test.go index 687fe0d0679f0..95695f295ebd4 100644 --- a/agent/grpc-external/services/resource/watch_test.go +++ b/agent/grpc-external/services/resource/watch_test.go @@ -66,7 +66,7 @@ func TestWatchList_TypeNotFound(t *testing.T) { err = mustGetError(t, rspCh) require.Equal(t, codes.InvalidArgument.String(), status.Code(err).String()) - require.Contains(t, err.Error(), "resource type demo.v2.artist not registered") + require.Contains(t, err.Error(), "resource type demo.v2.Artist not registered") } func TestWatchList_GroupVersionMatches(t *testing.T) { @@ -172,7 +172,7 @@ func TestWatchList_ACL_ListAllowed_ReadDenied(t *testing.T) { // allow list, deny read authz := AuthorizerFrom(t, ` key_prefix "resource/" { policy = "list" } - key_prefix "resource/demo.v2.artist/" { policy = "deny" } + key_prefix "resource/demo.v2.Artist/" { policy = "deny" } `) rspCh, _ := roundTripACL(t, authz) @@ -187,7 +187,7 @@ func TestWatchList_ACL_ListAllowed_ReadAllowed(t *testing.T) { // allow list, allow read authz := AuthorizerFrom(t, ` key_prefix "resource/" { policy = "list" } - key_prefix "resource/demo.v2.artist/" { policy = "read" } + key_prefix "resource/demo.v2.Artist/" { policy = "read" } `) rspCh, artist := roundTripACL(t, authz) diff --git a/agent/grpc-external/services/resource/write_status_test.go b/agent/grpc-external/services/resource/write_status_test.go index f65c7918ff796..aa26330176df7 100644 --- a/agent/grpc-external/services/resource/write_status_test.go +++ b/agent/grpc-external/services/resource/write_status_test.go @@ -180,7 +180,7 @@ func TestWriteStatus_TypeNotFound(t *testing.T) { _, err = client.WriteStatus(testContext(t), validWriteStatusRequest(t, res)) require.Error(t, err) require.Equal(t, codes.InvalidArgument.String(), status.Code(err).String()) - require.Contains(t, err.Error(), "resource type demo.v2.artist not registered") + require.Contains(t, err.Error(), "resource type demo.v2.Artist not registered") } func TestWriteStatus_ResourceNotFound(t *testing.T) { diff --git a/agent/grpc-external/services/resource/write_test.go b/agent/grpc-external/services/resource/write_test.go index 3da4ec478a526..4ec25ee26c0c7 100644 --- a/agent/grpc-external/services/resource/write_test.go +++ b/agent/grpc-external/services/resource/write_test.go @@ -151,7 +151,7 @@ func TestWrite_TypeNotFound(t *testing.T) { _, err = client.Write(testContext(t), &pbresource.WriteRequest{Resource: res}) require.Error(t, err) require.Equal(t, codes.InvalidArgument.String(), status.Code(err).String()) - require.Contains(t, err.Error(), "resource type demo.v2.artist not registered") + require.Contains(t, err.Error(), "resource type demo.v2.Artist not registered") } func TestWrite_ACLs(t *testing.T) { diff --git a/agent/hcp/client/client.go b/agent/hcp/client/client.go index 212647c51e87b..1c49fd7924716 100644 --- a/agent/hcp/client/client.go +++ b/agent/hcp/client/client.go @@ -313,9 +313,14 @@ func (t *TelemetryConfig) Enabled() (string, bool) { } // DefaultLabels returns a set of string pairs that must be added as attributes to all exported telemetry data. -func (t *TelemetryConfig) DefaultLabels(nodeID string) map[string]string { - labels := map[string]string{ - "node_id": nodeID, // used to delineate Consul nodes in graphs +func (t *TelemetryConfig) DefaultLabels(cfg config.CloudConfig) map[string]string { + labels := make(map[string]string) + nodeID := string(cfg.NodeID) + if nodeID != "" { + labels["node_id"] = nodeID + } + if cfg.NodeName != "" { + labels["node_name"] = cfg.NodeName } for k, v := range t.Labels { diff --git a/agent/hcp/client/client_test.go b/agent/hcp/client/client_test.go index 8c8a6addd70c5..0292fa3fab224 100644 --- a/agent/hcp/client/client_test.go +++ b/agent/hcp/client/client_test.go @@ -4,6 +4,8 @@ import ( "context" "testing" + "github.com/hashicorp/consul/agent/hcp/config" + "github.com/hashicorp/consul/types" "github.com/hashicorp/hcp-sdk-go/clients/cloud-consul-telemetry-gateway/preview/2023-04-14/client/consul_telemetry_service" "github.com/hashicorp/hcp-sdk-go/clients/cloud-consul-telemetry-gateway/preview/2023-04-14/models" "github.com/stretchr/testify/mock" @@ -147,3 +149,53 @@ func TestConvertTelemetryConfig(t *testing.T) { }) } } + +func Test_DefaultLabels(t *testing.T) { + for name, tc := range map[string]struct { + cfg config.CloudConfig + expectedLabels map[string]string + }{ + "Success": { + cfg: config.CloudConfig{ + NodeID: types.NodeID("nodeyid"), + NodeName: "nodey", + }, + expectedLabels: map[string]string{ + "node_id": "nodeyid", + "node_name": "nodey", + }, + }, + + "NoNodeID": { + cfg: config.CloudConfig{ + NodeID: types.NodeID(""), + NodeName: "nodey", + }, + expectedLabels: map[string]string{ + "node_name": "nodey", + }, + }, + "NoNodeName": { + cfg: config.CloudConfig{ + NodeID: types.NodeID("nodeyid"), + NodeName: "", + }, + expectedLabels: map[string]string{ + "node_id": "nodeyid", + }, + }, + "Empty": { + cfg: config.CloudConfig{ + NodeID: "", + NodeName: "", + }, + expectedLabels: map[string]string{}, + }, + } { + t.Run(name, func(t *testing.T) { + tCfg := &TelemetryConfig{} + labels := tCfg.DefaultLabels(tc.cfg) + require.Equal(t, labels, tc.expectedLabels) + }) + } +} diff --git a/agent/hcp/client/metrics_client.go b/agent/hcp/client/metrics_client.go index 7e19c9857a972..0bcb90b81ce23 100644 --- a/agent/hcp/client/metrics_client.go +++ b/agent/hcp/client/metrics_client.go @@ -32,6 +32,10 @@ const ( // defaultRetryMax is set to 0 to turn off retry functionality, until dynamic configuration is possible. // This is to circumvent any spikes in load that may cause or exacerbate server-side issues for now. defaultRetryMax = 0 + + // defaultErrRespBodyLength refers to the max character length of the body on a failure to export metrics. + // anything beyond we will truncate. + defaultErrRespBodyLength = 100 ) // MetricsClient exports Consul metrics in OTLP format to the HCP Telemetry Gateway. @@ -54,7 +58,7 @@ type otlpClient struct { // NewMetricsClient returns a configured MetricsClient. // The current implementation uses otlpClient to provide retry functionality. -func NewMetricsClient(cfg CloudConfig, ctx context.Context) (MetricsClient, error) { +func NewMetricsClient(ctx context.Context, cfg CloudConfig) (MetricsClient, error) { if cfg == nil { return nil, fmt.Errorf("failed to init telemetry client: provide valid cloudCfg (Cloud Configuration for TLS)") } @@ -150,8 +154,18 @@ func (o *otlpClient) ExportMetrics(ctx context.Context, protoMetrics *metricpb.R } if resp.StatusCode != http.StatusOK { - return fmt.Errorf("failed to export metrics: code %d: %s", resp.StatusCode, string(body)) + truncatedBody := truncate(respData.String(), defaultErrRespBodyLength) + return fmt.Errorf("failed to export metrics: code %d: %s", resp.StatusCode, truncatedBody) } return nil } + +func truncate(text string, width uint) string { + if len(text) <= int(width) { + return text + } + r := []rune(text) + trunc := r[:width] + return string(trunc) + "..." +} diff --git a/agent/hcp/client/metrics_client_test.go b/agent/hcp/client/metrics_client_test.go index e80996fcf5eb0..4119e326e9dc0 100644 --- a/agent/hcp/client/metrics_client_test.go +++ b/agent/hcp/client/metrics_client_test.go @@ -3,6 +3,7 @@ package client import ( "context" "fmt" + "math/rand" "net/http" "net/http/httptest" "testing" @@ -51,7 +52,7 @@ func TestNewMetricsClient(t *testing.T) { }, } { t.Run(name, func(t *testing.T) { - client, err := NewMetricsClient(test.cfg, test.ctx) + client, err := NewMetricsClient(test.ctx, test.cfg) if test.wantErr != "" { require.Error(t, err) require.Contains(t, err.Error(), test.wantErr) @@ -64,10 +65,21 @@ func TestNewMetricsClient(t *testing.T) { } } +var letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜ世界") + +func randStringRunes(n int) string { + b := make([]rune, n) + for i := range b { + b[i] = letterRunes[rand.Intn(len(letterRunes))] + } + return string(b) +} + func TestExportMetrics(t *testing.T) { for name, test := range map[string]struct { - wantErr string - status int + wantErr string + status int + largeBodyError bool }{ "success": { status: http.StatusOK, @@ -76,8 +88,14 @@ func TestExportMetrics(t *testing.T) { status: http.StatusBadRequest, wantErr: "failed to export metrics: code 400", }, + "failsWithNonRetryableErrorWithLongError": { + status: http.StatusBadRequest, + wantErr: "failed to export metrics: code 400", + largeBodyError: true, + }, } { t.Run(name, func(t *testing.T) { + randomBody := randStringRunes(1000) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { require.Equal(t, r.Header.Get("content-type"), "application/x-protobuf") require.Equal(t, r.Header.Get("x-hcp-resource-id"), testResourceID) @@ -91,11 +109,16 @@ func TestExportMetrics(t *testing.T) { w.Header().Set("Content-Type", "application/x-protobuf") w.WriteHeader(test.status) - w.Write(bytes) + if test.largeBodyError { + w.Write([]byte(randomBody)) + } else { + w.Write(bytes) + } + })) defer srv.Close() - client, err := NewMetricsClient(MockCloudCfg{}, context.Background()) + client, err := NewMetricsClient(context.Background(), MockCloudCfg{}) require.NoError(t, err) ctx := context.Background() @@ -105,6 +128,10 @@ func TestExportMetrics(t *testing.T) { if test.wantErr != "" { require.Error(t, err) require.Contains(t, err.Error(), test.wantErr) + if test.largeBodyError { + truncatedBody := truncate(randomBody, defaultErrRespBodyLength) + require.Contains(t, err.Error(), truncatedBody) + } return } @@ -112,3 +139,37 @@ func TestExportMetrics(t *testing.T) { }) } } + +func TestTruncate(t *testing.T) { + for name, tc := range map[string]struct { + body string + expectedSize int + }{ + "ZeroSize": { + body: "", + expectedSize: 0, + }, + "LessThanSize": { + body: "foobar", + expectedSize: 6, + }, + "defaultSize": { + body: "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis vel tincidunt nunc, sed tristique risu", + expectedSize: 100, + }, + "greaterThanSize": { + body: "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis vel tincidunt nunc, sed tristique risus", + expectedSize: 103, + }, + "greaterThanSizeWithUnicode": { + body: randStringRunes(1000), + expectedSize: 103, + }, + } { + t.Run(name, func(t *testing.T) { + truncatedBody := truncate(tc.body, defaultErrRespBodyLength) + truncatedRunes := []rune(truncatedBody) + require.Equal(t, len(truncatedRunes), tc.expectedSize) + }) + } +} diff --git a/agent/hcp/client/mock_metrics_client.go b/agent/hcp/client/mock_metrics_client.go new file mode 100644 index 0000000000000..a30b1f1c62c0d --- /dev/null +++ b/agent/hcp/client/mock_metrics_client.go @@ -0,0 +1,5 @@ +package client + +type MockMetricsClient struct { + MetricsClient +} diff --git a/agent/hcp/config/config.go b/agent/hcp/config/config.go index 8d1358fa4adfe..319c39e40e94c 100644 --- a/agent/hcp/config/config.go +++ b/agent/hcp/config/config.go @@ -6,6 +6,7 @@ package config import ( "crypto/tls" + "github.com/hashicorp/consul/types" hcpcfg "github.com/hashicorp/hcp-sdk-go/config" "github.com/hashicorp/hcp-sdk-go/resource" ) @@ -25,6 +26,9 @@ type CloudConfig struct { // TlsConfig for testing. TLSConfig *tls.Config + + NodeID types.NodeID + NodeName string } func (c *CloudConfig) WithTLSConfig(cfg *tls.Config) { diff --git a/agent/hcp/deps.go b/agent/hcp/deps.go index f4ad161daba4a..e3e83dec9657f 100644 --- a/agent/hcp/deps.go +++ b/agent/hcp/deps.go @@ -14,7 +14,6 @@ import ( "github.com/hashicorp/consul/agent/hcp/config" "github.com/hashicorp/consul/agent/hcp/scada" "github.com/hashicorp/consul/agent/hcp/telemetry" - "github.com/hashicorp/consul/types" "github.com/hashicorp/go-hclog" ) @@ -25,10 +24,13 @@ type Deps struct { Sink metrics.MetricSink } -func NewDeps(cfg config.CloudConfig, logger hclog.Logger, nodeID types.NodeID) (Deps, error) { +func NewDeps(cfg config.CloudConfig, logger hclog.Logger) (Deps, error) { + ctx := context.Background() + ctx = hclog.WithContext(ctx, logger) + client, err := hcpclient.NewClient(cfg) if err != nil { - return Deps{}, fmt.Errorf("failed to init client: %w:", err) + return Deps{}, fmt.Errorf("failed to init client: %w", err) } provider, err := scada.New(cfg, logger.Named("scada")) @@ -36,7 +38,13 @@ func NewDeps(cfg config.CloudConfig, logger hclog.Logger, nodeID types.NodeID) ( return Deps{}, fmt.Errorf("failed to init scada: %w", err) } - sink := sink(client, &cfg, logger.Named("sink"), nodeID) + metricsClient, err := hcpclient.NewMetricsClient(ctx, &cfg) + if err != nil { + logger.Error("failed to init metrics client", "error", err) + return Deps{}, fmt.Errorf("failed to init metrics client: %w", err) + } + + sink := sink(ctx, client, metricsClient, cfg) return Deps{ Client: client, @@ -48,10 +56,13 @@ func NewDeps(cfg config.CloudConfig, logger hclog.Logger, nodeID types.NodeID) ( // sink provides initializes an OTELSink which forwards Consul metrics to HCP. // The sink is only initialized if the server is registered with the management plane (CCM). // This step should not block server initialization, so errors are logged, but not returned. -func sink(hcpClient hcpclient.Client, cfg hcpclient.CloudConfig, logger hclog.Logger, nodeID types.NodeID) metrics.MetricSink { - ctx := context.Background() - ctx = hclog.WithContext(ctx, logger) - +func sink( + ctx context.Context, + hcpClient hcpclient.Client, + metricsClient hcpclient.MetricsClient, + cfg config.CloudConfig, +) metrics.MetricSink { + logger := hclog.FromContext(ctx).Named("sink") reqCtx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() @@ -72,16 +83,10 @@ func sink(hcpClient hcpclient.Client, cfg hcpclient.CloudConfig, logger hclog.Lo return nil } - metricsClient, err := hcpclient.NewMetricsClient(cfg, ctx) - if err != nil { - logger.Error("failed to init metrics client", "error", err) - return nil - } - sinkOpts := &telemetry.OTELSinkOpts{ Ctx: ctx, Reader: telemetry.NewOTELReader(metricsClient, u, telemetry.DefaultExportInterval), - Labels: telemetryCfg.DefaultLabels(string(nodeID)), + Labels: telemetryCfg.DefaultLabels(cfg), Filters: telemetryCfg.MetricsConfig.Filters, } diff --git a/agent/hcp/deps_test.go b/agent/hcp/deps_test.go index 54ec7b6de478b..9a90c26d50ad1 100644 --- a/agent/hcp/deps_test.go +++ b/agent/hcp/deps_test.go @@ -1,10 +1,11 @@ package hcp import ( + "context" "fmt" "testing" - "github.com/hashicorp/go-hclog" + "github.com/hashicorp/consul/agent/hcp/config" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" @@ -16,7 +17,7 @@ func TestSink(t *testing.T) { t.Parallel() for name, test := range map[string]struct { expect func(*client.MockClient) - mockCloudCfg client.CloudConfig + cloudCfg config.CloudConfig expectedSink bool }{ "success": { @@ -28,7 +29,10 @@ func TestSink(t *testing.T) { }, }, nil) }, - mockCloudCfg: client.MockCloudCfg{}, + cloudCfg: config.CloudConfig{ + NodeID: types.NodeID("nodeyid"), + NodeName: "nodey", + }, expectedSink: true, }, "noSinkWhenServerNotRegisteredWithCCM": { @@ -40,26 +44,13 @@ func TestSink(t *testing.T) { }, }, nil) }, - mockCloudCfg: client.MockCloudCfg{}, + cloudCfg: config.CloudConfig{}, }, "noSinkWhenCCMVerificationFails": { expect: func(mockClient *client.MockClient) { mockClient.EXPECT().FetchTelemetryConfig(mock.Anything).Return(nil, fmt.Errorf("fetch failed")) }, - mockCloudCfg: client.MockCloudCfg{}, - }, - "noSinkWhenMetricsClientInitFails": { - mockCloudCfg: client.MockCloudCfg{ - ConfigErr: fmt.Errorf("test bad hcp config"), - }, - expect: func(mockClient *client.MockClient) { - mockClient.EXPECT().FetchTelemetryConfig(mock.Anything).Return(&client.TelemetryConfig{ - Endpoint: "https://test.com", - MetricsConfig: &client.MetricsConfig{ - Endpoint: "", - }, - }, nil) - }, + cloudCfg: config.CloudConfig{}, }, "failsWithFetchTelemetryFailure": { expect: func(mockClient *client.MockClient) { @@ -93,14 +84,17 @@ func TestSink(t *testing.T) { t.Run(name, func(t *testing.T) { t.Parallel() c := client.NewMockClient(t) - l := hclog.NewNullLogger() + mc := client.MockMetricsClient{} + test.expect(c) - sinkOpts := sink(c, test.mockCloudCfg, l, types.NodeID("server1234")) + ctx := context.Background() + + s := sink(ctx, c, mc, test.cloudCfg) if !test.expectedSink { - require.Nil(t, sinkOpts) + require.Nil(t, s) return } - require.NotNil(t, sinkOpts) + require.NotNil(t, s) }) } } diff --git a/agent/leafcert/cached_roots.go b/agent/leafcert/cached_roots.go new file mode 100644 index 0000000000000..b973b6dc660ca --- /dev/null +++ b/agent/leafcert/cached_roots.go @@ -0,0 +1,47 @@ +package leafcert + +import ( + "context" + "errors" + + "github.com/hashicorp/consul/agent/cache" + cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/structs" +) + +// NewCachedRootsReader returns a RootsReader that sources data from the agent cache. +func NewCachedRootsReader(cache *cache.Cache, dc string) RootsReader { + return &agentCacheRootsReader{ + cache: cache, + datacenter: dc, + } +} + +type agentCacheRootsReader struct { + cache *cache.Cache + datacenter string +} + +var _ RootsReader = (*agentCacheRootsReader)(nil) + +func (r *agentCacheRootsReader) Get() (*structs.IndexedCARoots, error) { + // Background is fine here because this isn't a blocking query as no index is set. + // Therefore this will just either be a cache hit or return once the non-blocking query returns. + rawRoots, _, err := r.cache.Get(context.Background(), cachetype.ConnectCARootName, &structs.DCSpecificRequest{ + Datacenter: r.datacenter, + }) + if err != nil { + return nil, err + } + roots, ok := rawRoots.(*structs.IndexedCARoots) + if !ok { + return nil, errors.New("invalid RootCA response type") + } + return roots, nil +} + +func (r *agentCacheRootsReader) Notify(ctx context.Context, correlationID string, ch chan<- cache.UpdateEvent) error { + return r.cache.Notify(ctx, cachetype.ConnectCARootName, &structs.DCSpecificRequest{ + Datacenter: r.datacenter, + }, correlationID, ch) +} diff --git a/agent/leafcert/cert.go b/agent/leafcert/cert.go new file mode 100644 index 0000000000000..0230685737756 --- /dev/null +++ b/agent/leafcert/cert.go @@ -0,0 +1,133 @@ +package leafcert + +import ( + "sync" + "time" + + "golang.org/x/time/rate" + + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/lib/ttlcache" +) + +// certData tracks all of the metadata about a leaf cert. +type certData struct { + // lock locks access to all fields + lock sync.Mutex + + // index is the last raft index associated with an update of the 'value' field + index uint64 + + // value is the last updated cert contents or nil if not populated initially + value *structs.IssuedCert + + // state is metadata related to cert generation + state fetchState + + // fetchedAt was the time when 'value' was last updated + fetchedAt time.Time + + // refreshing indicates if there is an active request attempting to refresh + // the current leaf cert contents. + refreshing bool + + // lastFetchErr is the last error encountered when attempting to populate + // the 'value' field. + lastFetchErr error + + // expiry contains information about the expiration of this + // cert. This is a pointer as its shared as a value in the + // ExpiryHeap as well. + expiry *ttlcache.Entry + + // refreshRateLimiter limits the rate at which the cert can be regenerated + refreshRateLimiter *rate.Limiter +} + +func (c *certData) MarkRefreshing(v bool) { + c.lock.Lock() + defer c.lock.Unlock() + c.refreshing = v +} + +func (c *certData) GetValueAndState() (*structs.IssuedCert, fetchState) { + c.lock.Lock() + defer c.lock.Unlock() + return c.value, c.state +} + +func (c *certData) GetError() error { + c.lock.Lock() + defer c.lock.Unlock() + return c.lastFetchErr +} + +// NOTE: this function only has one goroutine in it per key at all times +func (c *certData) Update( + newCert *structs.IssuedCert, + newState fetchState, + err error, +) { + c.lock.Lock() + defer c.lock.Unlock() + + // Importantly, always reset the Error. Having both Error and a Value that + // are non-nil is allowed in the cache entry but it indicates that the Error + // is _newer_ than the last good value. So if the err is nil then we need to + // reset to replace any _older_ errors and avoid them bubbling up. If the + // error is non-nil then we need to set it anyway and used to do it in the + // code below. See https://github.com/hashicorp/consul/issues/4480. + c.lastFetchErr = err + + c.state = newState + if newCert != nil { + c.index = newCert.ModifyIndex + c.value = newCert + c.fetchedAt = time.Now() + } + + if c.index < 1 { + // Less than one is invalid unless there was an error and in this case + // there wasn't since a value was returned. If a badly behaved RPC + // returns 0 when it has no data, we might get into a busy loop here. We + // set this to minimum of 1 which is safe because no valid user data can + // ever be written at raft index 1 due to the bootstrap process for + // raft. This insure that any subsequent background refresh request will + // always block, but allows the initial request to return immediately + // even if there is no data. + c.index = 1 + } +} + +// fetchState is some additional metadata we store with each cert in the cache +// to track things like expiry and coordinate paces root rotations. It's +// important this doesn't contain any pointer types since we rely on the struct +// being copied to avoid modifying the actual state in the cache entry during +// Fetch. Pointers themselves are OK, but if we point to another struct that we +// call a method or modify in some way that would directly mutate the cache and +// cause problems. We'd need to deep-clone in that case in Fetch below. +// time.Time technically contains a pointer to the Location but we ignore that +// since all times we get from our wall clock should point to the same Location +// anyway. +type fetchState struct { + // authorityKeyId is the ID of the CA key (whether root or intermediate) that signed + // the current cert. This is just to save parsing the whole cert everytime + // we have to check if the root changed. + authorityKeyID string + + // forceExpireAfter is used to coordinate renewing certs after a CA rotation + // in a staggered way so that we don't overwhelm the servers. + forceExpireAfter time.Time + + // activeRootRotationStart is set when the root has changed and we need to get + // a new cert but haven't got one yet. forceExpireAfter will be set to the + // next scheduled time we should try our CSR, but this is needed to calculate + // the retry windows if we are rate limited when we try. See comment on + // const caChangeJitterWindow above for more. + activeRootRotationStart time.Time + + // consecutiveRateLimitErrs stores how many rate limit errors we've hit. We + // use this to choose a new window for the next retry. See comment on + // const caChangeJitterWindow above for more. + consecutiveRateLimitErrs int +} diff --git a/agent/leafcert/generate.go b/agent/leafcert/generate.go new file mode 100644 index 0000000000000..0e397cdc2d52f --- /dev/null +++ b/agent/leafcert/generate.go @@ -0,0 +1,362 @@ +package leafcert + +import ( + "context" + "errors" + "fmt" + "net" + "time" + + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/consul" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/lib" +) + +// caChangeJitterWindow is the time over which we spread each round of retries +// when attempting to get a new certificate following a root rotation. It's +// selected to be a trade-off between not making rotation unnecessarily slow on +// a tiny cluster while not hammering the servers on a huge cluster +// unnecessarily hard. Servers rate limit to protect themselves from the +// expensive crypto work, but in practice have 10k+ RPCs all in the same second +// will cause a major disruption even on large servers due to downloading the +// payloads, parsing msgpack etc. Instead we pick a window that for now is fixed +// but later might be either user configurable (not nice since it would become +// another hard-to-tune value) or set dynamically by the server based on it's +// knowledge of how many certs need to be rotated. Currently the server doesn't +// know that so we pick something that is reasonable. We err on the side of +// being slower that we need in trivial cases but gentler for large deployments. +// 30s means that even with a cluster of 10k service instances, the server only +// has to cope with ~333 RPCs a second which shouldn't be too bad if it's rate +// limiting the actual expensive crypto work. +// +// The actual backoff strategy when we are rate limited is to have each cert +// only retry once with each window of this size, at a point in the window +// selected at random. This performs much better than exponential backoff in +// terms of getting things rotated quickly with more predictable load and so +// fewer rate limited requests. See the full simulation this is based on at +// https://github.com/banks/sim-rate-limit-backoff/blob/master/README.md for +// more detail. +const caChangeJitterWindow = 30 * time.Second + +// NOTE: this function only has one goroutine in it per key at all times +func (m *Manager) attemptLeafRefresh( + req *ConnectCALeafRequest, + existing *structs.IssuedCert, + state fetchState, +) (*structs.IssuedCert, fetchState, error) { + if req.MaxQueryTime <= 0 { + req.MaxQueryTime = DefaultQueryTimeout + } + + // Handle brand new request first as it's simplest. + if existing == nil { + return m.generateNewLeaf(req, state, true) + } + + // We have a certificate in cache already. Check it's still valid. + now := time.Now() + minExpire, maxExpire := calculateSoftExpiry(now, existing) + expiresAt := minExpire.Add(lib.RandomStagger(maxExpire.Sub(minExpire))) + + // Check if we have been force-expired by a root update that jittered beyond + // the timeout of the query it was running. + if !state.forceExpireAfter.IsZero() && state.forceExpireAfter.Before(expiresAt) { + expiresAt = state.forceExpireAfter + } + + if expiresAt.Equal(now) || expiresAt.Before(now) { + // Already expired, just make a new one right away + return m.generateNewLeaf(req, state, false) + } + + // If we called Get() with MustRevalidate then this call came from a non-blocking query. + // Any prior CA rotations should've already expired the cert. + // All we need to do is check whether the current CA is the one that signed the leaf. If not, generate a new leaf. + // This is not a perfect solution (as a CA rotation update can be missed) but it should take care of instances like + // see https://github.com/hashicorp/consul/issues/10871, https://github.com/hashicorp/consul/issues/9862 + // This seems to me like a hack, so maybe we can revisit the caching/ fetching logic in this case + if req.MustRevalidate { + roots, err := m.rootsReader.Get() + if err != nil { + return nil, state, err + } else if roots == nil { + return nil, state, errors.New("no CA roots") + } + if activeRootHasKey(roots, state.authorityKeyID) { + return nil, state, nil + } + + // if we reach here then the current leaf was not signed by the same CAs, just regen + return m.generateNewLeaf(req, state, false) + } + + // We are about to block and wait for a change or timeout. + + // Make a chan we can be notified of changes to CA roots on. It must be + // buffered so we don't miss broadcasts from rootsWatch. It is an edge trigger + // so a single buffer element is sufficient regardless of whether we consume + // the updates fast enough since as soon as we see an element in it, we will + // reload latest CA from cache. + rootUpdateCh := make(chan struct{}, 1) + + // The roots may have changed in between blocking calls. We need to verify + // that the existing cert was signed by the current root. If it was we still + // want to do the whole jitter thing. We could code that again here but it's + // identical to the select case below so we just trigger our own update chan + // and let the logic below handle checking if the CA actually changed in the + // common case where it didn't it is a no-op anyway. + rootUpdateCh <- struct{}{} + + // Subscribe our chan to get root update notification. + m.rootWatcher.Subscribe(rootUpdateCh) + defer m.rootWatcher.Unsubscribe(rootUpdateCh) + + // Setup the timeout chan outside the loop so we don't keep bumping the timeout + // later if we loop around. + timeoutTimer := time.NewTimer(req.MaxQueryTime) + defer timeoutTimer.Stop() + + // Setup initial expiry chan. We may change this if root update occurs in the + // loop below. + expiresTimer := time.NewTimer(expiresAt.Sub(now)) + defer func() { + // Resolve the timer reference at defer time, so we use the latest one each time. + expiresTimer.Stop() + }() + + // Current cert is valid so just wait until it expires or we time out. + for { + select { + case <-timeoutTimer.C: + // We timed out the request with same cert. + return nil, state, nil + + case <-expiresTimer.C: + // Cert expired or was force-expired by a root change. + return m.generateNewLeaf(req, state, false) + + case <-rootUpdateCh: + // A root cache change occurred, reload roots from cache. + roots, err := m.rootsReader.Get() + if err != nil { + return nil, state, err + } else if roots == nil { + return nil, state, errors.New("no CA roots") + } + + // Handle _possibly_ changed roots. We still need to verify the new active + // root is not the same as the one our current cert was signed by since we + // can be notified spuriously if we are the first request since the + // rootsWatcher didn't know about the CA we were signed by. We also rely + // on this on every request to do the initial check that the current roots + // are the same ones the current cert was signed by. + if activeRootHasKey(roots, state.authorityKeyID) { + // Current active CA is the same one that signed our current cert so + // keep waiting for a change. + continue + } + state.activeRootRotationStart = time.Now() + + // CA root changed. We add some jitter here to avoid a thundering herd. + // See docs on caChangeJitterWindow const. + delay := m.getJitteredCAChangeDelay() + + // Force the cert to be expired after the jitter - the delay above might + // be longer than we have left on our timeout. We set forceExpireAfter in + // the cache state so the next request will notice we still need to renew + // and do it at the right time. This is cleared once a new cert is + // returned by generateNewLeaf. + state.forceExpireAfter = state.activeRootRotationStart.Add(delay) + // If the delay time is within the current timeout, we want to renew the + // as soon as it's up. We change the expire time and chan so that when we + // loop back around, we'll wait at most delay until generating a new cert. + if state.forceExpireAfter.Before(expiresAt) { + expiresAt = state.forceExpireAfter + // Stop the former one and create a new one. + expiresTimer.Stop() + expiresTimer = time.NewTimer(delay) + } + continue + } + } +} + +func (m *Manager) getJitteredCAChangeDelay() time.Duration { + if m.config.TestOverrideCAChangeInitialDelay > 0 { + return m.config.TestOverrideCAChangeInitialDelay + } + // CA root changed. We add some jitter here to avoid a thundering herd. + // See docs on caChangeJitterWindow const. + return lib.RandomStagger(caChangeJitterWindow) +} + +func activeRootHasKey(roots *structs.IndexedCARoots, currentSigningKeyID string) bool { + for _, ca := range roots.Roots { + if ca.Active { + return ca.SigningKeyID == currentSigningKeyID + } + } + // Shouldn't be possible since at least one root should be active. + return false +} + +// generateNewLeaf does the actual work of creating a new private key, +// generating a CSR and getting it signed by the servers. +// +// NOTE: do not hold the lock while doing the RPC/blocking stuff +func (m *Manager) generateNewLeaf( + req *ConnectCALeafRequest, + newState fetchState, + firstTime bool, +) (*structs.IssuedCert, fetchState, error) { + // Need to lookup RootCAs response to discover trust domain. This should be a + // cache hit. + roots, err := m.rootsReader.Get() + if err != nil { + return nil, newState, err + } else if roots == nil { + return nil, newState, errors.New("no CA roots") + } + if roots.TrustDomain == "" { + return nil, newState, errors.New("cluster has no CA bootstrapped yet") + } + + // Build the cert uri + var id connect.CertURI + var dnsNames []string + var ipAddresses []net.IP + + switch { + case req.Service != "": + id = &connect.SpiffeIDService{ + Host: roots.TrustDomain, + Datacenter: req.Datacenter, + Partition: req.TargetPartition(), + Namespace: req.TargetNamespace(), + Service: req.Service, + } + dnsNames = append(dnsNames, req.DNSSAN...) + + case req.Agent != "": + id = &connect.SpiffeIDAgent{ + Host: roots.TrustDomain, + Datacenter: req.Datacenter, + Partition: req.TargetPartition(), + Agent: req.Agent, + } + dnsNames = append([]string{"localhost"}, req.DNSSAN...) + ipAddresses = append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::1")}, req.IPSAN...) + + case req.Kind == structs.ServiceKindMeshGateway: + id = &connect.SpiffeIDMeshGateway{ + Host: roots.TrustDomain, + Datacenter: req.Datacenter, + Partition: req.TargetPartition(), + } + dnsNames = append(dnsNames, req.DNSSAN...) + + case req.Kind != "": + return nil, newState, fmt.Errorf("unsupported kind: %s", req.Kind) + + case req.Server: + if req.Datacenter == "" { + return nil, newState, errors.New("datacenter name must be specified") + } + id = &connect.SpiffeIDServer{ + Host: roots.TrustDomain, + Datacenter: req.Datacenter, + } + dnsNames = append(dnsNames, connect.PeeringServerSAN(req.Datacenter, roots.TrustDomain)) + + default: + return nil, newState, errors.New("URI must be either service, agent, server, or kind") + } + + // Create a new private key + + // TODO: for now we always generate EC keys on clients regardless of the key + // type being used by the active CA. This is fine and allowed in TLS1.2 and + // signing EC CSRs with an RSA key is supported by all current CA providers so + // it's OK. IFF we ever need to support a CA provider that refuses to sign a + // CSR with a different signature algorithm, or if we have compatibility + // issues with external PKI systems that require EC certs be signed with ECDSA + // from the CA (this was required in TLS1.1 but not in 1.2) then we can + // instead intelligently pick the key type we generate here based on the key + // type of the active signing CA. We already have that loaded since we need + // the trust domain. + pk, pkPEM, err := connect.GeneratePrivateKey() + if err != nil { + return nil, newState, err + } + + // Create a CSR. + csr, err := connect.CreateCSR(id, pk, dnsNames, ipAddresses) + if err != nil { + return nil, newState, err + } + + // Request signing + args := structs.CASignRequest{ + WriteRequest: structs.WriteRequest{Token: req.Token}, + Datacenter: req.Datacenter, + CSR: csr, + } + + reply, err := m.certSigner.SignCert(context.Background(), &args) + if err != nil { + if err.Error() == consul.ErrRateLimited.Error() { + if firstTime { + // This was a first fetch - we have no good value in cache. In this case + // we just return the error to the caller rather than rely on surprising + // semi-blocking until the rate limit is appeased or we timeout + // behavior. It's likely the caller isn't expecting this to block since + // it's an initial fetch. This also massively simplifies this edge case. + return nil, newState, err + } + + if newState.activeRootRotationStart.IsZero() { + // We hit a rate limit error by chance - for example a cert expired + // before the root rotation was observed (not triggered by rotation) but + // while server is working through high load from a recent rotation. + // Just pretend there is a rotation and the retry logic here will start + // jittering and retrying in the same way from now. + newState.activeRootRotationStart = time.Now() + } + + // Increment the errors in the state + newState.consecutiveRateLimitErrs++ + + delay := m.getJitteredCAChangeDelay() + + // Find the start of the next window we can retry in. See comment on + // caChangeJitterWindow for details of why we use this strategy. + windowStart := newState.activeRootRotationStart.Add( + time.Duration(newState.consecutiveRateLimitErrs) * delay) + + // Pick a random time in that window + newState.forceExpireAfter = windowStart.Add(delay) + + // Return a result with the existing cert but the new state - the cache + // will see this as no change. Note that we always have an existing result + // here due to the nil value check above. + return nil, newState, nil + } + return nil, newState, err + } + reply.PrivateKeyPEM = pkPEM + + // Reset rotation state + newState.forceExpireAfter = time.Time{} + newState.consecutiveRateLimitErrs = 0 + newState.activeRootRotationStart = time.Time{} + + cert, err := connect.ParseCert(reply.CertPEM) + if err != nil { + return nil, newState, err + } + // Set the CA key ID so we can easily tell when a active root has changed. + newState.authorityKeyID = connect.EncodeSigningKeyID(cert.AuthorityKeyId) + + return reply, newState, nil +} diff --git a/agent/leafcert/leafcert.go b/agent/leafcert/leafcert.go new file mode 100644 index 0000000000000..9cd0c08db13d0 --- /dev/null +++ b/agent/leafcert/leafcert.go @@ -0,0 +1,556 @@ +package leafcert + +import ( + "context" + "errors" + "fmt" + "sync" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/go-hclog" + "golang.org/x/sync/singleflight" + "golang.org/x/time/rate" + + "github.com/hashicorp/consul/agent/cache" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/lib/ttlcache" +) + +const ( + DefaultLastGetTTL = 72 * time.Hour // reasonable default is days + + // DefaultLeafCertRefreshRate is the default rate at which certs can be refreshed. + // This defaults to not being limited + DefaultLeafCertRefreshRate = rate.Inf + + // DefaultLeafCertRefreshMaxBurst is the number of cache entry fetches that can + // occur in a burst. + DefaultLeafCertRefreshMaxBurst = 2 + + DefaultLeafCertRefreshBackoffMin = 3 // 3 attempts before backing off + DefaultLeafCertRefreshMaxWait = 1 * time.Minute // maximum backoff wait time + + DefaultQueryTimeout = 10 * time.Minute +) + +type Config struct { + // LastGetTTL is the time that the certs returned by this type remain in + // the cache after the last get operation. If a cert isn't accessed within + // this duration, the certs is purged and background refreshing will cease. + LastGetTTL time.Duration + + // LeafCertRefreshMaxBurst max burst size of RateLimit for a single cache entry + LeafCertRefreshMaxBurst int + + // LeafCertRefreshRate represents the max calls/sec for a single cache entry + LeafCertRefreshRate rate.Limit + + // LeafCertRefreshBackoffMin is the number of attempts to wait before + // backing off. + // + // Mostly configurable just for testing. + LeafCertRefreshBackoffMin uint + + // LeafCertRefreshMaxWait is the maximum backoff wait time. + // + // Mostly configurable just for testing. + LeafCertRefreshMaxWait time.Duration + + // TestOverrideCAChangeInitialDelay allows overriding the random jitter + // after a root change with a fixed delay. So far ths is only done in + // tests. If it's zero the caChangeInitialSpreadDefault maximum jitter will + // be used but if set, it overrides and provides a fixed delay. To + // essentially disable the delay in tests they can set it to 1 nanosecond. + // We may separately allow configuring the jitter limit by users later but + // this is different and for tests only since we need to set a + // deterministic time delay in order to test the behavior here fully and + // determinstically. + TestOverrideCAChangeInitialDelay time.Duration +} + +func (c Config) withDefaults() Config { + if c.LastGetTTL <= 0 { + c.LastGetTTL = DefaultLastGetTTL + } + if c.LeafCertRefreshRate == 0.0 { + c.LeafCertRefreshRate = DefaultLeafCertRefreshRate + } + if c.LeafCertRefreshMaxBurst == 0 { + c.LeafCertRefreshMaxBurst = DefaultLeafCertRefreshMaxBurst + } + if c.LeafCertRefreshBackoffMin == 0 { + c.LeafCertRefreshBackoffMin = DefaultLeafCertRefreshBackoffMin + } + if c.LeafCertRefreshMaxWait == 0 { + c.LeafCertRefreshMaxWait = DefaultLeafCertRefreshMaxWait + } + return c +} + +type Deps struct { + Config Config + Logger hclog.Logger + + // RootsReader is an interface to access connect CA roots. + RootsReader RootsReader + + // CertSigner is an interface to remotely sign certificates. + CertSigner CertSigner +} + +type RootsReader interface { + Get() (*structs.IndexedCARoots, error) + Notify(ctx context.Context, correlationID string, ch chan<- cache.UpdateEvent) error +} + +type CertSigner interface { + SignCert(ctx context.Context, args *structs.CASignRequest) (*structs.IssuedCert, error) +} + +func NewManager(deps Deps) *Manager { + deps.Config = deps.Config.withDefaults() + + if deps.Logger == nil { + deps.Logger = hclog.NewNullLogger() + } + if deps.RootsReader == nil { + panic("RootsReader is required") + } + if deps.CertSigner == nil { + panic("CertSigner is required") + } + + m := &Manager{ + config: deps.Config, + logger: deps.Logger, + certSigner: deps.CertSigner, + rootsReader: deps.RootsReader, + // + certs: make(map[string]*certData), + certsExpiryHeap: ttlcache.NewExpiryHeap(), + } + + m.ctx, m.ctxCancel = context.WithCancel(context.Background()) + + m.rootWatcher = &rootWatcher{ + ctx: m.ctx, + rootsReader: m.rootsReader, + } + + // Start the expiry watcher + go m.runExpiryLoop() + + return m +} + +type Manager struct { + logger hclog.Logger + + // config contains agent configuration necessary for the cert manager to operate. + config Config + + // rootsReader is an interface to access connect CA roots. + rootsReader RootsReader + + // certSigner is an interface to remotely sign certificates. + certSigner CertSigner + + // rootWatcher helps let multiple requests for leaf certs to coordinate + // sharing a single long-lived watch for the root certs. This allows the + // leaf cert requests to notice when the roots rotate and trigger their + // reissuance. + rootWatcher *rootWatcher + + // This is the "top-level" internal context. This is used to cancel + // background operations. + ctx context.Context + ctxCancel context.CancelFunc + + // lock guards access to certs and certsExpiryHeap + lock sync.RWMutex + certs map[string]*certData + certsExpiryHeap *ttlcache.ExpiryHeap + + // certGroup is a singleflight group keyed identically to the certs map. + // When the leaf cert itself needs replacement requests will coalesce + // together through this chokepoint. + certGroup singleflight.Group +} + +func (m *Manager) getCertData(key string) *certData { + m.lock.RLock() + cd, ok := m.certs[key] + m.lock.RUnlock() + + if ok { + return cd + } + + m.lock.Lock() + defer m.lock.Unlock() + + cd, ok = m.certs[key] + if !ok { + cd = &certData{ + expiry: m.certsExpiryHeap.Add(key, m.config.LastGetTTL), + refreshRateLimiter: rate.NewLimiter( + m.config.LeafCertRefreshRate, + m.config.LeafCertRefreshMaxBurst, + ), + } + + m.certs[key] = cd + + metrics.SetGauge([]string{"leaf-certs", "entries_count"}, float32(len(m.certs))) + } + return cd +} + +// Stop stops any background work and frees all resources for the manager. +// Current fetch requests are allowed to continue to completion and callers may +// still access the current leaf cert values so coordination isn't needed with +// callers, however no background activity will continue. It's intended to +// close the manager at agent shutdown so no further requests should be made, +// however concurrent or in-flight ones won't break. +func (m *Manager) Stop() { + if m.ctxCancel != nil { + m.ctxCancel() + m.ctxCancel = nil + } +} + +// Get returns the leaf cert for the request. If data satisfying the +// minimum index is present, it is returned immediately. Otherwise, +// this will block until the cert is refreshed or the request timeout is +// reached. +// +// Multiple Get calls for the same logical request will block on a single +// network request. +// +// The timeout specified by the request will be the timeout on the cache +// Get, and does not correspond to the timeout of any background data +// fetching. If the timeout is reached before data satisfying the minimum +// index is retrieved, the last known value (maybe nil) is returned. No +// error is returned on timeout. This matches the behavior of Consul blocking +// queries. +func (m *Manager) Get(ctx context.Context, req *ConnectCALeafRequest) (*structs.IssuedCert, cache.ResultMeta, error) { + // Lightweight copy this object so that manipulating req doesn't race. + dup := *req + req = &dup + + // We don't want non-blocking queries to return expired leaf certs + // or leaf certs not valid under the current CA. So always revalidate + // the leaf cert on non-blocking queries (ie when MinQueryIndex == 0) + // + // NOTE: This conditional was formerly only in the API endpoint. + if req.MinQueryIndex == 0 { + req.MustRevalidate = true + } + + return m.internalGet(ctx, req) +} + +func (m *Manager) internalGet(ctx context.Context, req *ConnectCALeafRequest) (*structs.IssuedCert, cache.ResultMeta, error) { + key := req.Key() + if key == "" { + return nil, cache.ResultMeta{}, fmt.Errorf("a key is required") + } + + if req.MaxQueryTime <= 0 { + req.MaxQueryTime = DefaultQueryTimeout + } + timeoutTimer := time.NewTimer(req.MaxQueryTime) + defer timeoutTimer.Stop() + + // First time through + first := true + + for { + // Get the current value + cd := m.getCertData(key) + + cd.lock.Lock() + var ( + existing = cd.value + existingIndex = cd.index + refreshing = cd.refreshing + fetchedAt = cd.fetchedAt + lastFetchErr = cd.lastFetchErr + expiry = cd.expiry + ) + cd.lock.Unlock() + + shouldReplaceCert := certNeedsUpdate(req, existingIndex, existing, refreshing) + + if expiry != nil { + // The entry already exists in the TTL heap, touch it to keep it alive since + // this Get is still interested in the value. Note that we used to only do + // this in the `entryValid` block below but that means that a cache entry + // will expire after it's TTL regardless of how many callers are waiting for + // updates in this method in a couple of cases: + // + // 1. If the agent is disconnected from servers for the TTL then the client + // will be in backoff getting errors on each call to Get and since an + // errored cache entry has Valid = false it won't be touching the TTL. + // + // 2. If the value is just not changing then the client's current index + // will be equal to the entry index and entryValid will be false. This + // is a common case! + // + // But regardless of the state of the entry, assuming it's already in the + // TTL heap, we should touch it every time around here since this caller at + // least still cares about the value! + m.lock.Lock() + m.certsExpiryHeap.Update(expiry.Index(), m.config.LastGetTTL) + m.lock.Unlock() + } + + if !shouldReplaceCert { + meta := cache.ResultMeta{ + Index: existingIndex, + } + + if first { + meta.Hit = true + } + + // For non-background refresh types, the age is just how long since we + // fetched it last. + if !fetchedAt.IsZero() { + meta.Age = time.Since(fetchedAt) + } + + // We purposely do not return an error here since the cache only works with + // fetching values that either have a value or have an error, but not both. + // The Error may be non-nil in the entry in the case that an error has + // occurred _since_ the last good value, but we still want to return the + // good value to clients that are not requesting a specific version. The + // effect of this is that blocking clients will all see an error immediately + // without waiting a whole timeout to see it, but clients that just look up + // cache with an older index than the last valid result will still see the + // result and not the error here. I.e. the error is not "cached" without a + // new fetch attempt occurring, but the last good value can still be fetched + // from cache. + return existing, meta, nil + } + + // If this isn't our first time through and our last value has an error, then + // we return the error. This has the behavior that we don't sit in a retry + // loop getting the same error for the entire duration of the timeout. + // Instead, we make one effort to fetch a new value, and if there was an + // error, we return. Note that the invariant is that if both entry.Value AND + // entry.Error are non-nil, the error _must_ be more recent than the Value. In + // other words valid fetches should reset the error. See + // https://github.com/hashicorp/consul/issues/4480. + if !first && lastFetchErr != nil { + return existing, cache.ResultMeta{Index: existingIndex}, lastFetchErr + } + + notifyCh := m.triggerCertRefreshInGroup(req, cd) + + // No longer our first time through + first = false + + select { + case <-ctx.Done(): + return nil, cache.ResultMeta{}, ctx.Err() + case <-notifyCh: + // Our fetch returned, retry the get from the cache. + req.MustRevalidate = false + + case <-timeoutTimer.C: + // Timeout on the cache read, just return whatever we have. + return existing, cache.ResultMeta{Index: existingIndex}, nil + } + } +} + +func certNeedsUpdate(req *ConnectCALeafRequest, index uint64, value *structs.IssuedCert, refreshing bool) bool { + if value == nil { + return true + } + + if req.MinQueryIndex > 0 && req.MinQueryIndex >= index { + // MinIndex was given and matches or is higher than current value so we + // ignore the cache and fallthrough to blocking on a new value. + return true + } + + // Check if re-validate is requested. If so the first time round the + // loop is not a hit but subsequent ones should be treated normally. + if req.MustRevalidate { + // It is important to note that this block ONLY applies when we are not + // in indefinite refresh mode (where the underlying goroutine will + // continue to re-query for data). + // + // In this mode goroutines have a 1:1 relationship to RPCs that get + // executed, and importantly they DO NOT SLEEP after executing. + // + // This means that a running goroutine for this cache entry extremely + // strongly implies that the RPC has not yet completed, which is why + // this check works for the revalidation-avoidance optimization here. + if refreshing { + // There is an active goroutine performing a blocking query for + // this data, which has not returned. + // + // We can logically deduce that the contents of the cache are + // actually current, and we can simply return this while leaving + // the blocking query alone. + return false + } else { + return true + } + } + + return false +} + +func (m *Manager) triggerCertRefreshInGroup(req *ConnectCALeafRequest, cd *certData) <-chan singleflight.Result { + // Lightweight copy this object so that manipulating req doesn't race. + dup := *req + req = &dup + + if req.MaxQueryTime == 0 { + req.MaxQueryTime = DefaultQueryTimeout + } + + // At this point, we know we either don't have a cert at all or the + // cert we have is too old. We need to mint a new one. + // + // We use a singleflight group to coordinate only one request driving + // the async update to the key at once. + // + // NOTE: this anonymous function only has one goroutine in it per key at all times + return m.certGroup.DoChan(req.Key(), func() (any, error) { + cd.lock.Lock() + var ( + shouldReplaceCert = certNeedsUpdate(req, cd.index, cd.value, cd.refreshing) + rateLimiter = cd.refreshRateLimiter + lastIndex = cd.index + ) + cd.lock.Unlock() + + if !shouldReplaceCert { + // This handles the case where a fetch succeeded after checking for + // its existence in Get. This ensures that we don't miss updates + // since we don't hold the lock between the read and then the + // refresh trigger. + return nil, nil + } + + if err := rateLimiter.Wait(m.ctx); err != nil { + // NOTE: this can only happen when the entire cache is being + // shutdown and isn't something that can happen normally. + return nil, nil + } + + cd.MarkRefreshing(true) + defer cd.MarkRefreshing(false) + + req.MinQueryIndex = lastIndex + + // Start building the new entry by blocking on the fetch. + m.refreshLeafAndUpdate(req, cd) + + return nil, nil + }) +} + +// testGet is a way for the test code to do a get but from the middle of the +// logic stack, skipping some of the caching logic. +func (m *Manager) testGet(req *ConnectCALeafRequest) (uint64, *structs.IssuedCert, error) { + cd := m.getCertData(req.Key()) + + m.refreshLeafAndUpdate(req, cd) + + cd.lock.Lock() + var ( + index = cd.index + cert = cd.value + err = cd.lastFetchErr + ) + cd.lock.Unlock() + + if err != nil { + return 0, nil, err + } + + return index, cert, nil +} + +// refreshLeafAndUpdate will try to refresh the leaf and persist the updated +// data back to the in-memory store. +// +// NOTE: this function only has one goroutine in it per key at all times +func (m *Manager) refreshLeafAndUpdate(req *ConnectCALeafRequest, cd *certData) { + existing, state := cd.GetValueAndState() + newCert, updatedState, err := m.attemptLeafRefresh(req, existing, state) + cd.Update(newCert, updatedState, err) +} + +// Prepopulate puts a cert in manually. This is useful when the correct initial +// value is known and the cache shouldn't refetch the same thing on startup. It +// is used to set AgentLeafCert when AutoEncrypt.TLS is turned on. The manager +// itself cannot fetch that the first time because it requires a special +// RPCType. Subsequent runs are fine though. +func (m *Manager) Prepopulate( + ctx context.Context, + key string, + index uint64, + value *structs.IssuedCert, + authorityKeyID string, +) error { + if value == nil { + return errors.New("value is required") + } + cd := m.getCertData(key) + + cd.lock.Lock() + defer cd.lock.Unlock() + + cd.index = index + cd.value = value + cd.state = fetchState{ + authorityKeyID: authorityKeyID, + forceExpireAfter: time.Time{}, + consecutiveRateLimitErrs: 0, + activeRootRotationStart: time.Time{}, + } + + return nil +} + +// runExpiryLoop is a blocking function that watches the expiration +// heap and invalidates cert entries that have expired. +func (m *Manager) runExpiryLoop() { + for { + m.lock.RLock() + timer := m.certsExpiryHeap.Next() + m.lock.RUnlock() + + select { + case <-m.ctx.Done(): + timer.Stop() + return + case <-m.certsExpiryHeap.NotifyCh: + timer.Stop() + continue + + case <-timer.Wait(): + m.lock.Lock() + + entry := timer.Entry + + // Entry expired! Remove it. + delete(m.certs, entry.Key()) + m.certsExpiryHeap.Remove(entry.Index()) + + // Set some metrics + metrics.IncrCounter([]string{"leaf-certs", "evict_expired"}, 1) + metrics.SetGauge([]string{"leaf-certs", "entries_count"}, float32(len(m.certs))) + + m.lock.Unlock() + } + } +} diff --git a/agent/leafcert/leafcert_test.go b/agent/leafcert/leafcert_test.go new file mode 100644 index 0000000000000..0db683a816e1a --- /dev/null +++ b/agent/leafcert/leafcert_test.go @@ -0,0 +1,1133 @@ +package leafcert + +import ( + "context" + "crypto/tls" + "crypto/x509" + "encoding/pem" + "fmt" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/hashicorp/consul/acl" + "github.com/hashicorp/consul/agent/cache" + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/consul" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/sdk/testutil" + "github.com/hashicorp/consul/sdk/testutil/retry" +) + +// Test that after an initial signing, new CA roots (new ID) will +// trigger a blocking query to execute. +func TestManager_changingRoots(t *testing.T) { + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + t.Parallel() + + m, signer := testManager(t, nil) + + caRoot := signer.UpdateCA(t, nil) + + // We'll reuse the fetch options and request + req := &ConnectCALeafRequest{ + Datacenter: "dc1", Service: "web", + MinQueryIndex: 0, MaxQueryTime: 10 * time.Second, + } + + // First fetch should return immediately + getCh := testAsyncGet(t, m, req) + var idx uint64 + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-getCh: + require.NoError(t, result.Err) + require.NotNil(t, result.Value) + requireLeafValidUnderCA(t, result.Value, caRoot) + require.True(t, result.Index > 0) + + idx = result.Index + } + + // Second fetch should block with set index + req.MinQueryIndex = idx + getCh = testAsyncGet(t, m, req) + select { + case result := <-getCh: + t.Fatalf("should not return: %#v", result) + case <-time.After(100 * time.Millisecond): + } + + // Let's send in new roots, which should trigger the sign req. We need to take + // care to set the new root as active + caRoot2 := signer.UpdateCA(t, nil) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-getCh: + require.NoError(t, result.Err) + require.NotNil(t, result.Value) + require.True(t, result.Index > idx) + requireLeafValidUnderCA(t, result.Value, caRoot2) + } + + // Third fetch should block + getCh = testAsyncGet(t, m, req) + select { + case result := <-getCh: + t.Fatalf("should not return: %#v", result) + case <-time.After(100 * time.Millisecond): + } +} + +// Tests that if the root change jitter is longer than the time left on the +// timeout, we return normally but then still renew the cert on a subsequent +// call. +func TestManager_changingRootsJitterBetweenCalls(t *testing.T) { + t.Parallel() + + const TestOverrideCAChangeInitialDelay = 100 * time.Millisecond + + m, signer := testManager(t, func(cfg *Config) { + // Override the root-change delay so we will timeout first. We can't set it to + // a crazy high value otherwise we'll have to wait that long in the test to + // see if it actually happens on subsequent calls. We instead reduce the + // timeout in FetchOptions to be much shorter than this. + cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay + }) + + caRoot := signer.UpdateCA(t, nil) + + // We'll reuse the fetch options and request. Timeout must be much shorter + // than the initial root delay. 20ms means that if we deliver the root change + // during the first blocking call, we should need to block fully for 5 more + // calls before the cert is renewed. We pick a timeout that is not an exact + // multiple of the 100ms delay above to reduce the chance that timing works + // out in a way that makes it hard to tell a timeout from an early return due + // to a cert renewal. + req := &ConnectCALeafRequest{ + Datacenter: "dc1", Service: "web", + MinQueryIndex: 0, MaxQueryTime: 35 * time.Millisecond, + } + + // First fetch should return immediately + getCh := testAsyncGet(t, m, req) + var ( + idx uint64 + issued *structs.IssuedCert + ) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-getCh: + require.NoError(t, result.Err) + require.NotNil(t, result.Value) + require.True(t, result.Index > 0) + requireLeafValidUnderCA(t, result.Value, caRoot) + idx = result.Index + issued = result.Value + } + + // Let's send in new roots, which should eventually trigger the sign req. We + // need to take care to set the new root as active. Note that this is + // implicitly testing that root updates that happen in between leaf blocking + // queries are still noticed too. At this point no leaf blocking query is + // running so the root watch should be stopped. By pushing this update, the + // next blocking query will _immediately_ see the new root which means it + // needs to correctly notice that it is not the same one that generated the + // current cert and start the rotation. This is good, just not obvious that + // the behavior is actually well tested here when it is. + caRoot2 := signer.UpdateCA(t, nil) + earliestRootDelivery := time.Now() + + // Some number of fetches (2,3,4 likely) should timeout after 20ms and after + // 100ms has elapsed total we should see the new cert. Since this is all very + // timing dependent, we don't hard code exact numbers here and instead loop + // for plenty of time and do as many calls as it takes and just assert on the + // time taken and that the call either blocks and returns the cached cert, or + // returns the new one. + req.MinQueryIndex = idx + var shouldExpireAfter time.Time + i := 1 + rootsDelivered := false + for rootsDelivered { + start := time.Now() + getCh = testAsyncGet(t, m, req) + select { + case result := <-getCh: + require.NoError(t, result.Err) + timeTaken := time.Since(start) + + // There are two options, either it blocked waiting for the delay after + // the rotation or it returned the new CA cert before the timeout was + // done. TO be more robust against timing, we take the value as the + // decider for which case it is, and assert timing matches our expected + // bounds rather than vice versa. + + if result.Index > idx { + // Got a new cert + require.NotEqual(t, issued, result.Value) + require.NotNil(t, result.Value) + requireLeafValidUnderCA(t, result.Value, caRoot2) + // Should not have been delivered before the delay + require.True(t, time.Since(earliestRootDelivery) > TestOverrideCAChangeInitialDelay) + // All good. We are done! + rootsDelivered = true + } else { + // Should be the cached cert + require.Equal(t, issued, result.Value) + require.Equal(t, idx, result.Index) + requireLeafValidUnderCA(t, result.Value, caRoot) + // Sanity check we blocked for the whole timeout + require.Truef(t, timeTaken > req.MaxQueryTime, + "should block for at least %s, returned after %s", + req.MaxQueryTime, timeTaken) + // Sanity check that the forceExpireAfter state was set correctly + shouldExpireAfter := testObserveLeafCert(m, req, func(cd *certData) time.Time { + return cd.state.forceExpireAfter + }) + require.True(t, shouldExpireAfter.After(time.Now())) + require.True(t, shouldExpireAfter.Before(time.Now().Add(TestOverrideCAChangeInitialDelay))) + } + case <-time.After(50 * time.Millisecond): + t.Fatalf("request %d blocked too long", i) + } + i++ + + // Sanity check that we've not gone way beyond the deadline without a + // new cert. We give some leeway to make it less brittle. + require.Falsef(t, time.Now().After(shouldExpireAfter.Add(100*time.Millisecond)), + "waited extra 100ms and delayed CA rotate renew didn't happen") + } +} + +func testObserveLeafCert[T any](m *Manager, req *ConnectCALeafRequest, cb func(*certData) T) T { + key := req.Key() + + cd := m.getCertData(key) + + cd.lock.Lock() + defer cd.lock.Unlock() + + return cb(cd) +} + +// Tests that if the root changes in between blocking calls we still pick it up. +func TestManager_changingRootsBetweenBlockingCalls(t *testing.T) { + t.Parallel() + + m, signer := testManager(t, nil) + + caRoot := signer.UpdateCA(t, nil) + + // We'll reuse the fetch options and request. Short timeout important since we + // wait the full timeout before chaning roots. + req := &ConnectCALeafRequest{ + Datacenter: "dc1", Service: "web", + MinQueryIndex: 0, MaxQueryTime: 35 * time.Millisecond, + } + + // First fetch should return immediately + getCh := testAsyncGet(t, m, req) + var ( + idx uint64 + issued *structs.IssuedCert + ) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-getCh: + require.NoError(t, result.Err) + require.NotNil(t, result.Value) + requireLeafValidUnderCA(t, result.Value, caRoot) + require.True(t, result.Index > 0) + idx = result.Index + issued = result.Value + } + + // Next fetch should block for the full timeout + start := time.Now() + getCh = testAsyncGet(t, m, req) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block for too long waiting for fetch") + case result := <-getCh: + require.NoError(t, result.Err) + require.Equal(t, issued, result.Value) + // Still the initial cached result + require.Equal(t, idx, result.Index) + // Sanity check that it waited + require.True(t, time.Since(start) > req.MaxQueryTime) + } + + // No active requests, simulate root change now + caRoot2 := signer.UpdateCA(t, nil) + earliestRootDelivery := time.Now() + + // We should get the new cert immediately on next fetch (since test override + // root change jitter to be 1 nanosecond so no delay expected). + getCh = testAsyncGet(t, m, req) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block too long waiting for fetch") + case result := <-getCh: + require.NoError(t, result.Err) + require.NotEqual(t, issued, result.Value) + requireLeafValidUnderCA(t, result.Value, caRoot2) + require.True(t, result.Index > idx) + // Sanity check that we didn't wait too long + require.True(t, time.Since(earliestRootDelivery) < req.MaxQueryTime) + } +} + +func TestManager_CSRRateLimiting(t *testing.T) { + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + t.Parallel() + + m, signer := testManager(t, func(cfg *Config) { + // Each jitter window will be only 100 ms long to make testing quick but + // highly likely not to fail based on scheduling issues. + cfg.TestOverrideCAChangeInitialDelay = 100 * time.Millisecond + }) + + signer.UpdateCA(t, nil) + + signer.SetSignCallErrors( + // First call return rate limit error. This is important as it checks + // behavior when cache is empty and we have to return a nil Value but need to + // save state to do the right thing for retry. + consul.ErrRateLimited, // inc + // Then succeed on second call + nil, + // Then be rate limited again on several further calls + consul.ErrRateLimited, // inc + consul.ErrRateLimited, // inc + // Then fine after that + ) + + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "web", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + + // First fetch should return rate limit error directly - client is expected to + // backoff itself. + getCh := testAsyncGet(t, m, req) + select { + case <-time.After(200 * time.Millisecond): + t.Fatal("shouldn't block longer than one jitter window for success") + case result := <-getCh: + require.Error(t, result.Err) + require.Equal(t, consul.ErrRateLimited.Error(), result.Err.Error()) + } + + // Second call should return correct cert immediately. + getCh = testAsyncGet(t, m, req) + var ( + idx uint64 + issued *structs.IssuedCert + ) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-getCh: + require.NoError(t, result.Err) + require.NotNil(t, result.Value) + require.True(t, result.Index > 0) + idx = result.Index + issued = result.Value + } + + // Send in new roots, which should trigger the next sign req. We need to take + // care to set the new root as active + signer.UpdateCA(t, nil) + earliestRootDelivery := time.Now() + + // Sanity check state + require.Equal(t, uint64(1), signer.GetSignCallErrorCount()) + + // After root rotation jitter has been waited out, a new CSR will + // be attempted but will fail and return the previous cached result with no + // error since we will try again soon. + getCh = testAsyncGet(t, m, req) + select { + case <-time.After(200 * time.Millisecond): + t.Fatal("shouldn't block too long waiting for fetch") + case result := <-getCh: + // We should block for _at least_ one jitter period since we set that to + // 100ms and in test override mode we always pick the max jitter not a + // random amount. + require.True(t, time.Since(earliestRootDelivery) > 100*time.Millisecond) + require.Equal(t, uint64(2), signer.GetSignCallErrorCount()) + + require.NoError(t, result.Err) + require.Equal(t, issued, result.Value) + // 1 since this should still be the original cached result as we failed to + // get a new cert. + require.Equal(t, idx, result.Index) + } + + // Root rotation state is now only captured in the opts.LastResult.State so a + // subsequent call should also wait for 100ms and then attempt to generate a + // new cert since we failed last time. + getCh = testAsyncGet(t, m, req) + select { + case <-time.After(200 * time.Millisecond): + t.Fatal("shouldn't block too long waiting for fetch") + case result := <-getCh: + // We should block for _at least_ two jitter periods now. + require.True(t, time.Since(earliestRootDelivery) > 200*time.Millisecond) + require.Equal(t, uint64(3), signer.GetSignCallErrorCount()) + + require.NoError(t, result.Err) + require.Equal(t, issued, result.Value) + // 1 since this should still be the original cached result as we failed to + // get a new cert. + require.Equal(t, idx, result.Index) + } + + // Now we've had two rate limit failures and seen root rotation state work + // across both the blocking request that observed the rotation and the + // subsequent one. The next request should wait out the rest of the backoff + // and then actually fetch a new cert at last! + getCh = testAsyncGet(t, m, req) + select { + case <-time.After(200 * time.Millisecond): + t.Fatal("shouldn't block too long waiting for fetch") + case result := <-getCh: + // We should block for _at least_ three jitter periods now. + require.True(t, time.Since(earliestRootDelivery) > 300*time.Millisecond) + require.Equal(t, uint64(3), signer.GetSignCallErrorCount()) + + require.NoError(t, result.Err) + require.NotEqual(t, issued, result.Value) + // 3 since the rootCA change used 2 + require.True(t, result.Index > idx) + } +} + +// This test runs multiple concurrent callers watching different leaf certs and +// tries to ensure that the background root watch activity behaves correctly. +func TestManager_watchRootsDedupingMultipleCallers(t *testing.T) { + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + t.Parallel() + + m, signer := testManager(t, nil) + + caRoot := signer.UpdateCA(t, nil) + + // n is the number of clients we'll run + n := 3 + + // setup/testDoneCh are used for coordinating clients such that each has + // initial cert delivered and is blocking before the root changes. It's not a + // wait group since we want to be able to timeout the main test goroutine if + // one of the clients gets stuck. Instead it's a buffered chan. + setupDoneCh := make(chan error, n) + testDoneCh := make(chan error, n) + // rootsUpdate is used to coordinate clients so they know when they should + // expect to see leaf renewed after root change. + rootsUpdatedCh := make(chan struct{}) + + // Create a function that models a single client. It should go through the + // steps of getting an initial cert and then watching for changes until root + // updates. + client := func(i int) { + // We'll reuse the fetch options and request + req := &ConnectCALeafRequest{ + Datacenter: "dc1", Service: fmt.Sprintf("web-%d", i), + MinQueryIndex: 0, MaxQueryTime: 10 * time.Second, + } + + // First fetch should return immediately + getCh := testAsyncGet(t, m, req) + var idx uint64 + select { + case <-time.After(100 * time.Millisecond): + setupDoneCh <- fmt.Errorf("shouldn't block waiting for fetch") + return + case result := <-getCh: + require.NoError(t, result.Err) + idx = result.Index + } + + // Second fetch should block with set index + req.MinQueryIndex = idx + getCh = testAsyncGet(t, m, req) + select { + case result := <-getCh: + setupDoneCh <- fmt.Errorf("should not return: %#v", result) + return + case <-time.After(100 * time.Millisecond): + } + + // We're done with setup and the blocking call is still blocking in + // background. + setupDoneCh <- nil + + // Wait until all others are also done and roots change incase there are + // stragglers delaying the root update. + select { + case <-rootsUpdatedCh: + case <-time.After(200 * time.Millisecond): + testDoneCh <- fmt.Errorf("waited too long for root update") + return + } + + // Now we should see root update within a short period + select { + case <-time.After(100 * time.Millisecond): + testDoneCh <- fmt.Errorf("shouldn't block waiting for fetch") + return + case result := <-getCh: + require.NoError(t, result.Err) + if req.MinQueryIndex == result.Value.CreateIndex { + testDoneCh <- fmt.Errorf("index must be different") + return + } + } + + testDoneCh <- nil + } + + // Sanity check the roots watcher is not running yet + assertRootsWatchCounts(t, m, 0, 0) + + for i := 0; i < n; i++ { + go client(i) + } + + timeoutCh := time.After(200 * time.Millisecond) + + for i := 0; i < n; i++ { + select { + case <-timeoutCh: + t.Fatal("timed out waiting for clients") + case err := <-setupDoneCh: + if err != nil { + t.Fatalf(err.Error()) + } + } + } + + // Should be 3 clients running now, so the roots watcher should have started + // once and not stopped. + assertRootsWatchCounts(t, m, 1, 0) + + caRootCopy := caRoot.Clone() + caRootCopy.Active = false + + // Now we deliver the root update + _ = signer.UpdateCA(t, nil) + // And notify clients + close(rootsUpdatedCh) + + timeoutCh = time.After(200 * time.Millisecond) + for i := 0; i < n; i++ { + select { + case <-timeoutCh: + t.Fatalf("timed out waiting for %d of %d clients to renew after root change", n-i, n) + case err := <-testDoneCh: + if err != nil { + t.Fatalf(err.Error()) + } + } + } + + // All active requests have returned the new cert so the rootsWatcher should + // have stopped. This is timing dependent though so retry a few times + retry.RunWith(retry.ThreeTimes(), t, func(r *retry.R) { + assertRootsWatchCounts(r, m, 1, 1) + }) +} + +func assertRootsWatchCounts(t require.TestingT, m *Manager, wantStarts, wantStops int) { + if tt, ok := t.(*testing.T); ok { + tt.Helper() + } + starts := atomic.LoadUint32(&m.rootWatcher.testStartCount) + stops := atomic.LoadUint32(&m.rootWatcher.testStopCount) + require.Equal(t, wantStarts, int(starts)) + require.Equal(t, wantStops, int(stops)) +} + +// Test that after an initial signing, an expiringLeaf will trigger a +// blocking query to resign. +func TestManager_expiringLeaf(t *testing.T) { + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + t.Parallel() + + m, signer := testManager(t, nil) + + caRoot := signer.UpdateCA(t, nil) + + signer.SetSignCallErrors( + // First call returns expired cert to prime cache with an expired one. + ReplyWithExpiredCert, + ) + + // We'll reuse the fetch options and request + req := &ConnectCALeafRequest{ + Datacenter: "dc1", Service: "web", + MinQueryIndex: 0, MaxQueryTime: 10 * time.Second, + } + + // First fetch should return immediately + getCh := testAsyncGet(t, m, req) + var ( + idx uint64 + issued *structs.IssuedCert + ) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-getCh: + require.NoError(t, result.Err) + require.NotNil(t, result.Value) + require.True(t, result.Index > 0) + idx = result.Index + issued = result.Value + } + + // Second fetch should return immediately despite there being + // no updated CA roots, because we issued an expired cert. + getCh = testAsyncGet(t, m, req) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-getCh: + require.NoError(t, result.Err) + require.NotEqual(t, issued, result.Value) + require.True(t, result.Index > idx) + requireLeafValidUnderCA(t, result.Value, caRoot) + idx = result.Index + } + + // Third fetch should block since the cert is not expiring and + // we also didn't update CA certs. + req.MinQueryIndex = idx + getCh = testAsyncGet(t, m, req) + select { + case result := <-getCh: + t.Fatalf("should not return: %#v", result) + case <-time.After(100 * time.Millisecond): + } +} + +func TestManager_DNSSANForService(t *testing.T) { + t.Parallel() + + m, signer := testManager(t, nil) + + _ = signer.UpdateCA(t, nil) + + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "web", + DNSSAN: []string{"test.example.com"}, + } + + _, _, err := m.Get(context.Background(), req) + require.NoError(t, err) + + caReq := signer.GetCapture(0) + require.NotNil(t, caReq) + + pemBlock, _ := pem.Decode([]byte(caReq.CSR)) + csr, err := x509.ParseCertificateRequest(pemBlock.Bytes) + require.NoError(t, err) + require.Equal(t, csr.DNSNames, []string{"test.example.com"}) +} + +func TestManager_workflow_good(t *testing.T) { + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + const TestOverrideCAChangeInitialDelay = 1 * time.Nanosecond + + m, signer := testManager(t, func(cfg *Config) { + cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay + }) + + ca1 := signer.UpdateCA(t, nil) + + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + + // List + issued, meta, err := m.Get(ctx, req) + require.NoError(t, err) + require.False(t, meta.Hit) + require.NotNil(t, issued) + + // Verify that the cert is signed by the CA + requireLeafValidUnderCA(t, issued, ca1) + + // Verify blocking index + require.True(t, issued.ModifyIndex > 0) + require.Equal(t, issued.ModifyIndex, meta.Index) + + index := meta.Index + + // Fetch it again + testutil.RunStep(t, "test you get a cache hit on another read", func(t *testing.T) { + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + issued2, _, err := m.Get(ctx, req) + require.NoError(t, err) + require.NotNil(t, issued2) + require.Equal(t, issued, issued2) + }) + + type reply struct { + cert *structs.IssuedCert + meta cache.ResultMeta + err error + } + + replyCh := make(chan *reply, 1) + go func() { + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + MinQueryIndex: index, + } + + issued2, meta2, err := m.Get(ctx, req) + + replyCh <- &reply{issued2, meta2, err} + }() + + // Set a new CA + ca2 := signer.UpdateCA(t, nil) + + // Issue a blocking query to ensure that the cert gets updated appropriately + testutil.RunStep(t, "test blocking queries update leaf cert", func(t *testing.T) { + var got *reply + select { + case got = <-replyCh: + case <-time.After(500 * time.Millisecond): + t.Fatal("blocking query did not wake up during rotation") + } + + issued2, meta2, err := got.cert, got.meta, got.err + require.NoError(t, err) + require.NotNil(t, issued2) + + require.NotEqual(t, issued.CertPEM, issued2.CertPEM) + require.NotEqual(t, issued.PrivateKeyPEM, issued2.PrivateKeyPEM) + + // Verify that the cert is signed by the new CA + requireLeafValidUnderCA(t, issued2, ca2) + + // Should not be a cache hit! The data was updated in response to the blocking + // query being made. + require.False(t, meta2.Hit) + }) + + testutil.RunStep(t, "test non-blocking queries update leaf cert", func(t *testing.T) { + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + + issued, _, err := m.Get(ctx, req) + require.NoError(t, err) + require.NotNil(t, issued) + + // Verify that the cert is signed by the CA + requireLeafValidUnderCA(t, issued, ca2) + + // Issue a non blocking query to ensure that the cert gets updated appropriately + { + // Set a new CA + ca3 := signer.UpdateCA(t, nil) + + retry.Run(t, func(r *retry.R) { + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + + issued2, meta2, err := m.Get(ctx, req) + require.NoError(r, err) + require.NotNil(r, issued2) + + requireLeafValidUnderCA(r, issued2, ca3) + + // Should not be a cache hit! + require.False(r, meta2.Hit) + + require.NotEqual(r, issued.CertPEM, issued2.CertPEM) + require.NotEqual(r, issued.PrivateKeyPEM, issued2.PrivateKeyPEM) + + // Verify that the cert is signed by the new CA + requireLeafValidUnderCA(r, issued2, ca3) + }) + } + }) +} + +// Test we can request a leaf cert for a service and witness correct caching, +// blocking, and update semantics. +// +// This test originally was a client agent test in +// agent.TestAgentConnectCALeafCert_goodNotLocal and was cloned here to +// increase complex coverage, but the specific naming of the parent test is +// irrelevant here since there's no notion of the catalog at all at this layer. +func TestManager_workflow_goodNotLocal(t *testing.T) { + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + const TestOverrideCAChangeInitialDelay = 1 * time.Nanosecond + + m, signer := testManager(t, func(cfg *Config) { + cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay + }) + + ca1 := signer.UpdateCA(t, nil) + + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + + // List + issued, meta, err := m.Get(ctx, req) + require.NoError(t, err) + require.False(t, meta.Hit) + require.NotNil(t, issued) + + // Verify that the cert is signed by the CA + requireLeafValidUnderCA(t, issued, ca1) + + // Verify blocking index + require.True(t, issued.ModifyIndex > 0) + require.Equal(t, issued.ModifyIndex, meta.Index) + + // Fetch it again + testutil.RunStep(t, "test you get a cache hit on another read", func(t *testing.T) { + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + issued2, _, err := m.Get(ctx, req) + require.NoError(t, err) + require.NotNil(t, issued2) + require.Equal(t, issued, issued2) + }) + + // Test Blocking - see https://github.com/hashicorp/consul/issues/4462 + testutil.RunStep(t, "test blocking issue 4462", func(t *testing.T) { + // Fetch it again + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + MinQueryIndex: issued.ModifyIndex, + MaxQueryTime: 125 * time.Millisecond, + } + var ( + respCh = make(chan *structs.IssuedCert) + errCh = make(chan error, 1) + ) + go func() { + issued2, _, err := m.Get(ctx, req) + if err != nil { + errCh <- err + } else { + respCh <- issued2 + } + }() + + select { + case <-time.After(500 * time.Millisecond): + require.FailNow(t, "Shouldn't block for this long - not respecting wait parameter in the query") + + case err := <-errCh: + require.NoError(t, err) + case <-respCh: + } + }) + + testutil.RunStep(t, "test that caching is updated in the background", func(t *testing.T) { + // Set a new CA + ca := signer.UpdateCA(t, nil) + + retry.Run(t, func(r *retry.R) { + // Try and sign again (note no index/wait arg since cache should update in + // background even if we aren't actively blocking) + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + + issued2, _, err := m.Get(ctx, req) + require.NoError(r, err) + + if issued.CertPEM == issued2.CertPEM { + r.Fatalf("leaf has not updated") + } + + // Got a new leaf. Sanity check it's a whole new key as well as different + // cert. + if issued.PrivateKeyPEM == issued2.PrivateKeyPEM { + r.Fatalf("new leaf has same private key as before") + } + + // Verify that the cert is signed by the new CA + requireLeafValidUnderCA(r, issued2, ca) + + require.NotEqual(r, issued, issued2) + }) + }) +} + +func TestManager_workflow_nonBlockingQuery_after_blockingQuery_shouldNotBlock(t *testing.T) { + // see: https://github.com/hashicorp/consul/issues/12048 + + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + t.Parallel() + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + m, signer := testManager(t, nil) + + _ = signer.UpdateCA(t, nil) + + var ( + serialNumber string + index uint64 + issued *structs.IssuedCert + ) + testutil.RunStep(t, "do initial non-blocking query", func(t *testing.T) { + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + issued1, meta, err := m.Get(ctx, req) + require.NoError(t, err) + + serialNumber = issued1.SerialNumber + + require.False(t, meta.Hit, "for the leaf cert cache type these are always MISS") + index = meta.Index + issued = issued1 + }) + + go func() { + // launch goroutine for blocking query + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + MinQueryIndex: index, + } + _, _, _ = m.Get(ctx, req) + }() + + // We just need to ensure that the above blocking query is in-flight before + // the next step, so do a little sleep. + time.Sleep(50 * time.Millisecond) + + // The initial non-blocking query populated the leaf cert cache entry + // implicitly. The agent cache doesn't prune entries very often at all, so + // in between both of these steps the data should still be there, causing + // this to be a HIT that completes in less than 10m (the default inner leaf + // cert blocking query timeout). + testutil.RunStep(t, "do a non-blocking query that should not block", func(t *testing.T) { + req := &ConnectCALeafRequest{ + Datacenter: "dc1", + Service: "test", + EnterpriseMeta: *acl.DefaultEnterpriseMeta(), + } + issued2, meta2, err := m.Get(ctx, req) + require.NoError(t, err) + + require.True(t, meta2.Hit) + + // If this is actually returning a cached result, the serial number + // should be unchanged. + require.Equal(t, serialNumber, issued2.SerialNumber) + + require.Equal(t, issued, issued2) + }) +} + +func requireLeafValidUnderCA(t require.TestingT, issued *structs.IssuedCert, ca *structs.CARoot) { + require.NotNil(t, issued) + require.NotNil(t, ca) + + leaf, intermediates, err := connect.ParseLeafCerts(issued.CertPEM) + require.NoError(t, err) + + roots := x509.NewCertPool() + require.True(t, roots.AppendCertsFromPEM([]byte(ca.RootCert))) + + _, err = leaf.Verify(x509.VerifyOptions{ + Roots: roots, + Intermediates: intermediates, + }) + require.NoError(t, err) + + // Verify the private key matches. tls.LoadX509Keypair does this for us! + _, err = tls.X509KeyPair([]byte(issued.CertPEM), []byte(issued.PrivateKeyPEM)) + require.NoError(t, err) +} + +// testManager returns a *Manager that is pre-configured to use a mock RPC +// implementation that can sign certs, and an in-memory CA roots reader that +// interacts well with it. +func testManager(t *testing.T, mut func(*Config)) (*Manager, *testSigner) { + signer := newTestSigner(t, nil, nil) + + deps := Deps{ + Logger: testutil.Logger(t), + RootsReader: signer.RootsReader, + CertSigner: signer, + Config: Config{ + // Override the root-change spread so we don't have to wait up to 20 seconds + // to see root changes work. Can be changed back for specific tests that + // need to test this, Note it's not 0 since that used default but is + // effectively the same. + TestOverrideCAChangeInitialDelay: 1 * time.Microsecond, + }, + } + if mut != nil { + mut(&deps.Config) + } + + m := NewManager(deps) + t.Cleanup(m.Stop) + + return m, signer +} + +type testRootsReader struct { + mu sync.Mutex + index uint64 + roots *structs.IndexedCARoots + watcher chan struct{} +} + +func newTestRootsReader(t *testing.T) *testRootsReader { + r := &testRootsReader{ + watcher: make(chan struct{}), + } + t.Cleanup(func() { + r.mu.Lock() + watcher := r.watcher + r.mu.Unlock() + close(watcher) + }) + return r +} + +var _ RootsReader = (*testRootsReader)(nil) + +func (r *testRootsReader) Set(roots *structs.IndexedCARoots) { + r.mu.Lock() + oldWatcher := r.watcher + r.watcher = make(chan struct{}) + r.roots = roots + if roots == nil { + r.index = 1 + } else { + r.index = roots.Index + } + r.mu.Unlock() + + close(oldWatcher) +} + +func (r *testRootsReader) Get() (*structs.IndexedCARoots, error) { + r.mu.Lock() + defer r.mu.Unlock() + return r.roots, nil +} + +func (r *testRootsReader) Notify(ctx context.Context, correlationID string, ch chan<- cache.UpdateEvent) error { + r.mu.Lock() + watcher := r.watcher + r.mu.Unlock() + + go func() { + <-watcher + + r.mu.Lock() + defer r.mu.Unlock() + + ch <- cache.UpdateEvent{ + CorrelationID: correlationID, + Result: r.roots, + Meta: cache.ResultMeta{Index: r.index}, + Err: nil, + } + }() + return nil +} + +type testGetResult struct { + Index uint64 + Value *structs.IssuedCert + Err error +} + +// testAsyncGet returns a channel that returns the result of the testGet call. +// +// This is useful for testing timing and concurrency with testGet calls. +func testAsyncGet(t *testing.T, m *Manager, req *ConnectCALeafRequest) <-chan testGetResult { + ch := make(chan testGetResult) + go func() { + index, cert, err := m.testGet(req) + if err != nil { + ch <- testGetResult{Err: err} + return + } + + ch <- testGetResult{Index: index, Value: cert} + }() + return ch +} diff --git a/agent/leafcert/roots.go b/agent/leafcert/roots.go new file mode 100644 index 0000000000000..7f95e0578dccb --- /dev/null +++ b/agent/leafcert/roots.go @@ -0,0 +1,152 @@ +package leafcert + +import ( + "context" + "sync" + "sync/atomic" + + "github.com/hashicorp/consul/agent/cache" + "github.com/hashicorp/consul/agent/structs" +) + +// rootWatcher helps let multiple requests for leaf certs to coordinate sharing +// a single long-lived watch for the root certs. This allows the leaf cert +// requests to notice when the roots rotate and trigger their reissuance. +type rootWatcher struct { + // This is the "top-level" internal context. This is used to cancel + // background operations. + ctx context.Context + + // rootsReader is an interface to access connect CA roots. + rootsReader RootsReader + + // lock protects access to the subscribers map and cancel + lock sync.Mutex + // subscribers is a set of chans, one for each currently in-flight + // Fetch. These chans have root updates delivered from the root watcher. + subscribers map[chan struct{}]struct{} + // cancel is a func to call to stop the background root watch if any. + // You must hold lock to read (e.g. call) or write the value. + cancel func() + + // testStart/StopCount are testing helpers that allow tests to + // observe the reference counting behavior that governs the shared root watch. + // It's not exactly pretty to expose internals like this, but seems cleaner + // than constructing elaborate and brittle test cases that we can infer + // correct behavior from, and simpler than trying to probe runtime goroutine + // traces to infer correct behavior that way. They must be accessed + // atomically. + testStartCount uint32 + testStopCount uint32 +} + +// Subscribe is called on each fetch that is about to block and wait for +// changes to the leaf. It subscribes a chan to receive updates from the shared +// root watcher and triggers root watcher if it's not already running. +func (r *rootWatcher) Subscribe(rootUpdateCh chan struct{}) { + r.lock.Lock() + defer r.lock.Unlock() + // Lazy allocation + if r.subscribers == nil { + r.subscribers = make(map[chan struct{}]struct{}) + } + // Make sure a root watcher is running. We don't only do this on first request + // to be more tolerant of errors that could cause the root watcher to fail and + // exit. + if r.cancel == nil { + ctx, cancel := context.WithCancel(r.ctx) + r.cancel = cancel + go r.rootWatcher(ctx) + } + r.subscribers[rootUpdateCh] = struct{}{} +} + +// Unsubscribe is called when a blocking call exits to unsubscribe from root +// updates and possibly stop the shared root watcher if it's no longer needed. +// Note that typically root CA is still being watched by clients directly and +// probably by the ProxyConfigManager so it will stay hot in cache for a while, +// we are just not monitoring it for updates any more. +func (r *rootWatcher) Unsubscribe(rootUpdateCh chan struct{}) { + r.lock.Lock() + defer r.lock.Unlock() + delete(r.subscribers, rootUpdateCh) + if len(r.subscribers) == 0 && r.cancel != nil { + // This was the last request. Stop the root watcher. + r.cancel() + r.cancel = nil + } +} + +func (r *rootWatcher) notifySubscribers() { + r.lock.Lock() + defer r.lock.Unlock() + + for ch := range r.subscribers { + select { + case ch <- struct{}{}: + default: + // Don't block - chans are 1-buffered so this default case + // means the subscriber already holds an update signal. + } + } +} + +// rootWatcher is the shared rootWatcher that runs in a background goroutine +// while needed by one or more inflight Fetch calls. +func (r *rootWatcher) rootWatcher(ctx context.Context) { + atomic.AddUint32(&r.testStartCount, 1) + defer atomic.AddUint32(&r.testStopCount, 1) + + ch := make(chan cache.UpdateEvent, 1) + + if err := r.rootsReader.Notify(ctx, "roots", ch); err != nil { + // Trigger all inflight watchers. We don't pass the error, but they will + // reload from cache and observe the same error and return it to the caller, + // or if it's transient, will continue and the next Fetch will get us back + // into the right state. Seems better than busy loop-retrying here given + // that almost any error we would see here would also be returned from the + // cache get this will trigger. + r.notifySubscribers() + return + } + + var oldRoots *structs.IndexedCARoots + // Wait for updates to roots or all requests to stop + for { + select { + case <-ctx.Done(): + return + case e := <-ch: + // Root response changed in some way. Note this might be the initial + // fetch. + if e.Err != nil { + // See above rationale about the error propagation + r.notifySubscribers() + continue + } + + roots, ok := e.Result.(*structs.IndexedCARoots) + if !ok { + // See above rationale about the error propagation + r.notifySubscribers() + continue + } + + // Check that the active root is actually different from the last CA + // config there are many reasons the config might have changed without + // actually updating the CA root that is signing certs in the cluster. + // The Fetch calls will also validate this since the first call here we + // don't know if it changed or not, but there is no point waking up all + // Fetch calls to check this if we know none of them will need to act on + // this update. + if oldRoots != nil && oldRoots.ActiveRootID == roots.ActiveRootID { + continue + } + + // Distribute the update to all inflight requests - they will decide + // whether or not they need to act on it. + r.notifySubscribers() + oldRoots = roots + } + } +} diff --git a/agent/leafcert/signer_netrpc.go b/agent/leafcert/signer_netrpc.go new file mode 100644 index 0000000000000..2d6b490a9ea8c --- /dev/null +++ b/agent/leafcert/signer_netrpc.go @@ -0,0 +1,35 @@ +package leafcert + +import ( + "context" + + "github.com/hashicorp/consul/agent/structs" +) + +// NetRPC is an interface that an NetRPC client must implement. This is a helper +// interface that is implemented by the agent delegate so that Type +// implementations can request NetRPC access. +type NetRPC interface { + RPC(ctx context.Context, method string, args any, reply any) error +} + +// NewNetRPCCertSigner returns a CertSigner that uses net-rpc to sign certs. +func NewNetRPCCertSigner(netRPC NetRPC) CertSigner { + return &netRPCCertSigner{netRPC: netRPC} +} + +type netRPCCertSigner struct { + // NetRPC is an RPC client for remote cert signing requests. + netRPC NetRPC +} + +var _ CertSigner = (*netRPCCertSigner)(nil) + +func (s *netRPCCertSigner) SignCert(ctx context.Context, args *structs.CASignRequest) (*structs.IssuedCert, error) { + var reply structs.IssuedCert + err := s.netRPC.RPC(ctx, "ConnectCA.Sign", args, &reply) + if err != nil { + return nil, err + } + return &reply, nil +} diff --git a/agent/leafcert/signer_test.go b/agent/leafcert/signer_test.go new file mode 100644 index 0000000000000..21e3388f5a1c4 --- /dev/null +++ b/agent/leafcert/signer_test.go @@ -0,0 +1,243 @@ +package leafcert + +import ( + "bytes" + "context" + "crypto/rand" + "crypto/x509" + "encoding/pem" + "errors" + "fmt" + "math/big" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/structs" +) + +// testSigner implements NetRPC and handles leaf signing operations +type testSigner struct { + caLock sync.Mutex + ca *structs.CARoot + prevRoots []*structs.CARoot // remember prior ones + + IDGenerator *atomic.Uint64 + RootsReader *testRootsReader + + signCallLock sync.Mutex + signCallErrors []error + signCallErrorCount uint64 + signCallCapture []*structs.CASignRequest +} + +var _ CertSigner = (*testSigner)(nil) + +var ReplyWithExpiredCert = errors.New("reply with expired cert") + +func newTestSigner(t *testing.T, idGenerator *atomic.Uint64, rootsReader *testRootsReader) *testSigner { + if idGenerator == nil { + idGenerator = &atomic.Uint64{} + } + if rootsReader == nil { + rootsReader = newTestRootsReader(t) + } + s := &testSigner{ + IDGenerator: idGenerator, + RootsReader: rootsReader, + } + return s +} + +func (s *testSigner) SetSignCallErrors(errs ...error) { + s.signCallLock.Lock() + defer s.signCallLock.Unlock() + s.signCallErrors = append(s.signCallErrors, errs...) +} + +func (s *testSigner) GetSignCallErrorCount() uint64 { + s.signCallLock.Lock() + defer s.signCallLock.Unlock() + return s.signCallErrorCount +} + +func (s *testSigner) UpdateCA(t *testing.T, ca *structs.CARoot) *structs.CARoot { + if ca == nil { + ca = connect.TestCA(t, nil) + } + roots := &structs.IndexedCARoots{ + ActiveRootID: ca.ID, + TrustDomain: connect.TestTrustDomain, + Roots: []*structs.CARoot{ca}, + QueryMeta: structs.QueryMeta{Index: s.nextIndex()}, + } + + // Update the signer first. + s.caLock.Lock() + { + s.ca = ca + roots.Roots = append(roots.Roots, s.prevRoots...) + // Remember for the next rotation. + dup := ca.Clone() + dup.Active = false + s.prevRoots = append(s.prevRoots, dup) + } + s.caLock.Unlock() + + // Then trigger an event when updating the roots. + s.RootsReader.Set(roots) + + return ca +} + +func (s *testSigner) nextIndex() uint64 { + return s.IDGenerator.Add(1) +} + +func (s *testSigner) getCA() *structs.CARoot { + s.caLock.Lock() + defer s.caLock.Unlock() + return s.ca +} + +func (s *testSigner) GetCapture(idx int) *structs.CASignRequest { + s.signCallLock.Lock() + defer s.signCallLock.Unlock() + if len(s.signCallCapture) > idx { + return s.signCallCapture[idx] + } + + return nil +} + +func (s *testSigner) SignCert(ctx context.Context, req *structs.CASignRequest) (*structs.IssuedCert, error) { + useExpiredCert := false + s.signCallLock.Lock() + s.signCallCapture = append(s.signCallCapture, req) + if len(s.signCallErrors) > 0 { + err := s.signCallErrors[0] + s.signCallErrors = s.signCallErrors[1:] + if err == ReplyWithExpiredCert { + useExpiredCert = true + } else if err != nil { + s.signCallErrorCount++ + s.signCallLock.Unlock() + return nil, err + } + } + s.signCallLock.Unlock() + + // parts of this were inlined from CAManager and the connect ca provider + ca := s.getCA() + if ca == nil { + return nil, fmt.Errorf("must call UpdateCA at least once") + } + + csr, err := connect.ParseCSR(req.CSR) + if err != nil { + return nil, fmt.Errorf("error parsing CSR: %w", err) + } + + connect.HackSANExtensionForCSR(csr) + + spiffeID, err := connect.ParseCertURI(csr.URIs[0]) + if err != nil { + return nil, fmt.Errorf("error parsing CSR URI: %w", err) + } + + serviceID, isService := spiffeID.(*connect.SpiffeIDService) + if !isService { + return nil, fmt.Errorf("unexpected spiffeID type %T", spiffeID) + } + + signer, err := connect.ParseSigner(ca.SigningKey) + if err != nil { + return nil, fmt.Errorf("error parsing CA signing key: %w", err) + } + + keyId, err := connect.KeyId(signer.Public()) + if err != nil { + return nil, fmt.Errorf("error forming CA key id from public key: %w", err) + } + + subjectKeyID, err := connect.KeyId(csr.PublicKey) + if err != nil { + return nil, fmt.Errorf("error forming subject key id from public key: %w", err) + } + + caCert, err := connect.ParseCert(ca.RootCert) + if err != nil { + return nil, fmt.Errorf("error parsing CA root cert pem: %w", err) + } + + const expiration = 10 * time.Minute + + now := time.Now() + template := x509.Certificate{ + SerialNumber: big.NewInt(int64(s.nextIndex())), + URIs: csr.URIs, + Signature: csr.Signature, + // We use the correct signature algorithm for the CA key we are signing with + // regardless of the algorithm used to sign the CSR signature above since + // the leaf might use a different key type. + SignatureAlgorithm: connect.SigAlgoForKey(signer), + PublicKeyAlgorithm: csr.PublicKeyAlgorithm, + PublicKey: csr.PublicKey, + BasicConstraintsValid: true, + KeyUsage: x509.KeyUsageDataEncipherment | + x509.KeyUsageKeyAgreement | + x509.KeyUsageDigitalSignature | + x509.KeyUsageKeyEncipherment, + ExtKeyUsage: []x509.ExtKeyUsage{ + x509.ExtKeyUsageClientAuth, + x509.ExtKeyUsageServerAuth, + }, + NotAfter: now.Add(expiration), + NotBefore: now, + AuthorityKeyId: keyId, + SubjectKeyId: subjectKeyID, + DNSNames: csr.DNSNames, + IPAddresses: csr.IPAddresses, + } + + if useExpiredCert { + template.NotBefore = time.Now().Add(-13 * time.Hour) + template.NotAfter = time.Now().Add(-1 * time.Hour) + } + + // Create the certificate, PEM encode it and return that value. + var buf bytes.Buffer + bs, err := x509.CreateCertificate( + rand.Reader, &template, caCert, csr.PublicKey, signer) + if err != nil { + return nil, fmt.Errorf("error creating cert pem from CSR: %w", err) + } + + err = pem.Encode(&buf, &pem.Block{Type: "CERTIFICATE", Bytes: bs}) + if err != nil { + return nil, fmt.Errorf("error encoding cert pem into text: %w", err) + } + + leafPEM := buf.String() + + leafCert, err := connect.ParseCert(leafPEM) + if err != nil { + return nil, fmt.Errorf("error parsing cert from generated leaf pem: %w", err) + } + + index := s.nextIndex() + return &structs.IssuedCert{ + SerialNumber: connect.EncodeSerialNumber(leafCert.SerialNumber), + CertPEM: leafPEM, + Service: serviceID.Service, + ServiceURI: leafCert.URIs[0].String(), + ValidAfter: leafCert.NotBefore, + ValidBefore: leafCert.NotAfter, + RaftIndex: structs.RaftIndex{ + CreateIndex: index, + ModifyIndex: index, + }, + }, nil +} diff --git a/agent/leafcert/structs.go b/agent/leafcert/structs.go new file mode 100644 index 0000000000000..531d35c897e68 --- /dev/null +++ b/agent/leafcert/structs.go @@ -0,0 +1,103 @@ +package leafcert + +import ( + "fmt" + "net" + "time" + + "github.com/mitchellh/hashstructure" + + "github.com/hashicorp/consul/acl" + "github.com/hashicorp/consul/agent/cache" + "github.com/hashicorp/consul/agent/structs" +) + +// ConnectCALeafRequest is the cache.Request implementation for the +// ConnectCALeaf cache type. This is implemented here and not in structs +// since this is only used for cache-related requests and not forwarded +// directly to any Consul servers. +type ConnectCALeafRequest struct { + Token string + Datacenter string + DNSSAN []string + IPSAN []net.IP + MinQueryIndex uint64 + MaxQueryTime time.Duration + acl.EnterpriseMeta + MustRevalidate bool + + // The following flags indicate the entity we are requesting a cert for. + // Only one of these must be specified. + Service string // Given a Service name, not ID, the request is for a SpiffeIDService. + Agent string // Given an Agent name, not ID, the request is for a SpiffeIDAgent. + Kind structs.ServiceKind // Given "mesh-gateway", the request is for a SpiffeIDMeshGateway. No other kinds supported. + Server bool // If true, the request is for a SpiffeIDServer. +} + +func (r *ConnectCALeafRequest) Key() string { + r.EnterpriseMeta.Normalize() + + switch { + case r.Agent != "": + v, err := hashstructure.Hash([]any{ + r.Agent, + r.PartitionOrDefault(), + }, nil) + if err == nil { + return fmt.Sprintf("agent:%d", v) + } + case r.Kind == structs.ServiceKindMeshGateway: + v, err := hashstructure.Hash([]any{ + r.PartitionOrDefault(), + r.DNSSAN, + r.IPSAN, + }, nil) + if err == nil { + return fmt.Sprintf("kind:%d", v) + } + case r.Kind != "": + // this is not valid + case r.Server: + v, err := hashstructure.Hash([]any{ + "server", + r.Datacenter, + }, nil) + if err == nil { + return fmt.Sprintf("server:%d", v) + } + default: + v, err := hashstructure.Hash([]any{ + r.Service, + r.EnterpriseMeta, + r.DNSSAN, + r.IPSAN, + }, nil) + if err == nil { + return fmt.Sprintf("service:%d", v) + } + } + + // If there is an error, we don't set the key. A blank key forces + // no cache for this request so the request is forwarded directly + // to the server. + return "" +} + +func (req *ConnectCALeafRequest) TargetNamespace() string { + return req.NamespaceOrDefault() +} + +func (req *ConnectCALeafRequest) TargetPartition() string { + return req.PartitionOrDefault() +} + +func (r *ConnectCALeafRequest) CacheInfo() cache.RequestInfo { + return cache.RequestInfo{ + Token: r.Token, + Key: r.Key(), + Datacenter: r.Datacenter, + MinIndex: r.MinQueryIndex, + Timeout: r.MaxQueryTime, + MustRevalidate: r.MustRevalidate, + } +} diff --git a/agent/leafcert/structs_test.go b/agent/leafcert/structs_test.go new file mode 100644 index 0000000000000..bb131f10ed7c6 --- /dev/null +++ b/agent/leafcert/structs_test.go @@ -0,0 +1,79 @@ +package leafcert + +import ( + "net" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestConnectCALeafRequest_Key(t *testing.T) { + key := func(r ConnectCALeafRequest) string { + return r.Key() + } + t.Run("service", func(t *testing.T) { + t.Run("name", func(t *testing.T) { + r1 := key(ConnectCALeafRequest{Service: "web"}) + r2 := key(ConnectCALeafRequest{Service: "api"}) + require.True(t, strings.HasPrefix(r1, "service:"), "Key %s does not start with service:", r1) + require.True(t, strings.HasPrefix(r2, "service:"), "Key %s does not start with service:", r2) + require.NotEqual(t, r1, r2, "Cache keys for different services should not be equal") + }) + t.Run("dns-san", func(t *testing.T) { + r3 := key(ConnectCALeafRequest{Service: "foo", DNSSAN: []string{"a.com"}}) + r4 := key(ConnectCALeafRequest{Service: "foo", DNSSAN: []string{"b.com"}}) + require.NotEqual(t, r3, r4, "Cache keys for different DNSSAN should not be equal") + }) + t.Run("ip-san", func(t *testing.T) { + r5 := key(ConnectCALeafRequest{Service: "foo", IPSAN: []net.IP{net.ParseIP("192.168.4.139")}}) + r6 := key(ConnectCALeafRequest{Service: "foo", IPSAN: []net.IP{net.ParseIP("192.168.4.140")}}) + require.NotEqual(t, r5, r6, "Cache keys for different IPSAN should not be equal") + }) + }) + t.Run("agent", func(t *testing.T) { + t.Run("name", func(t *testing.T) { + r1 := key(ConnectCALeafRequest{Agent: "abc"}) + require.True(t, strings.HasPrefix(r1, "agent:"), "Key %s does not start with agent:", r1) + }) + t.Run("dns-san ignored", func(t *testing.T) { + r3 := key(ConnectCALeafRequest{Agent: "foo", DNSSAN: []string{"a.com"}}) + r4 := key(ConnectCALeafRequest{Agent: "foo", DNSSAN: []string{"b.com"}}) + require.Equal(t, r3, r4, "DNSSAN is ignored for agent type") + }) + t.Run("ip-san ignored", func(t *testing.T) { + r5 := key(ConnectCALeafRequest{Agent: "foo", IPSAN: []net.IP{net.ParseIP("192.168.4.139")}}) + r6 := key(ConnectCALeafRequest{Agent: "foo", IPSAN: []net.IP{net.ParseIP("192.168.4.140")}}) + require.Equal(t, r5, r6, "IPSAN is ignored for agent type") + }) + }) + t.Run("kind", func(t *testing.T) { + t.Run("invalid", func(t *testing.T) { + r1 := key(ConnectCALeafRequest{Kind: "terminating-gateway"}) + require.Empty(t, r1) + }) + t.Run("mesh-gateway", func(t *testing.T) { + t.Run("normal", func(t *testing.T) { + r1 := key(ConnectCALeafRequest{Kind: "mesh-gateway"}) + require.True(t, strings.HasPrefix(r1, "kind:"), "Key %s does not start with kind:", r1) + }) + t.Run("dns-san", func(t *testing.T) { + r3 := key(ConnectCALeafRequest{Kind: "mesh-gateway", DNSSAN: []string{"a.com"}}) + r4 := key(ConnectCALeafRequest{Kind: "mesh-gateway", DNSSAN: []string{"b.com"}}) + require.NotEqual(t, r3, r4, "Cache keys for different DNSSAN should not be equal") + }) + t.Run("ip-san", func(t *testing.T) { + r5 := key(ConnectCALeafRequest{Kind: "mesh-gateway", IPSAN: []net.IP{net.ParseIP("192.168.4.139")}}) + r6 := key(ConnectCALeafRequest{Kind: "mesh-gateway", IPSAN: []net.IP{net.ParseIP("192.168.4.140")}}) + require.NotEqual(t, r5, r6, "Cache keys for different IPSAN should not be equal") + }) + }) + }) + t.Run("server", func(t *testing.T) { + r1 := key(ConnectCALeafRequest{ + Server: true, + Datacenter: "us-east", + }) + require.True(t, strings.HasPrefix(r1, "server:"), "Key %s does not start with server:", r1) + }) +} diff --git a/agent/leafcert/util.go b/agent/leafcert/util.go new file mode 100644 index 0000000000000..a7453df37b4f9 --- /dev/null +++ b/agent/leafcert/util.go @@ -0,0 +1,63 @@ +package leafcert + +import ( + "time" + + "github.com/hashicorp/consul/agent/structs" +) + +// calculateSoftExpiry encapsulates our logic for when to renew a cert based on +// it's age. It returns a pair of times min, max which makes it easier to test +// the logic without non-deterministic jitter to account for. The caller should +// choose a time randomly in between these. +// +// We want to balance a few factors here: +// - renew too early and it increases the aggregate CSR rate in the cluster +// - renew too late and it risks disruption to the service if a transient +// error prevents the renewal +// - we want a broad amount of jitter so if there is an outage, we don't end +// up with all services in sync and causing a thundering herd every +// renewal period. Broader is better for smoothing requests but pushes +// both earlier and later tradeoffs above. +// +// Somewhat arbitrarily the current strategy looks like this: +// +// 0 60% 90% +// Issued [------------------------------|===============|!!!!!] Expires +// 72h TTL: 0 ~43h ~65h +// 1h TTL: 0 36m 54m +// +// Where |===| is the soft renewal period where we jitter for the first attempt +// and |!!!| is the danger zone where we just try immediately. +// +// In the happy path (no outages) the average renewal occurs half way through +// the soft renewal region or at 75% of the cert lifetime which is ~54 hours for +// a 72 hour cert, or 45 mins for a 1 hour cert. +// +// If we are already in the softRenewal period, we randomly pick a time between +// now and the start of the danger zone. +// +// We pass in now to make testing easier. +func calculateSoftExpiry(now time.Time, cert *structs.IssuedCert) (min time.Time, max time.Time) { + certLifetime := cert.ValidBefore.Sub(cert.ValidAfter) + if certLifetime < 10*time.Minute { + // Shouldn't happen as we limit to 1 hour shortest elsewhere but just be + // defensive against strange times or bugs. + return now, now + } + + // Find the 60% mark in diagram above + softRenewTime := cert.ValidAfter.Add(time.Duration(float64(certLifetime) * 0.6)) + hardRenewTime := cert.ValidAfter.Add(time.Duration(float64(certLifetime) * 0.9)) + + if now.After(hardRenewTime) { + // In the hard renew period, or already expired. Renew now! + return now, now + } + + if now.After(softRenewTime) { + // Already in the soft renew period, make now the lower bound for jitter + softRenewTime = now + } + return softRenewTime, hardRenewTime +} diff --git a/agent/leafcert/util_test.go b/agent/leafcert/util_test.go new file mode 100644 index 0000000000000..be89ad5936c19 --- /dev/null +++ b/agent/leafcert/util_test.go @@ -0,0 +1,133 @@ +package leafcert + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/hashicorp/consul/agent/structs" +) + +func TestCalculateSoftExpire(t *testing.T) { + tests := []struct { + name string + now string + issued string + lifetime time.Duration + wantMin string + wantMax string + }{ + { + name: "72h just issued", + now: "2018-01-01 00:00:01", + issued: "2018-01-01 00:00:00", + lifetime: 72 * time.Hour, + // Should jitter between 60% and 90% of the lifetime which is 43.2/64.8 + // hours after issued + wantMin: "2018-01-02 19:12:00", + wantMax: "2018-01-03 16:48:00", + }, + { + name: "72h in renew range", + // This time should be inside the renewal range. + now: "2018-01-02 20:00:20", + issued: "2018-01-01 00:00:00", + lifetime: 72 * time.Hour, + // Min should be the "now" time + wantMin: "2018-01-02 20:00:20", + wantMax: "2018-01-03 16:48:00", + }, + { + name: "72h in hard renew", + // This time should be inside the renewal range. + now: "2018-01-03 18:00:00", + issued: "2018-01-01 00:00:00", + lifetime: 72 * time.Hour, + // Min and max should both be the "now" time + wantMin: "2018-01-03 18:00:00", + wantMax: "2018-01-03 18:00:00", + }, + { + name: "72h expired", + // This time is after expiry + now: "2018-01-05 00:00:00", + issued: "2018-01-01 00:00:00", + lifetime: 72 * time.Hour, + // Min and max should both be the "now" time + wantMin: "2018-01-05 00:00:00", + wantMax: "2018-01-05 00:00:00", + }, + { + name: "1h just issued", + now: "2018-01-01 00:00:01", + issued: "2018-01-01 00:00:00", + lifetime: 1 * time.Hour, + // Should jitter between 60% and 90% of the lifetime which is 36/54 mins + // hours after issued + wantMin: "2018-01-01 00:36:00", + wantMax: "2018-01-01 00:54:00", + }, + { + name: "1h in renew range", + // This time should be inside the renewal range. + now: "2018-01-01 00:40:00", + issued: "2018-01-01 00:00:00", + lifetime: 1 * time.Hour, + // Min should be the "now" time + wantMin: "2018-01-01 00:40:00", + wantMax: "2018-01-01 00:54:00", + }, + { + name: "1h in hard renew", + // This time should be inside the renewal range. + now: "2018-01-01 00:55:00", + issued: "2018-01-01 00:00:00", + lifetime: 1 * time.Hour, + // Min and max should both be the "now" time + wantMin: "2018-01-01 00:55:00", + wantMax: "2018-01-01 00:55:00", + }, + { + name: "1h expired", + // This time is after expiry + now: "2018-01-01 01:01:01", + issued: "2018-01-01 00:00:00", + lifetime: 1 * time.Hour, + // Min and max should both be the "now" time + wantMin: "2018-01-01 01:01:01", + wantMax: "2018-01-01 01:01:01", + }, + { + name: "too short lifetime", + // This time is after expiry + now: "2018-01-01 01:01:01", + issued: "2018-01-01 00:00:00", + lifetime: 1 * time.Minute, + // Min and max should both be the "now" time + wantMin: "2018-01-01 01:01:01", + wantMax: "2018-01-01 01:01:01", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + now, err := time.Parse("2006-01-02 15:04:05", tc.now) + require.NoError(t, err) + issued, err := time.Parse("2006-01-02 15:04:05", tc.issued) + require.NoError(t, err) + wantMin, err := time.Parse("2006-01-02 15:04:05", tc.wantMin) + require.NoError(t, err) + wantMax, err := time.Parse("2006-01-02 15:04:05", tc.wantMax) + require.NoError(t, err) + + min, max := calculateSoftExpiry(now, &structs.IssuedCert{ + ValidAfter: issued, + ValidBefore: issued.Add(tc.lifetime), + }) + + require.Equal(t, wantMin, min) + require.Equal(t, wantMax, max) + }) + } +} diff --git a/agent/leafcert/watch.go b/agent/leafcert/watch.go new file mode 100644 index 0000000000000..62a7260c42875 --- /dev/null +++ b/agent/leafcert/watch.go @@ -0,0 +1,160 @@ +package leafcert + +import ( + "context" + "fmt" + "time" + + "github.com/hashicorp/consul/agent/cache" + "github.com/hashicorp/consul/lib" +) + +// Notify registers a desire to be updated about changes to a cache result. +// +// It is a helper that abstracts code from performing their own "blocking" query +// logic against a cache key to watch for changes and to maintain the key in +// cache actively. It will continue to perform blocking Get requests until the +// context is canceled. +// +// The passed context must be canceled or timeout in order to free resources +// and stop maintaining the value in cache. Typically request-scoped resources +// do this but if a long-lived context like context.Background is used, then the +// caller must arrange for it to be canceled when the watch is no longer +// needed. +// +// The passed chan may be buffered or unbuffered, if the caller doesn't consume +// fast enough it will block the notification loop. When the chan is later +// drained, watching resumes correctly. If the pause is longer than the +// cachetype's TTL, the result might be removed from the local cache. Even in +// this case though when the chan is drained again, the new Get will re-fetch +// the entry from servers and resume notification behavior transparently. +// +// The chan is passed in to allow multiple cached results to be watched by a +// single consumer without juggling extra goroutines per watch. The +// correlationID is opaque and will be returned in all UpdateEvents generated by +// result of watching the specified request so the caller can set this to any +// value that allows them to disambiguate between events in the returned chan +// when sharing a chan between multiple cache entries. If the chan is closed, +// the notify loop will terminate. +func (m *Manager) Notify( + ctx context.Context, + req *ConnectCALeafRequest, + correlationID string, + ch chan<- cache.UpdateEvent, +) error { + return m.NotifyCallback(ctx, req, correlationID, func(ctx context.Context, event cache.UpdateEvent) { + select { + case ch <- event: + case <-ctx.Done(): + } + }) +} + +// NotifyCallback allows you to receive notifications about changes to a cache +// result in the same way as Notify, but accepts a callback function instead of +// a channel. +func (m *Manager) NotifyCallback( + ctx context.Context, + req *ConnectCALeafRequest, + correlationID string, + cb cache.Callback, +) error { + if req.Key() == "" { + return fmt.Errorf("a key is required") + } + // Lightweight copy this object so that manipulating req doesn't race. + dup := *req + req = &dup + + if req.MaxQueryTime <= 0 { + req.MaxQueryTime = DefaultQueryTimeout + } + + go m.notifyBlockingQuery(ctx, req, correlationID, cb) + return nil +} + +func (m *Manager) notifyBlockingQuery( + ctx context.Context, + req *ConnectCALeafRequest, + correlationID string, + cb cache.Callback, +) { + // Always start at 0 index to deliver the initial (possibly currently cached + // value). + index := uint64(0) + failures := uint(0) + + for { + // Check context hasn't been canceled + if ctx.Err() != nil { + return + } + + // Blocking request + req.MinQueryIndex = index + newValue, meta, err := m.internalGet(ctx, req) + + // Check context hasn't been canceled + if ctx.Err() != nil { + return + } + + // Check the index of the value returned in the cache entry to be sure it + // changed + if index == 0 || index < meta.Index { + cb(ctx, cache.UpdateEvent{ + CorrelationID: correlationID, + Result: newValue, + Meta: meta, + Err: err, + }) + + // Update index for next request + index = meta.Index + } + + var wait time.Duration + // Handle errors with backoff. Badly behaved blocking calls that returned + // a zero index are considered as failures since we need to not get stuck + // in a busy loop. + if err == nil && meta.Index > 0 { + failures = 0 + } else { + failures++ + wait = backOffWait(m.config, failures) + + m.logger. + With("error", err). + With("index", index). + Warn("handling error in Manager.Notify") + } + + if wait > 0 { + select { + case <-time.After(wait): + case <-ctx.Done(): + return + } + } + // Sanity check we always request blocking on second pass + if err == nil && index < 1 { + index = 1 + } + } +} + +func backOffWait(cfg Config, failures uint) time.Duration { + if failures > cfg.LeafCertRefreshBackoffMin { + shift := failures - cfg.LeafCertRefreshBackoffMin + waitTime := cfg.LeafCertRefreshMaxWait + if shift < 31 { + waitTime = (1 << shift) * time.Second + } + if waitTime > cfg.LeafCertRefreshMaxWait { + waitTime = cfg.LeafCertRefreshMaxWait + } + return waitTime + lib.RandomStagger(waitTime) + } + return 0 +} diff --git a/agent/proxycfg-glue/glue.go b/agent/proxycfg-glue/glue.go index 320d2fc258047..d0363ec3b8e73 100644 --- a/agent/proxycfg-glue/glue.go +++ b/agent/proxycfg-glue/glue.go @@ -10,8 +10,6 @@ import ( "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" - "github.com/hashicorp/consul/proto/private/pbpeering" - "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/cache" cachetype "github.com/hashicorp/consul/agent/cache-types" @@ -23,6 +21,7 @@ import ( "github.com/hashicorp/consul/agent/proxycfg" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/submatview" + "github.com/hashicorp/consul/proto/private/pbpeering" ) // ServerDataSourceDeps contains the dependencies needed for sourcing data from @@ -81,17 +80,6 @@ func CacheServiceGateways(c *cache.Cache) proxycfg.GatewayServices { return &cacheProxyDataSource[*structs.ServiceSpecificRequest]{c, cachetype.ServiceGatewaysName} } -// CacheLeafCertificate satisifies the proxycfg.LeafCertificate interface by -// sourcing data from the agent cache. -// -// Note: there isn't a server-local equivalent of this data source because -// "agentless" proxies obtain certificates via SDS served by consul-dataplane. -// If SDS is not supported on consul-dataplane, data is sourced from the server agent cache -// even for "agentless" proxies. -func CacheLeafCertificate(c *cache.Cache) proxycfg.LeafCertificate { - return &cacheProxyDataSource[*cachetype.ConnectCALeafRequest]{c, cachetype.ConnectCALeafName} -} - // CachePrepraredQuery satisfies the proxycfg.PreparedQuery interface by // sourcing data from the agent cache. // diff --git a/agent/proxycfg-glue/health_blocking.go b/agent/proxycfg-glue/health_blocking.go new file mode 100644 index 0000000000000..0a47a920d157d --- /dev/null +++ b/agent/proxycfg-glue/health_blocking.go @@ -0,0 +1,164 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package proxycfgglue + +import ( + "context" + "fmt" + "time" + + "github.com/hashicorp/go-bexpr" + "github.com/hashicorp/go-memdb" + + "github.com/hashicorp/consul/acl" + "github.com/hashicorp/consul/agent/consul/state" + "github.com/hashicorp/consul/agent/consul/watch" + "github.com/hashicorp/consul/agent/proxycfg" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/agent/structs/aclfilter" +) + +// ServerHealthBlocking exists due to a bug with the streaming backend and its interaction with ACLs. +// Whenever an exported-services config entry is modified, this is effectively an ACL change. +// Assume the following situation: +// - no services are exported +// - an upstream watch to service X is spawned +// - the streaming backend filters out data for service X (because it's not exported yet) +// - service X is finally exported +// +// In this situation, the streaming backend does not trigger a refresh of its data. +// This means that any events that were supposed to have been received prior to the export are NOT backfilled, +// and the watches never see service X spawning. +// +// We currently have decided to not trigger a stream refresh in this situation due to the potential for a +// thundering herd effect (touching exports would cause a re-fetch of all watches for that partition, potentially). +// Therefore, this local blocking-query approach exists for agentless. +// +// It's also worth noting that the streaming subscription is currently bypassed most of the time with agentful, +// because proxycfg has a `req.Source.Node != ""` which prevents the `streamingEnabled` check from passing. +// This means that while agents should technically have this same issue, they don't experience it with mesh health +// watches. +func ServerHealthBlocking(deps ServerDataSourceDeps, remoteSource proxycfg.Health, state *state.Store) *serverHealthBlocking { + return &serverHealthBlocking{deps, remoteSource, state, 5 * time.Minute} +} + +type serverHealthBlocking struct { + deps ServerDataSourceDeps + remoteSource proxycfg.Health + state *state.Store + watchTimeout time.Duration +} + +// Notify is mostly a copy of the function in `agent/consul/health_endpoint.go` with a few minor tweaks. +// Most notably, some query features unnecessary for mesh have been stripped out. +func (h *serverHealthBlocking) Notify(ctx context.Context, args *structs.ServiceSpecificRequest, correlationID string, ch chan<- proxycfg.UpdateEvent) error { + if args.Datacenter != h.deps.Datacenter { + return h.remoteSource.Notify(ctx, args, correlationID, ch) + } + + // Verify the arguments + if args.ServiceName == "" { + return fmt.Errorf("Must provide service name") + } + if args.EnterpriseMeta.PartitionOrDefault() == acl.WildcardName { + return fmt.Errorf("Wildcards are not allowed in the partition field") + } + + // Determine the function we'll call + var f func(memdb.WatchSet, *state.Store, *structs.ServiceSpecificRequest) (uint64, structs.CheckServiceNodes, error) + switch { + case args.Connect: + f = serviceNodesConnect + case args.Ingress: + f = serviceNodesIngress + default: + f = serviceNodesDefault + } + + filter, err := bexpr.CreateFilter(args.Filter, nil, structs.CheckServiceNode{}) + if err != nil { + return err + } + + var hadResults bool = false + return watch.ServerLocalNotify(ctx, correlationID, h.deps.GetStore, + func(ws memdb.WatchSet, store Store) (uint64, *structs.IndexedCheckServiceNodes, error) { + // This is necessary so that service export changes are eventually picked up, since + // they won't trigger the watch themselves. + timeoutCh := make(chan struct{}) + time.AfterFunc(h.watchTimeout, func() { + close(timeoutCh) + }) + ws.Add(timeoutCh) + + authzContext := acl.AuthorizerContext{ + Peer: args.PeerName, + } + authz, err := h.deps.ACLResolver.ResolveTokenAndDefaultMeta(args.Token, &args.EnterpriseMeta, &authzContext) + if err != nil { + return 0, nil, err + } + // If we're doing a connect or ingress query, we need read access to the service + // we're trying to find proxies for, so check that. + if args.Connect || args.Ingress { + if authz.ServiceRead(args.ServiceName, &authzContext) != acl.Allow { + // If access was somehow revoked (via token deletion or unexporting), then we clear the + // last-known results before triggering an error. This way, the proxies will actually update + // their data, rather than holding onto the last-known list of healthy nodes indefinitely. + if hadResults { + hadResults = false + return 0, &structs.IndexedCheckServiceNodes{}, watch.ErrorACLResetData + } + return 0, nil, acl.ErrPermissionDenied + } + } + + var thisReply structs.IndexedCheckServiceNodes + thisReply.Index, thisReply.Nodes, err = f(ws, h.state, args) + if err != nil { + return 0, nil, err + } + + raw, err := filter.Execute(thisReply.Nodes) + if err != nil { + return 0, nil, err + } + thisReply.Nodes = raw.(structs.CheckServiceNodes) + + // Note: we filter the results with ACLs *after* applying the user-supplied + // bexpr filter, to ensure QueryMeta.ResultsFilteredByACLs does not include + // results that would be filtered out even if the user did have permission. + if err := h.filterACL(&authzContext, args.Token, &thisReply); err != nil { + return 0, nil, err + } + + hadResults = true + return thisReply.Index, &thisReply, nil + }, + dispatchBlockingQueryUpdate[*structs.IndexedCheckServiceNodes](ch), + ) +} + +func (h *serverHealthBlocking) filterACL(authz *acl.AuthorizerContext, token string, subj *structs.IndexedCheckServiceNodes) error { + // Get the ACL from the token + var entMeta acl.EnterpriseMeta + authorizer, err := h.deps.ACLResolver.ResolveTokenAndDefaultMeta(token, &entMeta, authz) + if err != nil { + return err + } + aclfilter.New(authorizer, h.deps.Logger).Filter(subj) + return nil +} + +func serviceNodesConnect(ws memdb.WatchSet, s *state.Store, args *structs.ServiceSpecificRequest) (uint64, structs.CheckServiceNodes, error) { + return s.CheckConnectServiceNodes(ws, args.ServiceName, &args.EnterpriseMeta, args.PeerName) +} + +func serviceNodesIngress(ws memdb.WatchSet, s *state.Store, args *structs.ServiceSpecificRequest) (uint64, structs.CheckServiceNodes, error) { + return s.CheckIngressServiceNodes(ws, args.ServiceName, &args.EnterpriseMeta) +} + +func serviceNodesDefault(ws memdb.WatchSet, s *state.Store, args *structs.ServiceSpecificRequest) (uint64, structs.CheckServiceNodes, error) { + return s.CheckServiceNodes(ws, args.ServiceName, &args.EnterpriseMeta, args.PeerName) +} diff --git a/agent/proxycfg-glue/health_blocking_test.go b/agent/proxycfg-glue/health_blocking_test.go new file mode 100644 index 0000000000000..3dcdaf17d6148 --- /dev/null +++ b/agent/proxycfg-glue/health_blocking_test.go @@ -0,0 +1,183 @@ +package proxycfgglue + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/hashicorp/consul/acl" + "github.com/hashicorp/consul/agent/consul/state" + "github.com/hashicorp/consul/agent/proxycfg" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/sdk/testutil" + "github.com/stretchr/testify/require" +) + +func TestServerHealthBlocking(t *testing.T) { + t.Run("remote queries are delegated to the remote source", func(t *testing.T) { + var ( + ctx = context.Background() + req = &structs.ServiceSpecificRequest{Datacenter: "dc2"} + correlationID = "correlation-id" + ch = make(chan<- proxycfg.UpdateEvent) + result = errors.New("KABOOM") + ) + + remoteSource := newMockHealth(t) + remoteSource.On("Notify", ctx, req, correlationID, ch).Return(result) + + store := state.NewStateStore(nil) + dataSource := ServerHealthBlocking(ServerDataSourceDeps{Datacenter: "dc1"}, remoteSource, store) + err := dataSource.Notify(ctx, req, correlationID, ch) + require.Equal(t, result, err) + }) + + t.Run("services notify correctly", func(t *testing.T) { + const ( + datacenter = "dc1" + serviceName = "web" + ) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + store := state.NewStateStore(nil) + aclResolver := newStaticResolver(acl.ManageAll()) + dataSource := ServerHealthBlocking(ServerDataSourceDeps{ + GetStore: func() Store { return store }, + Datacenter: datacenter, + ACLResolver: aclResolver, + Logger: testutil.Logger(t), + }, nil, store) + dataSource.watchTimeout = 1 * time.Second + + // Watch for all events + eventCh := make(chan proxycfg.UpdateEvent) + require.NoError(t, dataSource.Notify(ctx, &structs.ServiceSpecificRequest{ + Datacenter: datacenter, + ServiceName: serviceName, + }, "", eventCh)) + + // Watch for a subset of events + filteredCh := make(chan proxycfg.UpdateEvent) + require.NoError(t, dataSource.Notify(ctx, &structs.ServiceSpecificRequest{ + Datacenter: datacenter, + ServiceName: serviceName, + QueryOptions: structs.QueryOptions{ + Filter: "Service.ID == \"web1\"", + }, + }, "", filteredCh)) + + testutil.RunStep(t, "initial state", func(t *testing.T) { + result := getEventResult[*structs.IndexedCheckServiceNodes](t, eventCh) + require.Empty(t, result.Nodes) + result = getEventResult[*structs.IndexedCheckServiceNodes](t, filteredCh) + require.Empty(t, result.Nodes) + }) + + testutil.RunStep(t, "register services", func(t *testing.T) { + require.NoError(t, store.EnsureRegistration(10, &structs.RegisterRequest{ + Datacenter: "dc1", + Node: "foo", + Address: "127.0.0.1", + Service: &structs.NodeService{ + ID: serviceName + "1", + Service: serviceName, + Port: 80, + }, + })) + result := getEventResult[*structs.IndexedCheckServiceNodes](t, eventCh) + require.Len(t, result.Nodes, 1) + result = getEventResult[*structs.IndexedCheckServiceNodes](t, filteredCh) + require.Len(t, result.Nodes, 1) + + require.NoError(t, store.EnsureRegistration(11, &structs.RegisterRequest{ + Datacenter: "dc1", + Node: "foo", + Address: "127.0.0.1", + Service: &structs.NodeService{ + ID: serviceName + "2", + Service: serviceName, + Port: 81, + }, + })) + result = getEventResult[*structs.IndexedCheckServiceNodes](t, eventCh) + require.Len(t, result.Nodes, 2) + result = getEventResult[*structs.IndexedCheckServiceNodes](t, filteredCh) + require.Len(t, result.Nodes, 1) + require.Equal(t, "web1", result.Nodes[0].Service.ID) + }) + + testutil.RunStep(t, "deregister service", func(t *testing.T) { + require.NoError(t, store.DeleteService(12, "foo", serviceName+"1", nil, "")) + result := getEventResult[*structs.IndexedCheckServiceNodes](t, eventCh) + require.Len(t, result.Nodes, 1) + result = getEventResult[*structs.IndexedCheckServiceNodes](t, filteredCh) + require.Len(t, result.Nodes, 0) + }) + + testutil.RunStep(t, "acl enforcement", func(t *testing.T) { + require.NoError(t, store.EnsureRegistration(11, &structs.RegisterRequest{ + Datacenter: "dc1", + Node: "foo", + Address: "127.0.0.1", + Service: &structs.NodeService{ + Service: serviceName + "-sidecar-proxy", + Kind: structs.ServiceKindConnectProxy, + Proxy: structs.ConnectProxyConfig{ + DestinationServiceName: serviceName, + }, + }, + })) + + authzDeny := policyAuthorizer(t, ``) + authzAllow := policyAuthorizer(t, ` + node_prefix "" { policy = "read" } + service_prefix "web" { policy = "read" } + `) + + // Start a stream where insufficient permissions are denied + aclDenyCh := make(chan proxycfg.UpdateEvent) + aclResolver.SwapAuthorizer(authzDeny) + require.NoError(t, dataSource.Notify(ctx, &structs.ServiceSpecificRequest{ + Connect: true, + Datacenter: datacenter, + ServiceName: serviceName, + }, "", aclDenyCh)) + require.ErrorContains(t, getEventError(t, aclDenyCh), "Permission denied") + + // Adding ACL permissions will send valid data + aclResolver.SwapAuthorizer(authzAllow) + time.Sleep(dataSource.watchTimeout) + result := getEventResult[*structs.IndexedCheckServiceNodes](t, aclDenyCh) + require.Len(t, result.Nodes, 1) + require.Equal(t, "web-sidecar-proxy", result.Nodes[0].Service.Service) + + // Start a stream where sufficient permissions are allowed + aclAllowCh := make(chan proxycfg.UpdateEvent) + aclResolver.SwapAuthorizer(authzAllow) + require.NoError(t, dataSource.Notify(ctx, &structs.ServiceSpecificRequest{ + Connect: true, + Datacenter: datacenter, + ServiceName: serviceName, + }, "", aclAllowCh)) + result = getEventResult[*structs.IndexedCheckServiceNodes](t, aclAllowCh) + require.Len(t, result.Nodes, 1) + require.Equal(t, "web-sidecar-proxy", result.Nodes[0].Service.Service) + + // Removing ACL permissions will send empty data + aclResolver.SwapAuthorizer(authzDeny) + time.Sleep(dataSource.watchTimeout) + result = getEventResult[*structs.IndexedCheckServiceNodes](t, aclAllowCh) + require.Len(t, result.Nodes, 0) + + // Adding ACL permissions will send valid data + aclResolver.SwapAuthorizer(authzAllow) + time.Sleep(dataSource.watchTimeout) + result = getEventResult[*structs.IndexedCheckServiceNodes](t, aclAllowCh) + require.Len(t, result.Nodes, 1) + require.Equal(t, "web-sidecar-proxy", result.Nodes[0].Service.Service) + }) + }) +} diff --git a/agent/proxycfg-glue/leafcerts.go b/agent/proxycfg-glue/leafcerts.go new file mode 100644 index 0000000000000..24631ffc31134 --- /dev/null +++ b/agent/proxycfg-glue/leafcerts.go @@ -0,0 +1,25 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package proxycfgglue + +import ( + "context" + + "github.com/hashicorp/consul/agent/leafcert" + "github.com/hashicorp/consul/agent/proxycfg" +) + +// LocalLeafCerts satisfies the proxycfg.LeafCertificate interface by sourcing data from +// the given leafcert.Manager. +func LocalLeafCerts(m *leafcert.Manager) proxycfg.LeafCertificate { + return &localLeafCerts{m} +} + +type localLeafCerts struct { + leafCertManager *leafcert.Manager +} + +func (c *localLeafCerts) Notify(ctx context.Context, req *leafcert.ConnectCALeafRequest, correlationID string, ch chan<- proxycfg.UpdateEvent) error { + return c.leafCertManager.NotifyCallback(ctx, req, correlationID, dispatchCacheUpdate(ch)) +} diff --git a/agent/proxycfg/api_gateway.go b/agent/proxycfg/api_gateway.go index 21631bd5fa47b..7a4a48d0d2510 100644 --- a/agent/proxycfg/api_gateway.go +++ b/agent/proxycfg/api_gateway.go @@ -7,7 +7,7 @@ import ( "context" "fmt" "github.com/hashicorp/consul/acl" - cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/proxycfg/internal/watch" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/proto/private/pbpeering" @@ -489,7 +489,7 @@ func (h *handlerAPIGateway) watchIngressLeafCert(ctx context.Context, snap *Conf snap.APIGateway.LeafCertWatchCancel() } ctx, cancel := context.WithCancel(ctx) - err := h.dataSources.LeafCertificate.Notify(ctx, &cachetype.ConnectCALeafRequest{ + err := h.dataSources.LeafCertificate.Notify(ctx, &leafcert.ConnectCALeafRequest{ Datacenter: h.source.Datacenter, Token: h.token, Service: h.service, diff --git a/agent/proxycfg/connect_proxy.go b/agent/proxycfg/connect_proxy.go index 442223cda0efa..7dcbe18e71957 100644 --- a/agent/proxycfg/connect_proxy.go +++ b/agent/proxycfg/connect_proxy.go @@ -11,13 +11,15 @@ import ( "path" "strings" + "github.com/mitchellh/mapstructure" + "github.com/hashicorp/consul/acl" cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/proxycfg/internal/watch" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/proto/private/pbpeering" - "github.com/mitchellh/mapstructure" ) type handlerConnectProxy struct { @@ -69,7 +71,7 @@ func (s *handlerConnectProxy) initialize(ctx context.Context) (ConfigSnapshot, e } // Watch the leaf cert - err = s.dataSources.LeafCertificate.Notify(ctx, &cachetype.ConnectCALeafRequest{ + err = s.dataSources.LeafCertificate.Notify(ctx, &leafcert.ConnectCALeafRequest{ Datacenter: s.source.Datacenter, Token: s.token, Service: s.proxyCfg.DestinationServiceName, diff --git a/agent/proxycfg/data_sources.go b/agent/proxycfg/data_sources.go index 0018ce7c08ca7..ee779dfb6c884 100644 --- a/agent/proxycfg/data_sources.go +++ b/agent/proxycfg/data_sources.go @@ -8,6 +8,7 @@ import ( "errors" cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/structs" ) @@ -212,7 +213,7 @@ type InternalServiceDump interface { // LeafCertificate is the interface used to consume updates about a service's // leaf certificate. type LeafCertificate interface { - Notify(ctx context.Context, req *cachetype.ConnectCALeafRequest, correlationID string, ch chan<- UpdateEvent) error + Notify(ctx context.Context, req *leafcert.ConnectCALeafRequest, correlationID string, ch chan<- UpdateEvent) error } // PeeredUpstreams is the interface used to consume updates about upstreams diff --git a/agent/proxycfg/ingress_gateway.go b/agent/proxycfg/ingress_gateway.go index e76d567a50534..22eb405056776 100644 --- a/agent/proxycfg/ingress_gateway.go +++ b/agent/proxycfg/ingress_gateway.go @@ -7,7 +7,7 @@ import ( "context" "fmt" - cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/proxycfg/internal/watch" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/proto/private/pbpeering" @@ -222,7 +222,7 @@ func (s *handlerIngressGateway) watchIngressLeafCert(ctx context.Context, snap * snap.IngressGateway.LeafCertWatchCancel() } ctx, cancel := context.WithCancel(ctx) - err := s.dataSources.LeafCertificate.Notify(ctx, &cachetype.ConnectCALeafRequest{ + err := s.dataSources.LeafCertificate.Notify(ctx, &leafcert.ConnectCALeafRequest{ Datacenter: s.source.Datacenter, Token: s.token, Service: s.service, diff --git a/agent/proxycfg/manager_test.go b/agent/proxycfg/manager_test.go index 0cbd769df2cb3..13dd0f95420cd 100644 --- a/agent/proxycfg/manager_test.go +++ b/agent/proxycfg/manager_test.go @@ -10,10 +10,10 @@ import ( "github.com/stretchr/testify/require" "github.com/hashicorp/consul/acl" - cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/configentry" "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/consul/discoverychain" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/proxycfg/internal/watch" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/api" @@ -130,7 +130,7 @@ func TestManager_BasicLifecycle(t *testing.T) { Datacenter: "dc1", QueryOptions: structs.QueryOptions{Token: "my-token"}, } - leafReq := &cachetype.ConnectCALeafRequest{ + leafReq := &leafcert.ConnectCALeafRequest{ Datacenter: "dc1", Token: "my-token", Service: "web", @@ -358,7 +358,7 @@ func testManager_BasicLifecycle( t *testing.T, dataSources *TestDataSources, rootsReq *structs.DCSpecificRequest, - leafReq *cachetype.ConnectCALeafRequest, + leafReq *leafcert.ConnectCALeafRequest, roots *structs.IndexedCARoots, webProxy *structs.NodeService, expectSnap *ConfigSnapshot, diff --git a/agent/proxycfg/mesh_gateway.go b/agent/proxycfg/mesh_gateway.go index cf090b7b04607..f2fee37d46719 100644 --- a/agent/proxycfg/mesh_gateway.go +++ b/agent/proxycfg/mesh_gateway.go @@ -15,7 +15,9 @@ import ( "github.com/hashicorp/go-hclog" "github.com/hashicorp/consul/acl" + cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/proxycfg/internal/watch" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/lib/maps" @@ -392,7 +394,7 @@ func (s *handlerMeshGateway) handleUpdate(ctx context.Context, u UpdateEvent, sn if hasExports && snap.MeshGateway.LeafCertWatchCancel == nil { // no watch and we need one ctx, cancel := context.WithCancel(ctx) - err := s.dataSources.LeafCertificate.Notify(ctx, &cachetype.ConnectCALeafRequest{ + err := s.dataSources.LeafCertificate.Notify(ctx, &leafcert.ConnectCALeafRequest{ Datacenter: s.source.Datacenter, Token: s.token, Kind: structs.ServiceKindMeshGateway, diff --git a/agent/proxycfg/state_test.go b/agent/proxycfg/state_test.go index dbe2db11438f5..86957df4bd479 100644 --- a/agent/proxycfg/state_test.go +++ b/agent/proxycfg/state_test.go @@ -10,15 +10,15 @@ import ( "testing" "time" - cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/go-hclog" "github.com/stretchr/testify/require" "golang.org/x/time/rate" "github.com/hashicorp/consul/acl" - + cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/configentry" "github.com/hashicorp/consul/agent/consul/discoverychain" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/structs" apimod "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/proto/private/pbpeering" @@ -139,7 +139,7 @@ func recordWatches(sc *stateConfig) *watchRecorder { IntentionUpstreams: typedWatchRecorder[*structs.ServiceSpecificRequest]{wr}, IntentionUpstreamsDestination: typedWatchRecorder[*structs.ServiceSpecificRequest]{wr}, InternalServiceDump: typedWatchRecorder[*structs.ServiceDumpRequest]{wr}, - LeafCertificate: typedWatchRecorder[*cachetype.ConnectCALeafRequest]{wr}, + LeafCertificate: typedWatchRecorder[*leafcert.ConnectCALeafRequest]{wr}, PeeringList: typedWatchRecorder[*cachetype.PeeringListRequest]{wr}, PeeredUpstreams: typedWatchRecorder[*structs.PartitionSpecificRequest]{wr}, PreparedQuery: typedWatchRecorder[*structs.PreparedQueryExecuteRequest]{wr}, @@ -224,7 +224,7 @@ func genVerifyTrustBundleReadWatch(peer string) verifyWatchRequest { func genVerifyLeafWatchWithDNSSANs(expectedService string, expectedDatacenter string, expectedDNSSANs []string) verifyWatchRequest { return func(t testing.TB, request any) { - reqReal, ok := request.(*cachetype.ConnectCALeafRequest) + reqReal, ok := request.(*leafcert.ConnectCALeafRequest) reqReal.Token = aclToken require.True(t, ok) require.Equal(t, aclToken, reqReal.Token) diff --git a/agent/proxycfg/terminating_gateway.go b/agent/proxycfg/terminating_gateway.go index 7641cfc2c20bf..7d29ee70501bb 100644 --- a/agent/proxycfg/terminating_gateway.go +++ b/agent/proxycfg/terminating_gateway.go @@ -8,7 +8,7 @@ import ( "fmt" "strings" - cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/structs" ) @@ -172,7 +172,7 @@ func (s *handlerTerminatingGateway) handleUpdate(ctx context.Context, u UpdateEv // This cert is used to terminate mTLS connections on the service's behalf if _, ok := snap.TerminatingGateway.WatchedLeaves[svc.Service]; !ok { ctx, cancel := context.WithCancel(ctx) - err := s.dataSources.LeafCertificate.Notify(ctx, &cachetype.ConnectCALeafRequest{ + err := s.dataSources.LeafCertificate.Notify(ctx, &leafcert.ConnectCALeafRequest{ Datacenter: s.source.Datacenter, Token: s.token, Service: svc.Service.Name, diff --git a/agent/proxycfg/testing.go b/agent/proxycfg/testing.go index c8b933527adb7..ac68994cb8f18 100644 --- a/agent/proxycfg/testing.go +++ b/agent/proxycfg/testing.go @@ -21,6 +21,7 @@ import ( "github.com/hashicorp/consul/agent/cache" cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/proto/private/pbpeering" @@ -749,7 +750,7 @@ func testConfigSnapshotFixture( IntentionUpstreams: &noopDataSource[*structs.ServiceSpecificRequest]{}, IntentionUpstreamsDestination: &noopDataSource[*structs.ServiceSpecificRequest]{}, InternalServiceDump: &noopDataSource[*structs.ServiceDumpRequest]{}, - LeafCertificate: &noopDataSource[*cachetype.ConnectCALeafRequest]{}, + LeafCertificate: &noopDataSource[*leafcert.ConnectCALeafRequest]{}, PeeringList: &noopDataSource[*cachetype.PeeringListRequest]{}, PeeredUpstreams: &noopDataSource[*structs.PartitionSpecificRequest]{}, PreparedQuery: &noopDataSource[*structs.PreparedQueryExecuteRequest]{}, @@ -954,7 +955,7 @@ func NewTestDataSources() *TestDataSources { IntentionUpstreams: NewTestDataSource[*structs.ServiceSpecificRequest, *structs.IndexedServiceList](), IntentionUpstreamsDestination: NewTestDataSource[*structs.ServiceSpecificRequest, *structs.IndexedServiceList](), InternalServiceDump: NewTestDataSource[*structs.ServiceDumpRequest, *structs.IndexedCheckServiceNodes](), - LeafCertificate: NewTestDataSource[*cachetype.ConnectCALeafRequest, *structs.IssuedCert](), + LeafCertificate: NewTestDataSource[*leafcert.ConnectCALeafRequest, *structs.IssuedCert](), PeeringList: NewTestDataSource[*cachetype.PeeringListRequest, *pbpeering.PeeringListResponse](), PreparedQuery: NewTestDataSource[*structs.PreparedQueryExecuteRequest, *structs.PreparedQueryExecuteResponse](), ResolvedServiceConfig: NewTestDataSource[*structs.ServiceConfigRequest, *structs.ServiceConfigResponse](), @@ -981,7 +982,7 @@ type TestDataSources struct { IntentionUpstreams *TestDataSource[*structs.ServiceSpecificRequest, *structs.IndexedServiceList] IntentionUpstreamsDestination *TestDataSource[*structs.ServiceSpecificRequest, *structs.IndexedServiceList] InternalServiceDump *TestDataSource[*structs.ServiceDumpRequest, *structs.IndexedCheckServiceNodes] - LeafCertificate *TestDataSource[*cachetype.ConnectCALeafRequest, *structs.IssuedCert] + LeafCertificate *TestDataSource[*leafcert.ConnectCALeafRequest, *structs.IssuedCert] PeeringList *TestDataSource[*cachetype.PeeringListRequest, *pbpeering.PeeringListResponse] PeeredUpstreams *TestDataSource[*structs.PartitionSpecificRequest, *structs.IndexedPeeredServiceList] PreparedQuery *TestDataSource[*structs.PreparedQueryExecuteRequest, *structs.PreparedQueryExecuteResponse] diff --git a/agent/setup.go b/agent/setup.go index 6a9efb5f74426..4d5d0feed7a15 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -5,6 +5,7 @@ package agent import ( "context" + "errors" "fmt" "io" "net" @@ -33,6 +34,7 @@ import ( "github.com/hashicorp/consul/agent/grpc-internal/resolver" grpcWare "github.com/hashicorp/consul/agent/grpc-middleware" "github.com/hashicorp/consul/agent/hcp" + "github.com/hashicorp/consul/agent/leafcert" "github.com/hashicorp/consul/agent/local" "github.com/hashicorp/consul/agent/pool" "github.com/hashicorp/consul/agent/router" @@ -53,17 +55,45 @@ import ( type BaseDeps struct { consul.Deps // TODO: un-embed - RuntimeConfig *config.RuntimeConfig - MetricsConfig *lib.MetricsConfig - AutoConfig *autoconf.AutoConfig // TODO: use an interface - Cache *cache.Cache - ViewStore *submatview.Store - WatchedFiles []string + RuntimeConfig *config.RuntimeConfig + MetricsConfig *lib.MetricsConfig + AutoConfig *autoconf.AutoConfig // TODO: use an interface + Cache *cache.Cache + LeafCertManager *leafcert.Manager + ViewStore *submatview.Store + WatchedFiles []string + NetRPC *LazyNetRPC deregisterBalancer, deregisterResolver func() stopHostCollector context.CancelFunc } +type NetRPC interface { + RPC(ctx context.Context, method string, args any, reply any) error +} + +type LazyNetRPC struct { + mu sync.RWMutex + rpc NetRPC +} + +func (r *LazyNetRPC) SetNetRPC(rpc NetRPC) { + r.mu.Lock() + defer r.mu.Unlock() + r.rpc = rpc +} + +func (r *LazyNetRPC) RPC(ctx context.Context, method string, args any, reply any) error { + r.mu.RLock() + r2 := r.rpc + r.mu.RUnlock() + + if r2 == nil { + return errors.New("rpc: initialization ordering error; net-rpc not ready yet") + } + return r2.RPC(ctx, method, args, reply) +} + type ConfigLoader func(source config.Source) (config.LoadResult, error) func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hclog.InterceptLogger) (BaseDeps, error) { @@ -108,7 +138,10 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl var extraSinks []metrics.MetricSink if cfg.IsCloudEnabled() { - d.HCP, err = hcp.NewDeps(cfg.Cloud, d.Logger.Named("hcp"), cfg.NodeID) + // This values is set late within newNodeIDFromConfig above + cfg.Cloud.NodeID = cfg.NodeID + + d.HCP, err = hcp.NewDeps(cfg.Cloud, d.Logger.Named("hcp")) if err != nil { return d, err } @@ -141,6 +174,18 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl d.ViewStore = submatview.NewStore(d.Logger.Named("viewstore")) d.ConnPool = newConnPool(cfg, d.Logger, d.TLSConfigurator) + d.NetRPC = &LazyNetRPC{} + + // TODO: create leafCertManager in BaseDeps once NetRPC is available without Agent + d.LeafCertManager = leafcert.NewManager(leafcert.Deps{ + Logger: d.Logger.Named("leaf-certs"), + CertSigner: leafcert.NewNetRPCCertSigner(d.NetRPC), + RootsReader: leafcert.NewCachedRootsReader(d.Cache, cfg.Datacenter), + Config: leafcert.Config{ + TestOverrideCAChangeInitialDelay: cfg.ConnectTestCALeafRootChangeSpread, + }, + }) + agentType := "client" if cfg.ServerMode { agentType = "server" @@ -198,6 +243,7 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl ServerProvider: d.Router, TLSConfigurator: d.TLSConfigurator, Cache: d.Cache, + LeafCertManager: d.LeafCertManager, Tokens: d.Tokens, EnterpriseConfig: initEnterpriseAutoConfig(d.EnterpriseDeps, cfg), } @@ -221,6 +267,7 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl // handled by something else (e.g. the agent stop channel). func (bd BaseDeps) Close() { bd.AutoConfig.Stop() + bd.LeafCertManager.Stop() bd.MetricsConfig.Cancel() for _, fn := range []func(){bd.deregisterBalancer, bd.deregisterResolver, bd.stopHostCollector} { diff --git a/agent/structs/config_entry_jwt_provider.go b/agent/structs/config_entry_jwt_provider.go index a1e9120ea08ec..fc0c73950b766 100644 --- a/agent/structs/config_entry_jwt_provider.go +++ b/agent/structs/config_entry_jwt_provider.go @@ -316,6 +316,15 @@ func (e *JWTProviderConfigEntry) GetRaftIndex() *RaftIndex { retur func (e *JWTProviderConfigEntry) CanRead(authz acl.Authorizer) error { var authzContext acl.AuthorizerContext e.FillAuthzContext(&authzContext) + + // allow service-identity tokens the ability to read jwt-providers + // this is a workaround to allow sidecar proxies to read the jwt-providers + // see issue: https://github.com/hashicorp/consul/issues/17886 for more details + err := authz.ToAllowAuthorizer().ServiceWriteAnyAllowed(&authzContext) + if err == nil { + return err + } + return authz.ToAllowAuthorizer().MeshReadAllowed(&authzContext) } diff --git a/agent/structs/config_entry_jwt_provider_test.go b/agent/structs/config_entry_jwt_provider_test.go index 814a15257378b..c02becc2a13de 100644 --- a/agent/structs/config_entry_jwt_provider_test.go +++ b/agent/structs/config_entry_jwt_provider_test.go @@ -338,6 +338,24 @@ func TestJWTProviderConfigEntry_ACLs(t *testing.T) { canRead: false, canWrite: false, }, + { + name: "jwt-provider: any service write", + authorizer: newTestAuthz(t, `service "" { policy = "write" }`), + canRead: true, + canWrite: false, + }, + { + name: "jwt-provider: specific service write", + authorizer: newTestAuthz(t, `service "web" { policy = "write" }`), + canRead: true, + canWrite: false, + }, + { + name: "jwt-provider: any service prefix write", + authorizer: newTestAuthz(t, `service_prefix "" { policy = "write" }`), + canRead: true, + canWrite: false, + }, { name: "jwt-provider: mesh read", authorizer: newTestAuthz(t, `mesh = "read"`), diff --git a/agent/structs/discovery_chain.go b/agent/structs/discovery_chain.go index edbc7384a3832..029fc3b0a8be8 100644 --- a/agent/structs/discovery_chain.go +++ b/agent/structs/discovery_chain.go @@ -121,12 +121,11 @@ func (s *DiscoveryGraphNode) MapKey() string { // compiled form of ServiceResolverConfigEntry type DiscoveryResolver struct { - Default bool `json:",omitempty"` - ConnectTimeout time.Duration `json:",omitempty"` - RequestTimeout time.Duration `json:",omitempty"` - Target string `json:",omitempty"` - Failover *DiscoveryFailover `json:",omitempty"` - PrioritizeByLocality *DiscoveryPrioritizeByLocality `json:",omitempty"` + Default bool `json:",omitempty"` + ConnectTimeout time.Duration `json:",omitempty"` + RequestTimeout time.Duration `json:",omitempty"` + Target string `json:",omitempty"` + Failover *DiscoveryFailover `json:",omitempty"` } func (r *DiscoveryResolver) MarshalJSON() ([]byte, error) { @@ -238,6 +237,8 @@ type DiscoveryTarget struct { // balancer objects. This has a structure similar to SNI, but will not be // affected by SNI customizations. Name string `json:",omitempty"` + + PrioritizeByLocality *DiscoveryPrioritizeByLocality `json:",omitempty"` } func (t *DiscoveryTarget) MarshalJSON() ([]byte, error) { @@ -277,12 +278,13 @@ func (t *DiscoveryTarget) UnmarshalJSON(data []byte) error { } type DiscoveryTargetOpts struct { - Service string - ServiceSubset string - Namespace string - Partition string - Datacenter string - Peer string + Service string + ServiceSubset string + Namespace string + Partition string + Datacenter string + Peer string + PrioritizeByLocality *DiscoveryPrioritizeByLocality } func MergeDiscoveryTargetOpts(opts ...DiscoveryTargetOpts) DiscoveryTargetOpts { diff --git a/agent/structs/operator.go b/agent/structs/operator.go index f5c2b8ac86dfe..05862861e3f85 100644 --- a/agent/structs/operator.go +++ b/agent/structs/operator.go @@ -34,6 +34,9 @@ type RaftServer struct { // it's a non-voting server, which will be added in a future release of // Consul. Voter bool + + // LastIndex is the last log index this server has a record of in its Raft log. + LastIndex uint64 } // RaftConfigurationResponse is returned when querying for the current Raft diff --git a/agent/structs/structs.deepcopy.go b/agent/structs/structs.deepcopy.go index 59a5c72249527..cdd007c26cdb5 100644 --- a/agent/structs/structs.deepcopy.go +++ b/agent/structs/structs.deepcopy.go @@ -129,6 +129,10 @@ func (o *CompiledDiscoveryChain) DeepCopy() *CompiledDiscoveryChain { cp_Targets_v2.Locality = new(Locality) *cp_Targets_v2.Locality = *v2.Locality } + if v2.PrioritizeByLocality != nil { + cp_Targets_v2.PrioritizeByLocality = new(DiscoveryPrioritizeByLocality) + *cp_Targets_v2.PrioritizeByLocality = *v2.PrioritizeByLocality + } } cp.Targets[k2] = cp_Targets_v2 } @@ -240,10 +244,6 @@ func (o *DiscoveryResolver) DeepCopy() *DiscoveryResolver { if o.Failover != nil { cp.Failover = o.Failover.DeepCopy() } - if o.PrioritizeByLocality != nil { - cp.PrioritizeByLocality = new(DiscoveryPrioritizeByLocality) - *cp.PrioritizeByLocality = *o.PrioritizeByLocality - } return &cp } diff --git a/agent/xds/delta.go b/agent/xds/delta.go index 5e4cf702090a6..f84b633a852b3 100644 --- a/agent/xds/delta.go +++ b/agent/xds/delta.go @@ -492,6 +492,7 @@ func applyEnvoyExtension(logger hclog.Logger, cfgSnap *proxycfg.ConfigSnapshot, extender, err := envoyextensions.ConstructExtension(ext) metrics.MeasureSinceWithLabels([]string{"envoy_extension", "validate_arguments"}, now, getMetricLabels(err)) if err != nil { + errorParams = append(errorParams, "error", err) logFn("failed to construct extension", errorParams...) if ext.Required { @@ -507,6 +508,7 @@ func applyEnvoyExtension(logger hclog.Logger, cfgSnap *proxycfg.ConfigSnapshot, if err != nil { errorParams = append(errorParams, "error", err) logFn("failed to validate extension arguments", errorParams...) + if ext.Required { return status.Errorf(codes.InvalidArgument, "failed to validate arguments for extension %q for service %q", ext.Name, svc.Name) } @@ -517,9 +519,13 @@ func applyEnvoyExtension(logger hclog.Logger, cfgSnap *proxycfg.ConfigSnapshot, now = time.Now() _, err = extender.Extend(resources, &runtimeConfig) metrics.MeasureSinceWithLabels([]string{"envoy_extension", "extend"}, now, getMetricLabels(err)) - logFn("failed to apply envoy extension", errorParams...) - if err != nil && ext.Required { - return status.Errorf(codes.InvalidArgument, "failed to patch xDS resources in the %q extension: %v", ext.Name, err) + if err != nil { + errorParams = append(errorParams, "error", err) + logFn("failed to apply envoy extension", errorParams...) + + if ext.Required { + return status.Errorf(codes.InvalidArgument, "failed to patch xDS resources in the %q extension: %v", ext.Name, err) + } } return nil diff --git a/agent/xds/delta_envoy_extender_oss_test.go b/agent/xds/delta_envoy_extender_oss_test.go index 3d92b6d25de00..2f18809b19367 100644 --- a/agent/xds/delta_envoy_extender_oss_test.go +++ b/agent/xds/delta_envoy_extender_oss_test.go @@ -676,6 +676,7 @@ end`, ns.Proxy.EnvoyExtensions = makeExtAuthzEnvoyExtension( "http", "dest=local", + "target-uri=localhost:9191", "insert=AfterLastMatch:envoy.filters.http.header_to_metadata", ) }, nil) diff --git a/agent/xds/delta_envoy_extender_test.go b/agent/xds/delta_envoy_extender_test.go index 0a76d62219575..6cd57fa53a041 100644 --- a/agent/xds/delta_envoy_extender_test.go +++ b/agent/xds/delta_envoy_extender_test.go @@ -50,6 +50,13 @@ func makeExtAuthzEnvoyExtension(svc string, opts ...string) []structs.EnvoyExten "FilterName": filterName, } } + case "target-uri": + target = map[string]any{"URI": v} + configMap = map[string]any{ + serviceKey: map[string]any{ + "Target": target, + }, + } case "config-type": if v == "full" { target["Timeout"] = "2s" diff --git a/agent/xds/listeners_test.go b/agent/xds/listeners_test.go index 21e3149bf1099..10b358bdad157 100644 --- a/agent/xds/listeners_test.go +++ b/agent/xds/listeners_test.go @@ -1109,6 +1109,15 @@ func TestListenersFromSnapshot(t *testing.T) { nil) }, }, + { + name: "connect-proxy-without-tproxy-and-permissive-mtls", + create: func(t testinf.T) *proxycfg.ConfigSnapshot { + return proxycfg.TestConfigSnapshot(t, func(ns *structs.NodeService) { + ns.Proxy.MutualTLSMode = structs.MutualTLSModePermissive + }, + nil) + }, + }, } tests = append(tests, makeListenerDiscoChainTests(false)...) diff --git a/agent/xds/testdata/builtin_extension/clusters/ext-authz-http-local-http-service.latest.golden b/agent/xds/testdata/builtin_extension/clusters/ext-authz-http-local-http-service.latest.golden index 3b0f2da69ce4b..992da1ae684be 100644 --- a/agent/xds/testdata/builtin_extension/clusters/ext-authz-http-local-http-service.latest.golden +++ b/agent/xds/testdata/builtin_extension/clusters/ext-authz-http-local-http-service.latest.golden @@ -142,7 +142,7 @@ { "@type": "type.googleapis.com/envoy.config.cluster.v3.Cluster", "name": "local_ext_authz", - "type": "STATIC", + "type": "STRICT_DNS", "loadAssignment": { "clusterName": "local_ext_authz", "endpoints": [ @@ -152,7 +152,7 @@ "endpoint": { "address": { "socketAddress": { - "address": "127.0.0.1", + "address": "localhost", "portValue": 9191 } } diff --git a/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-local-grpc-service.latest.golden b/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-local-grpc-service.latest.golden index 7f0ad9cf1cd8f..0448214fe7817 100644 --- a/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-local-grpc-service.latest.golden +++ b/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-local-grpc-service.latest.golden @@ -90,20 +90,6 @@ ] }, "httpFilters": [ - { - "name": "envoy.filters.http.ext_authz", - "typedConfig": { - "@type": "type.googleapis.com/envoy.extensions.filters.http.ext_authz.v3.ExtAuthz", - "grpcService": { - "envoyGrpc": { - "clusterName": "local_ext_authz" - } - }, - "transportApiVersion": "V3", - "failureModeAllow": true, - "statPrefix": "response" - } - }, { "name": "envoy.filters.http.rbac", "typedConfig": { @@ -189,6 +175,23 @@ ] } }, + { + "name": "envoy.filters.http.ext_authz", + "typedConfig": { + "@type": "type.googleapis.com/envoy.extensions.filters.http.ext_authz.v3.ExtAuthz", + "grpcService": { + "envoyGrpc": { + "clusterName": "local_ext_authz" + } + }, + "transportApiVersion": "V3", + "failureModeAllow": true, + "metadataContextNamespaces": [ + "consul" + ], + "statPrefix": "response" + } + }, { "name": "envoy.filters.http.router", "typedConfig": { diff --git a/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-local-http-service.latest.golden b/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-local-http-service.latest.golden index dc2c1e45cedaa..9e3390355805c 100644 --- a/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-local-http-service.latest.golden +++ b/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-local-http-service.latest.golden @@ -187,6 +187,9 @@ }, "transportApiVersion": "V3", "failureModeAllow": true, + "metadataContextNamespaces": [ + "consul" + ], "statPrefix": "response" } }, diff --git a/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-upstream-grpc-service.latest.golden b/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-upstream-grpc-service.latest.golden index af4675750e182..203929ea66889 100644 --- a/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-upstream-grpc-service.latest.golden +++ b/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-upstream-grpc-service.latest.golden @@ -208,7 +208,8 @@ }, "metadataContextNamespaces": [ "test-ns-1", - "test-ns-2" + "test-ns-2", + "consul" ], "includePeerCertificate": true, "statPrefix": "ext_authz_stats", diff --git a/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-upstream-http-service.latest.golden b/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-upstream-http-service.latest.golden index ded85e73e3c14..e2bf641ee52a8 100644 --- a/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-upstream-http-service.latest.golden +++ b/agent/xds/testdata/builtin_extension/listeners/ext-authz-http-upstream-http-service.latest.golden @@ -206,7 +206,8 @@ }, "metadataContextNamespaces": [ "test-ns-1", - "test-ns-2" + "test-ns-2", + "consul" ], "includePeerCertificate": true, "statPrefix": "ext_authz_stats", diff --git a/agent/xds/testdata/builtin_extension/listeners/ext-authz-tcp-local-grpc-service.latest.golden b/agent/xds/testdata/builtin_extension/listeners/ext-authz-tcp-local-grpc-service.latest.golden index 4f75499ff8d83..61e2ff733931c 100644 --- a/agent/xds/testdata/builtin_extension/listeners/ext-authz-tcp-local-grpc-service.latest.golden +++ b/agent/xds/testdata/builtin_extension/listeners/ext-authz-tcp-local-grpc-service.latest.golden @@ -63,6 +63,14 @@ "filterChains": [ { "filters": [ + { + "name": "envoy.filters.network.rbac", + "typedConfig": { + "@type": "type.googleapis.com/envoy.extensions.filters.network.rbac.v3.RBAC", + "rules": {}, + "statPrefix": "connect_authz" + } + }, { "name": "envoy.filters.network.ext_authz", "typedConfig": { @@ -77,14 +85,6 @@ "transportApiVersion": "V3" } }, - { - "name": "envoy.filters.network.rbac", - "typedConfig": { - "@type": "type.googleapis.com/envoy.extensions.filters.network.rbac.v3.RBAC", - "rules": {}, - "statPrefix": "connect_authz" - } - }, { "name": "envoy.filters.network.tcp_proxy", "typedConfig": { diff --git a/agent/xds/testdata/listeners/connect-proxy-without-tproxy-and-permissive-mtls.latest.golden b/agent/xds/testdata/listeners/connect-proxy-without-tproxy-and-permissive-mtls.latest.golden new file mode 100644 index 0000000000000..b15ccf4a148a5 --- /dev/null +++ b/agent/xds/testdata/listeners/connect-proxy-without-tproxy-and-permissive-mtls.latest.golden @@ -0,0 +1,115 @@ +{ + "versionInfo": "00000001", + "resources": [ + { + "@type": "type.googleapis.com/envoy.config.listener.v3.Listener", + "name": "db:127.0.0.1:9191", + "address": { + "socketAddress": { + "address": "127.0.0.1", + "portValue": 9191 + } + }, + "filterChains": [ + { + "filters": [ + { + "name": "envoy.filters.network.tcp_proxy", + "typedConfig": { + "@type": "type.googleapis.com/envoy.extensions.filters.network.tcp_proxy.v3.TcpProxy", + "statPrefix": "upstream.db.default.default.dc1", + "cluster": "db.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul" + } + } + ] + } + ], + "trafficDirection": "OUTBOUND" + }, + { + "@type": "type.googleapis.com/envoy.config.listener.v3.Listener", + "name": "prepared_query:geo-cache:127.10.10.10:8181", + "address": { + "socketAddress": { + "address": "127.10.10.10", + "portValue": 8181 + } + }, + "filterChains": [ + { + "filters": [ + { + "name": "envoy.filters.network.tcp_proxy", + "typedConfig": { + "@type": "type.googleapis.com/envoy.extensions.filters.network.tcp_proxy.v3.TcpProxy", + "statPrefix": "upstream.prepared_query_geo-cache", + "cluster": "geo-cache.default.dc1.query.11111111-2222-3333-4444-555555555555.consul" + } + } + ] + } + ], + "trafficDirection": "OUTBOUND" + }, + { + "@type": "type.googleapis.com/envoy.config.listener.v3.Listener", + "name": "public_listener:0.0.0.0:9999", + "address": { + "socketAddress": { + "address": "0.0.0.0", + "portValue": 9999 + } + }, + "filterChains": [ + { + "filters": [ + { + "name": "envoy.filters.network.rbac", + "typedConfig": { + "@type": "type.googleapis.com/envoy.extensions.filters.network.rbac.v3.RBAC", + "rules": {}, + "statPrefix": "connect_authz" + } + }, + { + "name": "envoy.filters.network.tcp_proxy", + "typedConfig": { + "@type": "type.googleapis.com/envoy.extensions.filters.network.tcp_proxy.v3.TcpProxy", + "statPrefix": "public_listener", + "cluster": "local_app" + } + } + ], + "transportSocket": { + "name": "tls", + "typedConfig": { + "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.DownstreamTlsContext", + "commonTlsContext": { + "tlsParams": {}, + "tlsCertificates": [ + { + "certificateChain": { + "inlineString": "-----BEGIN CERTIFICATE-----\nMIICjDCCAjKgAwIBAgIIC5llxGV1gB8wCgYIKoZIzj0EAwIwFDESMBAGA1UEAxMJ\nVGVzdCBDQSAyMB4XDTE5MDMyMjEzNTgyNloXDTI5MDMyMjEzNTgyNlowDjEMMAoG\nA1UEAxMDd2ViMFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEADPv1RHVNRfa2VKR\nAB16b6rZnEt7tuhaxCFpQXPj7M2omb0B9Favq5E0ivpNtv1QnFhxtPd7d5k4e+T7\nSkW1TaOCAXIwggFuMA4GA1UdDwEB/wQEAwIDuDAdBgNVHSUEFjAUBggrBgEFBQcD\nAgYIKwYBBQUHAwEwDAYDVR0TAQH/BAIwADBoBgNVHQ4EYQRfN2Q6MDc6ODc6M2E6\nNDA6MTk6NDc6YzM6NWE6YzA6YmE6NjI6ZGY6YWY6NGI6ZDQ6MDU6MjU6NzY6M2Q6\nNWE6OGQ6MTY6OGQ6Njc6NWU6MmU6YTA6MzQ6N2Q6ZGM6ZmYwagYDVR0jBGMwYYBf\nZDE6MTE6MTE6YWM6MmE6YmE6OTc6YjI6M2Y6YWM6N2I6YmQ6ZGE6YmU6YjE6OGE6\nZmM6OWE6YmE6YjU6YmM6ODM6ZTc6NWU6NDE6NmY6ZjI6NzM6OTU6NTg6MGM6ZGIw\nWQYDVR0RBFIwUIZOc3BpZmZlOi8vMTExMTExMTEtMjIyMi0zMzMzLTQ0NDQtNTU1\nNTU1NTU1NTU1LmNvbnN1bC9ucy9kZWZhdWx0L2RjL2RjMS9zdmMvd2ViMAoGCCqG\nSM49BAMCA0gAMEUCIGC3TTvvjj76KMrguVyFf4tjOqaSCRie3nmHMRNNRav7AiEA\npY0heYeK9A6iOLrzqxSerkXXQyj5e9bE4VgUnxgPU6g=\n-----END CERTIFICATE-----\n" + }, + "privateKey": { + "inlineString": "-----BEGIN EC PRIVATE KEY-----\nMHcCAQEEIMoTkpRggp3fqZzFKh82yS4LjtJI+XY+qX/7DefHFrtdoAoGCCqGSM49\nAwEHoUQDQgAEADPv1RHVNRfa2VKRAB16b6rZnEt7tuhaxCFpQXPj7M2omb0B9Fav\nq5E0ivpNtv1QnFhxtPd7d5k4e+T7SkW1TQ==\n-----END EC PRIVATE KEY-----\n" + } + } + ], + "validationContext": { + "trustedCa": { + "inlineString": "-----BEGIN CERTIFICATE-----\nMIICXDCCAgKgAwIBAgIICpZq70Z9LyUwCgYIKoZIzj0EAwIwFDESMBAGA1UEAxMJ\nVGVzdCBDQSAyMB4XDTE5MDMyMjEzNTgyNloXDTI5MDMyMjEzNTgyNlowFDESMBAG\nA1UEAxMJVGVzdCBDQSAyMFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEIhywH1gx\nAsMwuF3ukAI5YL2jFxH6Usnma1HFSfVyxbXX1/uoZEYrj8yCAtdU2yoHETyd+Zx2\nThhRLP79pYegCaOCATwwggE4MA4GA1UdDwEB/wQEAwIBhjAPBgNVHRMBAf8EBTAD\nAQH/MGgGA1UdDgRhBF9kMToxMToxMTphYzoyYTpiYTo5NzpiMjozZjphYzo3Yjpi\nZDpkYTpiZTpiMTo4YTpmYzo5YTpiYTpiNTpiYzo4MzplNzo1ZTo0MTo2ZjpmMjo3\nMzo5NTo1ODowYzpkYjBqBgNVHSMEYzBhgF9kMToxMToxMTphYzoyYTpiYTo5Nzpi\nMjozZjphYzo3YjpiZDpkYTpiZTpiMTo4YTpmYzo5YTpiYTpiNTpiYzo4MzplNzo1\nZTo0MTo2ZjpmMjo3Mzo5NTo1ODowYzpkYjA/BgNVHREEODA2hjRzcGlmZmU6Ly8x\nMTExMTExMS0yMjIyLTMzMzMtNDQ0NC01NTU1NTU1NTU1NTUuY29uc3VsMAoGCCqG\nSM49BAMCA0gAMEUCICOY0i246rQHJt8o8Oya0D5PLL1FnmsQmQqIGCi31RwnAiEA\noR5f6Ku+cig2Il8T8LJujOp2/2A72QcHZA57B13y+8o=\n-----END CERTIFICATE-----\n" + } + } + }, + "requireClientCertificate": true + } + } + } + ], + "trafficDirection": "INBOUND" + } + ], + "typeUrl": "type.googleapis.com/envoy.config.listener.v3.Listener", + "nonce": "00000001" +} diff --git a/api/go.mod b/api/go.mod index 335a6df7ce13b..ddc961f8bd74c 100644 --- a/api/go.mod +++ b/api/go.mod @@ -6,7 +6,7 @@ replace github.com/hashicorp/consul/sdk => ../sdk require ( github.com/google/go-cmp v0.5.9 - github.com/hashicorp/consul/sdk v0.13.1 + github.com/hashicorp/consul/sdk v0.14.0-rc1 github.com/hashicorp/go-cleanhttp v0.5.2 github.com/hashicorp/go-hclog v1.5.0 github.com/hashicorp/go-rootcerts v1.0.2 diff --git a/api/go.sum b/api/go.sum index fd85203e346fc..b0041f05248ad 100644 --- a/api/go.sum +++ b/api/go.sum @@ -43,6 +43,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/hashicorp/consul/sdk v0.14.0-rc1 h1:PuETOfN0uxl28i0Pq6rK7TBCrIl7psMbL0YTSje4KvM= +github.com/hashicorp/consul/sdk v0.14.0-rc1/go.mod h1:gHYeuDa0+0qRAD6Wwr6yznMBvBwHKoxSBoW5l73+saE= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= diff --git a/api/operator_raft.go b/api/operator_raft.go index 393d6fb3c5f77..d72c00c97b93a 100644 --- a/api/operator_raft.go +++ b/api/operator_raft.go @@ -28,6 +28,9 @@ type RaftServer struct { // it's a non-voting server, which will be added in a future release of // Consul. Voter bool + + // LastIndex is the last log index this server has a record of in its Raft log. + LastIndex uint64 } // RaftConfiguration is returned when querying for the current Raft configuration. diff --git a/command/connect/envoy/bootstrap_config.go b/command/connect/envoy/bootstrap_config.go index a50eaf36fe066..2a0e21c4d25dd 100644 --- a/command/connect/envoy/bootstrap_config.go +++ b/command/connect/envoy/bootstrap_config.go @@ -847,7 +847,8 @@ func appendTelemetryCollectorConfig(args *BootstrapTplArgs, telemetryCollectorBi "envoy_grpc": { "cluster_name": "consul_telemetry_collector_loopback" } - } + }, + "emit_tags_as_labels": true } }` diff --git a/command/connect/envoy/bootstrap_config_test.go b/command/connect/envoy/bootstrap_config_test.go index 8de9ae007d6a8..293aee6600919 100644 --- a/command/connect/envoy/bootstrap_config_test.go +++ b/command/connect/envoy/bootstrap_config_test.go @@ -539,7 +539,8 @@ const ( "envoy_grpc": { "cluster_name": "consul_telemetry_collector_loopback" } - } + }, + "emit_tags_as_labels": true } }` @@ -638,7 +639,8 @@ func TestBootstrapConfig_ConfigureArgs(t *testing.T) { "envoy_grpc": { "cluster_name": "consul_telemetry_collector_loopback" } - } + }, + "emit_tags_as_labels": true } }`, StaticClustersJSON: `{ diff --git a/command/connect/envoy/testdata/telemetry-collector.golden b/command/connect/envoy/testdata/telemetry-collector.golden index 7c584864a0be2..3977ce65bba31 100644 --- a/command/connect/envoy/testdata/telemetry-collector.golden +++ b/command/connect/envoy/testdata/telemetry-collector.golden @@ -89,7 +89,8 @@ "envoy_grpc": { "cluster_name": "consul_telemetry_collector_loopback" } - } + }, + "emit_tags_as_labels": true } } ], diff --git a/command/debug/debug.go b/command/debug/debug.go index 38a02c26027b5..0c3fcca57a834 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -35,7 +35,7 @@ const ( // debugDuration is the total duration that debug runs before being // shut down - debugDuration = 2 * time.Minute + debugDuration = 5 * time.Minute // debugDurationGrace is a period of time added to the specified // duration to allow intervals to capture within that time @@ -503,7 +503,7 @@ func (c *cmd) captureHeap(outputDir string) error { } func (c *cmd) captureLogs(ctx context.Context) error { - logCh, err := c.client.Agent().Monitor("DEBUG", ctx.Done(), nil) + logCh, err := c.client.Agent().Monitor("TRACE", ctx.Done(), nil) if err != nil { return err } diff --git a/command/operator/raft/listpeers/operator_raft_list.go b/command/operator/raft/listpeers/operator_raft_list.go index 47bd161fed488..29643a87cf33b 100644 --- a/command/operator/raft/listpeers/operator_raft_list.go +++ b/command/operator/raft/listpeers/operator_raft_list.go @@ -70,8 +70,24 @@ func raftListPeers(client *api.Client, stale bool) (string, error) { return "", fmt.Errorf("Failed to retrieve raft configuration: %v", err) } + leaderLastCommitIndex := uint64(0) + serverIdLastIndexMap := make(map[string]uint64) + + for _, raftServer := range reply.Servers { + serverIdLastIndexMap[raftServer.ID] = raftServer.LastIndex + } + + for _, s := range reply.Servers { + if s.Leader { + lastIndex, ok := serverIdLastIndexMap[s.ID] + if ok { + leaderLastCommitIndex = lastIndex + } + } + } + // Format it as a nice table. - result := []string{"Node\x1fID\x1fAddress\x1fState\x1fVoter\x1fRaftProtocol"} + result := []string{"Node\x1fID\x1fAddress\x1fState\x1fVoter\x1fRaftProtocol\x1fCommit Index\x1fTrails Leader By"} for _, s := range reply.Servers { raftProtocol := s.ProtocolVersion @@ -82,8 +98,20 @@ func raftListPeers(client *api.Client, stale bool) (string, error) { if s.Leader { state = "leader" } - result = append(result, fmt.Sprintf("%s\x1f%s\x1f%s\x1f%s\x1f%v\x1f%s", - s.Node, s.ID, s.Address, state, s.Voter, raftProtocol)) + + trailsLeaderByText := "-" + serverLastIndex, ok := serverIdLastIndexMap[s.ID] + if ok { + trailsLeaderBy := leaderLastCommitIndex - serverLastIndex + trailsLeaderByText = fmt.Sprintf("%d commits", trailsLeaderBy) + if s.Leader { + trailsLeaderByText = "-" + } else if trailsLeaderBy == 1 { + trailsLeaderByText = fmt.Sprintf("%d commit", trailsLeaderBy) + } + } + result = append(result, fmt.Sprintf("%s\x1f%s\x1f%s\x1f%s\x1f%v\x1f%s\x1f%v\x1f%s", + s.Node, s.ID, s.Address, state, s.Voter, raftProtocol, serverLastIndex, trailsLeaderByText)) } return columnize.Format(result, &columnize.Config{Delim: string([]byte{0x1f})}), nil diff --git a/command/operator/raft/listpeers/operator_raft_list_test.go b/command/operator/raft/listpeers/operator_raft_list_test.go index 15bd1bfbe34f2..0694e0dd10465 100644 --- a/command/operator/raft/listpeers/operator_raft_list_test.go +++ b/command/operator/raft/listpeers/operator_raft_list_test.go @@ -28,7 +28,7 @@ func TestOperatorRaftListPeersCommand(t *testing.T) { a := agent.NewTestAgent(t, ``) defer a.Shutdown() - expected := fmt.Sprintf("%s %s 127.0.0.1:%d leader true 3", + expected := fmt.Sprintf("%s %s 127.0.0.1:%d leader true 3 1 -", a.Config.NodeName, a.Config.NodeID, a.Config.ServerPort) // Test the list-peers subcommand directly diff --git a/envoyextensions/go.mod b/envoyextensions/go.mod index 6a6128fa6cee8..e426b50365de9 100644 --- a/envoyextensions/go.mod +++ b/envoyextensions/go.mod @@ -6,8 +6,8 @@ replace github.com/hashicorp/consul/api => ../api require ( github.com/envoyproxy/go-control-plane v0.11.0 - github.com/hashicorp/consul/api v1.20.0 - github.com/hashicorp/consul/sdk v0.13.1 + github.com/hashicorp/consul/api v1.22.0-rc1 + github.com/hashicorp/consul/sdk v0.14.0-rc1 github.com/hashicorp/go-hclog v1.5.0 github.com/hashicorp/go-multierror v1.1.1 github.com/hashicorp/go-version v1.2.1 @@ -30,7 +30,6 @@ require ( github.com/hashicorp/go-uuid v1.0.3 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/hashicorp/serf v0.10.1 // indirect - github.com/kr/pretty v0.3.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.17 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect @@ -40,6 +39,5 @@ require ( golang.org/x/exp v0.0.0-20230321023759-10a507213a29 // indirect golang.org/x/sys v0.8.0 // indirect google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect - gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/envoyextensions/go.sum b/envoyextensions/go.sum index 52d5f9ed00c22..929a26218e652 100644 --- a/envoyextensions/go.sum +++ b/envoyextensions/go.sum @@ -24,7 +24,6 @@ github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/xds/go v0.0.0-20230310173818-32f1caf87195 h1:58f1tJ1ra+zFINPlwLWvQsR9CzAKt2e+EWV2yX9oXQ4= github.com/cncf/xds/go v0.0.0-20230310173818-32f1caf87195/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -62,8 +61,10 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/hashicorp/consul/sdk v0.13.1 h1:EygWVWWMczTzXGpO93awkHFzfUka6hLYJ0qhETd+6lY= -github.com/hashicorp/consul/sdk v0.13.1/go.mod h1:SW/mM4LbKfqmMvcFu8v+eiQQ7oitXEFeiBe9StxERb0= +github.com/hashicorp/consul/api v1.22.0-rc1 h1:ePmGqndeMgaI38KUbSA/CqTzeEAIogXyWnfNJzglo70= +github.com/hashicorp/consul/api v1.22.0-rc1/go.mod h1:wtduXtbAqSGtBdi3tyA5SSAYGAG51rBejV9SEUBciMY= +github.com/hashicorp/consul/sdk v0.14.0-rc1 h1:PuETOfN0uxl28i0Pq6rK7TBCrIl7psMbL0YTSje4KvM= +github.com/hashicorp/consul/sdk v0.14.0-rc1/go.mod h1:gHYeuDa0+0qRAD6Wwr6yznMBvBwHKoxSBoW5l73+saE= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -108,13 +109,10 @@ github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7V github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= -github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-colorable v0.1.6/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= @@ -169,8 +167,7 @@ github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8b github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= -github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= -github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= @@ -265,11 +262,8 @@ google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cn google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= -gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/go.mod b/go.mod index 81231ff9117f9..f85e23b0b3c7e 100644 --- a/go.mod +++ b/go.mod @@ -21,7 +21,7 @@ require ( github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e github.com/armon/go-metrics v0.4.1 github.com/armon/go-radix v1.0.0 - github.com/aws/aws-sdk-go v1.42.34 + github.com/aws/aws-sdk-go v1.44.289 github.com/coredns/coredns v1.6.6 github.com/coreos/go-oidc v2.1.0+incompatible github.com/docker/go-connections v0.4.0 diff --git a/go.sum b/go.sum index 3fa9a355c1005..9f38d4eb658e1 100644 --- a/go.sum +++ b/go.sum @@ -161,8 +161,8 @@ github.com/aws/aws-sdk-go v1.23.0/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN github.com/aws/aws-sdk-go v1.25.41/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/aws/aws-sdk-go v1.25.48/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/aws/aws-sdk-go v1.30.27/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= -github.com/aws/aws-sdk-go v1.42.34 h1:fqGAiKmCSRY1rEa4G9VqgkKKbNmLKYq5dKmLtQkvYi8= -github.com/aws/aws-sdk-go v1.42.34/go.mod h1:OGr6lGMAKGlG9CVrYnWYDKIyb829c6EVBRjxqjmPepc= +github.com/aws/aws-sdk-go v1.44.289 h1:5CVEjiHFvdiVlKPBzv0rjG4zH/21W/onT18R5AH/qx0= +github.com/aws/aws-sdk-go v1.44.289/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/baiyubin/aliyun-sts-go-sdk v0.0.0-20180326062324-cfa1a18b161f/go.mod h1:AuiFmCCPBSrqvVMvuqFuk0qogytodnVFVSN5CeJB8Gc= github.com/benbjohnson/immutable v0.4.0 h1:CTqXbEerYso8YzVPxmWxh2gnoRQbbB9X1quUC8+vGZA= github.com/benbjohnson/immutable v0.4.0/go.mod h1:iAr8OjJGLnLmVUr9MZ/rz4PWUy6Ouc2JLYuMArmvAJM= @@ -1098,6 +1098,7 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yusufpapurcu/wmi v1.2.2 h1:KBNDSne4vP5mbSWnJbO+51IMOXJB67QiYCSBrubbPRg= github.com/yusufpapurcu/wmi v1.2.2/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= @@ -1167,6 +1168,7 @@ golang.org/x/crypto v0.0.0-20200302210943-78000ba7a073/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.1.0 h1:MDRAIl0xIo9Io2xV565hzXHw3zVseKrJKodhohM5CjU= golang.org/x/crypto v0.1.0/go.mod h1:RecgLatLF4+eUMCP1PoPZQb+cVrJcOPbHkTkbkB9sbw= @@ -1208,6 +1210,7 @@ golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk= golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20170114055629-f2499483f923/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1266,9 +1269,10 @@ golang.org/x/net v0.0.0-20210421230115-4e50805a0758/go.mod h1:72T/g9IO56b78aLF+1 golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.0.0-20211216030914-fe4d6282115f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= @@ -1303,6 +1307,7 @@ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.2.0 h1:PUR+T4wwASmuSTYdKjYHI5TD22Wy5ogLU5qZCOLxBrI= golang.org/x/sync v0.2.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20170830134202-bb24a47a89ea/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -1397,13 +1402,16 @@ golang.org/x/sys v0.0.0-20220128215802-99c3d69c2c27/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.8.0 h1:n5xxQn2i3PC0yLAbjTpNT85q/Kgzcr2gIoX9OrJUols= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1417,6 +1425,7 @@ golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/time v0.0.0-20161028155119-f51c12702a4d/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1494,6 +1503,7 @@ golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.9.1 h1:8WMNJAz3zrtPmnYC7ISf5dEn3MT0gY7jBJfw27yrrLo= golang.org/x/tools v0.9.1/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/internal/controller/api_test.go b/internal/controller/api_test.go index 2006664b20fb7..e80a2d7d7133e 100644 --- a/internal/controller/api_test.go +++ b/internal/controller/api_test.go @@ -190,7 +190,7 @@ func TestController_String(t *testing.T) { WithPlacement(controller.PlacementEachServer) require.Equal(t, - `, placement="each-server">`, + `, placement="each-server">`, ctrl.String(), ) } @@ -201,7 +201,7 @@ func TestController_NoReconciler(t *testing.T) { ctrl := controller.ForType(demo.TypeV2Artist) require.PanicsWithValue(t, - `cannot register controller without a reconciler , placement="singleton">`, + `cannot register controller without a reconciler , placement="singleton">`, func() { mgr.Register(ctrl) }) } diff --git a/internal/resource/demo/demo.go b/internal/resource/demo/demo.go index 842b75739bc4f..20ad89c962c42 100644 --- a/internal/resource/demo/demo.go +++ b/internal/resource/demo/demo.go @@ -33,36 +33,36 @@ var ( TypeV1Artist = &pbresource.Type{ Group: "demo", GroupVersion: "v1", - Kind: "artist", + Kind: "Artist", } // TypeV1Album represents a collection of an artist's songs. TypeV1Album = &pbresource.Type{ Group: "demo", GroupVersion: "v1", - Kind: "album", + Kind: "Album", } // TypeV2Artist represents a musician or group of musicians. TypeV2Artist = &pbresource.Type{ Group: "demo", GroupVersion: "v2", - Kind: "artist", + Kind: "Artist", } // TypeV2Album represents a collection of an artist's songs. TypeV2Album = &pbresource.Type{ Group: "demo", GroupVersion: "v2", - Kind: "album", + Kind: "Album", } ) const ( - ArtistV1ReadPolicy = `key_prefix "resource/demo.v1.artist/" { policy = "read" }` - ArtistV1WritePolicy = `key_prefix "resource/demo.v1.artist/" { policy = "write" }` - ArtistV2ReadPolicy = `key_prefix "resource/demo.v2.artist/" { policy = "read" }` - ArtistV2WritePolicy = `key_prefix "resource/demo.v2.artist/" { policy = "write" }` + ArtistV1ReadPolicy = `key_prefix "resource/demo.v1.Artist/" { policy = "read" }` + ArtistV1WritePolicy = `key_prefix "resource/demo.v1.Artist/" { policy = "write" }` + ArtistV2ReadPolicy = `key_prefix "resource/demo.v2.Artist/" { policy = "read" }` + ArtistV2WritePolicy = `key_prefix "resource/demo.v2.Artist/" { policy = "write" }` ArtistV2ListPolicy = `key_prefix "resource/" { policy = "list" }` ) diff --git a/internal/resource/registry.go b/internal/resource/registry.go index 232e3998d4e2f..0004acfff4c6a 100644 --- a/internal/resource/registry.go +++ b/internal/resource/registry.go @@ -5,6 +5,7 @@ package resource import ( "fmt" + "regexp" "sync" "google.golang.org/protobuf/proto" @@ -13,6 +14,12 @@ import ( "github.com/hashicorp/consul/proto-public/pbresource" ) +var ( + groupRegexp = regexp.MustCompile(`^[a-z][a-z\d_]+$`) + groupVersionRegexp = regexp.MustCompile(`^v([a-z\d]+)?\d$`) + kindRegexp = regexp.MustCompile(`^[A-Z][A-Za-z\d]+$`) +) + type Registry interface { // Register the given resource type and its hooks. Register(reg Registration) @@ -82,14 +89,23 @@ func NewRegistry() Registry { } func (r *TypeRegistry) Register(registration Registration) { - r.lock.Lock() - defer r.lock.Unlock() - typ := registration.Type if typ.Group == "" || typ.GroupVersion == "" || typ.Kind == "" { panic("type field(s) cannot be empty") } + switch { + case !groupRegexp.MatchString(typ.Group): + panic(fmt.Sprintf("Type.Group must be in snake_case. Got: %q", typ.Group)) + case !groupVersionRegexp.MatchString(typ.GroupVersion): + panic(fmt.Sprintf("Type.GroupVersion must be lowercase, start with `v`, and end with a number (e.g. `v2` or `v1alpha1`). Got: %q", typ.Group)) + case !kindRegexp.MatchString(typ.Kind): + panic(fmt.Sprintf("Type.Kind must be in PascalCase. Got: %q", typ.Kind)) + } + + r.lock.Lock() + defer r.lock.Unlock() + key := ToGVK(registration.Type) if _, ok := r.registrations[key]; ok { panic(fmt.Sprintf("resource type %s already registered", key)) diff --git a/internal/resource/registry_test.go b/internal/resource/registry_test.go index 7979d618c470f..c9d1777159f8c 100644 --- a/internal/resource/registry_test.go +++ b/internal/resource/registry_test.go @@ -28,36 +28,9 @@ func TestRegister(t *testing.T) { require.True(t, proto.Equal(demo.TypeV2Artist, actual.Type)) // register existing should panic - require.PanicsWithValue(t, "resource type demo.v2.artist already registered", func() { + require.PanicsWithValue(t, "resource type demo.v2.Artist already registered", func() { r.Register(reg) }) - - // type missing required fields should panic - testcases := map[string]*pbresource.Type{ - "empty group": { - Group: "", - GroupVersion: "v2", - Kind: "artist", - }, - "empty group version": { - Group: "", - GroupVersion: "v2", - Kind: "artist", - }, - "empty kind": { - Group: "demo", - GroupVersion: "v2", - Kind: "", - }, - } - - for desc, typ := range testcases { - t.Run(desc, func(t *testing.T) { - require.PanicsWithValue(t, "type field(s) cannot be empty", func() { - r.Register(resource.Registration{Type: typ}) - }) - }) - } } func TestRegister_Defaults(t *testing.T) { @@ -102,7 +75,7 @@ func TestResolve(t *testing.T) { serviceType := &pbresource.Type{ Group: "mesh", GroupVersion: "v1", - Kind: "service", + Kind: "Service", } // not found @@ -115,3 +88,89 @@ func TestResolve(t *testing.T) { assert.True(t, ok) assert.Equal(t, registration.Type, serviceType) } + +func TestRegister_TypeValidation(t *testing.T) { + registry := resource.NewRegistry() + + testCases := map[string]struct { + fn func(*pbresource.Type) + valid bool + }{ + "Valid": {valid: true}, + "Group empty": { + fn: func(t *pbresource.Type) { t.Group = "" }, + valid: false, + }, + "Group PascalCase": { + fn: func(t *pbresource.Type) { t.Group = "Foo" }, + valid: false, + }, + "Group kebab-case": { + fn: func(t *pbresource.Type) { t.Group = "foo-bar" }, + valid: false, + }, + "Group snake_case": { + fn: func(t *pbresource.Type) { t.Group = "foo_bar" }, + valid: true, + }, + "GroupVersion empty": { + fn: func(t *pbresource.Type) { t.GroupVersion = "" }, + valid: false, + }, + "GroupVersion snake_case": { + fn: func(t *pbresource.Type) { t.GroupVersion = "v_1" }, + valid: false, + }, + "GroupVersion kebab-case": { + fn: func(t *pbresource.Type) { t.GroupVersion = "v-1" }, + valid: false, + }, + "GroupVersion no leading v": { + fn: func(t *pbresource.Type) { t.GroupVersion = "1" }, + valid: false, + }, + "GroupVersion no trailing number": { + fn: func(t *pbresource.Type) { t.GroupVersion = "OnePointOh" }, + valid: false, + }, + "Kind PascalCase with numbers": { + fn: func(t *pbresource.Type) { t.Kind = "Number1" }, + valid: true, + }, + "Kind camelCase": { + fn: func(t *pbresource.Type) { t.Kind = "barBaz" }, + valid: false, + }, + "Kind snake_case": { + fn: func(t *pbresource.Type) { t.Kind = "bar_baz" }, + valid: false, + }, + "Kind empty": { + fn: func(t *pbresource.Type) { t.Kind = "" }, + valid: false, + }, + } + for desc, tc := range testCases { + t.Run(desc, func(t *testing.T) { + reg := func() { + typ := &pbresource.Type{ + Group: "foo", + GroupVersion: "v1", + Kind: "Bar", + } + if tc.fn != nil { + tc.fn(typ) + } + registry.Register(resource.Registration{ + Type: typ, + }) + } + + if tc.valid { + require.NotPanics(t, reg) + } else { + require.Panics(t, reg) + } + }) + } +} diff --git a/internal/resource/tombstone.go b/internal/resource/tombstone.go index 289aec2d5161b..6d0285c602de9 100644 --- a/internal/resource/tombstone.go +++ b/internal/resource/tombstone.go @@ -6,6 +6,6 @@ var ( TypeV1Tombstone = &pbresource.Type{ Group: "internal", GroupVersion: "v1", - Kind: "tombstone", + Kind: "Tombstone", } ) diff --git a/test/integration/connect/envoy/case-property-override/setup.sh b/test/integration/connect/envoy/case-property-override/setup.sh index 1bf2021c0c9ac..744055f94966f 100644 --- a/test/integration/connect/envoy/case-property-override/setup.sh +++ b/test/integration/connect/envoy/case-property-override/setup.sh @@ -53,12 +53,30 @@ EnvoyExtensions = [ Path = "/upstream_connection_options/tcp_keepalive/keepalive_probes" Value = 1234 }, + { + ResourceFilter = { + ResourceType = "cluster" + TrafficDirection = "outbound" + } + Op = "add" + Path = "/outlier_detection/max_ejection_time/seconds" + Value = 120 + }, + { + ResourceFilter = { + ResourceType = "cluster" + TrafficDirection = "outbound" + } + Op = "add" + Path = "/outlier_detection/max_ejection_time_jitter/seconds" + Value = 1 + }, { ResourceFilter = { ResourceType = "cluster" TrafficDirection = "outbound" Services = [{ - Name = "s2" + Name = "s3" }] } Op = "remove" diff --git a/test/integration/connect/envoy/case-property-override/verify.bats b/test/integration/connect/envoy/case-property-override/verify.bats index 4453409eede53..446ef061da418 100644 --- a/test/integration/connect/envoy/case-property-override/verify.bats +++ b/test/integration/connect/envoy/case-property-override/verify.bats @@ -19,13 +19,14 @@ load helpers [ "$status" == 0 ] [ "$(echo "$output" | jq -r '.upstream_connection_options.tcp_keepalive.keepalive_probes')" == "1234" ] - [ "$(echo "$output" | jq -r '.outlier_detection')" == "null" ] + [ "$(echo "$output" | jq -r '.outlier_detection.max_ejection_time')" == "120s" ] + [ "$(echo "$output" | jq -r '.outlier_detection.max_ejection_time_jitter')" == "1s" ] run get_envoy_cluster_config localhost:19000 s3 [ "$status" == 0 ] [ "$(echo "$output" | jq -r '.upstream_connection_options.tcp_keepalive.keepalive_probes')" == "1234" ] - [ "$(echo "$output" | jq -r '.outlier_detection')" == "{}" ] + [ "$(echo "$output" | jq -r '.outlier_detection')" == "null" ] } @test "s2 proxy is configured with the expected envoy patches" { diff --git a/test/integration/consul-container/go.mod b/test/integration/consul-container/go.mod index 27d1357bd1df2..7a14573fa8197 100644 --- a/test/integration/consul-container/go.mod +++ b/test/integration/consul-container/go.mod @@ -7,9 +7,9 @@ require ( github.com/avast/retry-go v3.0.0+incompatible github.com/docker/docker v23.0.6+incompatible github.com/docker/go-connections v0.4.0 - github.com/hashicorp/consul/api v1.20.0 - github.com/hashicorp/consul/envoyextensions v0.1.2 - github.com/hashicorp/consul/sdk v0.13.1 + github.com/hashicorp/consul/api v1.22.0-rc1 + github.com/hashicorp/consul/envoyextensions v0.3.0-rc1 + github.com/hashicorp/consul/sdk v0.14.0-rc1 github.com/hashicorp/go-cleanhttp v0.5.2 github.com/hashicorp/go-multierror v1.1.1 github.com/hashicorp/go-uuid v1.0.3 diff --git a/troubleshoot/go.mod b/troubleshoot/go.mod index 1a6ca3559a639..1b9c0e274b57d 100644 --- a/troubleshoot/go.mod +++ b/troubleshoot/go.mod @@ -14,8 +14,8 @@ exclude ( require ( github.com/envoyproxy/go-control-plane v0.11.0 github.com/envoyproxy/go-control-plane/xdsmatcher v0.0.0-20230524161521-aaaacbfbe53e - github.com/hashicorp/consul/api v1.20.0 - github.com/hashicorp/consul/envoyextensions v0.1.2 + github.com/hashicorp/consul/api v1.22.0-rc1 + github.com/hashicorp/consul/envoyextensions v0.3.0-rc1 github.com/stretchr/testify v1.8.3 google.golang.org/protobuf v1.30.0 ) @@ -43,7 +43,6 @@ require ( github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.3.0 // indirect - github.com/rogpeppe/go-internal v1.10.0 // indirect go.opentelemetry.io/proto/otlp v0.19.0 // indirect golang.org/x/exp v0.0.0-20230321023759-10a507213a29 // indirect golang.org/x/net v0.10.0 // indirect diff --git a/troubleshoot/go.sum b/troubleshoot/go.sum index dc482f3d5ecc4..a76178464c684 100644 --- a/troubleshoot/go.sum +++ b/troubleshoot/go.sum @@ -161,7 +161,11 @@ github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+ github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4ZsPv9hVvWI6+ch50m39Pf2Ks= -github.com/hashicorp/consul/sdk v0.13.1 h1:EygWVWWMczTzXGpO93awkHFzfUka6hLYJ0qhETd+6lY= +github.com/hashicorp/consul/api v1.22.0-rc1 h1:ePmGqndeMgaI38KUbSA/CqTzeEAIogXyWnfNJzglo70= +github.com/hashicorp/consul/api v1.22.0-rc1/go.mod h1:wtduXtbAqSGtBdi3tyA5SSAYGAG51rBejV9SEUBciMY= +github.com/hashicorp/consul/envoyextensions v0.3.0-rc1 h1:weclrwjvLeX+vxPOyo4b4dCDxSpnDl60Z9K16nnCVnI= +github.com/hashicorp/consul/envoyextensions v0.3.0-rc1/go.mod h1:ckxoPHMiWXAe6dhyxmKsX1XqO4KTV64KWIyTu44z8UI= +github.com/hashicorp/consul/sdk v0.14.0-rc1 h1:PuETOfN0uxl28i0Pq6rK7TBCrIl7psMbL0YTSje4KvM= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -209,8 +213,8 @@ github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7V github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -271,8 +275,6 @@ github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsT github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= -github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= -github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= @@ -584,8 +586,8 @@ google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqw gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/version/VERSION b/version/VERSION index ee8855caa4a79..1f0d2f335194a 100644 --- a/version/VERSION +++ b/version/VERSION @@ -1 +1 @@ -1.17.0-dev +1.16.0-dev diff --git a/website/content/commands/debug.mdx b/website/content/commands/debug.mdx index 3fea57f43e381..bebbe955a294b 100644 --- a/website/content/commands/debug.mdx +++ b/website/content/commands/debug.mdx @@ -45,12 +45,12 @@ or otherwise. `Usage: consul debug [options]` By default, the debug command will capture an archive at the current path for -all targets for 2 minutes. +all targets for 5 minutes. #### Command Options - `-duration` - Optional, the total time to capture data for from the target agent. Must - be greater than the interval and longer than 10 seconds. Defaults to 2 minutes. + be greater than the interval and longer than 10 seconds. Defaults to 5 minutes. - `-interval` - Optional, the interval at which to capture dynamic data, such as heap and metrics. Must be longer than 5 seconds. Defaults to 30 seconds. @@ -79,7 +79,7 @@ information when `debug` is running. By default, it captures all information. | `host` | Information about resources on the host running the target agent such as CPU, memory, and disk. | | `members` | A list of all the WAN and LAN members in the cluster. | | `metrics` | Metrics from the in-memory metrics endpoint in the target, captured at the interval. | -| `logs` | `DEBUG` level logs for the target agent, captured for the duration. | +| `logs` | `TRACE` level logs for the target agent, captured for the duration. | | `pprof` | Golang heap, CPU, goroutine, and trace profiling. CPU and traces are captured for `duration` in a single file while heap and goroutine are separate snapshots for each `interval`. This information is not retrieved unless [`enable_debug`](/consul/docs/agent/config/config-files#enable_debug) is set to `true` on the target agent or ACLs are enable and an ACL token with `operator:read` is provided. | ## Examples diff --git a/website/content/commands/license.mdx b/website/content/commands/license.mdx index 50cee37544ad5..762e66df43d81 100644 --- a/website/content/commands/license.mdx +++ b/website/content/commands/license.mdx @@ -167,7 +167,8 @@ Licensed Features: Corresponding HTTP API Endpoint: [\[GET\] /v1/operator/license](/consul/api-docs/operator/license#getting-the-consul-license) -This command gets the Consul Enterprise license. +This command gets the Consul Enterprise license. If the leader hasn't been updated with the newer license, the followers +will display the outdated license in their GET output. The table below shows this command's [required ACLs](/consul/api-docs/api-structure#authentication). Configuration of [blocking queries](/consul/api-docs/features/blocking) and [agent caching](/consul/api-docs/features/caching) diff --git a/website/content/commands/operator/raft.mdx b/website/content/commands/operator/raft.mdx index 857a9ee1ac12d..1b2464cb8b294 100644 --- a/website/content/commands/operator/raft.mdx +++ b/website/content/commands/operator/raft.mdx @@ -46,10 +46,10 @@ Usage: `consul operator raft list-peers -stale=[true|false]` The output looks like this: ```text -Node ID Address State Voter RaftProtocol -alice 127.0.0.1:8300 127.0.0.1:8300 follower true 2 -bob 127.0.0.2:8300 127.0.0.2:8300 leader true 3 -carol 127.0.0.3:8300 127.0.0.3:8300 follower true 2 +Node ID Address State Voter RaftProtocol Commit Index Trails Leader By +alice 127.0.0.1:8300 127.0.0.1:8300 follower true 2 1167 0 commits +bob 127.0.0.2:8300 127.0.0.2:8300 leader true 3 1167 - +carol 127.0.0.3:8300 127.0.0.3:8300 follower true 2 1159 8 commits ``` `Node` is the node name of the server, as known to Consul, or "(unknown)" if @@ -66,11 +66,15 @@ Raft configuration. `Voter` is "true" or "false", indicating if the server has a vote in the Raft configuration. +`Commit Index` is the last log index the server has a record of in its Raft log. + +`Trails Leader By` is the number of commits a follower trails the leader by. + #### Command Options - `-stale` - Enables non-leader servers to provide cluster state information. If the cluster is in an outage state without a leader, - we recommend setting this option to `true. + we recommend setting this option to `true`. Default is `false`. ## remove-peer @@ -109,7 +113,7 @@ The return code will indicate success or failure. Corresponding HTTP API Endpoint: [\[POST\] /v1/operator/raft/transfer-leader](/consul/api-docs/operator/raft#transfer-raft-leadership) -This command transfers Raft leadership to another server agent. If an `id` is provided, Consul transfers leadership to the server with that id. +This command transfers Raft leadership to another server agent. If an `id` is provided, Consul transfers leadership to the server with that id. Use this command to change leadership without restarting the leader node, which maintains quorum and workload capacity. diff --git a/website/content/docs/agent/limits/index.mdx b/website/content/docs/agent/limits/index.mdx index ada5018bfff61..55fabc3de4bc9 100644 --- a/website/content/docs/agent/limits/index.mdx +++ b/website/content/docs/agent/limits/index.mdx @@ -21,7 +21,7 @@ You can set global limits on the rate of read and write requests that affect ind 1. Monitor the metrics and logs and readjust the initial configurations as necessary. Refer to [Monitor rate limit data](/consul/docs/agent/limits/usage/monitor-rate-limit-data) -1. Define your final operational limits based on your observations. If you are defining global rate limits, refer to [Set global traffic rate limits](/consul/docs/agent/limits/usage/set-global-rate-limits) for additional information. For information about setting limits based on source IP, refer to [Limit traffic rates for a source IP](/consul/docs/agent/limits/usage/set-source-ip-rate-limits). +1. Define your final operational limits based on your observations. If you are defining global rate limits, refer to [Set global traffic rate limits](/consul/docs/agent/limits/usage/set-global-rate-limits) for additional information. For information about setting limits per source IP address, refer to [Limit traffic rates for a source IP](/consul/docs/agent/limits/usage/set-source-ip-rate-limits). Note that setting limits per source IP requires Consul Enterprise. ### Order of operations diff --git a/website/content/docs/agent/limits/usage/limit-request-rates-from-ips.mdx b/website/content/docs/agent/limits/usage/limit-request-rates-from-ips.mdx index c074d3007af7a..58e7479012869 100644 --- a/website/content/docs/agent/limits/usage/limit-request-rates-from-ips.mdx +++ b/website/content/docs/agent/limits/usage/limit-request-rates-from-ips.mdx @@ -8,6 +8,12 @@ description: Learn how to set read and request rate limits on RPC and gRPC traff This topic describes how to configure RPC and gRPC traffic rate limits for source IP addresses. This enables you to specify a budget for read and write requests to prevent any single source IP from overwhelming the Consul server and negatively affecting the network. For information about setting global traffic rate limits, refer to [Set a global limit on traffic rates](/consul/docs/agent/limits/usage/set-glogal-traffic-rate-limits). For an overview of Consul's server rate limiting capabilities, refer to [Limit traffic rates overview](/consul/docs/agent/limits/overview). + + +This feature requires Consul Enterprise. Refer to the [feature compatibility matrix](/consul/docs/v1.16.x/enterprise#consul-enterprise-feature-availability) for additional information. + + + ## Overview You can set limits on the rate of read and write requests from source IP addresses to specific resources, which mitigates the risks to Consul servers when consul clients send excessive requests to a specific resource type. Before configuring traffic rate limits, you should complete the initialization process to understand normal traffic loads in your network. Refer to [Initialize rate limit settings](/consul/docs/agent/limits/init-rate-limits) for additional information. @@ -22,7 +28,7 @@ You should also monitor read and write rate activity and make any necessary adju ## Define rate limits -Create a control plane request limit configuration entry in the `default` partition. The configuration entry applies to all client requests targeting any partition. Refer to the [control plane request limit configuration entry](/consul/docs/connect/config-entries/control-plan-request-limit) reference documentation for details about the available configuration parameters. +Create a control plane request limit configuration entry in the `default` partition. The configuration entry applies to all client requests targeting any partition. Refer to the [control plane request limit configuration entry](/consul/docs/connect/config-entries/control-plane-request-limit) reference documentation for details about the available configuration parameters. Specify the following parameters: diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 326f5b42dbf53..59e78e27e0730 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -480,8 +480,10 @@ These metrics are used to monitor the health of the Consul servers. | `consul.raft.leader.dispatchNumLogs` | Measures the number of logs committed to disk in a batch. | logs | gauge | | `consul.raft.logstore.verifier.checkpoints_written` | Counts the number of checkpoint entries written to the LogStore. | checkpoints | counter | | `consul.raft.logstore.verifier.dropped_reports` | Counts how many times the verifier routine was still busy when the next checksum came in and so verification for a range was skipped. If you see this happen, consider increasing the interval between checkpoints with [`raft_logstore.verification.interval`](/consul/docs/agent/config/config-files#raft_logstore_verification) | reports dropped | counter | -| `consul.raft.logstore.verifier.ranges_verified` | Counts the number of log ranges for which a verification report has been completed. Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for more information. | log ranges verifications | counter | -| `consul.raft.logstore.verifier.read_checksum_failures` | Counts the number of times a range of logs between two check points contained at least one disk corruption. Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for more information. | disk corruptions | counter | +| `consul.raft.logstore.verifier.ranges_verified` | Counts the number of log ranges for which a verification report has been completed. Refer to [Monitor Raft metrics and logs for WAL +](/consul/docs/agent/wal-logstore/monitoring) for more information. | log ranges verifications | counter | +| `consul.raft.logstore.verifier.read_checksum_failures` | Counts the number of times a range of logs between two check points contained at least one disk corruption. Refer to [Monitor Raft metrics and logs for WAL +](/consul/docs/agent/wal-logstore/monitoring) for more information. | disk corruptions | counter | | `consul.raft.logstore.verifier.write_checksum_failures` | Counts the number of times a follower has a different checksum to the leader at the point where it writes to the log. This could be caused by either a disk-corruption on the leader (unlikely) or some other corruption of the log entries in-flight. | in-flight corruptions | counter | | `consul.raft.leader.lastContact` | Measures the time since the leader was last able to contact the follower nodes when checking its leader lease. It can be used as a measure for how stable the Raft timing is and how close the leader is to timing out its lease.The lease timeout is 500 ms times the [`raft_multiplier` configuration](/consul/docs/agent/config/config-files#raft_multiplier), so this telemetry value should not be getting close to that configured value, otherwise the Raft timing is marginal and might need to be tuned, or more powerful servers might be needed. See the [Server Performance](/consul/docs/install/performance) guide for more details. | ms | timer | | `consul.raft.leader.oldestLogAge` | The number of milliseconds since the _oldest_ log in the leader's log store was written. This can be important for replication health where write rate is high and the snapshot is large as followers may be unable to recover from a restart if restoring takes longer than the minimum value for the current leader. Compare this with `consul.raft.fsm.lastRestoreDuration` and `consul.raft.rpc.installSnapshot` to monitor. In normal usage this gauge value will grow linearly over time until a snapshot completes on the leader and the log is truncated. Note: this metric won't be emitted until the leader writes a snapshot. After an upgrade to Consul 1.10.0 it won't be emitted until the oldest log was written after the upgrade. | ms | gauge | diff --git a/website/content/docs/api-gateway/upgrades.mdx b/website/content/docs/api-gateway/upgrades.mdx index 31bc1ec82374f..fc00b99bb55f8 100644 --- a/website/content/docs/api-gateway/upgrades.mdx +++ b/website/content/docs/api-gateway/upgrades.mdx @@ -65,6 +65,8 @@ If you are able to tolerate downtime for your applications, you should delete pr $ kubectl apply -f apigw-installation.yaml ``` +1. Create `ServiceIntentions` allowing `Gateways` to communicate with any backend services that they route to. Refer to [Service intentions configuration entry reference](/consul/docs/connect/config-entries/service-intentions) for additional information. + 1. Change any existing `Gateways` to reference the new `GatewayClass` `consul`. Refer to [gatewayClass](/consul/docs/api-gateway/configuration/gateway#gatewayclassname) for additional information. 1. After updating all of your `gateway` configurations to use the new controller, you can complete the upgrade again and completely remove the `apiGateway` block to remove the old controller. @@ -99,9 +101,9 @@ If you are unable to tolerate any downtime, you can complete the following steps $ kubectl apply -f apigw-installation.yaml ``` -1. Change any existing `Gateways` to reference the new `GatewayClass` `consul`. Refer to [gatewayClass](/consul/docs/api-gateway/configuration/gateway#gatewayclassname) for additional information. +1. Create `ServiceIntentions` allowing `Gateways` to communicate with any backend services that they route to. Refer to [Service intentions configuration entry reference](/consul/docs/connect/config-entries/service-intentions) for additional information. -1. After updating all of your `gateway` configurations to use the new controller, you can remove the `apiGateway` block from the Helm chart and rerun it. This completely removes the old gateway controller. +1. Change any existing `Gateways` to reference the new `GatewayClass` `consul`. Refer to [gatewayClass](/consul/docs/api-gateway/configuration/gateway#gatewayclassname) for additional information. ## Upgrade to v0.4.0 diff --git a/website/content/docs/api-gateway/usage/errors.mdx b/website/content/docs/api-gateway/usage/errors.mdx index c873c55812db1..ba2c40f6f240b 100644 --- a/website/content/docs/api-gateway/usage/errors.mdx +++ b/website/content/docs/api-gateway/usage/errors.mdx @@ -58,3 +58,18 @@ The installation process typically fails after this error message is generated. **Resolution:** Install the required CRDs. Refer to the [Consul API Gateway installation instructions](/consul/docs/api-gateway/install#installation) for instructions. + +## Operation cannot be fulfilled, the object has been modified + +``` +{"error": "Operation cannot be fulfilled on gatewayclassconfigs.consul.hashicorp.com \"consul-api-gateway\": the object has been modified; please apply your changes to the latest version and try again"} + +``` +**Conditions:** +This error occurs when the gateway controller attempts to update an object that has been modified previously. It is a normal part of running the controller and will resolve itself by automatically retrying. + +**Impact:** +Excessive error logs are produced, but there is no impact to the functionality of the controller. + +**Resolution:** +No action needs to be taken to resolve this issue. diff --git a/website/content/docs/concepts/service-mesh.mdx b/website/content/docs/concepts/service-mesh.mdx index 334a6639f1ca7..2e793f2441c86 100644 --- a/website/content/docs/concepts/service-mesh.mdx +++ b/website/content/docs/concepts/service-mesh.mdx @@ -21,8 +21,8 @@ Some of the benefits of a service mesh include; - automatic failover - traffic management - encryption -- observability and traceability, -- authentication and authorization, +- observability and traceability +- authentication and authorization - network automation A common use case for leveraging a service mesh is to achieve a [_zero trust_ model](https://www.consul.io/use-cases/zero-trust-networking). diff --git a/website/content/docs/connect/config-entries/control-plane-request-limit.mdx b/website/content/docs/connect/config-entries/control-plane-request-limit.mdx index c6b44436ac564..1823857e3c552 100644 --- a/website/content/docs/connect/config-entries/control-plane-request-limit.mdx +++ b/website/content/docs/connect/config-entries/control-plane-request-limit.mdx @@ -8,6 +8,12 @@ description: Learn how to configure the control-plane-request-limit configurati This topic describes the configuration options for the `control-plane-request-limit` configuration entry. You can only write the `control-plane-request-limit` configuration entry to the `default` partition, but the configuration entry applies to all client requests that target any partition. + + +This feature requires Consul Enterprise. Refer to the [feature compatibility matrix](/consul/docs/v1.16.x/enterprise#consul-enterprise-feature-availability) for additional information. + + + ## Configuration model The following list outlines field hierarchy, language-specific data types, and requirements in a control plane request limit configuration entry. Click on a property name to view additional details, including default values. diff --git a/website/content/docs/connect/config-entries/exported-services.mdx b/website/content/docs/connect/config-entries/exported-services.mdx index 089eeb5723144..c7811d8d9b489 100644 --- a/website/content/docs/connect/config-entries/exported-services.mdx +++ b/website/content/docs/connect/config-entries/exported-services.mdx @@ -9,8 +9,6 @@ description: >- This topic describes the `exported-services` configuration entry type. The `exported-services` configuration entry enables Consul to export service instances to other clusters from a single file and connect services across clusters. For additional information, refer to [Cluster Peering](/consul/docs/connect/cluster-peering) and [Admin Partitions](/consul/docs/enterprise/admin-partitions). --> **v1.11.0+:** This config entry is supported in Consul versions 1.11.0+. - ## Introduction To configure Consul to export services contained in a Consul Enterprise admin partition or Consul OSS datacenter to one or more additional clusters, create a new configuration entry and declare `exported-services` in the `kind` field. This configuration entry enables you to route traffic between services in different clusters. diff --git a/website/content/docs/connect/config-entries/jwt-provider.mdx b/website/content/docs/connect/config-entries/jwt-provider.mdx index 5fc75da4d3006..8867a3e4f9722 100644 --- a/website/content/docs/connect/config-entries/jwt-provider.mdx +++ b/website/content/docs/connect/config-entries/jwt-provider.mdx @@ -108,53 +108,53 @@ Kind = "jwt-provider" # required Name = "" # required Issuer = "" # required JSONWebKeySet = { # required - Local = { # cannot specify with JWKS{}.Remote - JWKS = "" # cannot specify with JWKS{}.Local{}.Filename - Filename = "" # cannot specify with JWKS{}.Local{}.String - } + Local = { # cannot specify with JWKS{}.Remote + JWKS = "" # cannot specify with JWKS{}.Local{}.Filename + Filename = "" # cannot specify with JWKS{}.Local{}.String + } } JSONWebKeySet = { - Remote = { # cannot specify with JWKS{}.Local - URI = "" - RequestTimeoutMs = 1500 - CacheDuration = "5m" - FetchAsynchronously = false - RetryPolicy = { - NumRetries = 0 - RetryPolicyBackoff = { - BaseInterval = "1s" - MaxInterval = "10s" + Remote = { # cannot specify with JWKS{}.Local + URI = "" + RequestTimeoutMs = 1500 + CacheDuration = "5m" + FetchAsynchronously = false + RetryPolicy = { + NumRetries = 0 + RetryPolicyBackoff = { + BaseInterval = "1s" + MaxInterval = "10s" + } + } + } } - } - } -} Audiences = [""] Locations = [ - { - Header = { - Name = "" - ValuePrefix = "" - Forward = false - } - }, - { - QueryParam = { - Name = "" - } - }, - { - Cookie = { - Name = "" + { + Header = { + Name = "" + ValuePrefix = "" + Forward = false + } + }, + { + QueryParam = { + Name = "" + } + }, + { + Cookie = { + Name = "" + } } - } ] Forwarding = { - HeaderName = "" - PadForwardPayloadHeader = false + HeaderName = "" + PadForwardPayloadHeader = false } ClockSkewSeconds = 30 CacheConfig = { - Size = 0 + Size = 0 } ``` @@ -164,58 +164,58 @@ CacheConfig = { ```json { - "Kind": "jwt-provider", // required - "Name": "", // required - "Issuer": "", // required - "JSONWebKeySet": { // required - "Local": { // cannot specify with JWKS.Remote - "JWKS": "", // cannot specify with JWKS.Local.Filename - "Filename": "" // cannot specify with JWKS.Local.String +"Kind": "jwt-provider", // required +"Name": "", // required +"Issuer": "", // required +"JSONWebKeySet": { // required + "Local": { // cannot specify with JWKS.Remote + "JWKS": "", // cannot specify with JWKS.Local.Filename + "Filename": "" // cannot specify with JWKS.Local.String } - }, - "JSONWebKeySet": { - "Remote": { // cannot specify with JWKS.Local - "URI": "", - "RequestTimeoutMs": "1500", - "CacheDuration": "5m", - "FetchAsynchronously": "false", - "RetryPolicy": { - "NumRetries": "0", - "RetryPolicyBackOff": { - "BaseInterval": "1s", - "MaxInterval": "10s" +}, +"JSONWebKeySet": { + "Remote": { // cannot specify with JWKS.Local + "URI": "", + "RequestTimeoutMs": "1500", + "CacheDuration": "5m", + "FetchAsynchronously": "false", + "RetryPolicy": { + "NumRetries": "0", + "RetryPolicyBackOff": { + "BaseInterval": "1s", + "MaxInterval": "10s" + } + } } - } - } - }, - "Audiences": [""], - "Locations": [ +}, +"Audiences": [""], +"Locations": [ { - "Header": { - "Name": "", - "ValuePrefix": "", - "Forward": "false" - } + "Header": { + "Name": "", + "ValuePrefix": "", + "Forward": "false" + } }, { - "QueryParam": { - "Name":"", - } + "QueryParam": { + "Name":"", + } }, { - "Cookie": { - "Name": "" - } + "Cookie": { + "Name": "" + } } - ], - "Forwarding": { - "HeaderName": "", - "PadForwardPayloadHeader": "false" - }, - "ClockSkewSeconds": "30", - "CacheConfig": { +], +"Forwarding": { + "HeaderName": "", + "PadForwardPayloadHeader": "false" +}, +"ClockSkewSeconds": "30", +"CacheConfig": { "Size": "0" - } +} } ``` @@ -952,6 +952,22 @@ Defines behavior for caching the validation result of previously encountered JWT +## Metrics + +Envoy proxies expose metrics that can track JWT authentication details. Use the following Envoy metrics: + +```yaml +http.public_listener.jwt_authn.allowed +http.public_listener.jwt_authn.cors_preflight_bypassed +http.public_listener.jwt_authn.denied +http.public_listener.jwt_authn.jwks_fetch_failed +http.public_listener.jwt_authn.jwks_fetch_success +http.public_listener.jwt_authn.jwt_cache_hit +http.public_listener.jwt_authn.jwt_cache_miss +``` + +~> **Note:** Currently, Envoy does not reference these metrics in their documentation. Refer to [Envoy documentation](https://www.envoyproxy.io/docs/envoy/latest/) for more information about exposed metrics. + ## Examples The following examples demonstrate common JWT provider configuration patterns for specific use cases. @@ -1014,7 +1030,7 @@ metadata: name: okta spec: issuer: okta - jsonWebKeySet: + jsonwebkeyset: remote: uri: https://.okta.com/oauth2/default/v1/keys cacheDuration: 30m @@ -1023,4 +1039,4 @@ spec: ``` - \ No newline at end of file + diff --git a/website/content/docs/connect/config-entries/mesh.mdx b/website/content/docs/connect/config-entries/mesh.mdx index a323a6d90f179..98760c068c045 100644 --- a/website/content/docs/connect/config-entries/mesh.mdx +++ b/website/content/docs/connect/config-entries/mesh.mdx @@ -7,8 +7,6 @@ description: >- # Mesh Configuration Entry --> **v1.10.0+:** This configuration entry is supported in Consul versions 1.10.0+. - The `mesh` configuration entry allows you to define a global default configuration that applies to all service mesh proxies. Settings in this config entry apply across all namespaces and federated datacenters. diff --git a/website/content/docs/connect/config-entries/service-intentions.mdx b/website/content/docs/connect/config-entries/service-intentions.mdx index f8afda6e41da9..15e41314ba98b 100644 --- a/website/content/docs/connect/config-entries/service-intentions.mdx +++ b/website/content/docs/connect/config-entries/service-intentions.mdx @@ -1506,65 +1506,65 @@ Sources = [ ``` ```yaml -apiVersion: consul.hashicorp.com/v1alpha1 -kind: ServiceIntentions -metadata: - name: backend -spec: - sources: - name: frontend - permissions: + apiVersion: consul.hashicorp.com/v1alpha1 + kind: ServiceIntentions + metadata: + name: backend + spec: + sources: + name: frontend + permissions: + http: + pathExact: /admin + jwt: + providers: + name: okta + verifyClaims: + path: + - perms + - role + value: admin + action: allow http: - pathExact: /admin - jwt: - providers: - name: okta - verifyClaims: - path: - - perms - - role - value: admin - action: allow - http: - pathPrefix: / + pathPrefix: / ``` ```json { - "Kind": "service-intentions", - "Name": "backend", - "Sources": [ - { - "Name": "frontend", - "Permissions": [ - { - "HTTP": { - "PathExact": "/admin" - }, - "JWT": { - "Providers": [ - { - "Name": "okta", - "VerifyClaims": [ - { - "Path": ["perms", "role"], - "Value": "admin" - } - ] - } - ] - } +"Kind": "service-intentions", +"Name": "backend", +"Sources": [ + { + "Name": "frontend", + "Permissions": [ + { + "HTTP": { + "PathExact": "/admin" }, - { - "Action": "allow", - "HTTP": { - "PathPrefix": "/" - } + "JWT": { + "Providers": [ + { + "Name": "okta", + "VerifyClaims": [ + { + "Path": ["perms", "role"], + "Value": "admin" + } + ] + } + ] } - ] - } - ] + }, + { + "Action": "allow", + "HTTP": { + "PathPrefix": "/" + } + } + ] + } +] } ``` - \ No newline at end of file + diff --git a/website/content/docs/connect/config-entries/service-resolver.mdx b/website/content/docs/connect/config-entries/service-resolver.mdx index dcea59805499c..b4218d6d06149 100644 --- a/website/content/docs/connect/config-entries/service-resolver.mdx +++ b/website/content/docs/connect/config-entries/service-resolver.mdx @@ -35,12 +35,14 @@ The following list outlines field hierarchy, language-specific data types, and r - [`ServiceSubset`](#redirect-servicesubset): string - [`Namespace`](#redirect-namespace): string - [`Partition`](#redirect-partition): string | `default` + - [`SamenessGroup`](#redirect-samenessgroup): string - [`Datacenter`](#redirect-datacenter): list - [`Peer`](#redirect-peer): string - [`Failover`](#failover): map - [`Service`](#failover-service): string - [`ServiceSubset`](#failover-servicesubset): string - [`Namespace`](#failover-namespace): string + - [`SamenessGroup`](#failover-samenessgroup): string - [`Datacenters`](#failover-datacenters): list - [`Targets`](#failover-targets): list - [`Service`](#failover-targets-service): string @@ -87,12 +89,14 @@ The following list outlines field hierarchy, language-specific data types, and r - [`serviceSubset`](#spec-redirect-servicesubset): string - [`namespace`](#spec-redirect-namespace): string - [`partition`](#spec-redirect-partition): string + - [`samenessGroup`](#spec-redirect-samenessgroup): string - [`datacenter`](#spec-redirect-datacenter): string - [`peer`](#spec-redirect-peer): string - [`failover`](#spec-failover): map - [`service`](#spec-failover-service): string - [`serviceSubset`](#spec-failover-servicesubset): string - [`namespace`](#spec-failover-namespace): string + - [`samenessGroup`](#spec-failover-samenessgroup): string - [`datacenters`](#spec-failover-datacenters): string - [`targets`](#spec-failover-targets): list - [`service`](#spec-failover-targets-service): string @@ -157,11 +161,12 @@ Redirect = { ServiceSubset = "" Namespace = "" Partition = "" + SamenessGroup = "" Datacenter = "" Peer = "" } -Failover = { ## requires at least one of the following: Service, ServiceSubset, Namespace, Targets, Datacenters +Failover = { ## requires at least one of the following: Service, ServiceSubset, Namespace, Targets, Datacenters, SamenessGroup = { Targets = [ { Service = "" }, @@ -239,11 +244,12 @@ LoadBalancer = { "ServiceSubset":"", "Namespace":"", "Partition":"", + "SamenessGroup":"", "Datacenter":"", "Peer":"" }, - "Failover":{ // requires at least one of the following": Service, ServiceSubset, Namespace, Targets, Datacenters + "Failover":{ // requires at least one of the following": Service, ServiceSubset, Namespace, Targets, Datacenters, SamenessGroup "":{ "Targets":[ {"Service":""}, @@ -314,8 +320,9 @@ spec: servicesubset: namespace: partition: + samenessGroup: peer: - failover: # requires at least one of the following: service, serviceSubset, namespace, targets, datacenters + failover: # requires at least one of the following: service, serviceSubset, namespace, targets, datacenters, samenessGroup : targets: - service: @@ -465,6 +472,7 @@ Specifies redirect instructions for local service traffic so that services deplo - [`ServiceSubset`](#redirect-servicesubset) - [`Namespace`](#redirect-namespace) - [`Partition`](#redirect-partition) + - [`SamenessGroup`](#redirect-samenessgroup) - [`Datacenter`](#redirect-datacenter) - [`Peer`](#redirect-peer) @@ -504,6 +512,14 @@ Specifies the admin partition at the redirect’s destination that resolves loca - Default: None - Data type: String +### `Redirect{}.SamenessGroup` + +Specifies the sameness group at the redirect’s destination that resolves local upstream requests. + +#### Values + +- Default: None +- Data type: String ### `Redirect{}.Datacenter` @@ -529,7 +545,7 @@ Specifies controls for rerouting traffic to an alternate pool of service instanc This parameter is a map, and its key is the name of the local service subset that resolves to another location when it fails. You can specify a `"*"` wildcard to apply failovers to any subset. -`Service`, `ServiceSubset`, `Namespace`, `Targets`, and `Datacenters` cannot all be empty at the same time. +`Service`, `ServiceSubset`, `Namespace`, `Targets`, `SamenessGroup`, and `Datacenters` cannot all be empty at the same time. #### Values @@ -538,6 +554,7 @@ This parameter is a map, and its key is the name of the local service subset tha - [`Service`](#failover-service) - [`ServiceSubset`](#failover-servicesubset) - [`Namespace`](#failover-namespace) + - [`SamenessGroup`](#failover-samenessgroup) - [`Datacenters`](#failover-datacenters) - [`Targets`](#failover-targets) @@ -568,6 +585,15 @@ Specifies the namespace at the failover location where the failover services are - Default: None - Data type: String +### `Failover{}.SamenessGroup` + +Specifies the sameness group at the failover location where the failover services are deployed. + +#### Values + +- Default: None +- Data type: String + ### `Failover{}.Datacenters` Specifies an ordered list of datacenters at the failover location to attempt connections to during a failover scenario. When Consul cannot establish a connection with the first datacenter in the list, it proceeds sequentially until establishing a connection with another datacenter. @@ -907,6 +933,7 @@ Specifies redirect instructions for local service traffic so that services deplo - [`serviceSubset`](#spec-redirect-servicesubset) - [`namespace`](#spec-redirect-namespace) - [`partition`](#spec-redirect-partition) + - [`samenessGroup`](#spec-redirect-samenessgroup) - [`datacenter`](#spec-redirect-datacenter) - [`peer`](#spec-redirect-peer) @@ -946,6 +973,15 @@ Specifies the admin partition at the redirect’s destination that resolves loca - Default: None - Data type: String +### `spec.redirect.samenessGroup` + +Specifies the sameness group at the redirect’s destination that resolves local upstream requests. + +#### Values + +- Default: None +- Data type: String + ### `spec.redirect.datacenter` @@ -971,7 +1007,7 @@ Specifies controls for rerouting traffic to an alternate pool of service instanc This parameter is a map, and its key is the name of the local service subset that resolves to another location when it fails. You can specify a `"*"` wildcard to apply failovers to any subset. -`service`, `serviceSubset`, `namespace`, `targets`, and `datacenters` cannot all be empty at the same time. +`service`, `serviceSubset`, `namespace`, `targets`, `samenessGroup`, and `datacenters` cannot all be empty at the same time. #### Values @@ -980,6 +1016,7 @@ This parameter is a map, and its key is the name of the local service subset tha - [`service`](#spec-failover-service) - [`serviceSubset`](#spec-failover-servicesubset) - [`namespace`](#spec-failover-namespace) + - [`samenessGroup`](#spec-failover-samenessgroup) - [`datacenters`](#spec-failover-datacenters) - [`targets`](#spec-failover-targets) @@ -1010,6 +1047,15 @@ Specifies the namespace at the failover location where the failover services are - Default: None - Data type: String +### `spec.failover.samenessGroup` + +Specifies the sameness group at the failover location where the failover services are deployed. + +#### Values + +- Default: None +- Data type: String + ### `spec.failover.datacenters` Specifies an ordered list of datacenters at the failover location to attempt connections to during a failover scenario. When Consul cannot establish a connection with the first datacenter in the list, it proceeds sequentially until establishing a connection with another datacenter. diff --git a/website/content/docs/connect/config-entries/terminating-gateway.mdx b/website/content/docs/connect/config-entries/terminating-gateway.mdx index e0c1ff6032cfa..9c0804f9c39d9 100644 --- a/website/content/docs/connect/config-entries/terminating-gateway.mdx +++ b/website/content/docs/connect/config-entries/terminating-gateway.mdx @@ -7,9 +7,6 @@ description: >- # Terminating Gateway Configuration Entry --> **v1.8.4+:** On Kubernetes, the `TerminatingGateway` custom resource is supported in Consul versions 1.8.4+.
-**v1.8.0+:** On other platforms, this config entry is supported in Consul versions 1.8.0+. - The `terminating-gateway` config entry kind (`TerminatingGateway` on Kubernetes) allows you to configure terminating gateways to proxy traffic from services in the Consul service mesh to services registered with Consul that do not have a [service mesh sidecar proxy](/consul/docs/connect/proxies). The configuration is associated with the name of a gateway service diff --git a/website/content/docs/connect/failover/index.mdx b/website/content/docs/connect/failover/index.mdx index 2d96906440989..dd1591d469f6a 100644 --- a/website/content/docs/connect/failover/index.mdx +++ b/website/content/docs/connect/failover/index.mdx @@ -21,7 +21,7 @@ The following table compares these strategies in deployments with multiple datac | Failover Strategy | Supports WAN Federation | Supports Cluster Peering | Multi-Datacenter Failover Strength | Multi-Datacenter Usage Scenario | | :---------------: | :---------------------: | :----------------------: | :--------------------------------- | :------------------------------ | | `Failover` stanza | ✅ | ✅ | Enables more granular logic for failover targeting | Configuring failover for a single service or service subset, especially for testing or debugging purposes | -| Prepared query | ✅ | ✅ | Central policies that can automatically target the nearest datacenter | WAN-federated deployments where a primary datacenter is configured. Prepared queries are not replicated over peer connections. | +| Prepared query | ✅ | ❌ | Central policies that can automatically target the nearest datacenter | WAN-federated deployments where a primary datacenter is configured. Prepared queries are not replicated over peer connections. | | Sameness groups | ❌ | ✅ | Group size changes without edits to existing member configurations | Cluster peering deployments with consistently named services and namespaces | ### Failover configurations for a service mesh with a single datacenter diff --git a/website/content/docs/connect/gateways/api-gateway/configuration/http-route.mdx b/website/content/docs/connect/gateways/api-gateway/configuration/http-route.mdx index 997e2bbf692e4..02d2725ad6981 100644 --- a/website/content/docs/connect/gateways/api-gateway/configuration/http-route.mdx +++ b/website/content/docs/connect/gateways/api-gateway/configuration/http-route.mdx @@ -533,6 +533,11 @@ Specifies the HTTP method to match. Specifies type of match for the path: `"exact"`, `"prefix"`, or `"regex"`. +If set to `prefix`, Consul uses simple string matching to identify incoming request prefixes. For example, if the route is configured to match incoming requests to services prefixed with `/dev`, then the gateway would match requests to `/dev-` and `/deviate` and route to the upstream. + +This deviates from the +[Kubernetes Gateway API specification](https://gateway-api.sigs.k8s.io/references/spec/#gateway.networking.k8s.io%2fv1beta1.PathMatchType), which matches on full path elements. In the previous example, _only_ requests to `/dev` or `/dev/` would match. + #### Values - Default: none diff --git a/website/content/docs/connect/gateways/index.mdx b/website/content/docs/connect/gateways/index.mdx index 344b63dd0ad2b..b333615c4ed07 100644 --- a/website/content/docs/connect/gateways/index.mdx +++ b/website/content/docs/connect/gateways/index.mdx @@ -31,7 +31,7 @@ Mesh gateways enable the following scenarios: - **Service-to-service communication across WAN-federated datacenters**. Refer to [Enabling Service-to-service Traffic Across Datacenters](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-wan-datacenters) for additional information. - **Service-to-service communication across admin partitions**. Since Consul 1.11.0, you can create administrative boundaries for single Consul deployments called "admin partitions". You can use mesh gateways to facilitate cross-partition communication. Refer to [Enabling Service-to-service Traffic Across Admin Partitions](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-partitions) for additional information. - **Bridge multiple datacenters using Cluster Peering**. Since Consul 1.14.0, mesh gateways can be used to route peering control-plane traffic between peered Consul Servers. See [Mesh Gateways for Peering Control Plane Traffic](/consul/docs/connect/gateways/mesh-gateway/peering-via-mesh-gateways) for more information. -- **Service-to-service communication across peered datacenters**. Refer to [Mesh Gateways between Peered Clusters](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-peers) for more information. +- **Service-to-service communication across peered datacenters**. Refer to [Establish cluster peering connections](/consul/docs/connect/cluster-peering/usage/establish-cluster-peering) for more information. -> **Mesh gateway tutorial**: Follow the [mesh gateway tutorial](/consul/tutorials/developer-mesh/service-mesh-gateways) to learn concepts associated with mesh gateways. diff --git a/website/content/docs/connect/gateways/mesh-gateway/index.mdx b/website/content/docs/connect/gateways/mesh-gateway/index.mdx index 89d64b8d1ebe4..bcac5555278bb 100644 --- a/website/content/docs/connect/gateways/mesh-gateway/index.mdx +++ b/website/content/docs/connect/gateways/mesh-gateway/index.mdx @@ -18,7 +18,7 @@ Mesh gateways can be used with any of the following Consul configrations for man * [Mesh gateways can be used to route service-to-service traffic between datacenters](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-wan-datacenters) * [Mesh gateways can be used to route all WAN traffic, including from Consul servers](/consul/docs/connect/gateways/mesh-gateway/wan-federation-via-mesh-gateways) 2. Cluster Peering - * [Mesh gateways can be used to route service-to-service traffic between datacenters](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-peers) + * [Mesh gateways can be used to route service-to-service traffic between datacenters](/consul/docs/connect/cluster-peering/usage/establish-cluster-peering) * [Mesh gateways can be used to route control-plane traffic from Consul servers](/consul/docs/connect/gateways/mesh-gateway/peering-via-mesh-gateways) 3. Admin Partitions * [Mesh gateways can be used to route service-to-service traffic between admin partitions in the same Consul datacenter](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-partitions) diff --git a/website/content/docs/connect/gateways/mesh-gateway/peering-via-mesh-gateways.mdx b/website/content/docs/connect/gateways/mesh-gateway/peering-via-mesh-gateways.mdx index b46a18bef2e46..97045649b2ff9 100644 --- a/website/content/docs/connect/gateways/mesh-gateway/peering-via-mesh-gateways.mdx +++ b/website/content/docs/connect/gateways/mesh-gateway/peering-via-mesh-gateways.mdx @@ -7,7 +7,7 @@ description: >- # Enabling Peering Control Plane Traffic -In addition to [service-to-service traffic routing](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-peers), +In addition to [service-to-service traffic routing](/consul/docs/connect/cluster-peering/usage/establish-cluster-peering), we recommend routing control plane traffic between cluster peers through mesh gateways to simplfy networking requirements. @@ -59,7 +59,7 @@ For Consul Enterprise clusters, mesh gateways must be registered in the "default -In addition to the [ACL Configuration](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-peers#acl-configuration) necessary for service-to-service traffic, mesh gateways that route peering control plane traffic must be granted `peering:read` access to all peerings. +In addition to the [ACL Configuration](/consul/docs/connect/cluster-peering/tech-specs#acl-specifications) necessary for service-to-service traffic, mesh gateways that route peering control plane traffic must be granted `peering:read` access to all peerings. This access allows the mesh gateway to list all peerings in a Consul cluster and generate unique routing per peered datacenter. @@ -80,7 +80,7 @@ peering = "read" -In addition to the [ACL Configuration](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-peers#acl-configuration) necessary for service-to-service traffic, mesh gateways that route peering control plane traffic must be granted `peering:read` access to all peerings in all partitions. +In addition to the [ACL Configuration](/consul/docs/connect/cluster-peering/tech-specs#acl-specifications) necessary for service-to-service traffic, mesh gateways that route peering control plane traffic must be granted `peering:read` access to all peerings in all partitions. This access allows the mesh gateway to list all peerings in a Consul cluster and generate unique routing per peered partition. diff --git a/website/content/docs/connect/intentions/jwt-authorization.mdx b/website/content/docs/connect/intentions/jwt-authorization.mdx index a58bd3af3e19e..9a8458054ea27 100644 --- a/website/content/docs/connect/intentions/jwt-authorization.mdx +++ b/website/content/docs/connect/intentions/jwt-authorization.mdx @@ -98,4 +98,4 @@ After you update the service intention, write the configuration to Consul so tha ```shell-session $ consul config write web-intention.hcl -``` \ No newline at end of file +``` diff --git a/website/content/docs/connect/proxies/envoy-extensions/configuration/ext-authz.mdx b/website/content/docs/connect/proxies/envoy-extensions/configuration/ext-authz.mdx index 6b5d8cc272aab..2d3c48789a901 100644 --- a/website/content/docs/connect/proxies/envoy-extensions/configuration/ext-authz.mdx +++ b/website/content/docs/connect/proxies/envoy-extensions/configuration/ext-authz.mdx @@ -348,7 +348,7 @@ The following table describes how to configure parameters for the `Service` fiel ### `Arguments.Config.GrpcService.Target.Uri` -Specifies the URI of the external authorization service. Configure this field when you must provide an explicit URI to the external authorization service, such as cases in which the authorization service is running on the same host or pod. If set, the value of this field must be either `localhost:` or `127.0.0.1:` +Specifies the URI of the external authorization service. Configure this field when you must provide an explicit URI to the external authorization service, such as cases in which the authorization service is running on the same host or pod. If set, the value of this field must be one of `localhost:`, `127.0.0.1:`, or `::1:`. Configure either the `Uri` field or the [`Service`](#arguments-config-grpcservice-target-service) field, but not both. @@ -434,7 +434,7 @@ The following table describes how to configure parameters for the `Service` fiel ### `Arguments{}.Config{}.HttpService{}.Target{}.Uri` -Specifies the URI of the external authorization service. Configure this field when you must provide an explicit URI to the external authorization service, such as cases in which the authorization service is running on the same host or pod. +Specifies the URI of the external authorization service. Configure this field when you must provide an explicit URI to the external authorization service, such as cases in which the authorization service is running on the same host or pod. If set, the value of this field must be one of `localhost:`, `127.0.0.1:`, or `::1:`. Configure either the `Uri` field or the [`Service`](#arguments-config-httpservice-target-service) field, but not both. diff --git a/website/content/docs/connect/proxies/envoy-extensions/configuration/property-override.mdx b/website/content/docs/connect/proxies/envoy-extensions/configuration/property-override.mdx index 9b13cb940b235..8ccb49a391fb6 100644 --- a/website/content/docs/connect/proxies/envoy-extensions/configuration/property-override.mdx +++ b/website/content/docs/connect/proxies/envoy-extensions/configuration/property-override.mdx @@ -17,7 +17,7 @@ The following list outlines the field hierarchy, data types, and requirements fo Click on a property name to view additional details, including default values. -- [`ProxyType`](#proxytype): string | required +- [`ProxyType`](#proxytype): string | `connect-proxy` - [`Debug`](#debug): bool | `false` - [`Patches`](#patches): list | required - [`ResourceFilter`](#patches-resourcefilter): map @@ -46,13 +46,14 @@ Patches = [ TrafficDirection = "" Services = [ { - Name = "", + Name = "" Namespace = "" Partition = "" } ] - Op = "", - Path = "", + } + Op = "" + Path = "" Value = "" } ] @@ -69,7 +70,6 @@ Specifies the type of Envoy proxy that the extension applies to. The only suppor #### Values - Default: `connect-proxy` -- This field is required. - Data type: String ### `Debug` @@ -118,7 +118,7 @@ The following table describes how to configure a `ResourceFilter`: Specifies the JSON Patch operation to perform when the `ResourceFilter` matches a local Envoy proxy configuration. You can specify one of the following values for each patch: -- `add`: Replaces a property or message specified by [`Path`](#patches-path) with the given value. The JSON patch format does not merge objects. To emulate merges, you must configure discrete `add` operations for each changed field. Consul returns an error if the target field does not exist in the corresponding schema. +- `add`: Replaces a property or message specified by [`Path`](#patches-path) with the given value. The JSON Patch `add` operation does not merge objects. To emulate merges, you must configure discrete `add` operations for each changed field. Consul returns an error if the target field does not exist in the corresponding schema. - `remove`: Unsets the value of the field specified by [`Path`](#patches-path). If the field is not set, no changes are made. Consul returns an error if the target field does not exist in the corresponding schema. #### Values @@ -135,7 +135,7 @@ Specifies where the extension performs the associated operation on the specified The `Path` field does not support addressing array elements or protobuf map field entries. Refer to [Constructing paths](/consul/docs/connect/proxies/envoy-extensions/usage/property-override#constructing-paths) for information about how to construct paths. -When setting fields, the extension sets any unset intermediate fields to their default values. A a single operation on a nested field can set multiple intermediate fields. Because Consul sets the intermediate fields to their default values, you may need to configure subsequent patches to satisfy Envoy or Consul validation. +When setting fields, the extension sets any unset intermediate fields to their default values. A single operation on a nested field can set multiple intermediate fields. Because Consul sets the intermediate fields to their default values, you may need to configure subsequent patches to satisfy Envoy or Consul validation. #### Values @@ -145,9 +145,10 @@ When setting fields, the extension sets any unset intermediate fields to their d ### `Patches[].Value{}` -Defines a value to set at the specified [path](#patches-path) if the [operation](#patches-op) is set to `add`. You can specify either a scalar or enum value or define a map that contains string keys and values corresponding to scalar or enum child fields. Refer to the [example configurations](#examples) for additional guidance and to the [Envoy API documentation](https://www.envoyproxy.io/docs/envoy/latest/api-v3/api) for additional information about Envoy proxy interfaces. +Defines a value to set at the specified [path](#patches-path) if the [operation](#patches-op) is set to `add`. You can specify either a scalar or enum value, an array of scalar or enum values (for repeated fields), or define a map that contains string keys and values corresponding to scalar or enum child fields. Single and repeated scalar and enum values are supported. Refer to the [example configurations](#examples) for additional guidance and to the [Envoy API documentation](https://www.envoyproxy.io/docs/envoy/latest/api-v3/api) for additional information about Envoy proxy interfaces. If Envoy specifies a wrapper as the target field type, the extension automatically coerces simple values to the wrapped type when patching. For example, the value `32768` is allowed when targeting a cluster's `per_connection_buffer_limit_bytes`, which is a `UInt32Value` field. Refer to the [protobuf documentation](https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/wrappers.proto) for additional information about wrappers. + #### Values - Default: None @@ -161,9 +162,9 @@ If Envoy specifies a wrapper as the target field type, the extension automatical The following examples demonstrate patterns that you may be able to model your configurations on. -### Enable `enforcing_consecutive_5xx` outlier detection +### Enable `respect_dns_ttl` in a cluster -In the following example, the `add` operation patches an outlier detection property into outbound cluster traffic. The `Path` specifies the `enforcing_consecutive_5xx` interface and sets a value of `1234`: +In the following example, the `add` operation patches the outbound cluster corresponding to the `other-svc` upstream service to enable `respect_dns_ttl`. The `Path` specifies the [Cluster `/respect_dns_ttl`](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto#envoy-v3-api-field-config-cluster-v3-cluster-respect-dns-ttl) top-level field and `Value` specifies a value of `true`: ```hcl Kind = "service-defaults" @@ -184,8 +185,8 @@ EnvoyExtensions = [ }, }, "Op" = "add", - "Path" = "/outlier_detection/enforcing_consecutive_5xx", - "Value" = 1234, + "Path" = "/respect_dns_ttl", + "Value" = true, } ] } @@ -193,9 +194,9 @@ EnvoyExtensions = [ ] ``` -### Update multiple values in the default map +### Update multiple values in a message field -In the following example, two `ResourceFilter` blocks target outbound traffic to the `db` service and add `/outlier_detection/enforcing_consecutive_5xx` and `/outlier_detection/failure_percentage_request_volume` properties: +In the following example, both `ResourceFilter` blocks target the cluster corresponding to the `other-svc` upstream service and modify [Cluster `/outlier_detection`](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/outlier_detection.proto) properties: ```hcl Kind = "service-defaults" @@ -209,27 +210,27 @@ EnvoyExtensions = [ Patches = [ { ResourceFilter = { - ResourceType = "cluster", - TrafficDirection = "outbound", + ResourceType = "cluster" + TrafficDirection = "outbound" Services = [{ Name = "other-svc" - }], - }, - Op = "add", - Path = "/outlier_detection/enforcing_consecutive_5xx", - Value = 1234, + }] + } + Op = "add" + Path = "/outlier_detection/max_ejection_time/seconds" + Value = 120 }, { ResourceFilter = { - ResourceType = "cluster", - TrafficDirection = "outbound", + ResourceType = "cluster" + TrafficDirection = "outbound" Services = [{ Name = "other-svc" - }], - }, - Op = "add", - Path = "/outlier_detection/failure_percentage_request_volume", - Value = 2345, + }] + } + Op = "add" + Path = "/outlier_detection/max_ejection_time_jitter/seconds" + Value = 1 } ] } @@ -237,9 +238,13 @@ EnvoyExtensions = [ ] ``` -### Set multiple values that replace the map +The use of `/seconds` in these examples corresponds to the same field in the [google.protobuf.Duration](https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/duration.proto) proto definition, since the extension does not support JSON serialized string forms of common protobuf types (e.g. `120s`). + +-> **Note:** Using separate patches per field preserves any existing configuration of other fields in `outlier_detection` that may be directly set by Consul, such as [`enforcing_consecutive_5xx`](https://developer.hashicorp.com/consul/docs/connect/proxies/envoy#enforcing_consecutive_5xx). -In the following example, a `ResourceFilter` targets outbound traffic to the `db` service and replaces the map of properties located at `/outlier_detection` with `enforcing_consecutive_5xx` and `failure_percentage_request_volume` and properties: +### Replace a message field + +In the following example, a `ResourceFilter` targets the cluster corresponding to the `other-svc` upstream service and _replaces_ the entire map of properties located at `/outlier_detection`, including explicitly set `enforcing_success_rate` and `success_rate_minimum_hosts` properties: ```hcl Kind = "service-defaults" @@ -247,27 +252,29 @@ Name = "my-svc" Protocol = "http" EnvoyExtensions = [ { - Name = "builtin/property-override", + Name = "builtin/property-override" Arguments = { - ProxyType = "connect-proxy", + ProxyType = "connect-proxy" Patches = [ { ResourceFilter = { - ResourceType = "cluster", - TrafficDirection = "outbound", + ResourceType = "cluster" + TrafficDirection = "outbound" Services = [{ Name = "other-svc" - }], - }, - Op = "add", - Path = "/outlier_detection", + }] + } + Op = "add" + Path = "/outlier_detection" Value = { - "enforcing_consecutive_5xx" = 1234, - "failure_percentage_request_volume" = 2345, - }, + "enforcing_success_rate" = 80 + "success_rate_minimum_hosts" = 2 + } } ] } } ] ``` + +Unlike the previous example, other `/outlier_detection` values set by Consul will _not_ be retained unless they match Envoy's defaults, because the entire value of `/outlier_detection` will be replaced. diff --git a/website/content/docs/connect/proxies/envoy-extensions/usage/ext-authz.mdx b/website/content/docs/connect/proxies/envoy-extensions/usage/ext-authz.mdx index f3cc5432846b7..3062879d472db 100644 --- a/website/content/docs/connect/proxies/envoy-extensions/usage/ext-authz.mdx +++ b/website/content/docs/connect/proxies/envoy-extensions/usage/ext-authz.mdx @@ -115,7 +115,7 @@ The following Envoy configurations are not supported: | `failure_mode_allow` | Set the `EnvoyExtension.Required` field to `true` in the [service defaults configuration entry](/consul/docs/connect/config-entries/service-defaults#envoyextensions) or [proxy defaults configuration entry](/consul/docs/connect/config-entries/proxy-defaults#envoyextensions). | | `filter_enabled` | Set the `EnvoyExtension.Required` field to `true` in the [service defaults configuration entry](/consul/docs/connect/config-entries/service-defaults#envoyextensions) or [proxy defaults configuration entry](/consul/docs/connect/config-entries/proxy-defaults#envoyextensions). | | `filter_enabled_metadata` | Set the `EnvoyExtension.Required` field to `true` in the [service defaults configuration entry](/consul/docs/connect/config-entries/service-defaults#envoyextensions) or [proxy defaults configuration entry](/consul/docs/connect/config-entries/proxy-defaults#envoyextensions). | -| `transport_api_version` | Consul only supports v3 of the transport API. As a result, there is no workaround for implement the behavior of this field. | +| `transport_api_version` | Consul only supports v3 of the transport API. As a result, there is no workaround for implementing the behavior of this field. | ## Apply the configuration entry diff --git a/website/content/docs/connect/proxies/envoy-extensions/usage/lua.mdx b/website/content/docs/connect/proxies/envoy-extensions/usage/lua.mdx index 496b7d5fa58f3..da9e4c9b0f102 100644 --- a/website/content/docs/connect/proxies/envoy-extensions/usage/lua.mdx +++ b/website/content/docs/connect/proxies/envoy-extensions/usage/lua.mdx @@ -15,11 +15,9 @@ Envoy filters support setting and getting dynamic metadata, allowing a filter to To use the Lua Envoy extension, configure the following arguments in the `EnvoyExtensions` block: -| Arguments | Description | -| -------------- | ------------------------------------------------------------------------------------------------ | -| `ProxyType` | Determines the proxy type the extension applies to. The only supported value is `connect-proxy`. | -| `ListenerType` | Specifies if the extension is applied to the `inbound` or `outbound` listener. | -| `Script` | The Lua script that is configured to run by the HTTP Lua filter. | +- `ProxyType`: string | `connect-proxy` - Determines the proxy type the extension applies to. The only supported value is `connect-proxy`. +- `ListenerType`: string | required - Specifies if the extension is applied to the `inbound` or `outbound` listener. +- `Script`: string | required - The Lua script that is configured to run by the HTTP Lua filter. ## Workflow @@ -44,16 +42,15 @@ The following example configures the Lua Envoy extension on every service by usi ```hcl -Kind = "proxy-defaults" -Name = "global" -Protocol = "http" +Kind = "proxy-defaults" +Name = "global" +Protocol = "http" EnvoyExtensions { Name = "builtin/lua" - Arguments = { ProxyType = "connect-proxy" - Listener = "inbound" - Script = <<-EOS + Listener = "inbound" + Script = <<-EOS function envoy_on_request(request_handle) meta = request_handle:streamInfo():dynamicMetadata() m = meta:get("consul") @@ -159,7 +156,7 @@ $ kubectl apply lua-envoy-extension-proxy-defaults.yaml In the following example, the `service-defaults` configure the Lua Envoy extension to insert the HTTP Lua filter for service `myservice` and add the Consul service name to the`x-consul-service` header for all inbound requests. The `ListenerType` makes it so that the extension applies only on the inbound listener of the service's connect proxy. - + ```hcl Kind = "service-defaults" @@ -170,8 +167,8 @@ EnvoyExtensions = [ Arguments = { ProxyType = "connect-proxy" - Listener = "inbound" - Script = < + ```hcl Kind = "service-defaults" @@ -198,8 +195,8 @@ EnvoyExtensions = [ Name = "builtin/lua", Arguments = { ProxyType = "connect-proxy" - Listener = "inbound" - Script = <<-EOF + Listener = "inbound" + Script = <<-EOF function envoy_on_request(request_handle) meta = request_handle:streamInfo():dynamicMetadata() m = meta:get("consul") @@ -212,8 +209,8 @@ end Name = "builtin/lua", Arguments = { ProxyType = "connect-proxy" - Listener = "inbound" - Script = <<-EOF + Listener = "inbound" + Script = <<-EOF function envoy_on_request(request_handle) meta = request_handle:streamInfo():dynamicMetadata() m = meta:get("consul") diff --git a/website/content/docs/connect/proxies/envoy-extensions/usage/property-override.mdx b/website/content/docs/connect/proxies/envoy-extensions/usage/property-override.mdx index 0166f84ec193d..84cf621930b00 100644 --- a/website/content/docs/connect/proxies/envoy-extensions/usage/property-override.mdx +++ b/website/content/docs/connect/proxies/envoy-extensions/usage/property-override.mdx @@ -8,6 +8,13 @@ description: Learn how to use the property-override extension for Envoy proxies This topic describes how to use the `property-override` extension to set and remove individual properties for the Envoy resources Consul generates. The extension uses the [protoreflect](https://pkg.go.dev/google.golang.org/protobuf/reflect/protoreflect), which enables Consul to dynamically manipulate messages. +The extension currently supports setting scalar and enum fields, removing individual fields addressable by `Path`, and initializing unset intermediate message fields indicated in `Path`. + +It currently does _not_ support the following use cases: +- Adding, updating, or removing repeated field members +- Adding or updating [protobuf `map`](https://protobuf.dev/programming-guides/proto3/#maps) fields +- Adding or updating [protobuf `Any`](https://protobuf.dev/programming-guides/proto3/#any) fields + ## Workflow - Complete the following steps to use the `property-override` extension: @@ -23,37 +30,39 @@ Add Envoy extension configurations to a proxy defaults or service defaults confi - When you configure Envoy extensions on proxy defaults, they apply to every service. - When you configure Envoy extensions on service defaults, they apply to a specific service. -Consul applies Envoy extensions configured in proxy defaults before it applies extensions in service defaults. As a result, the Envoy extension configuration in service defaults may override configurations in proxy defaults. +Consul applies Envoy extensions configured in proxy defaults before it applies extensions in service defaults. As a result, the Envoy extension configuration in service defaults may override configurations in proxy defaults. -In the following service defaults configuration entry example, Consul adds a new `/upstream_connection_options/tcp_keepalive/keepalive_probes-5` field to each of the proxy's cluster configuration for the outbound `db`service upstream. The configuration applies to all `connect-proxy` proxies with services configured to communicate over HTTP: +In the following proxy defaults configuration entry example, Consul sets the `/respect_dns_ttl` field on the `api` service proxy's cluster configuration for the `other-svc` upstream service: ```hcl -Kind = "service-defaults" -Name = "global" -Protocol = "http" +Kind = "service-defaults" +Name = "api" +Protocol = "http" EnvoyExtensions = [ { Name = "builtin/property-override" Arguments = { - ProxyType = "connect-proxy", + ProxyType = "connect-proxy" Patches = [ { ResourceFilter = { - ResourceType = "cluster", - TrafficDirection = "outbound" + ResourceType = "cluster" + TrafficDirection = "outbound" Services = [{ Name = "other-svc" - }], - Op = "add" - Path = "/upstream_connection_options/tcp_keepalive/keepalive_probes", - Value = 5, + }] + } + Op = "add" + Path = "/respect_dns_ttl" + Value = true } - ] - } + ] + } + } ] ``` @@ -64,9 +73,9 @@ EnvoyExtensions = [ ```json "kind": "service-defaults", -"name": "global", +"name": "api", "protocol": "http", -"envoy_extensions": [{ +"envoyExtensions": [{ "name": "builtin/property-override", "arguments": { "proxyType": "connect-proxy", @@ -74,11 +83,11 @@ EnvoyExtensions = [ "resourceFilter": { "resourceType": "cluster", "trafficDirection": "outbound", - "services": [{ "name": "other-svc" }], - "op": "add", - "path": "/upstream_connection_options/tcp_keepalive/keepalive_probes", - "value": 5 - } + "services": [{ "name": "other-svc" }] + }, + "op": "add", + "path": "/respect_dns_ttl", + "value": true }] } }] @@ -86,13 +95,13 @@ EnvoyExtensions = [ - + ```yaml apiversion: consul.hashicorp.com/v1alpha1 kind: ServiceDefaults metadata: - name: global + name: api spec: protocol: http envoyExtensions: @@ -106,8 +115,8 @@ spec: services: - name: "other-svc" op: "add" - path: "/upstream_connection_options/tcp_keepalive/keepalive_probes", - value: 5 + path: "/respect_dns_ttl", + value: true ``` @@ -134,6 +143,7 @@ EnvoyExtensions = [ { Name = "builtin/property-override" Arguments = { + Debug = true ProxyType = "connect-proxy" Patches = [ { @@ -144,7 +154,7 @@ EnvoyExtensions = [ Op = "add" Path = "" Value = 5 - } + } ] } } @@ -155,19 +165,23 @@ After applying the configuration entry, Consul prints a message that includes th ```shell-session $ consul config write api.hcl -non-empty, non-root Path is required. available cluster fields: -/outlier_detection -/outlier_detection/enforcing_consecutive_5xx -/outlier_detection/failure_percentage_request_volume -/round_robin_lb_config -/round_robin_lb_config/slow_start_config +non-empty, non-root Path is required; +available envoy.config.cluster.v3.Cluster fields: +transport_socket_matches +name +alt_stat_name +type +cluster_type +eds_cluster_config +connect_timeout +... ``` You can use the output to help you construct the appropriate value for the `Path` field. For example: ```shell-session -$ consul config write api.hcl | grep round_robin -/round_robin_lb_config +$ consul config write api.hcl 2>&1 | grep round_robin +round_robin_lb_config ``` diff --git a/website/content/docs/connect/proxies/envoy.mdx b/website/content/docs/connect/proxies/envoy.mdx index e6759113c2c78..bbdda8b82fcd7 100644 --- a/website/content/docs/connect/proxies/envoy.mdx +++ b/website/content/docs/connect/proxies/envoy.mdx @@ -39,18 +39,19 @@ Consul supports **four major Envoy releases** at the beginning of each major Con | Consul Version | Compatible Envoy Versions | | ------------------- | -----------------------------------------------------------------------------------| +| 1.16.x | 1.26.2, 1.25.7, 1.24.8, 1.23.10 | | 1.15.x | 1.25.6, 1.24.7, 1.23.9, 1.22.11 | | 1.14.x | 1.24.0, 1.23.1, 1.22.5, 1.21.5 | -| 1.13.x | 1.23.1, 1.22.5, 1.21.5, 1.20.7 | ### Envoy and Consul Dataplane The Consul dataplane component was introduced in Consul v1.14 as a way to manage Envoy proxies without the use of Consul clients. Each new minor version of Consul is released with a new minor version of Consul dataplane, which packages both Envoy and the `consul-dataplane` binary in a single container image. For backwards compatability reasons, each new minor version of Consul will also support the previous minor version of Consul dataplane to allow for seamless upgrades. In addition, each minor version of Consul will support the next minor version of Consul dataplane to allow for extended dataplane support via newer versions of Envoy. -| Consul Version | Consul Dataplane Version (Bundled Envoy Version) | -| ------------------- | ------------------------------------------------- | -| 1.15.x | 1.1.x (Envoy 1.25.x), 1.0.x (Envoy 1.24.x) | -| 1.14.x | 1.1.x (Envoy 1.25.x), 1.0.x (Envoy 1.24.x) | +| Consul Version | Default `consul-dataplane` Version | Other compatible `consul-dataplane` Versions | +| ------------------- | ------------------------------------------------------------|----------------------------------------------| +| 1.16.x | 1.2.x (Envoy 1.26.x) | 1.1.x (Envoy 1.25.x) | +| 1.15.x | 1.1.x (Envoy 1.25.x) | 1.2.x (Envoy 1.26.x), 1.0.x (Envoy 1.24.x) | +| 1.14.x | 1.0.x (Envoy 1.24.x) | 1.1.x (Envoy 1.25.x) | ## Getting Started diff --git a/website/content/docs/enterprise/admin-partitions.mdx b/website/content/docs/enterprise/admin-partitions.mdx index 9be5a70d2e7e8..023e38ee31e7f 100644 --- a/website/content/docs/enterprise/admin-partitions.mdx +++ b/website/content/docs/enterprise/admin-partitions.mdx @@ -67,8 +67,6 @@ You can configure services to be discoverable by downstream services in any part You can use [cluster peering](/consul/docs/connect/cluster-peering/) between two admin partitions to connect clusters owned by different operators. Without Consul Enterprise, cluster peering is limited to the `default` partitions in each datacenter. Enterprise users can [establish cluster peering connections](/consul/docs/connect/cluster-peering/usage/establish-cluster-peering) between any two admin partitions as long as the partitions are in separate datacenters. It is not possible to establish cluster peering connections between two partitions in a single datacenter. -To use mesh gateways with admin partitions and cluster peering, refer to [Mesh Gateways between Peered Clusters](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-peers). - ## Requirements Your Consul configuration must meet the following requirements to use admin partitions. diff --git a/website/content/docs/enterprise/fips.mdx b/website/content/docs/enterprise/fips.mdx index bde0a154b15ac..6ad145886be29 100644 --- a/website/content/docs/enterprise/fips.mdx +++ b/website/content/docs/enterprise/fips.mdx @@ -8,9 +8,7 @@ description: >- # FIPS 140-2 - -This feature requires requires Consul Enterprise. - + This feature requires Consul Enterprise. Builds of Consul Enterprise marked with a `fips1402` feature name include built-in support for FIPS 140-2 compliance. @@ -22,9 +20,10 @@ To use this feature, you must have an [active or trial license for Consul Enterp FIPS 140-2 builds of Consul Enterprise behave in the same way as non-FIPS builds. There are no restrictions on Consul algorithms and ensuring that Consul remains in a FIPS-compliant mode of operation is your responsibility. To maintain FIPS-compliant operation, you must [ensure that TLS is enabled](/consul/tutorials/security/tls-encryption-secure) so that communication is encrypted. Consul products surface some helpful warnings where settings are insecure. Encryption is disabled in Consul Enterprise by default. As a result, Consul may transmit sensitive control plane information. You must ensure that gossip encryption and mTLS is enabled for all agents when running Consul with FIPS-compliant settings. In addition, be aware that TLSv1.3 does not work with FIPS 140-2, as HKDF is not a certified primitive. + HashiCorp is not a NIST-certified testing laboratory and can only provide general guidance about using Consul Enterprise in a FIPS-compliant manner. We recommend consulting an approved auditor for further information. -The FIPS 140-2 variant of Consul uses separate binaries that are available available from the following sources: +The FIPS 140-2 variant of Consul uses separate binaries that are available from the following sources: - From the [HashiCorp Releases page](https://releases.hashicorp.com/consul), releases ending with the `+ent.fips1402` suffix. - From the [Docker Hub `hashicorp/consul-enterprise-fips`](https://hub.docker.com/r/hashicorp/consul-enterprise-fips) container repository. @@ -86,15 +85,15 @@ Consul's FIPS 140-2 Linux products use the BoringCrypto integration in the offic Consul's FIPS 140-2 products on Windows use the CNGCrypto integration in Microsoft's Go toolchain, which include a FIPS-validated crypto module. -To ensure your build of Consul Enterprise includes FIPS support, confirm that a line with `FIPS: Enabled` appears when you run a `version` command. For example, the following message appears for Linux users +To ensure your build of Consul Enterprise includes FIPS support, confirm that a line with `FIPS: Enabled` appears when you run a `version` command. For example, the following message appears for Linux users: -```shell-session +```shell-session hideClipboard FIPS: FIPS 140-2 Enabled, crypto module boringcrypto ``` The following message appears for Windows users: -```shell-session +```shell-session hideClipboard FIPS: FIPS 140-2 Enabled, crypto module cngcrypto ``` @@ -111,11 +110,11 @@ To validate that a FIPS 140-2 Linux binary correctly includes BoringCrypto, run ```shell-session $ go tool nm consul | grep -i goboringcrypto - 4014d0 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_cbc_encrypt - 4014f0 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_ctr128_encrypt - 401520 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_decrypt - 401540 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_encrypt - 401560 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_set_decrypt_key +4014d0 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_cbc_encrypt +4014f0 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_ctr128_encrypt +401520 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_decrypt +401540 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_encrypt +401560 T _cgo_6880f0fbb71e_Cfunc__goboringcrypto_AES_set_decrypt_key ``` Similarly, on a FIPS Windows binary, run `go tool nm` on the binary to get a symbol dump, and then search for `go-crypto-winnative`. diff --git a/website/content/docs/enterprise/index.mdx b/website/content/docs/enterprise/index.mdx index 60207b136bb32..3295ccc9504ad 100644 --- a/website/content/docs/enterprise/index.mdx +++ b/website/content/docs/enterprise/index.mdx @@ -86,6 +86,8 @@ Available Enterprise features per Consul form and license include: | [Redundancy Zones](/consul/docs/enterprise/redundancy) | Not applicable | Yes | With Global Visibility, Routing, and Scale module | | [Sameness Groups](/consul/docs/connect/config-entries/samenes-group) | No | Yes | N/A | | [Sentinel for KV](/consul/docs/enterprise/sentinel) | All tiers | Yes | With Governance and Policy module | +| [Server request rate limits per source IP](/consul/docs/v1.16.x/agent/limits/usage/limit-request-rates-from-ips) | All tiers | Yes | With Governance and Policy module | + [HashiCorp Cloud Platform (HCP) Consul]: https://cloud.hashicorp.com/products/consul [Consul Enterprise]: https://www.hashicorp.com/products/consul/ @@ -112,6 +114,7 @@ Consul Enterprise feature availability can change depending on your server and c | [Redundancy Zones](/consul/docs/enterprise/redundancy) | ✅ | ✅ | ✅ | | [Sameness Groups](/consul/docs/connect/config-entries/samenes-group) | ✅ | ✅ | ✅ | | [Sentinel ](/consul/docs/enterprise/sentinel) | ✅ | ✅ | ✅ | +| [Server request rate limits per source IP](/consul/docs/v1.16.x/agent/limits/usage/limit-request-rates-from-ips) | ✅ | ✅ | ✅ | @@ -131,6 +134,7 @@ Consul Enterprise feature availability can change depending on your server and c | [Redundancy Zones](/consul/docs/enterprise/redundancy) | ❌ | ❌ | ❌ | | [Sameness Groups](/consul/docs/connect/config-entries/samenes-group) | ✅ | ✅ | ✅ | | [Sentinel ](/consul/docs/enterprise/sentinel) | ✅ | ✅ | ✅ | +| [Server request rate limits per source IP](/consul/docs/v1.16.x/agent/limits/usage/limit-request-rates-from-ips) | ✅ | ✅ | ✅ | @@ -150,6 +154,7 @@ Consul Enterprise feature availability can change depending on your server and c | [Redundancy Zones](/consul/docs/enterprise/redundancy) | n/a | n/a | n/a | | [Sameness Groups](/consul/docs/connect/config-entries/samenes-group) | ✅ | ✅ | ✅ | | [Sentinel ](/consul/docs/enterprise/sentinel) | ✅ | ✅ | ✅ | +| [Server request rate limits per source IP](/consul/docs/v1.16.x/agent/limits/usage/limit-request-rates-from-ips) | ✅ | ✅ | ✅ | \ No newline at end of file diff --git a/website/content/docs/k8s/compatibility.mdx b/website/content/docs/k8s/compatibility.mdx index 343387156df05..ad9f2cdf1261f 100644 --- a/website/content/docs/k8s/compatibility.mdx +++ b/website/content/docs/k8s/compatibility.mdx @@ -15,9 +15,9 @@ Consul Kubernetes versions all of its components (`consul-k8s` CLI, `consul-k8s- | Consul Version | Compatible consul-k8s Versions | Compatible Kubernetes Versions | | -------------- | -------------------------------- | -------------------------------| +| 1.16.x | 1.2.x | 1.24.x - 1.27.x | | 1.15.x | 1.1.x | 1.23.x - 1.26.x | | 1.14.x | 1.0.x | 1.22.x - 1.25.x | -| 1.13.x | 0.49.x | 1.21.x - 1.24.x | ## Supported Envoy versions diff --git a/website/content/docs/k8s/deployment-configurations/vault/data-integration/webhook-certs.mdx b/website/content/docs/k8s/deployment-configurations/vault/data-integration/webhook-certs.mdx index aec4fb1d69d34..b615d31fabd73 100644 --- a/website/content/docs/k8s/deployment-configurations/vault/data-integration/webhook-certs.mdx +++ b/website/content/docs/k8s/deployment-configurations/vault/data-integration/webhook-certs.mdx @@ -14,16 +14,16 @@ In a Consul Helm chart configuration that does not use Vault, `webhook-cert-mana When Vault is configured as the controller and connect inject Webhook Certificate Provider on Kubernetes: - `webhook-cert-manager` is no longer deployed to the cluster. - - controller and connect inject each get their webhook certificates from its own Vault PKI mount via the injected Vault Agent. - - controller and connect inject each need to be configured with its own Vault Role that has necessary permissions to receive certificates from its respective PKI mount. - - controller and connect inject each locally update its own `mutatingwebhookconfiguration` so that Kubernetes can relay events. + - Controller and connect inject each get their webhook certificates from its own Vault PKI mount via the injected Vault Agent. + - Controller and connect inject each need to be configured with its own Vault Role that has necessary permissions to receive certificates from its respective PKI mount. + - Controller and connect inject each locally update its own `mutatingwebhookconfiguration` so that Kubernetes can relay events. - Vault manages certificate rotation and rotates certificates to each webhook. To use Vault as the controller and connect inject Webhook Certificate Provider, we will need to modify the steps outlined in the [Data Integration](/consul/docs/k8s/deployment-configurations/vault/data-integration) section: These following steps will be repeated for each datacenter: 1. Create a Vault policy that authorizes the desired level of access to the secret. - 1. (Added) Create Vault PKI roles for controller and connect inject each that establish the domains that each is allowed to issue certificates for. + 1. (Added) Create Vault PKI roles for controller and connect inject that each establish the domains that each is allowed to issue certificates for. 1. Create Vault Kubernetes auth roles that link the policy to each Consul on Kubernetes service account that requires access. 1. Configure the Vault Kubernetes auth roles in the Consul on Kubernetes helm chart. @@ -74,44 +74,45 @@ Issue the following commands to enable and configure the PKI Secrets Engine to s 1. Create a policy that allows `["create", "update"]` access to the [certificate issuing URL](/vault/api-docs/secret/pki) so Consul controller and connect inject can fetch a new certificate/key pair and provide it to the Kubernetes `mutatingwebhookconfiguration`. - The path to the secret referenced in the `path` resource is the same value that you will configure in the `global.secretsBackend.vault.controller.tlsCert.secretName` and `global.secretsBackend.vault.connectInject.tlsCert.secretName` Helm configuration (refer to [Update Consul on Kubernetes Helm chart](#update-consul-on-kubernetes-helm-chart)). + The path to the secret referenced in the `path` resource is the same value that you will configure in the `global.secretsBackend.vault.controller.tlsCert.secretName` and `global.secretsBackend.vault.connectInject.tlsCert.secretName` Helm configuration (refer to [Update Consul on Kubernetes Helm chart](#update-consul-on-kubernetes-helm-chart)). - ```shell-session - $ vault policy write controller-tls-policy - < \ diff --git a/website/content/docs/k8s/upgrade/index.mdx b/website/content/docs/k8s/upgrade/index.mdx index 3533f10755f4f..666b75bb36137 100644 --- a/website/content/docs/k8s/upgrade/index.mdx +++ b/website/content/docs/k8s/upgrade/index.mdx @@ -219,7 +219,7 @@ In earlier versions, Consul on Kubernetes used client agents in its deployments. If you upgrade Consul from a version that uses client agents to a version the uses dataplanes, complete the following steps to upgrade your deployment safely and without downtime. -1. Before you upgrade, edit your Helm chart to enable Consul client agents by setting `client.enabled` and `client.updateStrategy`: +1. Before you upgrade, edit your Helm chart configuration to enable Consul client agents by setting `client.enabled` and `client.updateStrategy`: ```yaml filename="values.yaml" client: @@ -228,15 +228,25 @@ If you upgrade Consul from a version that uses client agents to a version the us type: OnDelete ``` -1. Add `consul.hashicorp.com/consul-k8s-version: 1.0.0` to the annotations for each pod you upgrade. +1. Update the `connect-injector` to not log out on restart +to make sure that the ACL tokens used by existing services are still valid during the migration to `consul-dataplane`. +Note that you must remove the token manually after completing the migration. -1. Follow our [recommended procedures to upgrade servers](#upgrading-consul-servers) on Kubernetes deployments to upgrade Helm values for the new version of Consul. + The following command triggers the deployment rollout. Wait for the rollout to complete before proceeding to next step. + + ```bash + kubectl config set-context --current --namespace= + INJECTOR_DEPLOYMENT=$(kubectl get deploy -l "component=connect-injector" -o=jsonpath='{.items[0].metadata.name}') + kubectl patch deploy $INJECTOR_DEPLOYMENT --type='json' -p='[{"op": "remove", "path": "/spec/template/spec/containers/0/lifecycle"}]' + ``` + +1. Follow our [recommended procedures to upgrade servers](#upgrade-consul-servers) on Kubernetes deployments to upgrade Helm values for the new version of Consul. 1. Run `kubectl rollout restart` to restart your service mesh applications. Restarting service mesh application causes Kubernetes to re-inject them with the webhook for dataplanes. 1. Restart all gateways in your service mesh. -1. Disable client agents in your Helm chart by deleting the `client` stanza or setting `client.enabled` to `false`. +1. Disable client agents in your Helm chart by deleting the `client` stanza or setting `client.enabled` to `false` and running a `consul-k8s` or Helm upgrade. ## Configuring TLS on an existing cluster diff --git a/website/content/docs/lambda/invoke-from-lambda.mdx b/website/content/docs/lambda/invoke-from-lambda.mdx index 4c29e4b8d9cd2..fd0da60776d5d 100644 --- a/website/content/docs/lambda/invoke-from-lambda.mdx +++ b/website/content/docs/lambda/invoke-from-lambda.mdx @@ -88,7 +88,7 @@ The mesh gateway must be running and registered to the Lambda function’s Consu - [Mesh Gateways between WAN-Federated Datacenters](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-wan-datacenters) - [Mesh Gateways between Admin Partitions](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-partitions) -- [Mesh Gateways between Peered Clusters](/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-peers) +- [Establish cluster peering connections](/consul/docs/connect/cluster-peering/usage/establish-cluster-peering) - [Connect Services Across Datacenters with Mesh Gateways](/consul/tutorials/developer-mesh/service-mesh-gateways) ## Deploy the Lambda extension layer diff --git a/website/content/docs/release-notes/consul-k8s/v1_2_x.mdx b/website/content/docs/release-notes/consul-k8s/v1_2_x.mdx new file mode 100644 index 0000000000000..bd8d65b4803a6 --- /dev/null +++ b/website/content/docs/release-notes/consul-k8s/v1_2_x.mdx @@ -0,0 +1,84 @@ +--- +layout: docs +page_title: 1.2.x +description: >- + Consul on Kubernetes release notes for version 1.2.x +--- + +# Consul on Kubernetes 1.2.0 + +We are pleased to announce the following Consul updates. + +## Release highlights + +- **Sameness groups (Enterprise):** Sameness groups are a user-defined set of partitions that Consul uses to identify services in different administrative partitions with the same name as being the same services. You can use sameness groups to create a blanket failover policy for deployments with cluster peering connections. Refer to the [Sameness groups overview](/consul/docs/connect/cluster-peering/usage/create-sameness-groups) for more information. + + Sameness groups is currently a beta feature in Consul Enterprise v1.16.0. + +- **Permissive mTLS:** You can enable the `permissive` mTLS mode to enable sidecar proxies to accept both mTLS and non-mTLS traffic. Using this mode enables you to onboard services without downtime and without reconfiguring or redeploying your application. Refer to the [Onboard services while in transparent proxy mode](/consul/docs/k8s/connect/onboarding-tproxy-mode) for more information on how to use permissive mTLS to onboard services to Consul. + +- **Transparent proxy enhancements for failover and virtual services:** We have made several internal improvements, such as ensuring that virtual IPs are always available, to reduce the friction associated with operating Consul in transparent proxy mode. Onboarding services, configuring failover redirects, and other operations require less administrative effort and ensure a smoother experience. Refer to the following documentation for additional information: + + - [Onboard services while in transparent proxy mode](/consul/docs/k8s/connect/onboarding-tproxy-mode) + - [Route traffic to virtual services](/consul/docs/k8s/l7-traffic/route-to-virtual-services) + - [Configure failover services](/consul/docs/k8s/l7-traffic/failover-tproxy). + +- **Granular server-side rate limits (Enterprise):** You can now set limits per source IP address. The following steps describe the general process for setting global read and write rate limits: + + 1. Set arbitrary limits to begin understanding the upper boundary of RPC and gRPC loads in your network. Refer to [Initialize rate limit settings](/consul/docs/agent/limits/usage/init-rate-limits) for additional information. + 1. Monitor the metrics and logs and readjust the initial configurations as necessary. Refer to [Monitor rate limit data](/consul/docs/agent/limits/usage/monitor-rate-limits) + 1. Define your final operational limits based on your observations. If you are defining global rate limits, refer to [Set global traffic rate limits](/consul/docs/agent/limits/usage/set-global-traffic-rate-limits) for additional information. For information about setting limits based on source IP, refer to [Limit traffic rates for a source IP](/consul/docs/agent/limits/usage/limit-request-rates-from-ips). + +- **Consul Envoy Extensions:** Consul Envoy extension system enables you to modify Consul-generated Envoy resources. Refer to [Envoy extension overview](/consul/docs/connect/proxies/envoy-extensions) for more information on how to use these extensions for Consul service mesh. + + - **Property Override:** The property override Envoy extension lets you set, remove, or modify individual properties for the Envoy resources Consul generates. Refer to the [Configure Envoy proxy properties](/consul/docs/connect/proxies/envoy-extensions/usage/property-override) for more information on how to use this extension. + + - **Wasm:** The Wasm Envoy extension lets you configure Wasm programs to be used as filters in the service's sidecar proxy. Refer to the [Run WebAssembly plug-ins in Envoy proxy](/consul/docs/connect/proxies/envoy-extensions/usage/wasm) for more information on how to use this extension. + + - **External Authorization:** The external authorization Envoy extension lets you delegate data plane authorization requests to external systems. Refer to the [Delegate authorization to an external service](/consul/docs/connect/proxies/envoy-extensions/usage/ext-authz) for more information on how to use this extension. + +- **Simplified API Gateway installation for Consul on Kubernetes:** API Gateway is now built into Consul. This enables a simplified installation and configuration process for Consul on Kubernetes. Refer to the [API Gateway installation](/consul/docs/api-gateway/install) for more information on the simplified native installation method. + +- **FIPS compliance (Enterprise):** HashiCorp now offers FIPS 140-2 compliant builds of Consul Enterprise that meet the security needs of federal agencies protecting sensitive, unclassified information with approved cryptographic measures. These builds use certified cryptographic modules and restrict configuration settings to comply with FIPS 140-2 Level 1 requirements, enabling compliant Consul deployments. Refer to the [Consul Enterprise FIPS](/consul/docs/enterprise/fips) for more information on FIPS compliance. + +- **JWT Authorization with service intentions:** Consul can now authorize connections based on claims present in JSON Web Token (JWT). You can configure Consul to use one or more JWT providers, which lets you control access to services and specific HTTP paths based on the validity of JWT claims embedded in the service traffic. This ensures a uniform and low latency mechanism to validate and authorize communication based on JWT claims across all services in a diverse service-oriented architecture. Refer to the [Use JWT authorization with service intentions](/consul/docs/connect/intentions/jwt-authorization) for more information. + +- **Automated license utilization reporting (Enterprise):** Consul Enterprise now provides automated license utilization reporting, which sends minimal product-license metering data to HashiCorp. You can use these reports to understand how much more you can deploy under your current contract, which can help you protect against overutilization and budget for predicted consumption. Refer to the [Automated license utilization reporting documentation](/consul/docs/enterprise/license/utilization-reporting) for more information. + +## What's deprecated + +- **Ingress gateway:** Starting with this release, ingress gateway is deprecated and will not be enhanced beyond its current capabilities. Ingress gateway is fully supported in this version but may be removed in a future release of Consul. + + Consul's API gateway is the recommended alternative to ingress gateway. For ingress gateway features not currently supported by API gateway, equivalent functionality will be added to API gateway over the next several releases of Consul. + +- **Legacy API Gateway:** The Consul AP Gateway that was previously packaged (`consul-api-gateway`) and released separately from Consul K8s is now deprecated. This is referred to as the “legacy” API Gateway. + + The legacy API Gateway (v0.5.4) is supported with this version of Consul on Kubernetes in order to simplify the process of migrating from legacy to native API gateways. + +## What's changed + +- The native API Gateway creates "API-gateway" configuration objects in Consul. This is a change from the legacy API Gateway, which creates "ingress-gateway" objects in Consul. + +- The native API Gateway in Consul on Kubernetes v1.2 does not create service intentions automatically. + +## Supported software + + Consul 1.15.x and 1.14.x are not supported. Please refer to Supported Consul and Kubernetes versions for more detail on choosing the correct consul-k8s version. + +- Consul 1.16.x. +- Consul Dataplane v1.2.x. Refer to Envoy and Consul Dataplane for details about Consul Dataplane versions and the available packaged Envoy version. +- Kubernetes 1.24.x - 1.27.x +- kubectl 1.24.x - 1.27.x +- Helm 3.6+ + +## Upgrading + +For more detailed information, please refer to the [upgrade details page](/consul/docs/upgrading/upgrade-specific) and the changelogs. + +## Changelogs + +The changelogs for this major release version and any maintenance versions are listed below. + + These links take you to the changelogs on the GitHub website. + +- [1.2.0-rc1](https://github.com/hashicorp/consul-k8s/releases/tag/v1.2.0-rc1) \ No newline at end of file diff --git a/website/content/docs/release-notes/consul/v1_16_x.mdx b/website/content/docs/release-notes/consul/v1_16_x.mdx index 96e0909f41221..0e39c024ceafc 100644 --- a/website/content/docs/release-notes/consul/v1_16_x.mdx +++ b/website/content/docs/release-notes/consul/v1_16_x.mdx @@ -9,11 +9,11 @@ description: >- We are pleased to announce the following Consul updates. -## Release Highlights +## Release highlights -- **Sameness groups:** Sameness groups are a user-defined set of partitions that Consul uses to identify services in different administrative partitions with the same name as being the same services. You can use sameness groups to create a blanket failover policy for deployments with cluster peering connections. Refer to the [Sameness groups overview](/consul/docs/connect/cluster-peering/usage/create-sameness-groups) for more information. +- **Sameness groups (Enterprise):** Sameness groups are a user-defined set of partitions that Consul uses to identify services in different administrative partitions with the same name as being the same services. You can use sameness groups to create a blanket failover policy for deployments with cluster peering connections. Refer to the [Sameness groups overview](/consul/docs/connect/cluster-peering/usage/create-sameness-groups) for more information. - Sameness groups is currently a "Beta" feature in Consul v1.16.0 and is an Enterprise feature. + Sameness groups is currently a beta feature in Consul Enterprise v1.16.0. - **Permissive mTLS:** You can enable the `permissive` mTLS mode to enable sidecar proxies to accept both mTLS and non-mTLS traffic. Using this mode enables you to onboard services without downtime and without reconfiguring or redeploying your application. Refer to the [Onboard services while in transparent proxy mode](/consul/docs/k8s/connect/onboarding-tproxy-mode) for more information on how to use permissive mTLS to onboard services to Consul. @@ -23,7 +23,7 @@ We are pleased to announce the following Consul updates. - [Route traffic to virtual services](/consul/docs/k8s/l7-traffic/route-to-virtual-services) - [Configure failover services](/consul/docs/k8s/l7-traffic/failover-tproxy). -- **Granular server-side rate limits:** You can now set limits per source IP address. The following steps describe the general process for setting global read and write rate limits: +- **Granular server-side rate limits:** You can now set limits per source IP address in Consul Enterprise. The following steps describe the general process for setting global read and write rate limits: 1. Set arbitrary limits to begin understanding the upper boundary of RPC and gRPC loads in your network. Refer to [Initialize rate limit settings](/consul/docs/agent/limits/usage/init-rate-limits) for additional information. 1. Monitor the metrics and logs and readjust the initial configurations as necessary. Refer to [Monitor rate limit data](/consul/docs/agent/limits/usage/monitor-rate-limits) @@ -39,11 +39,17 @@ We are pleased to announce the following Consul updates. - **Simplified API Gateway installation for Consul on Kubernetes:** API Gateway is now built into Consul. This enables a simplified installation and configuration process for Consul on Kubernetes. Refer to the [API Gateway installation](/consul/docs/api-gateway/install) for more information on the simplified native installation method. -- **FIPS compliance:** Consul Enterprise now offers FIPS 140-2 compliant builds that meet the security needs of federal agencies protecting sensitive, unclassified information with approved cryptographic measures. These builds use certified cryptographic modules and restrict configuration settings to comply with FIPS 140-2 Level 1 requirements, enabling compliant Consul deployments. Refer to the [Consul Enterprise FIPS](/consul/docs/enterprise/fips) for more information on FIPS compliance. +- **FIPS compliance (Enterprise):** HashiCorp now offers FIPS 140-2 compliant builds of Consul Enterprise that meet the security needs of federal agencies protecting sensitive, unclassified information with approved cryptographic measures. These builds use certified cryptographic modules and restrict configuration settings to comply with FIPS 140-2 Level 1 requirements, enabling compliant Consul deployments. Refer to the [Consul Enterprise FIPS](/consul/docs/enterprise/fips) for more information on FIPS compliance. - **JWT Authorization with service intentions:** Consul can now authorize connections based on claims present in JSON Web Token (JWT). You can configure Consul to use one or more JWT providers, which lets you control access to services and specific HTTP paths based on the validity of JWT claims embedded in the service traffic. This ensures a uniform and low latency mechanism to validate and authorize communication based on JWT claims across all services in a diverse service-oriented architecture. Refer to the [Use JWT authorization with service intentions](/consul/docs/connect/intentions/jwt-authorization) for more information. -- **Automated license utilization reporting:** Consul Enteprise now provides automated license utilization reporting, which sends minimal product-license metering data to HashiCorp. You can use these reports to understand how much more you can deploy under your current contract, which can help you protect against overutilization and budget for predicted consumption. Refer to the [Automated license utilization reporting documentation](/consul/docs/enterprise/license/utilization-reporting) for more information. +- **Automated license utilization reporting (Enterprise):** Consul Enterprise now provides automated license utilization reporting, which sends minimal product-license metering data to HashiCorp. You can use these reports to understand how much more you can deploy under your current contract, which can help you protect against overutilization and budget for predicted consumption. Refer to the [Automated license utilization reporting documentation](/consul/docs/enterprise/license/utilization-reporting) for more information. + +## What's deprecated + +- **Ingress gateway:** Starting with this release, ingress gateway is deprecated and will not be enhanced beyond its current capabilities. Ingress gateway is fully supported in this version but may be removed in a future release of Consul. + + Consul's API gateway is the recommended alternative to ingress gateway. For ingress gateway features not currently supported by API gateway, equivalent functionality will be added to API gateway over the next several releases of Consul. ## Upgrading @@ -55,4 +61,4 @@ The changelogs for this major release version and any maintenance versions are l These links take you to the changelogs on the GitHub website. -- [1.16.0-rc1](https://github.com/hashicorp/consul/releases/tag/v1.16.0-rc1) \ No newline at end of file +- [1.16.0](https://github.com/hashicorp/consul/releases/tag/v1.16.0) \ No newline at end of file diff --git a/website/content/docs/security/acl/tokens/create/create-a-mesh-gateway-token.mdx b/website/content/docs/security/acl/tokens/create/create-a-mesh-gateway-token.mdx new file mode 100644 index 0000000000000..9736beb50d9e8 --- /dev/null +++ b/website/content/docs/security/acl/tokens/create/create-a-mesh-gateway-token.mdx @@ -0,0 +1,508 @@ +--- +layout: docs +page_title: Create a token for mesh gateway registration +description: >- + Learn how to create ACL tokens that your mesh gateway can present to Consul servers so that they + can register with the Consul catalog. +--- + +# Create a mesh gateway token + +This topic describes how to create a token for a mesh gateway. + +## Introduction + +Mesh gateways enable service-to-service traffic between Consul datacenters or between Consul admin +partitions. They also enable datacenters to be federated across wide area networks. Refer to [Mesh +Gateways](/consul/docs/connect/gateways#mesh-gateways) for additional information. + +Gateways must present a token linked to policies that grant the appropriate set of permissions in +order to be discoverable and to route to other services in a mesh. + +## Requiremements + +Core ACL functionality is available in all versions of Consul. + +The mesh gateway must present a token linked to a policy that grants the following permissions: + +* `mesh:write` to obtain leaf certificates for terminating TLS connections +* `peering:read` for Consul cluster peering through mesh gateways. If you are not using cluster + peering or if the mesh gateway is not in the `default` partition, then you can omit the + `peering:read` permission. +* `service:write` to allow the mesh gateway to register into the catalog +* `service:read` for all services and `node:read` for all nodes in order to discover and route to services +* `agent:read` to enable the `consul connect envoy` CLI command to automatically discover gRPC + settings from the Consul agent. If this command is not used to start the gateway or if the Consul + agent uses the default gRPC settings, then you can omit the `agent:read` permission. + +@include 'create-token-requirements.mdx' + +## Consul OSS + +To create a token for the mesh gateway, you must define a policy, register the policy with Consul, and link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The policy grants the appropriate permissions to register as a service named `mesh-gateway` and to operate as a mesh gateway. + + + +```hcl +mesh = "write" +peering = "read" +service "mesh-gateway" { + policy = "write" +} +service_prefix "" { + policy = "read" +} +node_prefix "" { + policy = "read" +} +agent_prefix "" { + policy = "read" +} +``` + +```json +{ + "mesh": "write", + "peering": "read", + "service": { + "mesh-gateway": [{ + "policy": "write" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + }, + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "agent_prefix": { + "": [{ + "policy": "read" + }] + } +} +``` + + + + +### Register the policy with Consul + +After defining the policy, you can register the policy with Consul using the command line or API endpoint. + +The following commands create the ACL policy and token. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `mgw-register.hcl`: + +```shell-session +$ consul acl policy create \ + -name "mgw-register" -rules @mgw-register.hcl \ + -description "Mesh gateway policy" +``` + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `mgw-register.hcl`. You must embed policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + –-header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "mgw-register", + "Description": "Mesh gateway policy", + "Rules": "mesh = \"write\"\npeering = \"read\"\nservice \"mesh-gateway\" {\n policy = \"write\"\n}\nservice_prefix \"\" {\n policy = \"read\"\n}\nnode_prefix \"\" {\n policy = \"read\"\n}\nagent_prefix \"\" {\n policy = \"read\"\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policy into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +The following command creates the ACL token linked to the policy `mgw-register`. + +```shell-session +$ consul acl token create \ + -description "Mesh gateway token" \ + -policy-name "mgw-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + –-header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "mgw-register" + } + ] +}' +``` + + + + + +## Consul Enterprise in default partition + +To create a token for the mesh gateway, you must define a policy, register the policy with Consul, and link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +You can specify an admin partition and namespace when using Consul Enterprise. Mesh gateways must register into the `default` namespace. + +The following example policy is defined in a file. The policy grants the appropriate permissions to register as a service named `mesh-gateway` and to operate as a mesh gateway in the default partition. + + + +```hcl +mesh = "write" +partition_prefix "" { + peering = "read" +} +partition "default" { + namespace "default" { + service "mesh-gateway" { + policy = "write" + } + agent_prefix "" { + policy = "read" + } + } + namespace_prefix "" { + node_prefix "" { + policy = "read" + } + service_prefix "" { + policy = "read" + } + } +} +``` + +```json +{ + "mesh": "write", + "partition": { + "default": [{ + "namespace": { + "default": [{ + "service": { + "mesh-gateway": [{ + "policy": "write" + }] + }, + "agent_prefix": { + "": [{ + "policy": "read" + }] + } + }] + }, + "namespace_prefix": { + "": [{ + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + } + }] + } + }] + }, + "partition_prefix": { + "": [{ + "peering": "read" + }] + } +} +``` + + + + +### Register the policy with Consul + +After defining the policy, you can register the policy with Consul using the command line or API endpoint. + +The following commands create the ACL policy and token. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `mgw-register.hcl`: + +You can specify an admin partition when creating policies in Consul Enterprise. The policy is only valid in the specified admin partition. You must create the policy in the partition where the mesh gateway registers. The following example creates the policy in the `default` partition. + +```shell-session +$ consul acl policy create -partition "default" \ + -name mgw-register -rules @mgw-register.hcl \ + -description "Mesh gateway policy" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `mgw-register.hcl`. You must embed policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + –-header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "mgw-register", + "Description": "Mesh gateway policy", + "Partition": "default", + "Rules": "mesh = \"write\"\npeering = \"read\"\npartition_prefix \"\" {\n peering = \"read\"\n}\nnamespace \"default\" {\n service \"mesh-gateway\" {\n policy = \"write\"\n }\n agent_prefix \"\" {\n policy = \"read\"\n }\n}\nnamespace_prefix \"\" {\n node_prefix \"\" {\n \tpolicy = \"read\"\n }\n service_prefix \"\" {\n policy = \"read\"\n }\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policy into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +You can specify an admin partition when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition. The token must be created in the partition where the mesh gateway registers. The following example creates the token in the partition `ptn1`. + +```shell-session +$ consul acl token create -partition "default" \ + -description "Mesh gateway token" \ + -policy-name "mgw-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +You can specify an admin partition when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition. The token must be created in the partition where the mesh gateway registers. The following example creates the token in the partition `ptn1`. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + –-header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "mgw-register" + } + ], + "Partition": "default" +}' +``` + + + + + +## Consul Enterprise in non-default partition + +To create a token for the mesh gateway, you must define a policy, register the policy with Consul, and link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +You can specify an admin partition and namespace when using Consul Enterprise. Mesh gateways must register into the `default` namespace. To register a mesh gateway in a non-default partition, create the ACL policy and token in the partition where the mesh gateway registers. + +The following example policy is defined in a file. The policy grants the appropriate permissions to register as a service named `mesh-gateway` and to operate as a mesh gateway in a non-default partition. + + + +```hcl +mesh = "write" +namespace "default" { + service "mesh-gateway" { + policy = "write" + } + agent_prefix "" { + policy = "read" + } +} +namespace_prefix "" { + node_prefix "" { + policy = "read" + } + service_prefix "" { + policy = "read" + } +} +``` + +```json +{ + "mesh": "write", + "namespace": { + "default": [{ + "service": { + "mesh-gateway": [{ + "policy": "write" + }] + }, + "agent_prefix": { + "": [{ + "policy": "read" + }] + } + }] + }, + "namespace_prefix": { + "": [{ + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + } + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policy, you can register the policy with Consul using the command line or API endpoint. + +The following commands create the ACL policy and token. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `mgw-register.hcl`: + +You can specify an admin partition when creating policies in Consul Enterprise. The policy is only valid in the specified admin partition. You must create the policy in the partition where the mesh gateway registers. The following example creates the policy in the partition `ptn1`. + +```shell-session +$ consul acl policy create -partition "ptn1" \ + -name mgw-register -rules @mgw-register.hcl \ + -description "Mesh gateway policy" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `mgw-register.hcl`. You must embed policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + –-header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "mgw-register", + "Description": "Mesh gateway policy", + "Partition": "ptn1", + "Rules": "mesh = \"write\"\npeering = \"read\"\nnamespace \"default\" {\n service \"mesh-gateway\" {\n policy = \"write\"\n }\n agent_prefix \"\" {\n policy = \"read\"\n }\n}\nnamespace_prefix \"\" {\n node_prefix \"\" {\n \tpolicy = \"read\"\n }\n service_prefix \"\" {\n policy = \"read\"\n }\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policy into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +You can specify an admin partition when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition. The token must be created in the partition where the mesh gateway registers. The following example creates the token in the partition `ptn1`. + +```shell-session +$ consul acl token create -partition "ptn1" \ + -description "Mesh gateway token" \ + -policy-name "mgw-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +You can specify an admin partition when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition. The token must be created in the partition where the mesh gateway registers. The following example creates the token in the partition `ptn1`. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + –-header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "mgw-register" + } + ], + "Partition": "ptn1" +}' +``` + + + + diff --git a/website/content/docs/security/acl/tokens/create/create-a-service-token.mdx b/website/content/docs/security/acl/tokens/create/create-a-service-token.mdx new file mode 100644 index 0000000000000..125390b398305 --- /dev/null +++ b/website/content/docs/security/acl/tokens/create/create-a-service-token.mdx @@ -0,0 +1,416 @@ +--- +layout: docs +page_title: Create tokens for service registration +description: >- + Learn how to create ACL tokens that your services can present to Consul servers so that they can register with the Consul catalog. +--- + +# Create a service token + +This topic describes how to create a token that you can use to register a service and discover services in the Consul catalog. If you are using Consul service mesh, a sidecar proxy can use the token to discover and route traffic to other services. + +## Introduction + +Services must present a token linked to policies that grant the appropriate set of permissions in order to be discoverable or to interact with other services in a mesh. + +### Service identities versus custom policies + +You can create tokens linked to custom policies or to service identities. [Service identities](/consul/docs/security/acl#service-identities) are constructs in Consul that enable you to quickly grant permissions for a group of services, rather than creating similar policies for each service. + +We recommend using a service identity to grant permissions for service discovery and service mesh use cases rather than creating a custom policy. This is because service identities automatically grant the service and its sidecar proxy `service:write`, `service:read`, and `node:read`. + +Your organization may have requirements or processes for deploying services in a way that is inconsistent with service and node identities. In these cases, you can create custom policies and link them to tokens. + +## Requirements + +Core ACL functionality is available in all versions of Consul. + +The service token must be linked to policies that grant the following permissions: + +* `service:write`: Enables the service to update the catalog. If service mesh is enabled, the service's sidecar proxy can also update the catalog. Note that this permission implicitly grants `intention:read` permission to sidecar proxies so that they can read and enforce intentions. Refer to [Intention Management Permissions](/consul/docs/connect/intentions#intention-management-permissions) for details. +* `service:read`: Enables the service to learn about other services in the network. If service mesh is enabled, the service's sidecar proxy can also learn about other services in the network. +* `node:read`: Enables the sidecar proxy to discover and route traffic to other services in the catalog if service mesh is enabled. + +@include 'create-token-requirements.mdx' + +## Service identity in Consul OSS + +Refer to [Service identities](/consul/docs/security/acl#service-identities) for information about creating service identities that you can link to tokens. + +You can manually create tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy or service identity to link to create a token. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +The following example creates an ACL token linked to a service identity for a service named `svc1`. + +```shell-session +$ consul acl token create \ + -description "Service token for svc1" \ + -service-identity "svc1" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify a service identity in the request body to create a token linked to the service identity. An ACL token linked to a policy with permissions to use the API endpoint is required. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +The following example creates a token linked to a service identity named `svc1`: + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "ServiceIdentities": [ + { + "ServiceName": "svc1" + } + ] +}' +``` + + + + + +## Service identity in Consul Enterprise + +Refer to [Service identities](/consul/docs/security/acl#service-identities) for information about creating service identities that you can link to tokens. + +You can manually create tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy or service identity to link to create a token. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +You can specify an admin partition, namespace, or both when creating tokens in Consul Enterprise. The token can only include permissions in the specified scope, if any. The following example creates an ACL token that the service can use to register in the `ns1` namespace of partition `ptn1`: + +```shell-session +$ consul acl token create -partition "ptn1" -namespace "ns1" \ + -description "Service token for svc1" \ + -service-identity "svc1" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify a service identity in the request body to create a token linked to the service identity. An ACL token linked to a policy with permissions to use the API endpoint is required. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +You can specify an admin partition and namespace when creating tokens in Consul Enterprise. The token is only valid in the specified scopes. The following example creates an ACL token that the service can use to register in the `ns1` namespace of partition `ptn1`: + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "ServiceIdentities": [ + { + "ServiceName": "svc1" + } + ], + "Namespace": "ns1", + "Partition": "ptn1" +}' +``` + + + + + +## Custom policy in Consul OSS + +When you are unable to link tokens to a service identity, you can define policies, register them with Consul, and link the policies to tokens that enable services to register into the Consul catalog. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The policy grants the `svc1` service `write` permissions so that it can register into the catalog. For service mesh, the policy grants the `svc1-sidecar-proxy` service `write` permissions so that the sidecar proxy can register into the catalog. It grants service and node `read` permissions to discover and route to other services. + + + +```hcl +service "svc1" { + policy = "write" +} +service "svc1-sidecar-proxy" { + policy = "write" +} +service_prefix "" { + policy = "read" +} +node_prefix "" { + policy = "read" +} +``` + +```json +{ + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service": { + "svc1": [{ + "policy": "write" + }], + "svc1-sidecar-proxy": [{ + "policy": "write" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + } +} +``` + + + + +### Register the policy with Consul + +After defining the policies, you can register them with Consul using the command line or API endpoint. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `svc1-register.hcl`: + + +```shell-session +$ consul acl policy create \ + -name "svc1-register" -rules @svc1-register.hcl \ + -description "Allow svc1 to register into the catalog" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl token create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `svc1-register.hcl`. You must embed policy rules in the `Rules` field of the request body + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "svc1-register", + "Description": "Allow svc1 to register into the catalog", + "Rules": "service \"svc1\" {\n policy = \"write\"\n}\nservice \"svc1-sidecar-proxy\" {\n policy = \"write\"\n}\nservice_prefix \"\" {\n policy = \"read\"\n}\nnode_prefix \"\" {\n policy = \"read\"\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policies into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +The following commands create the ACL token linked to the policy `svc1-register`. + +```shell-session +$ consul acl token create \ + -description "Service token for svc1" \ + -policy-name "svc1-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +The following example creates an ACL token that the `svc1` service can use to register in the `ns1` namespaces of partition `ptn1`: + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "svc1-register" + } + ] +}' +``` + + + + + +## Custom policy in Consul Enterprise + +When you are unable to link tokens to a service identity, you can define policies, register them with Consul, and link the policies to tokens that enable services to register into the Consul catalog. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +You can specify an admin partition and namespace when creating policies in Consul Enterprise. The policy is only valid in the specified scopes. + +The following example policy is defined in a file. The policy allows the `svc1` service to register in the `ns1` namespace of partition `ptn1`. For service mesh, the policy grants the `svc1-sidecar-proxy` service `write` permissions so that the sidecar proxy can register into the catalog. It grants service and node `read` permissions to discover and route to other services. + + + +```hcl +partition "ptn1" { + namespace "ns1" { + service "svc1" { + policy = "write" + } + service "svc1-sidecar-proxy" { + policy = "write" + } + service_prefix "" { + policy = "read" + } + node_prefix "" { + policy = "read" + } + } +} +``` + +```json +{ + "partition": { + "ptn1": [{ + "namespace": { + "ns1": [{ + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service": { + "svc1": [{ + "policy": "write" + }], + "svc1-sidecar-proxy": [{ + "policy": "write" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + } + }] + } + }] + } +} +``` + + + + +### Register the policy with Consul + +After defining the policies, you can register them with Consul using the command line or API endpoint. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `svc1-register.hcl`: + + +```shell-session +$ consul acl policy create -partition "ptn1" -namespace "ns1" \ + -name "svc1-register" -rules @svc1-register.hcl \ + -description "Custom policy for service svc1" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl token create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `svc1-register.hcl`. You must embed policy rules in the `Rules` field of the request body + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "svc1-register", + "Description": "Allow svc1 to register into the catalog", + "Namespace": "ns1", + "Partition": "ptn1", + "Rules": "partition \"ptn1\" {\n namespace \"ns1\" {\n service \"svc1\" {\n policy = \"write\"\n }\n service \"svc1-sidecar-proxy\" {\n policy = \"write\"\n }\n service_prefix \"\" {\n policy = \"read\"\n }\n node_prefix \"\" {\n policy = \"read\"\n }\n }\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policies into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +You can specify an admin partition and namespace when creating tokens in Consul Enterprise. The token is only valid in the specified scopes. The following example creates an ACL token that the service can use to register in the `ns1` namespace of partition `ptn1`: + +The following commands create the ACL token linked to the policy `svc1-register`. + +```shell-session +$ consul acl token create -partition "ptn1" -namespace "ns1" \ + -description "Service token for svc1" \ + -policy-name "svc1-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +You can specify an admin partition and namespace when creating tokens in Consul Enterprise. The token is only valid in the specified scopes. The following example creates an ACL token that the service can use to register in the `ns1` namespace of partition `ptn1`: + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "svc1-register" + } + ], + "Namespace": "ns1", + "Partition": "ptn1" +}' +``` + + + + diff --git a/website/content/docs/security/acl/tokens/create/create-a-terminating-gateway-token.mdx b/website/content/docs/security/acl/tokens/create/create-a-terminating-gateway-token.mdx new file mode 100644 index 0000000000000..5ba304a9f84bf --- /dev/null +++ b/website/content/docs/security/acl/tokens/create/create-a-terminating-gateway-token.mdx @@ -0,0 +1,352 @@ +--- +layout: docs +page_title: Create a token for terminating gateway registration +description: >- + Learn how to create ACL tokens that your terminating gateway can present to Consul servers so that they can register with the Consul catalog. +--- + +# Create a terminating gateway token + +This topic describes how to create an ACL token that enables a terminating gateway to register with Consul. + +## Introduction + +Terminating gateways enable connectivity within your organizational network from services in the Consul service mesh to services and destinations outside the mesh. + +To learn how to configure terminating gateways, refer to the [Terminating Gateways](/consul/docs/connect/gateways/terminating-gateway#terminating-gateway-configuration) documentation and the [Understand Terminating Gateways](/consul/tutorials/developer-mesh/service-mesh-terminating-gateways) tutorial. + +## Requirements + +Core ACL functionality is available in all versions of Consul. + +The terminating gateway token must be linked to policies that grant the appropriate set of permissions in order to be discoverable and to forward traffic out of the mesh. The following permissions are required: + +* `service:write` to allow the terminating gateway to register into the catalog +* `service:write` for each service that it forwards traffic for +* `node:read` for the nodes of each service that it forwards traffic for +* `service:read` for all services and `node:read` for all nodes in order to discover and route to services +* `agent:read` to enable the `consul connect envoy` CLI command to automatically discover gRPC settings from the Consul agent. If this command is not used to start the gateway or if the Consul agent uses the default gRPC settings, then you can omit the `agent:read` permission. + +@include 'create-token-requirements.mdx' + +## Consul OSS + +To create a token for the terminating gateway, you must define a policy, register the policy with Consul, and link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The policy grants the appropriate permissions to register as a service named `terminating-gateway` and to operate as a terminating gateway. For this example, the terminating gateway forwards traffic for two services named `external-service-1` and `external-service-2`. The policy examples include `service:write` permissions for these services. If you have additional services, your policy must include `service:write` permissions for the additional services to be included in the policy rules. + + + +```hcl +service "terminating-gateway" { + policy = "write" +} +service "external-service-1" { + policy = "write" +} +service "external-service-2" { + policy = "write" +} +node_prefix "" { + policy = "read" +} +agent_prefix "" { + policy = "read" +} +``` + +```json +{ + "service": { + "terminating-gateway": [{ + "policy": "write" + }], + "external-service-1": [{ + "policy": "write" + }], + "external-service-2": [{ + "policy": "write" + }] + }, + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "agent_prefix": { + "": [{ + "policy": "read" + }] + } +} +``` + + + + +### Register the policy with Consul + +After defining the policy, you can register the policy with Consul using the command line or API endpoint. + +The following commands create the ACL policy and token. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `tgw-register.hcl`: + +```shell-session +$ consul acl policy create \ + -name "tgw-register" -rules @tgw-register.hcl \ + -description "Terminating gateway policy" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `tgw-register.hcl`. You must embed policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "tgw-register", + "Description": "Terminating gateway policy", + "Rules": "service \"terminating-gateway\" {\n policy = \"write\"\n}\nservice \"external-service-1\" {\n policy = \"write\"\n}\nservice \"external-service-2\" {\n policy = \"write\"\n}\nnode_prefix \"\" {\n policy = \"read\"\n}\nagent_prefix \"\" {\n policy = \"read\"\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policy into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +The following command creates the ACL token linked to the policy `tgw-register`. + +```shell-session +$ consul acl token create \ + -description "Terminating gateway token" \ + -policy-name "tgw-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "tgw-register" + } + ] +}' +``` + + + + + +## Consul Enterprise + +To create a token for the terminating gateway, you must define a policy, register the policy with Consul, and link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +You can specify an admin partition and namespace when creating policies in Consul Enterprise. The policy is only valid in the specified scopes. + +The following example policy is defined in a file. The policy grants the appropriate permissions for a terminating gateway to register as a service named `terminating-gateway` in namespace `ns1` in partition `ptn1`. + +For this example, the terminating gateway forwards traffic for the following two services: + +* `external-service-1` in the `default` namespace +* `external-service-2` in the `ns1` namespace + +The policy examples include `service:write` permissions for these services. If you have additional services, your policy must include `service:write` permissions for the additional services to be included in the policy rules. + +The policy contains permissions for resources in multiple namespaces. You must create ACL policies that grant permissions for multiple namespaces in the `default` namespace. + + + +```hcl +partition "ptn1" { + namespace "ns1" { + service "terminating-gateway" { + policy = "write" + } + node_prefix "" { + policy = "read" + } + service "external-service-2" { + policy = "write" + } + } + namespace "default" { + service "external-service-1" { + policy = "write" + } + node_prefix "" { + policy = "read" + } + agent_prefix "" { + policy = "read" + } + } +} +``` + +```json +{ + "partition": { + "ptn1": [{ + "namespace": { + "default": [{ + "agent_prefix": { + "": [{ + "policy": "read" + }] + }, + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service": { + "external-service-1": [{ + "policy": "write" + }] + } + }], + "ns1": [{ + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service": { + "external-service-2": [{ + "policy": "write" + }], + "terminating-gateway": [{ + "policy": "write" + }] + } + }] + } + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policy, you can register the policy with Consul using the command line or API endpoint. + +You can specify an admin partition and namespace when creating policies in Consul Enterprise. The policy is only valid in the specified admin partition and namespace. You must create the policy in the same partition where the terminating gateway is registered. If the terminating gateway requires permissions for multiple namespaces, then the policy must be created in the `default` namespace. The following example creates the policy in the partition `ptn1` and `default` namespace because the example policy contains permissions for multiple namespaces. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `tgw-register.hcl`: + +```shell-session +$ consul acl policy create -partition "ptn1" -namespace "default" \ + -name "tgw-register" -rules @tgw-register.hcl \ + -description "Terminating gateway policy" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `tgw-register.hcl`. You must embed policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "tgw-register", + "Description": "Terminating gateway policy", + "Partition": "ptn1", + "Namespace": "default", + "Rules": "partition \"ptn1\" {\n namespace \"ns1\" {\n service \"terminating-gateway\" {\n policy = \"write\"\n }\n node_prefix \"\" {\n policy = \"read\"\n }\n service \"external-service-2\" {\n policy = \"write\"\n }\n }\n namespace \"default\" {\n service \"external-service-1\" {\n policy = \"write\"\n }\n node_prefix \"\" {\n policy = \"read\"\n }\n agent_prefix \"\" {\n policy = \"read\"\n }\n }\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policy into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + +You can specify an admin partition when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition. You must create the token in the partition where the terminating gateway is registered. If the terminating gateway requires permissions for multiple namespaces, then the token must be created in the `default` namespace. The following example creates the token in the `default` namespace in the `ptn1` partition because the example policy contains permissions for multiple namespaces. + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +```shell-session +$ consul acl token create -partition "ptn1" -namespace "default" \ + -description "Terminating gateway token" \ + -policy-name "tgw-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "tgw-register" + } + ], + "Partition": "ptn1", + "Namespace": "default" +}' +``` + + + + diff --git a/website/content/docs/security/acl/tokens/create/create-a-ui-token.mdx b/website/content/docs/security/acl/tokens/create/create-a-ui-token.mdx new file mode 100644 index 0000000000000..9c1e9019b5e78 --- /dev/null +++ b/website/content/docs/security/acl/tokens/create/create-a-ui-token.mdx @@ -0,0 +1,557 @@ +--- +layout: docs +page_title: Create tokens for agent registration +description: >- + Learn how to create ACL tokens that your Consul agents can present to Consul servers so that they can join the Consul cluster. +--- + +# Create a UI token + +This topic describes how to create a token that you can use to view resources in the Consul UI. + +## Introduction + +To navigate the Consul UI when ACLs are enabled, log into the UI with a token linked to policies that grant an appropriate set of permissions. The UI is unable to display resources that the token does not have permission to access. + +## Requirements + +Core ACL functionality is available in all versions of Consul. + +@include 'create-token-requirements.mdx' + +## View catalog in Consul OSS + +This section describes how to create a token that grants read-only access to the catalog. This token allows users to view the catalog without the ability to make changes. To create the ACL token, define a policy, create the policy, and then link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The policy allows users that login with the token to view all services and nodes in the catalog. + + + +```hcl +service_prefix "" { + policy = "read" +} +node_prefix "" { + policy = "read" +} +``` + +```json +{ + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policies, you can register them with Consul using the command line or API endpoint. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `ui-view-catalog.hcl`. + +```shell-session +$ consul acl policy create \ + -name "ui-view-catalog" -rules @ui-view-catalog.hcl \ + -description "Allow viewing the catalog" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `view-catalog.hcl`. You must embed the policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "ui-view-catalog", + "Description": "Allow viewing the catalog", + "Rules": "service_prefix \"\" {\n policy = \"read\"\n}\nnode_prefix \"\" {\n policy = \"read\"\n}\n" +}' +``` + + + + + +### Link the policy to a token + +After registering the policies into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +The following command creates the ACL token linked to the policy `ui-view-catalog`. + +```shell-session +$ consul acl token create \ + -description "UI token to view the catalog" \ + -policy-name "ui-view-catalog" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +The following example creates an ACL token that you can use to login to the UI and view the catalog. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "ui-view-catalog" + } + ] +}' +``` + + + + + +## View catalog in Consul Enterprise + +This section describes how to create a token that grants read-only access to the catalog. This token allows users to view the catalog without the ability to make changes. To create the ACL token, define a policy, create the policy, and then link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The following policy allows users that log in with the token to view services and nodes in the catalog in any partition and in any namespace. The `operator:read` permission is needed to list partitions. Without this permission, you can still view resources within a partition but cannot easily navigate to other partitions in the Consul UI. + + + +```hcl +operator = "read" +partition_prefix "" { + namespace_prefix "" { + service_prefix "" { + policy = "read" + } + node_prefix "" { + policy = "read" + } + } +} +``` + +```json +{ + "partition_prefix": { + "": [{ + "namespace_prefix": { + "": [{ + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + } + }] + } + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policies, you can register them with Consul using the command line or API endpoint. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `ui-view-catalog.hcl`. + +You can specify an admin partition and namespace when registering policies in Consul Enterprise. Policies are only valid in the scopes specified during registration, but you can grant tokens registered in the `default` partition permission to access resources in a different partition than where the token was registered. Refer to the [admin partition documentation](/consul/docs/enterprise/admin-partitions#default-admin-partition) for additional information. + +The following example registers the policy in the `default` partition and the `default` namespace because the policy grants cross-partition and cross-namespace access. + +```shell-session +$ consul acl policy create -partition "default" -namespace "default" \ + -name "ui-view-catalog" -rules @ui-view-catalog.hcl \ + -description "Allow viewing the catalog" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `view-catalog.hcl`. You must embed the policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "ui-view-catalog", + "Description": "Allow viewing the catalog", + "Partition": "default", + "Namespace": "default", + "Rules": "partition_prefix \"\" {\n namespace_prefix \"\" {\n service_prefix \"\" {\n policy = \"read\"\n }\n node_prefix \"\" {\n policy = \"read\"\n }\n }\n}\n" +}' +``` + + + + + +### Link the policy to a token + +After registering the policies into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +```shell-session +$ consul acl token create -partition "default" -namespace "default" \ + -description "UI token to view the catalog" \ + -policy-name "ui-view-catalog" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +You can specify an admin partition and namespace when registering policies in Consul Enterprise. Policies are only valid in the scopes specified during registration, but you can grant tokens registered in the `default` partition permission to access resources in a different partition than where the token was registered. Refer to the [admin partition documentation](/consul/docs/enterprise/admin-partitions#default-admin-partition) for additional information. + +The following example registers the policy in the `default` partition and the `default` namespace because the policy grants cross-partition and cross-namespace access. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "ui-view-catalog" + } + ], + "Partition": "default", + "Namespace": "default" +}' +``` + + + + + +## View all resources in Consul OSS + +This section describes how to create a token with read-only access to all resources in the Consul UI. This token allows users to view any resources without the ability to make changes. To create the ACL token, define a policy, create the policy, and then link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The policy allows users that log in with the token to view all services and nodes in the catalog, all objects in the key/value store, all intentions, and all ACL resources. The `acl:read` permission does not allow viewing the token secret ids. + + + +```hcl +acl = "read" +key_prefix "" { + policy = "read" +} +node_prefix "" { + policy = "read" +} +operator = "read" +service_prefix "" { + policy = "read" + intentions = "read" +} +``` + +```json +{ + "acl": "read", + "key_prefix": { + "": [{ + "policy": "read" + }] + }, + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "operator": "read", + "service_prefix": { + "": [{ + "intentions": "read", + "policy": "read" + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policies, you can register them with Consul using the command line or API endpoint. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `ui-view-all.hcl`. + +```shell-session +$ consul acl policy create \ + -name "ui-view-all" -rules @ui-view-all.hcl \ + -description "Allow viewing all resources" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `ui-view-all.hcl`. You must embed the policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "ui-view-all", + "Description": "Allow viewing all resources", + "Rules": "acl = \"read\"\nkey_prefix \"\" {\n policy = \"read\"\n}\nnode_prefix \"\" {\n policy = \"read\"\n}\noperator = \"read\"\nservice_prefix \"\" {\n policy = \"read\"\n intentions = \"read\"\n}\n" +}' +``` + + + + + +### Link the policy to a token + +After registering the policies into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +The following command creates the ACL token linked to the policy `ui-view-all`. + +```shell-session +$ consul acl token create \ + -description "UI token to view all resources" \ + -policy-name "ui-view-all" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +The following example creates an ACL token that you can use to login to the UI and view the catalog. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "ui-view-all" + } + ] +}' +``` + + + + + +## View all resources in Consul Enterprise + +This section describes how to create a token with read-only access to all resources in the Consul UI. This token allows users to view any resources without the ability to make changes. To create the ACL token, define a policy, create the policy, and then link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The policy allows users that log in with the token to view all services and nodes in the catalog, all objects in the key-value store, all intentions, and all ACL resources in any namespace and any partition. The `acl:read` permission does not allow viewing the token secret ids. + + + +```hcl +operator = "read" +partition_prefix "" { + namespace_prefix "" { + acl = "read" + key_prefix "" { + policy = "read" + } + node_prefix "" { + policy = "read" + } + service_prefix "" { + policy = "read" + intentions = "read" + } + } +} +``` + +```json +{ + "operator": "read", + "partition_prefix": { + "": [{ + "namespace_prefix": { + "": [{ + "acl": "read", + "key_prefix": { + "": [{ + "policy": "read" + }] + }, + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service_prefix": { + "": [{ + "intentions": "read", + "policy": "read" + }] + } + }] + } + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policies, you can register them with Consul using the command line or API endpoint. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `ui-view-all.hcl`. + +You can specify an admin partition and namespace when creating policies in Consul Enterprise. The policy is only valid in the specified scopes. Because the policy grants cross-partition and cross-namespace access, the policy must be created in the `default` partition and the `default` namespace. + +```shell-session +$ consul acl policy create -partition "default" -namespace "default" \ + -name "ui-view-all" -rules @ui-view-all.hcl \ + -description "Allow viewing all resources" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `ui-view-all.hcl`. You must embed the policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "ui-view-all", + "Description": "Allow viewing all resources", + "Partition": "default", + "Namespace": "default", + "Rules": "operator = \"read\"\npartition_prefix \"\" {\n namespace_prefix \"\" {\n acl = \"read\"\n key_prefix \"\" {\n policy = \"read\"\n }\n node_prefix \"\" {\n policy = \"read\"\n }\n service_prefix \"\" {\n policy = \"read\"\n intentions = \"read\"\n }\n }\n}\n" +}' +``` + + + + + +### Link the policy to a token + +After registering the policies into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +```shell-session +$ consul acl token create -partition "default" -namespace "default" \ + -description "UI token to view all resources" \ + -policy-name "ui-view-all" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +You can specify an admin partition and namespace when creating tokens in Consul Enterprise. The token is only valid in the specified scopes. Because the policy was created in the `default` partition and `default` namespace, the token must also be created in the `default` partition and `default` namespace. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "ui-view-all" + } + ], + "Partition": "default", + "Namespace": "default" +}' +``` + + + + diff --git a/website/content/docs/security/acl/tokens/create/create-an-agent-token.mdx b/website/content/docs/security/acl/tokens/create/create-an-agent-token.mdx new file mode 100644 index 0000000000000..598db91125f20 --- /dev/null +++ b/website/content/docs/security/acl/tokens/create/create-an-agent-token.mdx @@ -0,0 +1,407 @@ +--- +layout: docs +page_title: Create tokens for agent registration +description: >- + Learn how to create ACL tokens that your Consul agents can present to Consul servers so that they can join the Consul cluster. +--- + +# Create an agent token + +This topic describes how to create a token that you can use to register an agent into the catalog. + +## Introduction + +Consul agents must present a token linked to policies that grant the appropriate set of permissions in order to register into the catalog and to discover services and nodes in the catalog. + +Specify the [`agent`](/consul/docs/agent/config/config-files#acl_tokens_agent) token to the Consul agent so that it can present the token when it registers into the catalog. + +### Node identities versus custom policies + +You can create tokens linked to custom policies or to node identities. [Node identities](/consul/docs/security/acl#node-identities) are constructs in Consul that enable you to quickly grant permissions for a group of agents, rather than create similar policies for each agent. + +We recommend using a node identity to grant permissions to the agent rather than creating a custom policy. This is because node identities automatically grant the node `node:write` and `service:read` permission. + +Your organization may have requirements or processes for deploying services in a way that is inconsistent with service and node identities. In these cases, you can create custom policies and link them to tokens. + +## Requirements + +Core ACL functionality is available in all versions of Consul. + +The agent token must be linked to policies that grant the following permissions: + +* `node:write`: Enables the agent to update the catalog. +* `service:read`: Enables the agent to discover other services in the catalog + +@include 'create-token-requirements.mdx' + +## Node identity in Consul OSS + +Refer to [Node identities](/consul/docs/security/acl#node-identities) for information about node identities that you can link to tokens. + +You can manually create tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy or node identity to link to create a token. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +The following command creates an ACL token linked to a node identity for a node named `node1` in the datacenter `dc1`. + +```shell-session +$ consul acl token create \ + -description "Agent token for node1" \ + -node-identity "node1:dc1" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify a node identity in the request body to create a token linked to the node identity. An ACL token linked to a policy with permissions to use the API endpoint is required. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +The following example creates a token linked to a node identity named `node1`: + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "NodeIdentities": [ + { + "NodeName": "node1", + "Datacenter": "dc1" + } + ] +}' +``` + + + + + +## Node identity in Consul Enterprise + +Refer to [Node identities](/consul/docs/security/acl#node-identities) for information about node identities that you can link to tokens. + +You can manually create tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy or node identity to link to create a token. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +You can specify an admin partition when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition. The following example creates an ACL token that the agent can use to register in partition `ptn1` in datacenter `dc1`: + +```shell-session +$ consul acl token create -partition "ptn1" \ + -description "Agent token for node1" \ + -node-identity "node1:dc1" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify a node identity in the request body to create a token linked to the node identity. An ACL token linked to a policy with permissions to use the API endpoint is required. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +You can specify an admin partition when creating a token in Consul Enterprise. The token is only valid in the specified admin partition. The following example creates an ACL token that the agent can use to register in the partition `ptn1` in datacenter `dc1`: + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "NodeIdentities": [ + { + "NodeName": "node1", + "Datacenter": "dc1" + } + ], + "Partition": "ptn1" +}' +``` + + + + + +## Custom policy in Consul OSS + +When you are unable to link tokens to a node identity, you can define policies, register them with Consul, and link the policies to tokens that enable nodes to register into the Consul catalog. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The policy grants `write` permission for node `node1` so that the Consul agent can register into the catalog. It grants `read` permissions to discover services in the catalog. + + + +```hcl +node "node1" { + policy = "write" +} +service_prefix "" { + policy = "read" +} +``` + +```json +{ + "node": { + "node1": [{ + "policy": "write" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policies, you can register them with Consul using the command line or API endpoint. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `node1-register.hcl`: + +```shell-session +$ consul acl policy create \ + -name "node1-register" -rules @node1-register.hcl \ + -description "Custom policy for node1" \ +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `node1-register.hcl`. You must embed policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "node1-register", + "Description": "Allow node1 to register into the catalog", + "Rules": "node \"node1\" {\n policy = \"write\"\n}\nservice_prefix \"\" {\n policy = \"read\"\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policies into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +The following command creates the ACL token linked to the policy `node1-register`. + +```shell-session +$ consul acl token create \ + -description "Agent token for node1" \ + -policy-name "node1-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +The following example creates an ACL token that the agent can use to register as node `node1` in the catalog: + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "node1-register" + } + ] +}' +``` + + + + + + + +## Custom policy in Consul Enterprise + +When you are unable to link tokens to a node identity, you can define policies, register them with Consul, and link the policies to tokens that enable nodes to register into the Consul catalog. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The policy grants the `write` permission for node `node1` in partition `ptn1` so that the Consul agent can register into the catalog. It grants `read` permissions to discover services in any namespace in the `ptn1` partition. + + + +```hcl +partition "ptn1" { + node "node1" { + policy = "write" + } + namespace_prefix "" { + service_prefix "" { + policy = "read" + } + } +} +``` + +```json +{ + "partition": { + "ptn1": [{ + "namespace_prefix": { + "": [{ + "service_prefix": { + "": [{ + "policy": "read" + }] + } + }] + }, + "node": { + "node1": [{ + "policy": "write" + }] + } + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policies, you can register them with Consul using the command line or API endpoint. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `node1-register.hcl`: + +```shell-session +$ consul acl policy create -partition "ptn1" \ + -name "node1-register" -rules @node1-register.hcl \ + -description "Custom policy for node1" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `node1-register.hcl`. You must embed policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "node1-register", + "Description": "Allow node1 to register into the catalog", + "Partition": "ptn1", + "Rules": "partition \"ptn1\" {\n node \"node1\" {\n policy = \"write\"\n }\n namespace_prefix \"\" {\n service_prefix \"\" {\n policy = \"read\"\n }\n }\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policies into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +```shell-session +$ consul acl token create -partition "ptn1" \ + -description "Agent token for node1" \ + -policy-name "node1-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +You can specify an admin partition when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition. The following example creates an ACL token that the agent can use to register as the node `node1` in the partition `ptn1`: + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "node1-register" + } + ], + "Partition": "ptn1" +}' +``` + + + + + +## Apply the token + +Configure the Consul agent to present the token by either specifying the token in the agent configuration file or by using the `consul set-agent-token` command. + +### Apply the token in a file + +Specify the token in the [`acl.token.agent`](/consul/docs/agent/config/config-files#acl_tokens_agent) field of the agent configuration file so that the agent can present it and register into the catalog on startup. + +```hcl +acl = { + enabled = true + tokens = { + agent = "" + ... + } + ... +} +``` + +### Apply the token with a command + +Set the `agent` token using the [`consul set-agent-token`](/consul/commands/acl/set-agent-token) command. The following command configures a running Consul agent token with the specified token. + +```shell-session +consul acl set-agent-token agent +``` diff --git a/website/content/docs/security/acl/tokens/create/create-an-ingress-gateway-token.mdx b/website/content/docs/security/acl/tokens/create/create-an-ingress-gateway-token.mdx new file mode 100644 index 0000000000000..65e01369966ad --- /dev/null +++ b/website/content/docs/security/acl/tokens/create/create-an-ingress-gateway-token.mdx @@ -0,0 +1,326 @@ +--- +layout: docs +page_title: Create a token for ingress gateway registration +description: >- + Learn how to create ACL tokens that your ingress gateway can present to Consul servers so that they can register with the Consul catalog. +--- + +# Create an ingress gateway token + +This topic describes how to create a token to enable an ingress gateway to register. + +## Introduction + +Gateways must present a token linked to policies that grant the appropriate set of permissions in order to register into the catalog and to route to other services in a mesh. + +## Requirements + +Core ACL functionality is available in all versions of Consul. + +The ingress gateway token must be linked to policies that grant the following permissions: + +* `service:write` to allow the ingress gateway to register into the catalog +* `service:read` for all services and `node:read` for all nodes in order to discover and route to services +* `agent:read` to enable the `consul connect envoy` CLI command to automatically discover gRPC settings from the Consul agent. If this command is not used to start the gateway or if the Consul agent uses the default gRPC settings, then you can omit the `agent:read` permission. + +@include 'create-token-requirements.mdx' + +## Consul OSS + +To create a token for the ingress gateway, you must define a policy, register the policy with Consul, and link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +The following example policy is defined in a file. The policy grants the ingress gateway the appropriate permissions to register as a service named `ingress-gateway` and to operate as an ingress gateway. + + + +```hcl +service "ingress-gateway" { + policy = "write" +} +node_prefix "" { + policy = "read" +} +service_prefix "" { + policy = "read" +} +agent_prefix "" { + policy = "read" +} +``` + +```json +{ + "agent_prefix": { + "": [{ + "policy": "read" + }] + }, + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service": { + "ingress-gateway": [{ + "policy": "write" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policy, you can register the policy with Consul using the command line or API endpoint. + +The following commands create the ACL policy and token. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `igw-register.hcl`: + +```shell-session +$ consul acl policy create \ + -name "igw-register" -rules @igw-register.hcl \ + -description "Ingress gateway policy" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `igw-register.hcl`. You must embed policy rules in the `Rules` field of the request body. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "igw-register", + "Description": "Ingress gateway policy", + "Rules": "service \"ingress-gateway\" {\n policy = \"write\"\n}\nnode_prefix \"\" {\n policy = \"read\"\n}\nservice_prefix \"\" {\n policy = \"read\"\n}\nagent_prefix \"\" {\n policy = \"read\"\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policy into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +The following command creates the ACL token linked to the policy `igw-register`. + +```shell-session +$ consul acl token create \ + -description "Ingress gateway token" \ + -policy-name "igw-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "igw-register" + } + ] +}' +``` + + + + + +## Consul Enterprise + +To create a token for the ingress gateway, you must define a policy, register the policy with Consul, and link the policy to a token. + +### Define a policy + +You can send policy definitions as command line or API arguments or define them in an external HCL or JSON file. Refer to [ACL Rules](/consul/docs/security/acl/acl-rules) for details about all of the rules you can use in your policies. + +You can specify an admin partition and namespace when creating policies in Consul Enterprise. The policy is only valid in the specified scopes. + +The following example policy is defined in a file. The policy allows an ingress gateway to register as a service named `ingress-gateway` in the `ptn1` partition and `ns1` namespace. The policy contains permissions for resources in multiple namespaces. You must create ACL policies that grant permissions for multiple namespaces in the `default` namespace. + + + +```hcl +partition "ptn1" { + namespace "ns1" { + service "ingress-gateway" { + policy = "write" + } + node_prefix "" { + policy = "read" + } + service_prefix "" { + policy = "read" + } + } + namespace "default" { + agent_prefix "" { + policy = "read" + } + } +} +``` + +```json +{ + "partition": { + "ptn1": [{ + "namespace": { + "default": [{ + "agent_prefix": { + "": [{ + "policy": "read" + }] + } + }], + "ns1": [{ + "node_prefix": { + "": [{ + "policy": "read" + }] + }, + "service": { + "ingress-gateway": [{ + "policy": "write" + }] + }, + "service_prefix": { + "": [{ + "policy": "read" + }] + } + }] + } + }] + } +} +``` + + + +### Register the policy with Consul + +After defining the policy, you can register the policy with Consul using the command line or API endpoint. + +The following commands create the ACL policy and token. + + + + + +Run the `consul acl policy create` command and specify the policy rules to create a policy. The following example registers a policy defined in `igw-register.hcl`: + +You can specify an admin partition and namespace when creating policies in Consul Enterprise. The policy is only valid in the specified admin partition and namespace. The following example creates the policy in the `default` namespace in the `ptn1` partition. The example policy contains permissions for resources in multiple namespaces. You must create ACL policies that grant permissions for multiple namespaces in the `default` namespace. + +```shell-session +$ consul acl policy create -partition "ptn1" -namespace "default" \ + -name "igw-register" -rules @igw-register.hcl \ + -description "Ingress gateway policy" +``` + +Refer to [Consul ACL Policy Create](/consul/commands/acl/policy/create) for details about the `consul acl policy create` command. + + + + + +Send a PUT request to the `/acl/policy` endpoint and specify the policy rules in the request body to create a policy. The following example registers the policy defined in `igw-register.hcl`. You must embed policy rules in the `Rules` field of the request body. + +You can specify an admin partition and namespace when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition and namespace. The following example creates the token in the partition `ptn1` and namespace `ns1`. The example policy contains permissions for resources in multiple namespaces. You must create ACL policies that grant permissions for multiple namespaces in the `default` namespace. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/policy \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Name": "igw-register", + "Description": "Ingress gateway policy", + "Partition": "ptn1", + "Namespace": "default", + "Rules": "partition \"ptn1\" {\n namespace \"ns1\" {\n service \"ingress-gateway\" {\n policy = \"write\"\n }\n node_prefix \"\" {\n policy = \"read\"\n }\n service_prefix \"\" {\n policy = \"read\"\n }\n }\n namespace \"default\" {\n agent_prefix \"\" {\n policy = \"read\"\n }\n }\n}\n" +}' +``` + +Refer to [ACL Policy HTTP API](/consul/api-docs/acl/policies) for additional information about using the API endpoint. + + + + + +### Link the policy to a token + +After registering the policy into Consul, you can create and link tokens using the Consul command line or API endpoint. You can also enable Consul to dynamically create tokens from trusted external systems using an [auth method](/consul/docs/security/acl/auth-methods). + + + + + +Run the `consul acl token create` command and specify the policy name or ID to create a token linked to the policy. Refer to [Consul ACL Token Create](/consul/commands/acl/token/create) for details about the `consul acl token create` command. + +You can specify an admin partition and namespace when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition and namespace. The following example creates the token in the partition `ptn1` and namespace `default`. The example policy contains permissions for resources in multiple namespaces. You must create ACL tokens linked to policies that grant permissions for multiple namespaces in the `default` namespace. + +```shell-session +$ consul acl token create -partition "ptn1" -namespace "default" \ + -description "Ingress gateway token" \ + -policy-name "igw-register" +``` + + + + + +Send a PUT request to the `/acl/token` endpoint and specify the policy name or ID in the request to create an ACL token linked to the policy. Refer to [ACL Token HTTP API](/consul/api-docs/acl/tokens) for additional information about using the API endpoint. + +You can specify an admin partition when creating tokens in Consul Enterprise. The token is only valid in the specified admin partition. You must create the token in the partition where the ingress gateway is registered. The following example creates the token in the partition `ptn1` and namespace `default`. + +```shell-session +$ curl --request PUT http://127.0.0.1:8500/v1/acl/token \ + --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" \ + --data '{ + "Policies": [ + { + "Name": "igw-register" + } + ], + "Partition": "ptn1", + "Namespace": "default" +}' +``` + + + + diff --git a/website/content/docs/security/acl/acl-tokens.mdx b/website/content/docs/security/acl/tokens/index.mdx similarity index 100% rename from website/content/docs/security/acl/acl-tokens.mdx rename to website/content/docs/security/acl/tokens/index.mdx diff --git a/website/content/docs/upgrading/instructions/general-process.mdx b/website/content/docs/upgrading/instructions/general-process.mdx index 84da7327dc03f..0c560f71357a0 100644 --- a/website/content/docs/upgrading/instructions/general-process.mdx +++ b/website/content/docs/upgrading/instructions/general-process.mdx @@ -107,13 +107,7 @@ Take note of which agent is the leader. binary with the new one. **3.** The following steps must be done in order on the server agents, leaving the leader -agent for last. First force the server agent to leave the cluster with the following command: - -``` -consul leave -``` - -Then, use a service management system (e.g., systemd, upstart, etc.) to restart the Consul service. If +agent for last. First, use a service management system (e.g., systemd, upstart, etc.) to restart the Consul service. If you are not using a service management system, you must restart the agent manually. To validate that the agent has rejoined the cluster and is in sync with the leader, issue the diff --git a/website/content/partials/create-token-auth-methods.mdx b/website/content/partials/create-token-auth-methods.mdx new file mode 100644 index 0000000000000..9128707285031 --- /dev/null +++ b/website/content/partials/create-token-auth-methods.mdx @@ -0,0 +1,3 @@ +### Auth methods + +Auth methods are components that perform authentication against a trusted external party to authorize the creation of ACL tokens for use within the local datacenter. Refer to the [auth methods documentation](/consul/docs/security/acl/auth-methods) for details about how to leverage auth methods in your network. diff --git a/website/content/partials/create-token-requirements.mdx b/website/content/partials/create-token-requirements.mdx new file mode 100644 index 0000000000000..bf4742719e8dd --- /dev/null +++ b/website/content/partials/create-token-requirements.mdx @@ -0,0 +1,22 @@ +### Authentication + +You must provide an ACL token linked to a policy with `acl:write` permissions to create and modify ACL tokens and policies using the CLI or API. + +You can provide the token manually using the `-token` option on the command line, but we recommend setting the `CONSUL_HTTP_TOKEN` environment variable to simplify your workflow: + +```shell-session +$ export CONSUL_HTTP_TOKEN= +``` + +The Consul CLI automatically reads the `CONSUL_HTTP_TOKEN` environment variable so that you do not have to pass the token to every Consul CLI command. + +To authenticate calls to the Consul HTTP API, you must provide the token in the `X-Consul-Token` header for each call: + +```shell-session +$ curl --header "X-Consul-Token: $CONSUL_HTTP_TOKEN" ... +``` + +To learn about alternative ways to authenticate, refer to the following documentation: + +* [CLI Authentication](/consul/commands#authentication) +* [API Authentication](/consul/api-docs/api-structure#authentication) diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index 0ba3a63149e04..8881e05747676 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -182,6 +182,10 @@ { "title": "Consul K8s", "routes": [ + { + "title": "v1.2.x", + "path": "release-notes/consul-k8s/v1_2_x" + }, { "title": "v1.1.x", "path": "release-notes/consul-k8s/v1_1_x" @@ -406,19 +410,39 @@ }, { "title": "API gateway", - "href": "/consul/docs/connect/gateways/api-gateway/configuration/api-gateway" + "href": "/consul/docs/connect/gateways/api-gateway/configuration/api-gateway", + "badge": { + "text": "BETA", + "type": "outlined", + "color": "neutral" + } }, { "title": "HTTP route", - "href": "/consul/docs/connect/gateways/api-gateway/configuration/http-route" + "href": "/consul/docs/connect/gateways/api-gateway/configuration/http-route", + "badge": { + "text": "BETA", + "type": "outlined", + "color": "neutral" + } }, { "title": "TCP route", - "href": "/consul/docs/connect/gateways/api-gateway/configuration/tcp-route" + "href": "/consul/docs/connect/gateways/api-gateway/configuration/tcp-route", + "badge": { + "text": "BETA", + "type": "outlined", + "color": "neutral" + } }, { "title": "Inline certificate", - "href": "/consul/docs/connect/gateways/api-gateway/configuration/inline-certificate" + "href": "/consul/docs/connect/gateways/api-gateway/configuration/inline-certificate", + "badge": { + "text": "BETA", + "type": "outlined", + "color": "neutral" + } }, { "title": "Ingress gateway", @@ -498,7 +522,7 @@ { "title": "Delegate authorization to external services", "path": "connect/proxies/envoy-extensions/usage/ext-authz" - }, + }, { "title": "Run Lua scripts in Envoy proxies", "path": "connect/proxies/envoy-extensions/usage/lua" @@ -514,8 +538,7 @@ { "title": "Run WebAssembly plug-ins in Envoy proxies", "path": "connect/proxies/envoy-extensions/usage/wasm" - } - ] + } ] }, { "title": "Configuration", @@ -523,16 +546,16 @@ { "title": "External authorization", "path": "connect/proxies/envoy-extensions/configuration/ext-authz" - }, + }, { "title": "Property override", "path": "connect/proxies/envoy-extensions/configuration/property-override" - }, + }, { "title": "WebAssembly", "path": "connect/proxies/envoy-extensions/configuration/wasm" } - ] + ] } ] }, @@ -635,6 +658,11 @@ }, { "title": "API Gateways", + "badge": { + "text": "BETA", + "type": "outlined", + "color": "neutral" + }, "routes": [ { "title": "Overview", @@ -895,7 +923,41 @@ }, { "title": "Tokens", - "path": "security/acl/acl-tokens" + "routes": [ + { + "title": "Overview", + "path": "security/acl/tokens" + }, + { + "title": "Create ACL Tokens", + "routes": [ + { + "title": "Create a service token", + "path": "security/acl/tokens/create/create-a-service-token" + }, + { + "title": "Create an agent token", + "path": "security/acl/tokens/create/create-an-agent-token" + }, + { + "title": "Create a UI token", + "path": "security/acl/tokens/create/create-a-ui-token" + }, + { + "title": "Create a mesh gateway token", + "path": "security/acl/tokens/create/create-a-mesh-gateway-token" + }, + { + "title": "Create an ingress gateway token", + "path": "security/acl/tokens/create/create-an-ingress-gateway-token" + }, + { + "title": "Create a terminating gateway token", + "path": "security/acl/tokens/create/create-a-terminating-gateway-token" + } + ] + } + ] }, { "title": "Policies", @@ -1012,7 +1074,7 @@ { "title": "Limit traffic rates from source IP addresses", "path": "agent/limits/usage/limit-request-rates-from-ips" - } + } ] }, { diff --git a/website/redirects.js b/website/redirects.js index 0468a19c60caa..517c73bbfa5a5 100644 --- a/website/redirects.js +++ b/website/redirects.js @@ -38,19 +38,26 @@ module.exports = [ permanent: true, }, { - source: '/consul/docs/v1.16.x/connect/transparent-proxy', - destination: '/consul/docs/v1.16.x/k8s/connect/transparent-proxy', + source: '/consul/docs/connect/transparent-proxy', + destination: '/consul/docs/k8s/connect/transparent-proxy', permanent: true, }, { - source: '/consul/docs/1.16.x/agent/limits/init-rate-limits', - destination: '/consul/docs/1.16.x/agent/limits/usage/init-rate-limits', + source: '/consul/docs/agent/limits/init-rate-limits', + destination: '/consul/docs/agent/limits/usage/init-rate-limits', permanent: true, }, { - source: '/consul/docs/1.16.x/agent/limits/set-global-traffic-rate-limits', + source: '/consul/docs/agent/limits/set-global-traffic-rate-limits', destination: - '/consul/docs/1.16.x/agent/limits/usage/set-global-traffic-rate-limits', + '/consul/docs/agent/limits/usage/set-global-traffic-rate-limits', + permanent: true, + }, + { + source: + '/consul/docs/connect/gateways/mesh-gateway/service-to-service-traffic-peers', + destination: + '/consul/docs/connect/cluster-peering/usage/establish-cluster-peering', permanent: true, }, ]