Skip to content

Commit

Permalink
connect: connect CA Roots in the primary datacenter should use a Sign…
Browse files Browse the repository at this point in the history
…ingKeyID derived from their local intermediate

This fixes an issue where leaf certificates issued in primary
datacenters using Vault as a Connect CA would be reissued very
frequently (every ~20 seconds) because the logic meant to detect root
rotation was errantly triggering.

The hash of the rootCA was being compared against a hash of the
intermediateCA and always failing. This doesn't apply to the Consul
built-in CA provider because there is no intermediate in use in the
primary DC.

This is reminiscent of #6513.
  • Loading branch information
rboyer committed Jan 25, 2021
1 parent 9b62182 commit 307035c
Show file tree
Hide file tree
Showing 5 changed files with 257 additions and 2 deletions.
4 changes: 4 additions & 0 deletions GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -359,10 +359,14 @@ ifeq ("$(CIRCLECI)","true")
gotestsum --format=short-verbose --junitfile "$(TEST_RESULTS_DIR)/gotestsum-report.xml" -- -cover -coverprofile=coverage.txt ./agent/connect/ca
# Run leader tests that require Vault
gotestsum --format=short-verbose --junitfile "$(TEST_RESULTS_DIR)/gotestsum-report-leader.xml" -- -cover -coverprofile=coverage-leader.txt -run TestLeader_Vault_ ./agent/consul
# Run agent tests that require Vault
gotestsum --format=short-verbose --junitfile "$(TEST_RESULTS_DIR)/gotestsum-report-agent.xml" -- -cover -coverprofile=coverage-agent.txt -run '.*_Vault_' ./agent
else
# Run locally
@echo "Running /agent/connect/ca tests in verbose mode"
@go test -v ./agent/connect/ca
@go test -v ./agent/consul -run 'TestLeader_Vault_'
@go test -v ./agent -run '.*_Vault_'
endif

proto: $(PROTOGOFILES) $(PROTOGOBINFILES)
Expand Down
126 changes: 126 additions & 0 deletions agent/agent_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/config"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/connect/ca"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/agent/debug"
"github.com/hashicorp/consul/agent/local"
Expand Down Expand Up @@ -5787,6 +5788,131 @@ func TestAgentConnectCALeafCert_goodNotLocal(t *testing.T) {
}
}

func TestAgentConnectCALeafCert_Vault_doesNotChurnLeafCertsAtIdle(t *testing.T) {
ca.SkipIfVaultNotPresent(t)

if testing.Short() {
t.Skip("too slow for testing.Short")
}

t.Parallel()

testVault := ca.NewTestVaultServer(t)
defer testVault.Stop()

assert := assert.New(t)
require := require.New(t)
a := StartTestAgent(t, TestAgent{Overrides: fmt.Sprintf(`
connect {
test_ca_leaf_root_change_spread = "1ns"
ca_provider = "vault"
ca_config {
address = %[1]q
token = %[2]q
root_pki_path = "pki-root/"
intermediate_pki_path = "pki-intermediate/"
}
}
`, testVault.Addr, testVault.RootToken)})
defer a.Shutdown()
testrpc.WaitForTestAgent(t, a.RPC, "dc1")
testrpc.WaitForActiveCARoot(t, a.RPC, "dc1", nil)

var ca1 *structs.CARoot
{
args := &structs.DCSpecificRequest{Datacenter: "dc1"}
var reply structs.IndexedCARoots
require.NoError(a.RPC("ConnectCA.Roots", args, &reply))
for _, r := range reply.Roots {
if r.ID == reply.ActiveRootID {
ca1 = r
break
}
}
require.NotNil(ca1)
}

{
// Register a local service
args := &structs.ServiceDefinition{
ID: "foo",
Name: "test",
Address: "127.0.0.1",
Port: 8000,
Check: structs.CheckType{
TTL: 15 * time.Second,
},
}
req, _ := http.NewRequest("PUT", "/v1/agent/service/register", jsonReader(args))
resp := httptest.NewRecorder()
_, err := a.srv.AgentRegisterService(resp, req)
require.NoError(err)
if !assert.Equal(200, resp.Code) {
t.Log("Body: ", resp.Body.String())
}
}

// List
req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test", nil)
resp := httptest.NewRecorder()
obj, err := a.srv.AgentConnectCALeafCert(resp, req)
require.NoError(err)
require.Equal("MISS", resp.Header().Get("X-Cache"))

// Get the issued cert
issued, ok := obj.(*structs.IssuedCert)
assert.True(ok)

// Verify that the cert is signed by the CA
requireLeafValidUnderCA(t, issued, ca1)

// Verify blocking index
assert.True(issued.ModifyIndex > 0)
assert.Equal(fmt.Sprintf("%d", issued.ModifyIndex),
resp.Header().Get("X-Consul-Index"))

// Test caching
{
// Fetch it again
resp := httptest.NewRecorder()
obj2, err := a.srv.AgentConnectCALeafCert(resp, req)
require.NoError(err)
require.Equal(obj, obj2)

// Should cache hit this time and not make request
require.Equal("HIT", resp.Header().Get("X-Cache"))
}

// Test that we aren't churning leaves for no reason at idle.
{
ch := make(chan error, 1)
go func() {
req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test?index="+strconv.Itoa(int(issued.ModifyIndex)), nil)
resp := httptest.NewRecorder()
obj, err := a.srv.AgentConnectCALeafCert(resp, req)
if err != nil {
ch <- err
} else {
issued2 := obj.(*structs.IssuedCert)
if issued.CertPEM == issued2.CertPEM {
ch <- fmt.Errorf("leaf woke up unexpectedly with same cert")
} else {
ch <- fmt.Errorf("leaf woke up unexpectedly with new cert")
}
}
}()

start := time.Now()

select {
case <-time.After(5 * time.Second):
case err := <-ch:
dur := time.Since(start)
t.Fatalf("unexpected return from blocking query; leaf churned during idle period, took %s: %v", dur, err)
}
}
}

func TestAgentConnectCALeafCert_secondaryDC_good(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
Expand Down
18 changes: 16 additions & 2 deletions agent/consul/leader_connect_ca.go
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi
if err != nil {
return fmt.Errorf("error generating intermediate cert: %v", err)
}
_, err = connect.ParseCert(interPEM)
intermediateCert, err := connect.ParseCert(interPEM)
if err != nil {
return fmt.Errorf("error getting intermediate cert: %v", err)
}
Expand All @@ -439,6 +439,13 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi
}
}

// Versions prior to 1.9.2, 1.8.8, and 1.7.12 incorrectly used the primary
// rootCA's subjectKeyID here instead of the intermediate. For
// provider=consul this didn't matter since there are no intermediates in
// the primaryDC, but for vault it does matter.
expectedSigningKeyID := connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId)
needsSigningKeyUpdate := (rootCA.SigningKeyID != expectedSigningKeyID)

// Check if the CA root is already initialized and exit if it is,
// adding on any existing intermediate certs since they aren't directly
// tied to the provider.
Expand All @@ -449,7 +456,10 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi
if err != nil {
return err
}
if activeRoot != nil {
if activeRoot != nil && needsSigningKeyUpdate {
c.logger.Info("Correcting stored SigningKeyID value", "previous", rootCA.SigningKeyID, "updated", expectedSigningKeyID)

} else if activeRoot != nil && !needsSigningKeyUpdate {
// This state shouldn't be possible to get into because we update the root and
// CA config in the same FSM operation.
if activeRoot.ID != rootCA.ID {
Expand All @@ -462,6 +472,10 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi
return nil
}

if needsSigningKeyUpdate {
rootCA.SigningKeyID = expectedSigningKeyID
}

// Get the highest index
idx, _, err := state.CARoots(nil)
if err != nil {
Expand Down
105 changes: 105 additions & 0 deletions agent/consul/leader_connect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,111 @@ func TestLeader_SecondaryCA_IntermediateRefresh(t *testing.T) {
require.NoError(err)
}

func TestLeader_Vault_PrimaryCA_FixSigningKeyID_OnRestart(t *testing.T) {
ca.SkipIfVaultNotPresent(t)

if testing.Short() {
t.Skip("too slow for testing.Short")
}

t.Parallel()

testVault := ca.NewTestVaultServer(t)
defer testVault.Stop()

dir1pre, s1pre := testServerWithConfig(t, func(c *Config) {
c.Build = "1.6.0"
c.PrimaryDatacenter = "dc1"
c.CAConfig = &structs.CAConfiguration{
Provider: "vault",
Config: map[string]interface{}{
"Address": testVault.Addr,
"Token": testVault.RootToken,
"RootPKIPath": "pki-root/",
"IntermediatePKIPath": "pki-intermediate/",
},
}
})
defer os.RemoveAll(dir1pre)
defer s1pre.Shutdown()

testrpc.WaitForLeader(t, s1pre.RPC, "dc1")

// Restore the pre-1.9.2/1.8.8/1.7.12 behavior of the SigningKeyID not being derived
// from the intermediates in the primary (which only matters for provider=vault).
var primaryRootSigningKeyID string
{
state := s1pre.fsm.State()

// Get the highest index
idx, activePrimaryRoot, err := state.CARootActive(nil)
require.NoError(t, err)
require.NotNil(t, activePrimaryRoot)

rootCert, err := connect.ParseCert(activePrimaryRoot.RootCert)
require.NoError(t, err)

// Force this to be derived just from the root, not the intermediate.
primaryRootSigningKeyID = connect.EncodeSigningKeyID(rootCert.SubjectKeyId)
activePrimaryRoot.SigningKeyID = primaryRootSigningKeyID

// Store the root cert in raft
resp, err := s1pre.raftApply(structs.ConnectCARequestType, &structs.CARequest{
Op: structs.CAOpSetRoots,
Index: idx,
Roots: []*structs.CARoot{activePrimaryRoot},
})
require.NoError(t, err)
if respErr, ok := resp.(error); ok {
t.Fatalf("respErr: %v", respErr)
}
}

// Shutdown s2pre and restart it to trigger the secondary CA init to correct
// the SigningKeyID.
s1pre.Shutdown()

dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.DataDir = s1pre.config.DataDir
c.Datacenter = "dc1"
c.PrimaryDatacenter = "dc1"
c.NodeName = s1pre.config.NodeName
c.NodeID = s1pre.config.NodeID
})
defer os.RemoveAll(dir1)
defer s1.Shutdown()

testrpc.WaitForLeader(t, s1.RPC, "dc1")

// Retry since it will take some time to init the primary CA fully and there
// isn't a super clean way to watch specifically until it's done than polling
// the CA provider anyway.
retry.Run(t, func(r *retry.R) {
// verify that the root is now corrected
provider, activeRoot := getCAProviderWithLock(s1)
require.NotNil(r, provider)
require.NotNil(r, activeRoot)

activeIntermediate, err := provider.ActiveIntermediate()
require.NoError(r, err)

intermediateCert, err := connect.ParseCert(activeIntermediate)
require.NoError(r, err)

// Force this to be derived just from the root, not the intermediate.
expect := connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId)

// The in-memory representation was saw the correction via a setCAProvider call.
require.Equal(r, expect, activeRoot.SigningKeyID)

// The state store saw the correction, too.
_, activePrimaryRoot, err := s1.fsm.State().CARootActive(nil)
require.NoError(r, err)
require.NotNil(r, activePrimaryRoot)
require.Equal(r, expect, activePrimaryRoot.SigningKeyID)
})
}

func TestLeader_SecondaryCA_FixSigningKeyID_via_IntermediateRefresh(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
Expand Down
6 changes: 6 additions & 0 deletions agent/proxycfg/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,12 @@ func (s *state) run() {
}

func (s *state) handleUpdate(u cache.UpdateEvent, snap *ConfigSnapshot) error {
s.logger.Trace("handleUpdate saw cache event",
"correlationID", u.CorrelationID,
"kind", s.kind,
"error", u.Err,
)

switch s.kind {
case structs.ServiceKindConnectProxy:
return s.handleUpdateConnectProxy(u, snap)
Expand Down

0 comments on commit 307035c

Please sign in to comment.