From 40d095fd1adb603be93b713d65cd385917e16436 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 13 Sep 2018 10:43:40 -0700 Subject: [PATCH 1/3] agent + consul --- client/allocrunner/testing.go | 2 +- client/client.go | 32 ++++++------ client/client_test.go | 8 +-- client/config/config.go | 5 ++ client/consul/consul_testing.go | 19 +++---- client/testing.go | 8 +-- command/agent/agent.go | 58 ++++++++++++++-------- command/agent/agent_endpoint.go | 4 +- command/agent/agent_test.go | 12 ++--- command/agent/alloc_endpoint_test.go | 2 +- command/agent/command.go | 28 +++++------ command/agent/consul/catalog_testing.go | 12 +++-- command/agent/consul/check_watcher.go | 29 ++++++----- command/agent/consul/check_watcher_test.go | 4 +- command/agent/consul/client.go | 26 +++++----- command/agent/consul/int_test.go | 2 +- command/agent/consul/script.go | 20 ++++---- command/agent/consul/script_test.go | 10 ++-- command/agent/consul/unit_test.go | 2 +- command/agent/consul/version_checker.go | 10 ++-- command/agent/http.go | 15 +++--- command/agent/metrics_endpoint.go | 27 +++++++--- command/agent/retry_join.go | 27 +++++----- command/agent/retry_join_test.go | 8 +-- helper/testlog/testlog.go | 15 +++++- nomad/testing.go | 3 +- 26 files changed, 221 insertions(+), 167 deletions(-) diff --git a/client/allocrunner/testing.go b/client/allocrunner/testing.go index 60a7b7b20bb..f9e8152bcf0 100644 --- a/client/allocrunner/testing.go +++ b/client/allocrunner/testing.go @@ -51,7 +51,7 @@ func TestAllocRunnerFromAlloc(t *testing.T, alloc *structs.Allocation, restarts alloc.Job.Type = structs.JobTypeBatch } vclient := vaultclient.NewMockVaultClient() - ar := NewAllocRunner(testlog.Logger(t), conf, db, upd.Update, alloc, vclient, consulApi.NewMockConsulServiceClient(t), NoopPrevAlloc{}) + ar := NewAllocRunner(testlog.Logger(t), conf, db, upd.Update, alloc, vclient, consulApi.NewMockConsulServiceClient(t, testlog.HCLogger(t)), NoopPrevAlloc{}) return upd, ar } diff --git a/client/client.go b/client/client.go index bda12e34379..633ff06fe01 100644 --- a/client/client.go +++ b/client/client.go @@ -16,28 +16,30 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/boltdb/bolt" consulapi "github.com/hashicorp/consul/api" - "github.com/hashicorp/consul/lib" multierror "github.com/hashicorp/go-multierror" + consulApi "github.com/hashicorp/nomad/client/consul" + cstructs "github.com/hashicorp/nomad/client/structs" + hstats "github.com/hashicorp/nomad/helper/stats" + nconfig "github.com/hashicorp/nomad/nomad/structs/config" + vaultapi "github.com/hashicorp/vault/api" + + "github.com/boltdb/bolt" + "github.com/hashicorp/consul/lib" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/allocrunner" "github.com/hashicorp/nomad/client/config" - consulApi "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/servers" "github.com/hashicorp/nomad/client/state" "github.com/hashicorp/nomad/client/stats" - cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/client/vaultclient" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/pool" - hstats "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/helper/tlsutil" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" - nconfig "github.com/hashicorp/nomad/nomad/structs/config" - vaultapi "github.com/hashicorp/vault/api" "github.com/shirou/gopsutil/host" ) @@ -196,7 +198,7 @@ var ( ) // NewClient is used to create a new client from the given configuration -func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService consulApi.ConsulServiceAPI, logger *log.Logger) (*Client, error) { +func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService consulApi.ConsulServiceAPI) (*Client, error) { // Create the tls wrapper var tlsWrap tlsutil.RegionWrapper if cfg.TLSConfig.EnableRPC { @@ -219,7 +221,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic connPool: pool.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap), tlsWrap: tlsWrap, streamingRpcs: structs.NewStreamingRpcRegistry(), - logger: logger, + logger: cfg.Logger.ResetNamed("").StandardLogger(&hclog.StandardLoggerOptions{InferLevels: true}), allocs: make(map[string]*allocrunner.AllocRunner), allocUpdates: make(chan *structs.Allocation, 64), shutdownCh: make(chan struct{}), @@ -245,7 +247,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic } // Add the stats collector - statsCollector := stats.NewHostStatsCollector(logger, c.config.AllocDir) + statsCollector := stats.NewHostStatsCollector(c.logger, c.config.AllocDir) c.hostStatsCollector = statsCollector // Add the garbage collector @@ -257,7 +259,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic ParallelDestroys: cfg.GCParallelDestroys, ReservedDiskMB: cfg.Node.Reserved.DiskMB, } - c.garbageCollector = NewAllocGarbageCollector(logger, statsCollector, c, gcConfig) + c.garbageCollector = NewAllocGarbageCollector(c.logger, statsCollector, c, gcConfig) go c.garbageCollector.Run() // Setup the node @@ -287,7 +289,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic c.configLock.RLock() if len(c.configCopy.Servers) > 0 { if _, err := c.setServersImpl(c.configCopy.Servers, true); err != nil { - logger.Printf("[WARN] client: None of the configured servers are valid: %v", err) + c.logger.Printf("[WARN] client: None of the configured servers are valid: %v", err) } } c.configLock.RUnlock() @@ -308,13 +310,13 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic // Restore the state if err := c.restoreState(); err != nil { - logger.Printf("[ERR] client: failed to restore state: %v", err) - logger.Printf("[ERR] client: Nomad is unable to start due to corrupt state. "+ + c.logger.Printf("[ERR] client: failed to restore state: %v", err) + c.logger.Printf("[ERR] client: Nomad is unable to start due to corrupt state. "+ "The safest way to proceed is to manually stop running task processes "+ "and remove Nomad's state (%q) and alloc (%q) directories before "+ "restarting. Lost allocations will be rescheduled.", c.config.StateDir, c.config.AllocDir) - logger.Printf("[ERR] client: Corrupt state is often caused by a bug. Please " + + c.logger.Printf("[ERR] client: Corrupt state is often caused by a bug. Please " + "report as much information as possible to " + "https://github.com/hashicorp/nomad/issues") return nil, fmt.Errorf("failed to restore state") diff --git a/client/client_test.go b/client/client_test.go index 03b466cbf9e..06d1afcb7d1 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -602,11 +602,11 @@ func TestClient_SaveRestoreState(t *testing.T) { } // Create a new client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) + c1.config.Logger = logger catalog := consul.NewMockCatalog(logger) - mockService := consulApi.NewMockConsulServiceClient(t) - mockService.Logger = logger - c2, err := NewClient(c1.config, catalog, mockService, logger) + mockService := consulApi.NewMockConsulServiceClient(t, logger) + c2, err := NewClient(c1.config, catalog, mockService) if err != nil { t.Fatalf("err: %v", err) } diff --git a/client/config/config.go b/client/config/config.go index a1225d0b6d5..bbc0c2a6a9f 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -8,6 +8,8 @@ import ( "strings" "time" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" @@ -76,6 +78,9 @@ type Config struct { // LogOutput is the destination for logs LogOutput io.Writer + // Logger provides a logger to thhe client + Logger log.Logger + // Region is the clients region Region string diff --git a/client/consul/consul_testing.go b/client/consul/consul_testing.go index e9f6421129a..a7997a3ebd0 100644 --- a/client/consul/consul_testing.go +++ b/client/consul/consul_testing.go @@ -2,11 +2,11 @@ package consul import ( "fmt" - "log" "sync" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/command/agent/consul" - "github.com/hashicorp/nomad/helper/testlog" "github.com/mitchellh/go-testing-interface" ) @@ -34,17 +34,18 @@ type MockConsulServiceClient struct { Ops []MockConsulOp mu sync.Mutex - Logger *log.Logger + logger log.Logger // AllocRegistrationsFn allows injecting return values for the // AllocRegistrations function. AllocRegistrationsFn func(allocID string) (*consul.AllocRegistration, error) } -func NewMockConsulServiceClient(t testing.T) *MockConsulServiceClient { +func NewMockConsulServiceClient(t testing.T, logger log.Logger) *MockConsulServiceClient { + logger = logger.Named("mock_consul") m := MockConsulServiceClient{ Ops: make([]MockConsulOp, 0, 20), - Logger: testlog.Logger(t), + logger: logger, } return &m } @@ -52,7 +53,7 @@ func NewMockConsulServiceClient(t testing.T) *MockConsulServiceClient { func (m *MockConsulServiceClient) UpdateTask(old, new *consul.TaskServices) error { m.mu.Lock() defer m.mu.Unlock() - m.Logger.Printf("[TEST] mock_consul: UpdateTask(alloc: %s, task: %s)", new.AllocID[:6], new.Name) + m.logger.Trace("UpdateTask", "alloc_id", new.AllocID, "task", new.Name) m.Ops = append(m.Ops, NewMockConsulOp("update", new.AllocID, new.Name)) return nil } @@ -60,7 +61,7 @@ func (m *MockConsulServiceClient) UpdateTask(old, new *consul.TaskServices) erro func (m *MockConsulServiceClient) RegisterTask(task *consul.TaskServices) error { m.mu.Lock() defer m.mu.Unlock() - m.Logger.Printf("[TEST] mock_consul: RegisterTask(alloc: %s, task: %s)", task.AllocID, task.Name) + m.logger.Trace("RegisterTask", "alloc_id", task.AllocID, "task", task.Name) m.Ops = append(m.Ops, NewMockConsulOp("add", task.AllocID, task.Name)) return nil } @@ -68,14 +69,14 @@ func (m *MockConsulServiceClient) RegisterTask(task *consul.TaskServices) error func (m *MockConsulServiceClient) RemoveTask(task *consul.TaskServices) { m.mu.Lock() defer m.mu.Unlock() - m.Logger.Printf("[TEST] mock_consul: RemoveTask(%q, %q)", task.AllocID, task.Name) + m.logger.Trace("RemoveTask", "alloc_id", task.AllocID, "task", task.Name) m.Ops = append(m.Ops, NewMockConsulOp("remove", task.AllocID, task.Name)) } func (m *MockConsulServiceClient) AllocRegistrations(allocID string) (*consul.AllocRegistration, error) { m.mu.Lock() defer m.mu.Unlock() - m.Logger.Printf("[TEST] mock_consul: AllocRegistrations(%q)", allocID) + m.logger.Trace("AllocRegistrations", "alloc_id", allocID) m.Ops = append(m.Ops, NewMockConsulOp("alloc_registrations", allocID, "")) if m.AllocRegistrationsFn != nil { diff --git a/client/testing.go b/client/testing.go index 5b6b8765911..cde5e8b8010 100644 --- a/client/testing.go +++ b/client/testing.go @@ -36,11 +36,11 @@ func TestClient(t testing.T, cb func(c *config.Config)) *Client { cb(conf) } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) + conf.Logger = logger catalog := consul.NewMockCatalog(logger) - mockService := consulApi.NewMockConsulServiceClient(t) - mockService.Logger = logger - client, err := NewClient(conf, catalog, mockService, logger) + mockService := consulApi.NewMockConsulServiceClient(t, logger) + client, err := NewClient(conf, catalog, mockService) if err != nil { t.Fatalf("err: %v", err) } diff --git a/command/agent/agent.go b/command/agent/agent.go index 09c34450fef..2d41849af3a 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -4,7 +4,6 @@ import ( "fmt" "io" "io/ioutil" - "log" "net" "os" "path/filepath" @@ -14,12 +13,16 @@ import ( "sync/atomic" "time" + golog "log" + metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" + uuidparse "github.com/hashicorp/go-uuid" + clientconfig "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/lib" - uuidparse "github.com/hashicorp/go-uuid" "github.com/hashicorp/nomad/client" - clientconfig "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad" @@ -50,8 +53,9 @@ type Agent struct { config *Config configLock sync.Mutex - logger *log.Logger - logOutput io.Writer + logger log.Logger + httpLogger log.Logger + logOutput io.Writer // consulService is Nomad's custom Consul client for managing services // and checks. @@ -75,14 +79,22 @@ type Agent struct { func NewAgent(config *Config, logOutput io.Writer, inmem *metrics.InmemSink) (*Agent, error) { a := &Agent{ config: config, - logger: log.New(logOutput, "", log.LstdFlags|log.Lmicroseconds), logOutput: logOutput, shutdownCh: make(chan struct{}), InmemSink: inmem, } + // Create the loggers + a.logger = log.New(&log.LoggerOptions{ + Name: "agent", + Level: log.LevelFromString(config.LogLevel), + Output: logOutput, + JSONFormat: false, // TODO(alex,hclog) Add a config option + }) + a.httpLogger = a.logger.ResetNamed("http") + // Global logger should match internal logger as much as possible - log.SetFlags(log.LstdFlags | log.Lmicroseconds) + golog.SetFlags(golog.LstdFlags | golog.Lmicroseconds) if err := a.setupConsul(config.Consul); err != nil { return nil, fmt.Errorf("Failed to initialize Consul client: %v", err) @@ -338,6 +350,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { a.config.AdvertiseAddrs.RPC) } + conf.Logger = a.logger conf.LogOutput = a.logOutput conf.LogLevel = a.config.LogLevel conf.DevMode = a.config.DevMode @@ -368,8 +381,8 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { } } if len(invalidConsulKeys) > 0 { - a.logger.Printf("[WARN] agent: Invalid keys: %v", strings.Join(invalidConsulKeys, ",")) - a.logger.Printf(`Nomad client ignores consul related configuration in client options. + a.logger.Warn("invalid consul keys", "keys", strings.Join(invalidConsulKeys, ",")) + a.logger.Warn(`Nomad client ignores consul related configuration in client options. Please refer to the guide https://www.nomadproject.io/docs/agent/configuration/consul.html to configure Nomad to work with Consul.`) } @@ -480,7 +493,8 @@ func (a *Agent) setupServer() error { } // Create the server - server, err := nomad.NewServer(conf, a.consulCatalog, a.logger) + // TODO(alex,hclog) + server, err := nomad.NewServer(conf, a.consulCatalog, a.logger.ResetNamed("").StandardLogger(&log.StandardLoggerOptions{InferLevels: true})) if err != nil { return fmt.Errorf("server setup failed: %v", err) } @@ -650,7 +664,7 @@ func (a *Agent) setupClient() error { } } - client, err := client.NewClient(conf, a.consulCatalog, a.consulService, a.logger) + client, err := client.NewClient(conf, a.consulCatalog, a.consulService) if err != nil { return fmt.Errorf("client setup failed: %v", err) } @@ -702,7 +716,7 @@ func (a *Agent) agentHTTPCheck(server bool) *structs.ServiceCheck { return &check } if a.config.TLSConfig.VerifyHTTPSClient { - a.logger.Printf("[WARN] agent: not registering Nomad HTTPS Health Check because verify_https_client enabled") + a.logger.Warn("not registering Nomad HTTPS Health Check because verify_https_client enabled") return nil } @@ -788,12 +802,12 @@ func (a *Agent) findLoopbackDevice() (string, string, string, error) { func (a *Agent) Leave() error { if a.client != nil { if err := a.client.Leave(); err != nil { - a.logger.Printf("[ERR] agent: client leave failed: %v", err) + a.logger.Error("client leave failed", "error", err) } } if a.server != nil { if err := a.server.Leave(); err != nil { - a.logger.Printf("[ERR] agent: server leave failed: %v", err) + a.logger.Error("server leave failed", "error", err) } } return nil @@ -808,23 +822,23 @@ func (a *Agent) Shutdown() error { return nil } - a.logger.Println("[INFO] agent: requesting shutdown") + a.logger.Info("requesting shutdown") if a.client != nil { if err := a.client.Shutdown(); err != nil { - a.logger.Printf("[ERR] agent: client shutdown failed: %v", err) + a.logger.Error("client shutdown failed", "error", err) } } if a.server != nil { if err := a.server.Shutdown(); err != nil { - a.logger.Printf("[ERR] agent: server shutdown failed: %v", err) + a.logger.Error("server shutdown failed", "error", err) } } if err := a.consulService.Shutdown(); err != nil { - a.logger.Printf("[ERR] agent: shutting down Consul client failed: %v", err) + a.logger.Error("shutting down Consul client failed", "error", err) } - a.logger.Println("[INFO] agent: shutdown complete") + a.logger.Info("shutdown complete") a.shutdown = true close(a.shutdownCh) return nil @@ -875,7 +889,7 @@ func (a *Agent) ShouldReload(newConfig *Config) (agent, http bool) { isEqual, err := a.config.TLSConfig.CertificateInfoIsEqual(newConfig.TLSConfig) if err != nil { - a.logger.Printf("[INFO] agent: error when parsing TLS certificate %v", err) + a.logger.Error("parsing TLS certificate", "error", err) return false, false } else if !isEqual { return true, true @@ -930,9 +944,9 @@ func (a *Agent) Reload(newConfig *Config) error { a.config.TLSConfig = newConfig.TLSConfig.Copy() if newConfig.TLSConfig.IsEmpty() { - a.logger.Println("[WARN] agent: Downgrading agent's existing TLS configuration to plaintext") + a.logger.Warn("downgrading agent's existing TLS configuration to plaintext") } else { - a.logger.Println("[INFO] agent: Upgrading from plaintext configuration to TLS") + a.logger.Info("upgrading from plaintext configuration to TLS") } return nil diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index ade1e0ffe06..cc1564e69f1 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -221,9 +221,9 @@ func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request) } // Set the servers list into the client - s.agent.logger.Printf("[TRACE] Adding servers %+q to the client's primary server list", servers) + s.agent.logger.Trace("adding servers to the client's primary server list", "servers", servers, "path", "/v1/agent/servers", "method", "PUT") if _, err := client.SetServers(servers); err != nil { - s.agent.logger.Printf("[ERR] Attempt to add servers %q to client failed: %v", servers, err) + s.agent.logger.Error("failed adding servers to client's server list", "servers", servers, "error", err, "path", "/v1/agent/servers", "method", "PUT") //TODO is this the right error to return? return nil, CodedError(400, err.Error()) } diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 871453fe747..07ebbdc4696 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -333,7 +333,7 @@ func TestAget_Client_TelemetryConfiguration(t *testing.T) { // API health check depending on configuration. func TestAgent_HTTPCheck(t *testing.T) { t.Parallel() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) agent := func() *Agent { return &Agent{ logger: logger, @@ -414,7 +414,7 @@ func TestAgent_HTTPCheckPath(t *testing.T) { // Agent.agentHTTPCheck only needs a config and logger a := &Agent{ config: DevConfig(), - logger: testlog.Logger(t), + logger: testlog.HCLogger(t), } if err := a.config.normalizeAddrs(); err != nil { t.Fatalf("error normalizing config: %v", err) @@ -632,7 +632,7 @@ func TestServer_Reload_TLS_WithNilConfiguration(t *testing.T) { t.Parallel() assert := assert.New(t) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) agent := &Agent{ logger: logger, @@ -656,7 +656,7 @@ func TestServer_Reload_TLS_UpgradeToTLS(t *testing.T) { dir := tmpDir(t) defer os.RemoveAll(dir) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) agentConfig := &Config{ TLSConfig: &sconfig.TLSConfig{}, @@ -698,7 +698,7 @@ func TestServer_Reload_TLS_DowngradeFromTLS(t *testing.T) { dir := tmpDir(t) defer os.RemoveAll(dir) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) agentConfig := &Config{ TLSConfig: &sconfig.TLSConfig{ @@ -930,7 +930,7 @@ func TestServer_ShouldReload_ReturnTrueForFileChanges(t *testing.T) { key = "../../helper/tlsutil/testdata/nomad-foo-key.pem" ) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) agentConfig := &Config{ TLSConfig: &sconfig.TLSConfig{ diff --git a/command/agent/alloc_endpoint_test.go b/command/agent/alloc_endpoint_test.go index 781da48cc3d..5d85894c2db 100644 --- a/command/agent/alloc_endpoint_test.go +++ b/command/agent/alloc_endpoint_test.go @@ -479,7 +479,7 @@ func TestHTTP_AllocSnapshot_Atomic(t *testing.T) { // require Snapshot fails if err := allocDir.Snapshot(ioutil.Discard); err != nil { - s.logger.Printf("[DEBUG] agent.test: snapshot returned error: %v", err) + t.Logf("[DEBUG] agent.test: snapshot returned error: %v", err) } else { t.Errorf("expected Snapshot() to fail but it did not") } diff --git a/command/agent/command.go b/command/agent/command.go index 8beadc77fb6..fed528d91a1 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -570,7 +570,7 @@ func (c *Command) handleRetryJoin(config *Config) error { joiner := retryJoiner{ discover: &discover.Discover{}, errCh: c.retryJoinErrCh, - logger: c.agent.logger, + logger: c.agent.logger.Named("joiner"), serverJoin: c.agent.server.Join, serverEnabled: true, } @@ -593,7 +593,7 @@ func (c *Command) handleRetryJoin(config *Config) error { config.Server.RetryInterval = 0 } - c.agent.logger.Printf("[WARN] agent: Using deprecated retry_join fields. Upgrade configuration to use server_join") + c.agent.logger.Warn("using deprecated retry_join fields. Upgrade configuration to use server_join") } if config.Server.Enabled && @@ -603,7 +603,7 @@ func (c *Command) handleRetryJoin(config *Config) error { joiner := retryJoiner{ discover: &discover.Discover{}, errCh: c.retryJoinErrCh, - logger: c.agent.logger, + logger: c.agent.logger.Named("joiner"), serverJoin: c.agent.server.Join, serverEnabled: true, } @@ -621,7 +621,7 @@ func (c *Command) handleRetryJoin(config *Config) error { joiner := retryJoiner{ discover: &discover.Discover{}, errCh: c.retryJoinErrCh, - logger: c.agent.logger, + logger: c.agent.logger.Named("joiner"), clientJoin: c.agent.client.SetServers, clientEnabled: true, } @@ -704,7 +704,7 @@ WAIT: // reloadHTTPServer shuts down the existing HTTP server and restarts it. This // is helpful when reloading the agent configuration. func (c *Command) reloadHTTPServer() error { - c.agent.logger.Println("[INFO] agent: Reloading HTTP server with new TLS configuration") + c.agent.logger.Info("reloading HTTP server with new TLS configuration") c.httpServer.Shutdown() @@ -741,23 +741,23 @@ func (c *Command) handleReload() { shouldReloadAgent, shouldReloadHTTP := c.agent.ShouldReload(newConf) if shouldReloadAgent { - c.agent.logger.Printf("[DEBUG] agent: starting reload of agent config") + c.agent.logger.Debug("starting reload of agent config") err := c.agent.Reload(newConf) if err != nil { - c.agent.logger.Printf("[ERR] agent: failed to reload the config: %v", err) + c.agent.logger.Error("failed to reload the config", "error", err) return } } if s := c.agent.Server(); s != nil { - c.agent.logger.Printf("[DEBUG] agent: starting reload of server config") + c.agent.logger.Debug("starting reload of server config") sconf, err := convertServerConfig(newConf, c.logOutput) if err != nil { - c.agent.logger.Printf("[ERR] agent: failed to convert server config: %v", err) + c.agent.logger.Error("failed to convert server config", "error", err) return } else { if err := s.Reload(sconf); err != nil { - c.agent.logger.Printf("[ERR] agent: reloading server config failed: %v", err) + c.agent.logger.Error("reloading server config failed", "error", err) return } } @@ -765,13 +765,13 @@ func (c *Command) handleReload() { if s := c.agent.Client(); s != nil { clientConfig, err := c.agent.clientConfig() - c.agent.logger.Printf("[DEBUG] agent: starting reload of client config") + c.agent.logger.Debug("starting reload of client config") if err != nil { - c.agent.logger.Printf("[ERR] agent: reloading client config failed: %v", err) + c.agent.logger.Error("reloading client config failed", "error", err) return } if err := c.agent.Client().Reload(clientConfig); err != nil { - c.agent.logger.Printf("[ERR] agent: reloading client config failed: %v", err) + c.agent.logger.Error("reloading client config failed", "error", err) return } } @@ -783,7 +783,7 @@ func (c *Command) handleReload() { if shouldReloadHTTP { err := c.reloadHTTPServer() if err != nil { - c.agent.logger.Printf("[ERR] http: failed to reload the config: %v", err) + c.agent.httpLogger.Error("reloading config failed", "error", err) return } } diff --git a/command/agent/consul/catalog_testing.go b/command/agent/consul/catalog_testing.go index 7cd2b833f81..621d4143992 100644 --- a/command/agent/consul/catalog_testing.go +++ b/command/agent/consul/catalog_testing.go @@ -2,29 +2,31 @@ package consul import ( "fmt" - "log" "sync" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/consul/api" ) // MockCatalog can be used for testing where the CatalogAPI is needed. type MockCatalog struct { - logger *log.Logger + logger log.Logger } -func NewMockCatalog(l *log.Logger) *MockCatalog { +func NewMockCatalog(l log.Logger) *MockCatalog { + l = l.Named("mock_consul") return &MockCatalog{logger: l} } func (m *MockCatalog) Datacenters() ([]string, error) { dcs := []string{"dc1"} - m.logger.Printf("[DEBUG] mock_consul: Datacenters() -> (%q, nil)", dcs) + m.logger.Trace("Datacenters()", "dcs", dcs, "error", "nil") return dcs, nil } func (m *MockCatalog) Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) { - m.logger.Printf("[DEBUG] mock_consul: Service(%q, %q, %#v) -> (nil, nil, nil)", service, tag, q) + m.logger.Trace("Services()", "service", service, "tag", tag, "query_options", q) return nil, nil, nil } diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index e9ae2dfe80a..d1f93be4e42 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -3,9 +3,10 @@ package consul import ( "context" "fmt" - "log" "time" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/nomad/structs" ) @@ -50,7 +51,7 @@ type checkRestart struct { // checks should be counted. graceUntil time.Time - logger *log.Logger + logger log.Logger } // apply restart state for check and restart task if necessary. Current @@ -62,8 +63,7 @@ type checkRestart struct { func (c *checkRestart) apply(now time.Time, status string) bool { healthy := func() { if !c.unhealthyState.IsZero() { - c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became healthy; canceling restart", - c.allocID, c.taskName, c.checkName) + c.logger.Debug("canceling restart because check became healthy") c.unhealthyState = time.Time{} } } @@ -89,8 +89,7 @@ func (c *checkRestart) apply(now time.Time, status string) bool { if c.unhealthyState.IsZero() { // First failure, set restart deadline if c.timeLimit != 0 { - c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became unhealthy. Restarting in %s if not healthy", - c.allocID, c.taskName, c.checkName, c.timeLimit) + c.logger.Debug("check became unhealthy. Will restart if check doesn't become healthy", "time_limit", c.timeLimit) } c.unhealthyState = now } @@ -101,7 +100,7 @@ func (c *checkRestart) apply(now time.Time, status string) bool { // Must test >= because if limit=1, restartAt == first failure if now.Equal(restartAt) || now.After(restartAt) { // hasn't become healthy by deadline, restart! - c.logger.Printf("[DEBUG] consul.health: restarting alloc %q task %q due to unhealthy check %q", c.allocID, c.taskName, c.checkName) + c.logger.Debug("restarting due to unhealthy check") // Tell TaskRunner to restart due to failure const failure = true @@ -139,17 +138,17 @@ type checkWatcher struct { // squelch repeated error messages. lastErr bool - logger *log.Logger + logger log.Logger } // newCheckWatcher creates a new checkWatcher but does not call its Run method. -func newCheckWatcher(logger *log.Logger, consul ChecksAPI) *checkWatcher { +func newCheckWatcher(logger log.Logger, consul ChecksAPI) *checkWatcher { return &checkWatcher{ consul: consul, pollFreq: defaultPollFreq, checkUpdateCh: make(chan checkWatchUpdate, 8), done: make(chan struct{}), - logger: logger, + logger: logger.ResetNamed("consul.health"), } } @@ -193,8 +192,8 @@ func (w *checkWatcher) Run(ctx context.Context) { // Add/update a check checks[update.checkID] = update.checkRestart - w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", - update.checkRestart.allocID, update.checkRestart.taskName, update.checkRestart.checkName) + w.logger.Debug("watching check", "alloc_id", update.checkRestart.allocID, + "task", update.checkRestart.taskName, "check", update.checkRestart.checkName) // if first check was added make sure polling is enabled if len(checks) == 1 { @@ -215,7 +214,7 @@ func (w *checkWatcher) Run(ctx context.Context) { if err != nil { if !w.lastErr { w.lastErr = true - w.logger.Printf("[ERR] consul.health: error retrieving health checks: %q", err) + w.logger.Error("failed retrieving health checks", "error", err) } continue } @@ -239,7 +238,7 @@ func (w *checkWatcher) Run(ctx context.Context) { if !ok { // Only warn if outside grace period to avoid races with check registration if now.After(check.graceUntil) { - w.logger.Printf("[WARN] consul.health: watched check %q (%s) not found in Consul", check.checkName, cid) + w.logger.Warn("watched check not found in Consul", "check", check.checkName, "check_id", cid) } continue } @@ -286,7 +285,7 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S graceUntil: time.Now().Add(check.CheckRestart.Grace), timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1), ignoreWarnings: check.CheckRestart.IgnoreWarnings, - logger: w.logger, + logger: w.logger.With("alloc_id", allocID, "task", taskName, "check", check.Name), } update := checkWatchUpdate{ diff --git a/command/agent/consul/check_watcher_test.go b/command/agent/consul/check_watcher_test.go index 7e0baf9e75c..cb5eb5cba7e 100644 --- a/command/agent/consul/check_watcher_test.go +++ b/command/agent/consul/check_watcher_test.go @@ -116,7 +116,7 @@ func (c *fakeChecksAPI) Checks() (map[string]*api.AgentCheck, error) { // logger and faster poll frequency. func testWatcherSetup(t *testing.T) (*fakeChecksAPI, *checkWatcher) { fakeAPI := newFakeChecksAPI() - cw := newCheckWatcher(testlog.Logger(t), fakeAPI) + cw := newCheckWatcher(testlog.HCLogger(t), fakeAPI) cw.pollFreq = 10 * time.Millisecond return fakeAPI, cw } @@ -142,7 +142,7 @@ func TestCheckWatcher_Skip(t *testing.T) { check := testCheck() check.CheckRestart = nil - cw := newCheckWatcher(testlog.Logger(t), newFakeChecksAPI()) + cw := newCheckWatcher(testlog.HCLogger(t), newFakeChecksAPI()) restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check) cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1) diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 7ca1fc845d7..884ca8668b6 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -3,7 +3,6 @@ package consul import ( "context" "fmt" - "log" "net" "net/url" "strconv" @@ -13,8 +12,10 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/hashicorp/consul/api" + log "github.com/hashicorp/go-hclog" cstructs "github.com/hashicorp/nomad/client/structs" + + "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -193,7 +194,7 @@ func (s *ServiceRegistration) copy() *ServiceRegistration { // ServiceClient handles task and agent service registration with Consul. type ServiceClient struct { client AgentAPI - logger *log.Logger + logger log.Logger retryInterval time.Duration maxRetryInterval time.Duration periodicInterval time.Duration @@ -242,7 +243,8 @@ type ServiceClient struct { // Client, logger and takes whether the client is being used by a Nomad Client agent. // When being used by a Nomad client, this Consul client reconciles all services and // checks created by Nomad on behalf of running tasks. -func NewServiceClient(consulClient AgentAPI, logger *log.Logger, isNomadClient bool) *ServiceClient { +func NewServiceClient(consulClient AgentAPI, logger log.Logger, isNomadClient bool) *ServiceClient { + logger = logger.ResetNamed("consul.sync") return &ServiceClient{ client: consulClient, logger: logger, @@ -306,7 +308,7 @@ INIT: c.merge(ops) } } - c.logger.Printf("[TRACE] consul.sync: able to contact Consul") + c.logger.Trace("able to contact Consul") // Block until contact with Consul has been established // Start checkWatcher @@ -329,10 +331,10 @@ INIT: if err := c.sync(); err != nil { if failures == 0 { // Log on the first failure - c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err) + c.logger.Warn("failed to update services in Consul", "error", err) } else if failures%10 == 0 { // Log every 10th consecutive failure - c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err) + c.logger.Error("still unable to update services in Consul", "failures", failures, "error", err) } failures++ @@ -352,7 +354,7 @@ INIT: retryTimer.Reset(backoff) } else { if failures > 0 { - c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul") + c.logger.Info("successfully updated services in Consul") failures = 0 } @@ -531,8 +533,8 @@ func (c *ServiceClient) sync() error { } } - c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks", - sreg, creg, sdereg, cdereg) + c.logger.Debug("sync complete", "registed_services", sreg, "deregistered_services", sdereg, + "registered_checks", creg, "deregistered_checks", cdereg) return nil } @@ -999,12 +1001,12 @@ func (c *ServiceClient) Shutdown() error { // deadline was reached for id := range c.agentServices { if err := c.client.ServiceDeregister(id); err != nil { - c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) + c.logger.Error("failed deregistering agent service", "service_id", id, "error", err) } } for id := range c.agentChecks { if err := c.client.CheckDeregister(id); err != nil { - c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) + c.logger.Error("failed deregistering agent check", "check_id", id, "error", err) } } diff --git a/command/agent/consul/int_test.go b/command/agent/consul/int_test.go index fb8c06cd99a..8febc7c9649 100644 --- a/command/agent/consul/int_test.go +++ b/command/agent/consul/int_test.go @@ -135,7 +135,7 @@ func TestConsul_Integration(t *testing.T) { consulClient, err := consulapi.NewClient(consulConfig) assert.Nil(err) - serviceClient := consul.NewServiceClient(consulClient.Agent(), logger, true) + serviceClient := consul.NewServiceClient(consulClient.Agent(), testlog.HCLogger(t), true) defer serviceClient.Shutdown() // just-in-case cleanup consulRan := make(chan struct{}) go func() { diff --git a/command/agent/consul/script.go b/command/agent/consul/script.go index 379025f76f7..be922d87fa6 100644 --- a/command/agent/consul/script.go +++ b/command/agent/consul/script.go @@ -2,10 +2,11 @@ package consul import ( "context" - "log" "time" metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/client/driver" "github.com/hashicorp/nomad/nomad/structs" @@ -44,16 +45,17 @@ type scriptCheck struct { // lastCheckOk is true if the last check was ok; otherwise false lastCheckOk bool - logger *log.Logger + logger log.Logger shutdownCh <-chan struct{} } // newScriptCheck creates a new scriptCheck. run() should be called once the // initial check is registered with Consul. func newScriptCheck(allocID, taskName, checkID string, check *structs.ServiceCheck, - exec driver.ScriptExecutor, agent heartbeater, logger *log.Logger, + exec driver.ScriptExecutor, agent heartbeater, logger log.Logger, shutdownCh <-chan struct{}) *scriptCheck { + logger = logger.ResetNamed("consul.checks").With("task", taskName, "alloc_id", allocID, "check", check.Name) return &scriptCheck{ allocID: allocID, taskName: taskName, @@ -108,8 +110,7 @@ func (s *scriptCheck) run() *scriptHandle { // Log deadline exceeded every time as it's a // distinct issue from checks returning // failures - s.logger.Printf("[WARN] consul.checks: check %q for task %q alloc %q timed out (%s)", - s.check.Name, s.taskName, s.allocID, s.check.Timeout) + s.logger.Warn("check timed out", "timeout", s.check.Timeout) } // cleanup context @@ -143,18 +144,15 @@ func (s *scriptCheck) run() *scriptHandle { if err != nil { if s.lastCheckOk { s.lastCheckOk = false - s.logger.Printf("[WARN] consul.checks: update for task %q alloc %q check %q failed: %v", - s.taskName, s.allocID, s.check.Name, err) + s.logger.Warn("updating check failed", "error", err) } else { - s.logger.Printf("[DEBUG] consul.checks: update for task %q alloc %q check %q still failing: %v", - s.taskName, s.allocID, s.check.Name, err) + s.logger.Debug("updating check still failing", "error", err) } } else if !s.lastCheckOk { // Succeeded for the first time or after failing; log s.lastCheckOk = true - s.logger.Printf("[INFO] consul.checks: update for task %q alloc %q check %q succeeded", - s.taskName, s.allocID, s.check.Name) + s.logger.Info("updating check succeeded") } select { diff --git a/command/agent/consul/script_test.go b/command/agent/consul/script_test.go index 3d4331ef09b..4a029ca0f06 100644 --- a/command/agent/consul/script_test.go +++ b/command/agent/consul/script_test.go @@ -60,7 +60,7 @@ func TestConsulScript_Exec_Cancel(t *testing.T) { exec := newBlockingScriptExec() // pass nil for heartbeater as it shouldn't be called - check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, nil, testlog.Logger(t), nil) + check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, nil, testlog.HCLogger(t), nil) handle := check.run() // wait until Exec is called @@ -112,7 +112,7 @@ func TestConsulScript_Exec_Timeout(t *testing.T) { exec := newBlockingScriptExec() hb := newFakeHeartbeater() - check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, hb, testlog.Logger(t), nil) + check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, hb, testlog.HCLogger(t), nil) handle := check.run() defer handle.cancel() // just-in-case cleanup <-exec.running @@ -161,7 +161,7 @@ func TestConsulScript_Exec_TimeoutCritical(t *testing.T) { Timeout: time.Nanosecond, } hb := newFakeHeartbeater() - check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, sleeperExec{}, hb, testlog.Logger(t), nil) + check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, sleeperExec{}, hb, testlog.HCLogger(t), nil) handle := check.run() defer handle.cancel() // just-in-case cleanup @@ -206,7 +206,7 @@ func TestConsulScript_Exec_Shutdown(t *testing.T) { hb := newFakeHeartbeater() shutdown := make(chan struct{}) exec := newSimpleExec(0, nil) - check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, hb, testlog.Logger(t), shutdown) + check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, hb, testlog.HCLogger(t), shutdown) handle := check.run() defer handle.cancel() // just-in-case cleanup @@ -243,7 +243,7 @@ func TestConsulScript_Exec_Codes(t *testing.T) { hb := newFakeHeartbeater() shutdown := make(chan struct{}) exec := newSimpleExec(code, err) - check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, hb, testlog.Logger(t), shutdown) + check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, hb, testlog.HCLogger(t), shutdown) handle := check.run() defer handle.cancel() diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 2cbb6488e23..28574621336 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -117,7 +117,7 @@ func setupFake(t *testing.T) *testFakeCtx { fc := NewMockAgent() tt := testTask() return &testFakeCtx{ - ServiceClient: NewServiceClient(fc, testlog.Logger(t), true), + ServiceClient: NewServiceClient(fc, testlog.HCLogger(t), true), FakeConsul: fc, Task: tt, MockExec: tt.DriverExec.(*mockExec), diff --git a/command/agent/consul/version_checker.go b/command/agent/consul/version_checker.go index 324a19a1248..970958072aa 100644 --- a/command/agent/consul/version_checker.go +++ b/command/agent/consul/version_checker.go @@ -2,16 +2,16 @@ package consul import ( "context" - "log" "strings" "time" + log "github.com/hashicorp/go-hclog" version "github.com/hashicorp/go-version" ) // checkConsulTLSSkipVerify logs if Consul does not support TLSSkipVerify on // checks and is intended to be run in a goroutine. -func checkConsulTLSSkipVerify(ctx context.Context, logger *log.Logger, client AgentAPI, done chan struct{}) { +func checkConsulTLSSkipVerify(ctx context.Context, logger log.Logger, client AgentAPI, done chan struct{}) { const ( baseline = time.Second limit = 20 * time.Second @@ -24,10 +24,10 @@ func checkConsulTLSSkipVerify(ctx context.Context, logger *log.Logger, client Ag self, err := client.Self() if err == nil { if supportsTLSSkipVerify(self) { - logger.Printf("[TRACE] consul.sync: supports TLSSkipVerify") + logger.Trace("Consul supports TLSSkipVerify") } else { - logger.Printf("[WARN] consul.sync: Consul does NOT support TLSSkipVerify; please upgrade to Consul %s or newer", - consulTLSSkipVerifyMinVersion) + logger.Warn("Consul does NOT support TLSSkipVerify; please upgrade Consul", + "min_version", consulTLSSkipVerifyMinVersion) } return } diff --git a/command/agent/http.go b/command/agent/http.go index ec8a58bdeec..857f5f8bcde 100644 --- a/command/agent/http.go +++ b/command/agent/http.go @@ -5,7 +5,6 @@ import ( "crypto/tls" "encoding/json" "fmt" - "log" "net" "net/http" "net/http/pprof" @@ -14,8 +13,10 @@ import ( "strings" "time" - "github.com/NYTimes/gziphandler" assetfs "github.com/elazarl/go-bindata-assetfs" + log "github.com/hashicorp/go-hclog" + + "github.com/NYTimes/gziphandler" "github.com/hashicorp/nomad/helper/tlsutil" "github.com/hashicorp/nomad/nomad/structs" "github.com/rs/cors" @@ -52,7 +53,7 @@ type HTTPServer struct { mux *http.ServeMux listener net.Listener listenerCh chan struct{} - logger *log.Logger + logger log.Logger Addr string } @@ -91,7 +92,7 @@ func NewHTTPServer(agent *Agent, config *Config) (*HTTPServer, error) { mux: mux, listener: ln, listenerCh: make(chan struct{}), - logger: agent.logger, + logger: agent.httpLogger, Addr: ln.Addr().String(), } srv.registerHandlers(config.EnableDebug) @@ -130,7 +131,7 @@ func (ln tcpKeepAliveListener) Accept() (c net.Conn, err error) { // Shutdown is used to shutdown the HTTP server func (s *HTTPServer) Shutdown() { if s != nil { - s.logger.Printf("[DEBUG] http: Shutting down http server") + s.logger.Debug("shutting down http server") s.listener.Close() <-s.listenerCh // block until http.Serve has returned. } @@ -278,14 +279,13 @@ func (s *HTTPServer) wrap(handler func(resp http.ResponseWriter, req *http.Reque reqURL := req.URL.String() start := time.Now() defer func() { - s.logger.Printf("[DEBUG] http: Request %v %v (%v)", req.Method, reqURL, time.Now().Sub(start)) + s.logger.Debug("request complete", "method", req.Method, "path", reqURL, "duration", time.Now().Sub(start)) }() obj, err := handler(resp, req) // Check for an error HAS_ERR: if err != nil { - s.logger.Printf("[ERR] http: Request %v, error: %v", reqURL, err) code := 500 errMsg := err.Error() if http, ok := err.(HTTPCodedError); ok { @@ -303,6 +303,7 @@ func (s *HTTPServer) wrap(handler func(resp http.ResponseWriter, req *http.Reque resp.WriteHeader(code) resp.Write([]byte(errMsg)) + s.logger.Error("request failed", "method", req.Method, "path", reqURL, "error", err, "code", code) return } diff --git a/command/agent/metrics_endpoint.go b/command/agent/metrics_endpoint.go index b0702c38a83..9e4f2259e34 100644 --- a/command/agent/metrics_endpoint.go +++ b/command/agent/metrics_endpoint.go @@ -2,11 +2,18 @@ package agent import ( "net/http" + "sync" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) +var ( + // Only create the prometheus handler once + promHandler http.Handler + promOnce sync.Once +) + // MetricsRequest returns metrics for the agent. Metrics are JSON by default // but Prometheus is an optional format. func (s *HTTPServer) MetricsRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) { @@ -15,16 +22,22 @@ func (s *HTTPServer) MetricsRequest(resp http.ResponseWriter, req *http.Request) } if format := req.URL.Query().Get("format"); format == "prometheus" { + s.prometheusHandler().ServeHTTP(resp, req) + return nil, nil + } + + return s.agent.InmemSink.DisplayMetrics(resp, req) +} + +func (s *HTTPServer) prometheusHandler() http.Handler { + promOnce.Do(func() { handlerOptions := promhttp.HandlerOpts{ - ErrorLog: s.logger, + ErrorLog: s.logger.Named("prometheus_handler").StandardLogger(nil), ErrorHandling: promhttp.ContinueOnError, DisableCompression: true, } - handler := promhttp.HandlerFor(prometheus.DefaultGatherer, handlerOptions) - handler.ServeHTTP(resp, req) - return nil, nil - } - - return s.agent.InmemSink.DisplayMetrics(resp, req) + promHandler = promhttp.HandlerFor(prometheus.DefaultGatherer, handlerOptions) + }) + return promHandler } diff --git a/command/agent/retry_join.go b/command/agent/retry_join.go index 2e8735be1d0..5250cd01982 100644 --- a/command/agent/retry_join.go +++ b/command/agent/retry_join.go @@ -2,9 +2,12 @@ package agent import ( "fmt" - "log" "strings" "time" + + golog "log" + + log "github.com/hashicorp/go-hclog" ) // DiscoverInterface is an interface for the Discover type in the go-discover @@ -15,7 +18,7 @@ type DiscoverInterface interface { // The config string must have the format 'provider=xxx key=val key=val ...' // where the keys and values are provider specific. The values are URL // encoded. - Addrs(string, *log.Logger) ([]string, error) + Addrs(string, *golog.Logger) ([]string, error) // Help describes the format of the configuration string for address // discovery and the various provider specific options. @@ -48,8 +51,8 @@ type retryJoiner struct { // limit has been reached errCh chan struct{} - // logger is the agent logger. - logger *log.Logger + // logger is the retry joiners logger + logger log.Logger } // Validate ensures that the configuration passes validity checks for the @@ -100,8 +103,9 @@ func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) { attempt := 0 addrsToJoin := strings.Join(serverJoin.RetryJoin, " ") - r.logger.Printf("[INFO] agent: Joining cluster... %s", addrsToJoin) + r.logger.Info("starting retry join", "servers", addrsToJoin) + standardLogger := r.logger.StandardLogger(&log.StandardLoggerOptions{InferLevels: true}) for { var addrs []string var n int @@ -110,9 +114,9 @@ func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) { for _, addr := range serverJoin.RetryJoin { switch { case strings.HasPrefix(addr, "provider="): - servers, err := r.discover.Addrs(addr, r.logger) + servers, err := r.discover.Addrs(addr, standardLogger) if err != nil { - r.logger.Printf("[ERR] agent: Join error %s", err) + r.logger.Error("determining join addresses failed", "error", err) } else { addrs = append(addrs, servers...) } @@ -125,14 +129,14 @@ func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) { if r.serverEnabled && r.serverJoin != nil { n, err = r.serverJoin(addrs) if err == nil { - r.logger.Printf("[INFO] agent: Join completed. Server synced with %d initial servers", n) + r.logger.Info("retry join completed", "initial_servers", n, "agent_mode", "server") return } } if r.clientEnabled && r.clientJoin != nil { n, err = r.clientJoin(addrs) if err == nil { - r.logger.Printf("[INFO] agent: Join completed. Client synced with %d initial servers", n) + r.logger.Info("retry join completed", "initial_servers", n, "agent_mode", "client") return } } @@ -140,14 +144,13 @@ func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) { attempt++ if serverJoin.RetryMaxAttempts > 0 && attempt > serverJoin.RetryMaxAttempts { - r.logger.Printf("[ERR] agent: max join retry exhausted, exiting") + r.logger.Error("max join retry exhausted, exiting") close(r.errCh) return } if err != nil { - r.logger.Printf("[WARN] agent: Join failed: %q, retrying in %v", err, - serverJoin.RetryInterval) + r.logger.Warn("join failed", "error", err, "retry", serverJoin.RetryInterval) } time.Sleep(serverJoin.RetryInterval) } diff --git a/command/agent/retry_join_test.go b/command/agent/retry_join_test.go index 153eed04f15..48a2e25143d 100644 --- a/command/agent/retry_join_test.go +++ b/command/agent/retry_join_test.go @@ -92,7 +92,7 @@ func TestRetryJoin_Server_NonCloud(t *testing.T) { discover: &MockDiscover{}, serverJoin: mockJoin, serverEnabled: true, - logger: testlog.Logger(t), + logger: testlog.HCLogger(t), errCh: make(chan struct{}), } @@ -123,7 +123,7 @@ func TestRetryJoin_Server_Cloud(t *testing.T) { discover: mockDiscover, serverJoin: mockJoin, serverEnabled: true, - logger: testlog.Logger(t), + logger: testlog.HCLogger(t), errCh: make(chan struct{}), } @@ -155,7 +155,7 @@ func TestRetryJoin_Server_MixedProvider(t *testing.T) { discover: mockDiscover, serverJoin: mockJoin, serverEnabled: true, - logger: testlog.Logger(t), + logger: testlog.HCLogger(t), errCh: make(chan struct{}), } @@ -186,7 +186,7 @@ func TestRetryJoin_Client(t *testing.T) { discover: &MockDiscover{}, clientJoin: mockJoin, clientEnabled: true, - logger: testlog.Logger(t), + logger: testlog.HCLogger(t), errCh: make(chan struct{}), } diff --git a/helper/testlog/testlog.go b/helper/testlog/testlog.go index 709bd9d5474..695c43e35b1 100644 --- a/helper/testlog/testlog.go +++ b/helper/testlog/testlog.go @@ -7,6 +7,8 @@ import ( "io" "log" "os" + + hclog "github.com/hashicorp/go-hclog" ) // UseStdout returns true if NOMAD_TEST_STDOUT=1 and sends logs to stdout. @@ -52,7 +54,18 @@ func WithPrefix(t LogPrinter, prefix string) *log.Logger { return New(t, prefix, log.Lmicroseconds) } -// NewLog logger with "TEST" prefix and the Lmicroseconds flag. +// Logger returns a new test logger with the Lmicroseconds flag set and no +// prefix. func Logger(t LogPrinter) *log.Logger { return WithPrefix(t, "") } + +//HCLogger returns a new test hc-logger. +func HCLogger(t LogPrinter) hclog.Logger { + opts := &hclog.LoggerOptions{ + Level: hclog.Trace, + Output: NewWriter(t), + IncludeLocation: true, + } + return hclog.New(opts) +} diff --git a/nomad/testing.go b/nomad/testing.go index 9b81f458d74..a18a929ec0d 100644 --- a/nomad/testing.go +++ b/nomad/testing.go @@ -77,7 +77,8 @@ func TestServer(t testing.T, cb func(*Config)) *Server { config.RaftConfig.StartAsLeader = !config.DevDisableBootstrap logger := testlog.WithPrefix(t, fmt.Sprintf("[%s] ", config.NodeName)) - catalog := consul.NewMockCatalog(logger) + hclogger := testlog.HCLogger(t) + catalog := consul.NewMockCatalog(hclogger) for i := 10; i >= 0; i-- { // Get random ports From 260b566c91f900a8935d9fe3fe3da60bf570307b Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Sat, 15 Sep 2018 16:23:13 -0700 Subject: [PATCH 2/3] server --- command/agent/agent.go | 8 +- command/agent/command.go | 2 +- nomad/acl_endpoint.go | 10 +- nomad/alloc_endpoint.go | 9 +- nomad/client_alloc_endpoint.go | 7 +- nomad/client_fs_endpoint.go | 7 +- nomad/client_rpc.go | 2 +- nomad/client_stats_endpoint.go | 7 +- nomad/config.go | 5 + nomad/core_sched.go | 70 +++---- nomad/deployment_endpoint.go | 5 +- nomad/deploymentwatcher/deployment_watcher.go | 25 +-- .../deploymentwatcher/deployments_watcher.go | 13 +- .../deployments_watcher_test.go | 2 +- nomad/drainer/drainer.go | 34 ++-- nomad/drainer/watch_jobs.go | 49 ++--- nomad/drainer/watch_jobs_test.go | 2 +- nomad/drainer/watch_nodes.go | 45 +++-- nomad/drainer/watch_nodes_test.go | 2 +- nomad/drainer_int_test.go | 7 +- nomad/eval_endpoint.go | 7 +- nomad/fsm.go | 181 ++++++++--------- nomad/fsm_test.go | 4 +- nomad/heartbeat.go | 116 ++++++----- nomad/job_endpoint.go | 35 ++-- nomad/leader.go | 111 +++++------ nomad/node_endpoint.go | 77 ++++---- nomad/operator_endpoint.go | 17 +- nomad/periodic.go | 39 ++-- nomad/periodic_endpoint.go | 7 +- nomad/periodic_test.go | 2 +- nomad/plan_apply.go | 71 ++++--- nomad/plan_apply_test.go | 6 +- nomad/plan_endpoint.go | 5 +- nomad/regions_endpoint.go | 9 +- nomad/rpc.go | 185 ++++++++++-------- nomad/search_endpoint.go | 8 +- nomad/serf.go | 30 +-- nomad/server.go | 165 ++++++++-------- nomad/server_setup_oss.go | 7 +- nomad/server_test.go | 2 +- nomad/state/state_store.go | 35 ++-- nomad/state/testing.go | 4 +- nomad/stats_fetcher.go | 18 +- nomad/status_endpoint.go | 7 +- nomad/system_endpoint.go | 5 +- nomad/testing.go | 7 +- nomad/vault.go | 47 +++-- nomad/vault_test.go | 46 ++--- nomad/worker.go | 40 ++-- nomad/worker_test.go | 5 +- scheduler/context.go | 11 +- scheduler/context_test.go | 2 +- scheduler/feasible.go | 11 +- scheduler/generic_sched.go | 35 ++-- scheduler/propertyset.go | 13 +- scheduler/rank.go | 8 +- scheduler/reconcile.go | 9 +- scheduler/reconcile_test.go | 140 ++++++------- scheduler/scheduler.go | 7 +- scheduler/spread.go | 2 +- scheduler/system_sched.go | 28 +-- scheduler/testing.go | 2 +- scheduler/util.go | 18 +- scheduler/util_test.go | 4 +- 65 files changed, 1015 insertions(+), 884 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 2d41849af3a..b7016e7451a 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -117,11 +117,12 @@ func NewAgent(config *Config, logOutput io.Writer, inmem *metrics.InmemSink) (*A // convertServerConfig takes an agent config and log output and returns a Nomad // Config. -func convertServerConfig(agentConfig *Config, logOutput io.Writer) (*nomad.Config, error) { +func convertServerConfig(agentConfig *Config, logger log.Logger, logOutput io.Writer) (*nomad.Config, error) { conf := agentConfig.NomadConfig if conf == nil { conf = nomad.DefaultConfig() } + conf.Logger = logger conf.LogOutput = logOutput conf.DevMode = agentConfig.DevMode conf.Build = agentConfig.Version.VersionNumber() @@ -321,7 +322,7 @@ func convertServerConfig(agentConfig *Config, logOutput io.Writer) (*nomad.Confi // serverConfig is used to generate a new server configuration struct // for initializing a nomad server. func (a *Agent) serverConfig() (*nomad.Config, error) { - return convertServerConfig(a.config, a.logOutput) + return convertServerConfig(a.config, a.logger, a.logOutput) } // clientConfig is used to generate a new client configuration struct @@ -493,8 +494,7 @@ func (a *Agent) setupServer() error { } // Create the server - // TODO(alex,hclog) - server, err := nomad.NewServer(conf, a.consulCatalog, a.logger.ResetNamed("").StandardLogger(&log.StandardLoggerOptions{InferLevels: true})) + server, err := nomad.NewServer(conf, a.consulCatalog) if err != nil { return fmt.Errorf("server setup failed: %v", err) } diff --git a/command/agent/command.go b/command/agent/command.go index fed528d91a1..e15555c33e1 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -751,7 +751,7 @@ func (c *Command) handleReload() { if s := c.agent.Server(); s != nil { c.agent.logger.Debug("starting reload of server config") - sconf, err := convertServerConfig(newConf, c.logOutput) + sconf, err := convertServerConfig(newConf, c.agent.logger, c.logOutput) if err != nil { c.agent.logger.Error("failed to convert server config", "error", err) return diff --git a/nomad/acl_endpoint.go b/nomad/acl_endpoint.go index 6c0d0d7fe85..7c382e4dcb9 100644 --- a/nomad/acl_endpoint.go +++ b/nomad/acl_endpoint.go @@ -9,6 +9,7 @@ import ( "time" metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" @@ -28,7 +29,8 @@ const ( // ACL endpoint is used for manipulating ACL tokens and policies type ACL struct { - srv *Server + srv *Server + logger log.Logger } // UpsertPolicies is used to create or update a set of policies @@ -420,7 +422,7 @@ func (a *ACL) fileBootstrapResetIndex() uint64 { raw, err := ioutil.ReadFile(path) if err != nil { if !os.IsNotExist(err) { - a.srv.logger.Printf("[ERR] acl.bootstrap: failed to read %q: %v", path, err) + a.logger.Error("failed to read bootstrap file", "path", path, "error", err) } return 0 } @@ -428,12 +430,12 @@ func (a *ACL) fileBootstrapResetIndex() uint64 { // Attempt to parse the file var resetIdx uint64 if _, err := fmt.Sscanf(string(raw), "%d", &resetIdx); err != nil { - a.srv.logger.Printf("[ERR] acl.bootstrap: failed to parse %q: %v", path, err) + a.logger.Error("failed to parse bootstrap file", "path", path, "error", err) return 0 } // Return the reset index - a.srv.logger.Printf("[WARN] acl.bootstrap: parsed %q: reset index %d", path, resetIdx) + a.logger.Warn("bootstrap file parsed", "path", path, "reset_index", resetIdx) return resetIdx } diff --git a/nomad/alloc_endpoint.go b/nomad/alloc_endpoint.go index 405136ca8cc..dd1dfcc9611 100644 --- a/nomad/alloc_endpoint.go +++ b/nomad/alloc_endpoint.go @@ -4,9 +4,11 @@ import ( "fmt" "time" + log "github.com/hashicorp/go-hclog" + multierror "github.com/hashicorp/go-multierror" + "github.com/armon/go-metrics" "github.com/hashicorp/go-memdb" - multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -14,7 +16,8 @@ import ( // Alloc endpoint is used for manipulating allocations type Alloc struct { - srv *Server + srv *Server + logger log.Logger } // List is used to list the allocations in the system @@ -225,7 +228,7 @@ func (a *Alloc) UpdateDesiredTransition(args *structs.AllocUpdateDesiredTransiti // Commit this update via Raft _, index, err := a.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args) if err != nil { - a.srv.logger.Printf("[ERR] nomad.allocs: AllocUpdateDesiredTransitionRequest failed: %v", err) + a.logger.Error("AllocUpdateDesiredTransitionRequest failed", "error", err) return err } diff --git a/nomad/client_alloc_endpoint.go b/nomad/client_alloc_endpoint.go index a9c86c931e0..f682a159571 100644 --- a/nomad/client_alloc_endpoint.go +++ b/nomad/client_alloc_endpoint.go @@ -5,15 +5,18 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/hashicorp/nomad/acl" + log "github.com/hashicorp/go-hclog" cstructs "github.com/hashicorp/nomad/client/structs" + + "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/structs" ) // ClientAllocations is used to forward RPC requests to the targed Nomad client's // Allocation endpoint. type ClientAllocations struct { - srv *Server + srv *Server + logger log.Logger } // GarbageCollectAll is used to garbage collect all allocations on a client. diff --git a/nomad/client_fs_endpoint.go b/nomad/client_fs_endpoint.go index 6353765fadd..998d3038900 100644 --- a/nomad/client_fs_endpoint.go +++ b/nomad/client_fs_endpoint.go @@ -9,8 +9,10 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/hashicorp/nomad/acl" + log "github.com/hashicorp/go-hclog" cstructs "github.com/hashicorp/nomad/client/structs" + + "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/ugorji/go/codec" @@ -19,7 +21,8 @@ import ( // FileSystem endpoint is used for accessing the logs and filesystem of // allocations from a Node. type FileSystem struct { - srv *Server + srv *Server + logger log.Logger } func (f *FileSystem) register() { diff --git a/nomad/client_rpc.go b/nomad/client_rpc.go index 1493348ebd3..ca8db233619 100644 --- a/nomad/client_rpc.go +++ b/nomad/client_rpc.go @@ -45,7 +45,7 @@ func (s *Server) getNodeConn(nodeID string) (*nodeConnState, bool) { // Shouldn't happen but rather be safe if state == nil { - s.logger.Printf("[WARN] nomad.client_rpc: node %q exists in node connection map without any connection", nodeID) + s.logger.Named("client_rpc").Warn("node exists in node connection map without any connection", "node_id", nodeID) return nil, false } diff --git a/nomad/client_stats_endpoint.go b/nomad/client_stats_endpoint.go index 553dda14022..91540b241c3 100644 --- a/nomad/client_stats_endpoint.go +++ b/nomad/client_stats_endpoint.go @@ -5,14 +5,17 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/hashicorp/nomad/client/structs" + log "github.com/hashicorp/go-hclog" nstructs "github.com/hashicorp/nomad/nomad/structs" + + "github.com/hashicorp/nomad/client/structs" ) // ClientStats is used to forward RPC requests to the targed Nomad client's // ClientStats endpoint. type ClientStats struct { - srv *Server + srv *Server + logger log.Logger } func (s *ClientStats) Stats(args *nstructs.NodeSpecificRequest, reply *structs.ClientStatsResponse) error { diff --git a/nomad/config.go b/nomad/config.go index e7aa945a28c..2e0b49ed7fb 100644 --- a/nomad/config.go +++ b/nomad/config.go @@ -8,6 +8,8 @@ import ( "runtime" "time" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/memberlist" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" @@ -72,6 +74,9 @@ type Config struct { // logs will go to stderr. LogOutput io.Writer + // Logger is the logger used by the server. + Logger log.Logger + // ProtocolVersion is the protocol version to speak. This must be between // ProtocolVersionMin and ProtocolVersionMax. ProtocolVersion uint8 diff --git a/nomad/core_sched.go b/nomad/core_sched.go index 6218eaf044f..5fecb56f23c 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -5,7 +5,9 @@ import ( "math" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/scheduler" @@ -22,15 +24,17 @@ var ( // as "_core". It is used to run various administrative work // across the cluster. type CoreScheduler struct { - srv *Server - snap *state.StateSnapshot + srv *Server + snap *state.StateSnapshot + logger log.Logger } // NewCoreScheduler is used to return a new system scheduler instance func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { s := &CoreScheduler{ - srv: srv, - snap: snap, + srv: srv, + snap: snap, + logger: srv.logger.ResetNamed("core.sched"), } return s } @@ -84,14 +88,14 @@ func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { // The GC was forced, so set the threshold to its maximum so everything // will GC. oldThreshold = math.MaxUint64 - c.srv.logger.Println("[DEBUG] sched.core: forced job GC") + c.logger.Debug("forced job GC") } else { // Get the time table to calculate GC cutoffs. tt := c.srv.fsm.TimeTable() cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) oldThreshold = tt.NearestIndex(cutoff) - c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)", - oldThreshold, c.srv.config.JobGCThreshold) + c.logger.Debug("job GC scanning before cutoff index", + "index", oldThreshold, "job_gc_threshold", c.srv.config.JobGCThreshold) } // Collect the allocations, evaluations and jobs to GC @@ -110,7 +114,7 @@ OUTER: ws := memdb.NewWatchSet() evals, err := c.snap.EvalsByJob(ws, job.Namespace, job.ID) if err != nil { - c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err) + c.logger.Error("job GC failed to get evals for job", "job", job.ID, "error", err) continue } @@ -143,8 +147,8 @@ OUTER: if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { return nil } - c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible", - len(gcJob), len(gcEval), len(gcAlloc)) + c.logger.Debug("job GC found eligible objects", + "jobs", len(gcJob), "evals", len(gcEval), "allocs", len(gcAlloc)) // Reap the evals and allocs if err := c.evalReap(gcEval, gcAlloc); err != nil { @@ -161,7 +165,7 @@ func (c *CoreScheduler) jobReap(jobs []*structs.Job, leaderACL string) error { for _, req := range c.partitionJobReap(jobs, leaderACL) { var resp structs.JobBatchDeregisterResponse if err := c.srv.RPC("Job.BatchDeregister", req, &resp); err != nil { - c.srv.logger.Printf("[ERR] sched.core: batch job reap failed: %v", err) + c.logger.Error("batch job reap failed", "error", err) return err } } @@ -221,7 +225,7 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { // The GC was forced, so set the threshold to its maximum so everything // will GC. oldThreshold = math.MaxUint64 - c.srv.logger.Println("[DEBUG] sched.core: forced eval GC") + c.logger.Debug("forced eval GC") } else { // Compute the old threshold limit for GC using the FSM // time table. This is a rough mapping of a time to the @@ -229,8 +233,8 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { tt := c.srv.fsm.TimeTable() cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) oldThreshold = tt.NearestIndex(cutoff) - c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)", - oldThreshold, c.srv.config.EvalGCThreshold) + c.logger.Debug("eval GC scanning before cutoff index", + "index", oldThreshold, "eval_gc_threshold", c.srv.config.EvalGCThreshold) } // Collect the allocations and evaluations to GC @@ -255,8 +259,8 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { if len(gcEval) == 0 && len(gcAlloc) == 0 { return nil } - c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible", - len(gcEval), len(gcAlloc)) + c.logger.Debug("eval GC found eligibile objects", + "evals", len(gcEval), "allocs", len(gcAlloc)) return c.evalReap(gcEval, gcAlloc) } @@ -314,8 +318,8 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, // Get the allocations by eval allocs, err := c.snap.AllocsByEval(ws, eval.ID) if err != nil { - c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v", - eval.ID, err) + c.logger.Error("failed to get allocs for eval", + "eval_id", eval.ID, "error", err) return false, nil, err } @@ -343,7 +347,7 @@ func (c *CoreScheduler) evalReap(evals, allocs []string) error { for _, req := range c.partitionEvalReap(evals, allocs) { var resp structs.GenericResponse if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { - c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err) + c.logger.Error("eval reap failed", "error", err) return err } } @@ -410,7 +414,7 @@ func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { // The GC was forced, so set the threshold to its maximum so everything // will GC. oldThreshold = math.MaxUint64 - c.srv.logger.Println("[DEBUG] sched.core: forced node GC") + c.logger.Debug("forced node GC") } else { // Compute the old threshold limit for GC using the FSM // time table. This is a rough mapping of a time to the @@ -418,8 +422,8 @@ func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { tt := c.srv.fsm.TimeTable() cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) oldThreshold = tt.NearestIndex(cutoff) - c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)", - oldThreshold, c.srv.config.NodeGCThreshold) + c.logger.Debug("node GC scanning before cutoff index", + "index", oldThreshold, "node_gc_threshold", c.srv.config.NodeGCThreshold) } // Collect the nodes to GC @@ -441,8 +445,8 @@ OUTER: ws := memdb.NewWatchSet() allocs, err := c.snap.AllocsByNode(ws, node.ID) if err != nil { - c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v", - eval.ID, err) + c.logger.Error("failed to get allocs for node", + "node_id", node.ID, "error", err) continue } @@ -464,7 +468,7 @@ OUTER: if len(gcNode) == 0 { return nil } - c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode)) + c.logger.Debug("node GC found eligible nodes", "nodes", len(gcNode)) // Call to the leader to issue the reap for _, nodeID := range gcNode { @@ -477,7 +481,7 @@ OUTER: } var resp structs.NodeUpdateResponse if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { - c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err) + c.logger.Error("node reap failed", "node_id", nodeID, err) return err } } @@ -498,7 +502,7 @@ func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error { // The GC was forced, so set the threshold to its maximum so everything // will GC. oldThreshold = math.MaxUint64 - c.srv.logger.Println("[DEBUG] sched.core: forced deployment GC") + c.logger.Debug("forced deployment GC") } else { // Compute the old threshold limit for GC using the FSM // time table. This is a rough mapping of a time to the @@ -506,8 +510,8 @@ func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error { tt := c.srv.fsm.TimeTable() cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold) oldThreshold = tt.NearestIndex(cutoff) - c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: scanning before index %d (%v)", - oldThreshold, c.srv.config.DeploymentGCThreshold) + c.logger.Debug("deployment GC scanning before cutoff index", + "index", oldThreshold, "deployment_gc_threshold", c.srv.config.DeploymentGCThreshold) } // Collect the deployments to GC @@ -529,8 +533,8 @@ OUTER: // Ensure there are no allocs referencing this deployment. allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID) if err != nil { - c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for deployment %s: %v", - deploy.ID, err) + c.logger.Error("failed to get allocs for deployment", + "deployment_id", deploy.ID, "error", err) continue } @@ -549,7 +553,7 @@ OUTER: if len(gcDeployment) == 0 { return nil } - c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: %d deployments eligible", len(gcDeployment)) + c.logger.Debug("deployment GC found eligible deployments", "deployments", len(gcDeployment)) return c.deploymentReap(gcDeployment) } @@ -560,7 +564,7 @@ func (c *CoreScheduler) deploymentReap(deployments []string) error { for _, req := range c.partitionDeploymentReap(deployments) { var resp structs.GenericResponse if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil { - c.srv.logger.Printf("[ERR] sched.core: deployment reap failed: %v", err) + c.logger.Error("deployment reap failed", "error", err) return err } } diff --git a/nomad/deployment_endpoint.go b/nomad/deployment_endpoint.go index 23489be5510..27bc1059135 100644 --- a/nomad/deployment_endpoint.go +++ b/nomad/deployment_endpoint.go @@ -5,7 +5,9 @@ import ( "time" metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -13,7 +15,8 @@ import ( // Deployment endpoint is used for manipulating deployments type Deployment struct { - srv *Server + srv *Server + logger log.Logger } // GetDeployment is used to request information about a specific deployment diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go index 7550b5a94a5..6ebbdf1589c 100644 --- a/nomad/deploymentwatcher/deployment_watcher.go +++ b/nomad/deploymentwatcher/deployment_watcher.go @@ -3,13 +3,14 @@ package deploymentwatcher import ( "context" "fmt" - "log" "sync" "time" "golang.org/x/time/rate" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" @@ -91,7 +92,7 @@ type deploymentWatcher struct { // by holding the lock or using the setter and getter methods. latestEval uint64 - logger *log.Logger + logger log.Logger ctx context.Context exitFn context.CancelFunc l sync.RWMutex @@ -100,7 +101,7 @@ type deploymentWatcher struct { // newDeploymentWatcher returns a deployment watcher that is used to watch // deployments and trigger the scheduler as needed. func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter, - logger *log.Logger, state *state.StateStore, d *structs.Deployment, + logger log.Logger, state *state.StateStore, d *structs.Deployment, j *structs.Job, triggers deploymentTriggers) *deploymentWatcher { ctx, exitFn := context.WithCancel(parent) @@ -112,7 +113,7 @@ func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter, j: j, state: state, deploymentTriggers: triggers, - logger: logger, + logger: logger.With("deployment_id", d.ID, "job", j.NamespacedID()), ctx: ctx, exitFn: exitFn, } @@ -395,14 +396,14 @@ FAIL: deadlineHit = true fail, rback, err := w.shouldFail() if err != nil { - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine whether to rollback job for deployment %q: %v", w.deploymentID, err) + w.logger.Error("failed to determine whether to rollback job", "error", err) } if !fail { - w.logger.Printf("[DEBUG] nomad.deployment_watcher: skipping deadline for deployment %q", w.deploymentID) + w.logger.Debug("skipping deadline") continue } - w.logger.Printf("[DEBUG] nomad.deployment_watcher: deadline for deployment %q hit and rollback is %v", w.deploymentID, rback) + w.logger.Debug("deadline hit", "rollback", rback) rollback = rback break FAIL case <-w.deploymentUpdateCh: @@ -431,7 +432,7 @@ FAIL: return } - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to retrieve allocations for deployment %q: %v", w.deploymentID, err) + w.logger.Error("failed to retrieve allocations", "error", err) return } allocIndex = updates.index @@ -444,7 +445,7 @@ FAIL: return } - w.logger.Printf("[ERR] nomad.deployment_watcher: failed handling allocation updates: %v", err) + w.logger.Error("failed handling allocation updates", "error", err) return } @@ -474,7 +475,7 @@ FAIL: var err error j, err = w.latestStableJob() if err != nil { - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to lookup latest stable job for %q: %v", w.j.ID, err) + w.logger.Error("failed to lookup latest stable job", "error", err) } // Description should include that the job is being rolled back to @@ -490,7 +491,7 @@ FAIL: e := w.getEval() u := w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc) if index, err := w.upsertDeploymentStatusUpdate(u, e, j); err != nil { - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to update deployment %q status: %v", w.deploymentID, err) + w.logger.Error("failed to update deployment status", "error", err) } else { w.setLatestEval(index) } @@ -685,7 +686,7 @@ func (w *deploymentWatcher) createBatchedUpdate(allowReplacements []string, forI // Create the eval if index, err := w.createUpdate(replacements, w.getEval()); err != nil { - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to create evaluation for deployment %q: %v", w.deploymentID, err) + w.logger.Error("failed to create evaluation for deployment", "deployment_id", w.deploymentID, "error", err) } else { w.setLatestEval(index) } diff --git a/nomad/deploymentwatcher/deployments_watcher.go b/nomad/deploymentwatcher/deployments_watcher.go index d4a1eb7bbf1..8cd695fcc66 100644 --- a/nomad/deploymentwatcher/deployments_watcher.go +++ b/nomad/deploymentwatcher/deployments_watcher.go @@ -3,13 +3,14 @@ package deploymentwatcher import ( "context" "fmt" - "log" "sync" "time" "golang.org/x/time/rate" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" ) @@ -58,7 +59,7 @@ type DeploymentRaftEndpoints interface { // transitions. type Watcher struct { enabled bool - logger *log.Logger + logger log.Logger // queryLimiter is used to limit the rate of blocking queries queryLimiter *rate.Limiter @@ -90,7 +91,7 @@ type Watcher struct { // NewDeploymentsWatcher returns a deployments watcher that is used to watch // deployments and trigger the scheduler as needed. -func NewDeploymentsWatcher(logger *log.Logger, +func NewDeploymentsWatcher(logger log.Logger, raft DeploymentRaftEndpoints, stateQueriesPerSecond float64, updateBatchDuration time.Duration) *Watcher { @@ -98,7 +99,7 @@ func NewDeploymentsWatcher(logger *log.Logger, raft: raft, queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100), updateBatchDuration: updateBatchDuration, - logger: logger, + logger: logger.Named("deployments_watcher"), } } @@ -154,7 +155,7 @@ func (w *Watcher) watchDeployments(ctx context.Context) { return } - w.logger.Printf("[ERR] nomad.deployments_watcher: failed to retrieve deployments: %v", err) + w.logger.Error("failed to retrieve deployments", "error", err) } // Update the latest index @@ -165,7 +166,7 @@ func (w *Watcher) watchDeployments(ctx context.Context) { for _, d := range deployments { if d.Active() { if err := w.add(d); err != nil { - w.logger.Printf("[ERR] nomad.deployments_watcher: failed to track deployment %q: %v", d.ID, err) + w.logger.Error("failed to track deployment", "deployment_id", d.ID, "error", err) } } else { w.remove(d) diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go index b86ef0ca51c..6a033a0e978 100644 --- a/nomad/deploymentwatcher/deployments_watcher_test.go +++ b/nomad/deploymentwatcher/deployments_watcher_test.go @@ -19,7 +19,7 @@ import ( func testDeploymentWatcher(t *testing.T, qps float64, batchDur time.Duration) (*Watcher, *mockBackend) { m := newMockBackend(t) - w := NewDeploymentsWatcher(testlog.Logger(t), m, qps, batchDur) + w := NewDeploymentsWatcher(testlog.HCLogger(t), m, qps, batchDur) return w, m } diff --git a/nomad/drainer/drainer.go b/nomad/drainer/drainer.go index 586b5e5ab84..84ae154a541 100644 --- a/nomad/drainer/drainer.go +++ b/nomad/drainer/drainer.go @@ -2,10 +2,11 @@ package drainer import ( "context" - "log" "sync" "time" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" @@ -63,16 +64,16 @@ type NodeTracker interface { } // DrainingJobWatcherFactory returns a new DrainingJobWatcher -type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger) DrainingJobWatcher +type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger) DrainingJobWatcher // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher -type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher +type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger, NodeTracker) DrainingNodeWatcher // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier // GetDrainingJobWatcher returns a draining job watcher -func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) DrainingJobWatcher { +func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) DrainingJobWatcher { return NewDrainingJobWatcher(ctx, limiter, state, logger) } @@ -83,7 +84,7 @@ func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier { // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory func GetNodeWatcherFactory() DrainingNodeWatcherFactory { - return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) DrainingNodeWatcher { + return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) DrainingNodeWatcher { return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker) } } @@ -109,7 +110,7 @@ type allocMigrateBatcher struct { // NodeDrainerConfig is used to configure a new node drainer. type NodeDrainerConfig struct { - Logger *log.Logger + Logger log.Logger Raft RaftApplier JobFactory DrainingJobWatcherFactory NodeFactory DrainingNodeWatcherFactory @@ -128,7 +129,7 @@ type NodeDrainerConfig struct { // nodes. type NodeDrainer struct { enabled bool - logger *log.Logger + logger log.Logger // nodes is the set of draining nodes nodes map[string]*drainingNode @@ -172,7 +173,7 @@ type NodeDrainer struct { func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer { return &NodeDrainer{ raft: c.Raft, - logger: c.Logger, + logger: c.Logger.Named("drain"), jobFactory: c.JobFactory, nodeFactory: c.NodeFactory, deadlineNotifierFactory: c.DrainDeadlineFactory, @@ -246,17 +247,17 @@ func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) { for _, node := range nodes { draining, ok := n.nodes[node] if !ok { - n.logger.Printf("[DEBUG] nomad.drain: skipping untracked deadlined node %q", node) + n.logger.Debug("skipping untracked deadlined node", "node_id", node) continue } allocs, err := draining.RemainingAllocs() if err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to retrive allocs on deadlined node %q: %v", node, err) + n.logger.Error("failed to retrive allocs on deadlined node", "node_id", node, "error", err) continue } - n.logger.Printf("[DEBUG] nomad.drain: node %q deadlined causing %d allocs to be force stopped", node, len(allocs)) + n.logger.Debug("node deadlined causing allocs to be force stopped", "node_id", node, "num_allocs", len(allocs)) forceStop = append(forceStop, allocs...) } n.l.RUnlock() @@ -272,7 +273,7 @@ func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) { // Raft transaction size. for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) { if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err) + n.logger.Error("ailed to unset drain for nodes", "error", err) } } } @@ -308,7 +309,7 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { isDone, err := draining.IsDone() if err != nil { - n.logger.Printf("[ERR] nomad.drain: error checking if node %q is done draining: %v", node, err) + n.logger.Error("error checking if node is done draining", "node_id", node, "error", err) continue } @@ -320,7 +321,7 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { remaining, err := draining.RemainingAllocs() if err != nil { - n.logger.Printf("[ERR] nomad.drain: node %q is done draining but encountered an error getting remaining allocs: %v", node, err) + n.logger.Error("node is done draining but encountered an error getting remaining allocs", "node_id", node, "error", err) continue } @@ -333,8 +334,7 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { future := structs.NewBatchFuture() n.drainAllocs(future, remainingAllocs) if err := future.Wait(); err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done nodes: %v", - len(remainingAllocs), err) + n.logger.Error("failed to drain remaining allocs from done nodes", "num_allocs", len(remainingAllocs), "error", err) } } @@ -347,7 +347,7 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { // Raft transaction size. for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) { if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err) + n.logger.Error("failed to unset drain for nodes", "error", err) } } } diff --git a/nomad/drainer/watch_jobs.go b/nomad/drainer/watch_jobs.go index fec7743e6a0..4f6b7e9cc14 100644 --- a/nomad/drainer/watch_jobs.go +++ b/nomad/drainer/watch_jobs.go @@ -3,11 +3,12 @@ package drainer import ( "context" "fmt" - "log" "sync" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -43,7 +44,7 @@ type DrainingJobWatcher interface { // draining allocations have replacements type drainingJobWatcher struct { ctx context.Context - logger *log.Logger + logger log.Logger // state is the state that is watched for state changes. state *state.StateStore @@ -67,7 +68,7 @@ type drainingJobWatcher struct { // NewDrainingJobWatcher returns a new job watcher. The caller is expected to // cancel the context to clean up the drainer. -func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) *drainingJobWatcher { +func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher { // Create a context that can cancel the blocking query so that when a new // job gets registered it is handled. @@ -78,7 +79,7 @@ func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *st queryCtx: queryCtx, queryCancel: queryCancel, limiter: limiter, - logger: logger, + logger: logger.Named("job_watcher"), state: state, jobs: make(map[structs.NamespacedID]struct{}, 64), drainCh: make(chan *DrainRequest), @@ -101,7 +102,7 @@ func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) { } // Add the job and cancel the context - w.logger.Printf("[TRACE] nomad.drain.job_watcher: registering job %v", jns) + w.logger.Trace("registering job", "job", jns) w.jobs[jns] = struct{}{} updated = true } @@ -134,22 +135,22 @@ func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) { Namespace: namespace, } delete(w.jobs, jns) - w.logger.Printf("[TRACE] nomad.drain.job_watcher: deregistering job %v", jns) + w.logger.Trace("deregistering job", "job", jns) } // watch is the long lived watching routine that detects job drain changes. func (w *drainingJobWatcher) watch() { waitIndex := uint64(1) for { - w.logger.Printf("[TRACE] nomad.drain.job_watcher: getting job allocs at index %d", waitIndex) + w.logger.Trace("getting job allocs at index", "index", waitIndex) jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex) - w.logger.Printf("[TRACE] nomad.drain.job_watcher: got allocs for %d jobs at index %d: %v", len(jobAllocs), index, err) + w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index, "error", err) if err != nil { if err == context.Canceled { // Determine if it is a cancel or a shutdown select { case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return default: // The query context was cancelled; @@ -160,10 +161,10 @@ func (w *drainingJobWatcher) watch() { } } - w.logger.Printf("[ERR] nomad.drain.job_watcher: error watching job allocs updates at index %d: %v", waitIndex, err) + w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err) select { case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return case <-time.After(stateReadErrorDelay): continue @@ -176,7 +177,7 @@ func (w *drainingJobWatcher) watch() { // Snapshot the state store snap, err := w.state.Snapshot() if err != nil { - w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to snapshot statestore: %v", err) + w.logger.Warn("failed to snapshot statestore", "error", err) continue } @@ -185,22 +186,22 @@ func (w *drainingJobWatcher) watch() { for jns, allocs := range jobAllocs { // Check if the job is still registered if _, ok := currentJobs[jns]; !ok { - w.logger.Printf("[TRACE] nomad.drain.job_watcher: skipping job %v as it is no longer registered for draining", jns) + w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns) continue } - w.logger.Printf("[TRACE] nomad.drain.job_watcher: handling job %v", jns) + w.logger.Trace("handling job", "job", jns) // Lookup the job job, err := snap.JobByID(nil, jns.Namespace, jns.ID) if err != nil { - w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to lookup job %v: %v", jns, err) + w.logger.Warn("failed to lookup job", "job", jns, "error", err) continue } // Ignore purged jobs if job == nil { - w.logger.Printf("[TRACE] nomad.drain.job_watcher: ignoring garbage collected job %q", jns) + w.logger.Trace("ignoring garbage collected job", "job", jns) w.deregisterJob(jns.ID, jns.Namespace) continue } @@ -213,11 +214,11 @@ func (w *drainingJobWatcher) watch() { result, err := handleJob(snap, job, allocs, lastHandled) if err != nil { - w.logger.Printf("[ERR] nomad.drain.job_watcher: handling drain for job %v failed: %v", jns, err) + w.logger.Error("handling drain for job failed", "job", jns, "error", err) continue } - w.logger.Printf("[TRACE] nomad.drain.job_watcher: result for job %v: %v", jns, result) + w.logger.Trace("received result for job", "job", jns, "result", result) allDrain = append(allDrain, result.drain...) allMigrated = append(allMigrated, result.migrated...) @@ -231,12 +232,12 @@ func (w *drainingJobWatcher) watch() { if len(allDrain) != 0 { // Create the request req := NewDrainRequest(allDrain) - w.logger.Printf("[TRACE] nomad.drain.job_watcher: sending drain request for %d allocs", len(allDrain)) + w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain)) select { case w.drainCh <- req: case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return } @@ -244,13 +245,13 @@ func (w *drainingJobWatcher) watch() { select { case <-req.Resp.WaitCh(): case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return } // See if it successfully committed if err := req.Resp.Error(); err != nil { - w.logger.Printf("[ERR] nomad.drain.job_watcher: failed to transition allocations: %v", err) + w.logger.Error("failed to transition allocations", "error", err) } // Wait until the new index @@ -260,11 +261,11 @@ func (w *drainingJobWatcher) watch() { } if len(allMigrated) != 0 { - w.logger.Printf("[TRACE] nomad.drain.job_watcher: sending migrated for %d allocs", len(allMigrated)) + w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated)) select { case w.migratedCh <- allMigrated: case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return } } diff --git a/nomad/drainer/watch_jobs_test.go b/nomad/drainer/watch_jobs_test.go index ad0926c3c33..fb5744baaa9 100644 --- a/nomad/drainer/watch_jobs_test.go +++ b/nomad/drainer/watch_jobs_test.go @@ -38,7 +38,7 @@ func testDrainingJobWatcher(t *testing.T, state *state.StateStore) (*drainingJob t.Helper() limiter := rate.NewLimiter(100.0, 100) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) ctx, cancel := context.WithCancel(context.Background()) w := NewDrainingJobWatcher(ctx, limiter, state, logger) return w, cancel diff --git a/nomad/drainer/watch_nodes.go b/nomad/drainer/watch_nodes.go index ac23b16680b..65ec062061a 100644 --- a/nomad/drainer/watch_nodes.go +++ b/nomad/drainer/watch_nodes.go @@ -2,10 +2,11 @@ package drainer import ( "context" - "log" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "golang.org/x/time/rate" @@ -70,10 +71,10 @@ func (n *NodeDrainer) Update(node *structs.Node) { // Register interest in the draining jobs. jobs, err := draining.DrainingJobs() if err != nil { - n.logger.Printf("[ERR] nomad.drain: error retrieving draining jobs on node %q: %v", node.ID, err) + n.logger.Error("error retrieving draining jobs on node", "node_id", node.ID, "error", err) return } - n.logger.Printf("[TRACE] nomad.drain: node %q has %d draining jobs on it", node.ID, len(jobs)) + n.logger.Trace("node has draining jobs on it", "node_id", node.ID, "num_jobs", len(jobs)) n.jobWatcher.RegisterJobs(jobs) // TODO Test at this layer as well that a node drain on a node without @@ -82,7 +83,7 @@ func (n *NodeDrainer) Update(node *structs.Node) { // nothing on it we unset drain done, err := draining.IsDone() if err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to check if node %q is done draining: %v", node.ID, err) + n.logger.Error("failed to check if node is done draining", "node_id", node.ID, "error", err) return } @@ -91,14 +92,12 @@ func (n *NodeDrainer) Update(node *structs.Node) { // marking node as complete. remaining, err := draining.RemainingAllocs() if err != nil { - n.logger.Printf("[ERR] nomad.drain: error getting remaining allocs on drained node %q: %v", - node.ID, err) + n.logger.Error("error getting remaining allocs on drained node", "node_id", node.ID, "error", err) } else if len(remaining) > 0 { future := structs.NewBatchFuture() n.drainAllocs(future, remaining) if err := future.Wait(); err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done node %q: %v", - len(remaining), node.ID, err) + n.logger.Error("failed to drain remaining allocs from done node", "num_allocs", len(remaining), "node_id", node.ID, "error", err) } } @@ -109,9 +108,9 @@ func (n *NodeDrainer) Update(node *structs.Node) { index, err := n.raft.NodesDrainComplete([]string{node.ID}, event) if err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", node.ID, err) + n.logger.Error("failed to unset drain for node", "node_id", node.ID, "error", err) } else { - n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", node.ID, index) + n.logger.Info("node completed draining at index", "node_id", node.ID, "index", index) } } } @@ -120,7 +119,7 @@ func (n *NodeDrainer) Update(node *structs.Node) { // changing their drain strategy. type nodeDrainWatcher struct { ctx context.Context - logger *log.Logger + logger log.Logger // state is the state that is watched for state changes. state *state.StateStore @@ -134,11 +133,11 @@ type nodeDrainWatcher struct { } // NewNodeDrainWatcher returns a new node drain watcher. -func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) *nodeDrainWatcher { +func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) *nodeDrainWatcher { w := &nodeDrainWatcher{ ctx: ctx, limiter: limiter, - logger: logger, + logger: logger.Named("node_watcher"), tracker: tracker, state: state, } @@ -151,19 +150,19 @@ func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *stat func (w *nodeDrainWatcher) watch() { nindex := uint64(1) for { - w.logger.Printf("[TRACE] nomad.drain.node_watcher: getting nodes at index %d", nindex) + w.logger.Trace("getting nodes at index", "index", nindex) nodes, index, err := w.getNodes(nindex) - w.logger.Printf("[TRACE] nomad.drain.node_watcher: got nodes %d at index %d: %v", len(nodes), nindex, err) + w.logger.Trace("got nodes at index", "num_nodes", len(nodes), "index", nindex, "error", err) if err != nil { if err == context.Canceled { - w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") + w.logger.Trace("shutting down") return } - w.logger.Printf("[ERR] nomad.drain.node_watcher: error watching node updates at index %d: %v", nindex, err) + w.logger.Error("error watching node updates at index", "index", nindex, "error", err) select { case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") + w.logger.Trace("shutting down") return case <-time.After(stateReadErrorDelay): continue @@ -181,20 +180,20 @@ func (w *nodeDrainWatcher) watch() { switch { // If the node is tracked but not draining, untrack case tracked && !newDraining: - w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", nodeID) + w.logger.Trace("tracked node is no longer draining", "node_id", nodeID) w.tracker.Remove(nodeID) // If the node is not being tracked but is draining, track case !tracked && newDraining: - w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", nodeID) + w.logger.Trace("untracked node is draining", "node_id", nodeID) w.tracker.Update(node) // If the node is being tracked but has changed, update: case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy): - w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", nodeID) + w.logger.Trace("tracked node has updated drain", "node_id", nodeID) w.tracker.Update(node) default: - w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", nodeID, node.ModifyIndex, tracked, newDraining) + w.logger.Trace("no changes for node", "node_id", nodeID, "node_modify_index", node.ModifyIndex, "tracked", tracked, "newly_draining", newDraining) } // TODO(schmichael) handle the case of a lost node @@ -202,7 +201,7 @@ func (w *nodeDrainWatcher) watch() { for nodeID := range tracked { if _, ok := nodes[nodeID]; !ok { - w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer exists", nodeID) + w.logger.Trace("tracked node no longer exists", "node_id", nodeID) w.tracker.Remove(nodeID) } } diff --git a/nomad/drainer/watch_nodes_test.go b/nomad/drainer/watch_nodes_test.go index 74a62e85253..43503e929eb 100644 --- a/nomad/drainer/watch_nodes_test.go +++ b/nomad/drainer/watch_nodes_test.go @@ -27,7 +27,7 @@ func testNodeDrainWatcher(t *testing.T) (*nodeDrainWatcher, *state.StateStore, * } limiter := rate.NewLimiter(100.0, 100) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) m := NewMockNodeTracker() w := NewNodeDrainWatcher(context.Background(), limiter, state, logger, m) return w, state, m diff --git a/nomad/drainer_int_test.go b/nomad/drainer_int_test.go index 379aee34047..9c05bb46b5f 100644 --- a/nomad/drainer_int_test.go +++ b/nomad/drainer_int_test.go @@ -3,13 +3,14 @@ package nomad import ( "context" "fmt" - "log" "net/rpc" "testing" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/drainer" @@ -22,7 +23,7 @@ import ( func allocPromoter(errCh chan<- error, ctx context.Context, state *state.StateStore, codec rpc.ClientCodec, nodeID string, - logger *log.Logger) { + logger log.Logger) { nindex := uint64(1) for { @@ -54,7 +55,7 @@ func allocPromoter(errCh chan<- error, ctx context.Context, Timestamp: now, } updates = append(updates, newAlloc) - logger.Printf("Marked deployment health for alloc %q", alloc.ID) + logger.Trace("marked deployment health for alloc", "alloc_id", alloc.ID) } if len(updates) == 0 { diff --git a/nomad/eval_endpoint.go b/nomad/eval_endpoint.go index bcbdfb7cec1..da7520e7125 100644 --- a/nomad/eval_endpoint.go +++ b/nomad/eval_endpoint.go @@ -4,9 +4,11 @@ import ( "fmt" "time" + log "github.com/hashicorp/go-hclog" + multierror "github.com/hashicorp/go-multierror" + "github.com/armon/go-metrics" "github.com/hashicorp/go-memdb" - multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -20,7 +22,8 @@ const ( // Eval endpoint is used for eval interactions type Eval struct { - srv *Server + srv *Server + logger log.Logger } // GetEval is used to request information about a specific evaluation diff --git a/nomad/fsm.go b/nomad/fsm.go index 0ef79d90923..d174b0f0c81 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -3,13 +3,14 @@ package nomad import ( "fmt" "io" - "log" "reflect" "sync" "time" - "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/armon/go-metrics" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -67,7 +68,7 @@ type nomadFSM struct { evalBroker *EvalBroker blockedEvals *BlockedEvals periodicDispatcher *PeriodicDispatch - logger *log.Logger + logger log.Logger state *state.StateStore timetable *TimeTable @@ -112,8 +113,8 @@ type FSMConfig struct { // be added to. Blocked *BlockedEvals - // LogOutput is the writer logs should be written to - LogOutput io.Writer + // Logger is the logger used by the FSM + Logger log.Logger // Region is the region of the server embedding the FSM Region string @@ -123,8 +124,8 @@ type FSMConfig struct { func NewFSM(config *FSMConfig) (*nomadFSM, error) { // Create a state store sconfig := &state.StateStoreConfig{ - LogOutput: config.LogOutput, - Region: config.Region, + Logger: config.Logger, + Region: config.Region, } state, err := state.NewStateStore(sconfig) if err != nil { @@ -135,7 +136,7 @@ func NewFSM(config *FSMConfig) (*nomadFSM, error) { evalBroker: config.EvalBroker, periodicDispatcher: config.Periodic, blockedEvals: config.Blocked, - logger: log.New(config.LogOutput, "", log.LstdFlags|log.Lmicroseconds), + logger: config.Logger.Named("fsm"), config: config, state: state, timetable: NewTimeTable(timeTableGranularity, timeTableLimit), @@ -255,7 +256,7 @@ func (n *nomadFSM) Apply(log *raft.Log) interface{} { // We didn't match anything, either panic or ignore if ignoreUnknown { - n.logger.Printf("[WARN] nomad.fsm: ignoring unknown message type (%d), upgrade to newer version", msgType) + n.logger.Warn("ignoring unknown message type, upgrade to newer version", "msg_type", msgType) return nil } @@ -266,14 +267,14 @@ func (n *nomadFSM) applyUpsertNode(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "register_node"}, time.Now()) var req structs.NodeRegisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } // Handle upgrade paths req.Node.Canonicalize() if err := n.state.UpsertNode(index, req.Node); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertNode failed: %v", err) + n.logger.Error("UpsertNode failed", "error", err) return err } @@ -290,11 +291,11 @@ func (n *nomadFSM) applyDeregisterNode(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_node"}, time.Now()) var req structs.NodeDeregisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteNode(index, req.NodeID); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteNode failed: %v", err) + n.logger.Error("DeleteNode failed", "error", err) return err } return nil @@ -304,11 +305,11 @@ func (n *nomadFSM) applyStatusUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "node_status_update"}, time.Now()) var req structs.NodeUpdateStatusRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateNodeStatus(index, req.NodeID, req.Status, req.NodeEvent); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateNodeStatus failed: %v", err) + n.logger.Error("UpdateNodeStatus failed", "error", err) return err } @@ -318,7 +319,7 @@ func (n *nomadFSM) applyStatusUpdate(buf []byte, index uint64) interface{} { ws := memdb.NewWatchSet() node, err := n.state.NodeByID(ws, req.NodeID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: looking up node %q failed: %v", req.NodeID, err) + n.logger.Error("looking up node failed", "node_id", req.NodeID, "error", err) return err } @@ -332,7 +333,7 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "node_drain_update"}, time.Now()) var req structs.NodeUpdateDrainRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } // COMPAT Remove in version 0.10 @@ -350,7 +351,7 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} { } if err := n.state.UpdateNodeDrain(index, req.NodeID, req.DrainStrategy, req.MarkEligible, req.NodeEvent); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateNodeDrain failed: %v", err) + n.logger.Error("UpdateNodeDrain failed", "error", err) return err } return nil @@ -360,11 +361,11 @@ func (n *nomadFSM) applyBatchDrainUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_node_drain_update"}, time.Now()) var req structs.BatchNodeUpdateDrainRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.BatchUpdateNodeDrain(index, req.Updates, req.NodeEvents); err != nil { - n.logger.Printf("[ERR] nomad.fsm: BatchUpdateNodeDrain failed: %v", err) + n.logger.Error("BatchUpdateNodeDrain failed", "error", err) return err } return nil @@ -374,18 +375,18 @@ func (n *nomadFSM) applyNodeEligibilityUpdate(buf []byte, index uint64) interfac defer metrics.MeasureSince([]string{"nomad", "fsm", "node_eligibility_update"}, time.Now()) var req structs.NodeUpdateEligibilityRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } // Lookup the existing node node, err := n.state.NodeByID(nil, req.NodeID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateNodeEligibility failed to lookup node %q: %v", req.NodeID, err) + n.logger.Error("UpdateNodeEligibility failed to lookup node", "node_id", req.NodeID, "error", err) return err } if err := n.state.UpdateNodeEligibility(index, req.NodeID, req.Eligibility, req.NodeEvent); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateNodeEligibility failed: %v", err) + n.logger.Error("UpdateNodeEligibility failed", "error", err) return err } @@ -403,7 +404,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "register_job"}, time.Now()) var req structs.JobRegisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } /* Handle upgrade paths: @@ -416,7 +417,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { req.Job.Canonicalize() if err := n.state.UpsertJob(index, req.Job); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertJob failed: %v", err) + n.logger.Error("UpsertJob failed", "error", err) return err } @@ -424,8 +425,8 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { // possibility that the periodic spec was removed and then we should stop // tracking it. if err := n.periodicDispatcher.Add(req.Job); err != nil { - n.logger.Printf("[ERR] nomad.fsm: periodicDispatcher.Add failed: %v", err) - return fmt.Errorf("failed adding job to periodic dispatcher: %v", err) + n.logger.Error("periodicDispatcher.Add failed", "error", err) + return fmt.Errorf("failed adding job to periodic dispatcher", "error", err) } // Create a watch set @@ -439,7 +440,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { if req.Job.IsPeriodicActive() { prevLaunch, err := n.state.PeriodicLaunchByID(ws, req.Namespace, req.Job.ID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: PeriodicLaunchByID failed: %v", err) + n.logger.Error("PeriodicLaunchByID failed", "error", err) return err } @@ -452,7 +453,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { Launch: time.Now(), } if err := n.state.UpsertPeriodicLaunch(index, launch); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertPeriodicLaunch failed: %v", err) + n.logger.Error("UpsertPeriodicLaunch failed", "error", err) return err } } @@ -463,7 +464,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { if parentID != "" { parent, err := n.state.JobByID(ws, req.Namespace, parentID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: JobByID(%v) lookup for parent failed: %v", parentID, err) + n.logger.Error("JobByID lookup for parent failed", "parent_id", parentID, "namespace", req.Namespace, "error", err) return err } else if parent == nil { // The parent has been deregistered. @@ -473,7 +474,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { if parent.IsPeriodic() && !parent.IsParameterized() { t, err := n.periodicDispatcher.LaunchTime(req.Job.ID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: LaunchTime(%v) failed: %v", req.Job.ID, err) + n.logger.Error("LaunchTime failed", "job", req.Job.NamespacedID(), "error", err) return err } @@ -483,7 +484,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { Launch: t, } if err := n.state.UpsertPeriodicLaunch(index, launch); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertPeriodicLaunch failed: %v", err) + n.logger.Error("UpsertPeriodicLaunch failed", "error", err) return err } } @@ -496,11 +497,11 @@ func (n *nomadFSM) applyDeregisterJob(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_job"}, time.Now()) var req structs.JobDeregisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.handleJobDeregister(index, req.JobID, req.Namespace, req.Purge); err != nil { - n.logger.Printf("[ERR] nomad.fsm: deregistering job failed: %v", err) + n.logger.Error("deregistering job failed", "error", err) return err } @@ -511,12 +512,12 @@ func (n *nomadFSM) applyBatchDeregisterJob(buf []byte, index uint64) interface{} defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_deregister_job"}, time.Now()) var req structs.JobBatchDeregisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } for jobNS, options := range req.Jobs { if err := n.handleJobDeregister(index, jobNS.ID, jobNS.Namespace, options.Purge); err != nil { - n.logger.Printf("[ERR] nomad.fsm: deregistering %v failed: %v", jobNS, err) + n.logger.Error("deregistering job failed", "job", jobNS, "error", err) return err } } @@ -528,13 +529,13 @@ func (n *nomadFSM) applyBatchDeregisterJob(buf []byte, index uint64) interface{} func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, purge bool) error { // If it is periodic remove it from the dispatcher if err := n.periodicDispatcher.Remove(namespace, jobID); err != nil { - n.logger.Printf("[ERR] nomad.fsm: periodicDispatcher.Remove failed: %v", err) + n.logger.Error("periodicDispatcher.Remove failed", "error", err) return err } if purge { if err := n.state.DeleteJob(index, namespace, jobID); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteJob failed: %v", err) + n.logger.Error("DeleteJob failed", "error", err) return err } @@ -547,7 +548,7 @@ func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, pu ws := memdb.NewWatchSet() current, err := n.state.JobByID(ws, namespace, jobID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: JobByID lookup failed: %v", err) + n.logger.Error("JobByID lookup failed", "error", err) return err } @@ -559,7 +560,7 @@ func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, pu stopped.Stop = true if err := n.state.UpsertJob(index, stopped); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertJob failed: %v", err) + n.logger.Error("UpsertJob failed", "error", err) return err } } @@ -571,14 +572,14 @@ func (n *nomadFSM) applyUpdateEval(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "update_eval"}, time.Now()) var req structs.EvalUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } return n.upsertEvals(index, req.Evals) } func (n *nomadFSM) upsertEvals(index uint64, evals []*structs.Evaluation) error { if err := n.state.UpsertEvals(index, evals); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertEvals failed: %v", err) + n.logger.Error("UpsertEvals failed", "error", err) return err } @@ -616,11 +617,11 @@ func (n *nomadFSM) applyDeleteEval(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "delete_eval"}, time.Now()) var req structs.EvalDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteEval(index, req.Evals, req.Allocs); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteEval failed: %v", err) + n.logger.Error("DeleteEval failed", "error", err) return err } return nil @@ -630,7 +631,7 @@ func (n *nomadFSM) applyAllocUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update"}, time.Now()) var req structs.AllocUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } // Attach the job to all the allocations. It is pulled out in the @@ -664,7 +665,7 @@ func (n *nomadFSM) applyAllocUpdate(buf []byte, index uint64) interface{} { } if err := n.state.UpsertAllocs(index, req.Alloc); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertAllocs failed: %v", err) + n.logger.Error("UpsertAllocs failed", "error", err) return err } return nil @@ -674,7 +675,7 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_client_update"}, time.Now()) var req structs.AllocUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if len(req.Alloc) == 0 { return nil @@ -693,14 +694,14 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} // Update all the client allocations if err := n.state.UpdateAllocsFromClient(index, req.Alloc); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateAllocFromClient failed: %v", err) + n.logger.Error("UpdateAllocFromClient failed", "error", err) return err } // Update any evals if len(req.Evals) > 0 { if err := n.upsertEvals(index, req.Evals); err != nil { - n.logger.Printf("[ERR] nomad.fsm: applyAllocClientUpdate failed to update evaluations: %v", err) + n.logger.Error("applyAllocClientUpdate failed to update evaluations", "error", err) return err } } @@ -713,7 +714,7 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} nodeID := alloc.NodeID node, err := n.state.NodeByID(ws, nodeID) if err != nil || node == nil { - n.logger.Printf("[ERR] nomad.fsm: looking up node %q failed: %v", nodeID, err) + n.logger.Error("looking up node failed", "node_id", nodeID, "error", err) return err } @@ -721,7 +722,7 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} // Unblock any associated quota quota, err := n.allocQuota(alloc.ID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: looking up quota associated with alloc %q failed: %v", alloc.ID, err) + n.logger.Error("looking up quota associated with alloc failed", "alloc_id", alloc.ID, "error", err) return err } @@ -738,11 +739,11 @@ func (n *nomadFSM) applyAllocUpdateDesiredTransition(buf []byte, index uint64) i defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update_desired_transition"}, time.Now()) var req structs.AllocUpdateDesiredTransitionRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateAllocsDesiredTransitions(index, req.Allocs, req.Evals); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateAllocsDesiredTransitions failed: %v", err) + n.logger.Error("UpdateAllocsDesiredTransitions failed", "error", err) return err } @@ -763,12 +764,12 @@ func (n *nomadFSM) applyUpsertNodeEvent(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_node_events"}, time.Now()) var req structs.EmitNodeEventsRequest if err := structs.Decode(buf, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: failed to decode EmitNodeEventsRequest: %v", err) + n.logger.Error("failed to decode EmitNodeEventsRequest", "error", err) return err } if err := n.state.UpsertNodeEvents(index, req.NodeEvents); err != nil { - n.logger.Printf("[ERR] nomad.fsm: failed to add node events: %v", err) + n.logger.Error("failed to add node events", "error", err) return err } @@ -781,11 +782,11 @@ func (n *nomadFSM) applyUpsertVaultAccessor(buf []byte, index uint64) interface{ defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_vault_accessor"}, time.Now()) var req structs.VaultAccessorsRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpsertVaultAccessor(index, req.Accessors); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertVaultAccessor failed: %v", err) + n.logger.Error("UpsertVaultAccessor failed", "error", err) return err } @@ -797,11 +798,11 @@ func (n *nomadFSM) applyDeregisterVaultAccessor(buf []byte, index uint64) interf defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_vault_accessor"}, time.Now()) var req structs.VaultAccessorsRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteVaultAccessors(index, req.Accessors); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeregisterVaultAccessor failed: %v", err) + n.logger.Error("DeregisterVaultAccessor failed", "error", err) return err } @@ -813,11 +814,11 @@ func (n *nomadFSM) applyPlanResults(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_plan_results"}, time.Now()) var req structs.ApplyPlanResultsRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpsertPlanResults(index, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: ApplyPlan failed: %v", err) + n.logger.Error("ApplyPlan failed", "error", err) return err } @@ -830,11 +831,11 @@ func (n *nomadFSM) applyDeploymentStatusUpdate(buf []byte, index uint64) interfa defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_status_update"}, time.Now()) var req structs.DeploymentStatusUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateDeploymentStatus(index, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertDeploymentStatusUpdate failed: %v", err) + n.logger.Error("UpsertDeploymentStatusUpdate failed", "error", err) return err } @@ -847,11 +848,11 @@ func (n *nomadFSM) applyDeploymentPromotion(buf []byte, index uint64) interface{ defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_promotion"}, time.Now()) var req structs.ApplyDeploymentPromoteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateDeploymentPromotion(index, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertDeploymentPromotion failed: %v", err) + n.logger.Error("UpsertDeploymentPromotion failed", "error", err) return err } @@ -865,11 +866,11 @@ func (n *nomadFSM) applyDeploymentAllocHealth(buf []byte, index uint64) interfac defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_alloc_health"}, time.Now()) var req structs.ApplyDeploymentAllocHealthRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateDeploymentAllocHealth(index, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertDeploymentAllocHealth failed: %v", err) + n.logger.Error("UpsertDeploymentAllocHealth failed", "error", err) return err } @@ -882,11 +883,11 @@ func (n *nomadFSM) applyDeploymentDelete(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_delete"}, time.Now()) var req structs.DeploymentDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteDeployment(index, req.Deployments); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteDeployment failed: %v", err) + n.logger.Error("DeleteDeployment failed", "error", err) return err } @@ -898,11 +899,11 @@ func (n *nomadFSM) applyJobStability(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_job_stability"}, time.Now()) var req structs.JobStabilityRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateJobStability(index, req.Namespace, req.JobID, req.JobVersion, req.Stable); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateJobStability failed: %v", err) + n.logger.Error("UpdateJobStability failed", "error", err) return err } @@ -914,11 +915,11 @@ func (n *nomadFSM) applyACLPolicyUpsert(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_policy_upsert"}, time.Now()) var req structs.ACLPolicyUpsertRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpsertACLPolicies(index, req.Policies); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertACLPolicies failed: %v", err) + n.logger.Error("UpsertACLPolicies failed", "error", err) return err } return nil @@ -929,11 +930,11 @@ func (n *nomadFSM) applyACLPolicyDelete(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_policy_delete"}, time.Now()) var req structs.ACLPolicyDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteACLPolicies(index, req.Names); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteACLPolicies failed: %v", err) + n.logger.Error("DeleteACLPolicies failed", "error", err) return err } return nil @@ -944,11 +945,11 @@ func (n *nomadFSM) applyACLTokenUpsert(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_upsert"}, time.Now()) var req structs.ACLTokenUpsertRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpsertACLTokens(index, req.Tokens); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertACLTokens failed: %v", err) + n.logger.Error("UpsertACLTokens failed", "error", err) return err } return nil @@ -959,11 +960,11 @@ func (n *nomadFSM) applyACLTokenDelete(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_delete"}, time.Now()) var req structs.ACLTokenDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteACLTokens(index, req.AccessorIDs); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteACLTokens failed: %v", err) + n.logger.Error("DeleteACLTokens failed", "error", err) return err } return nil @@ -974,11 +975,11 @@ func (n *nomadFSM) applyACLTokenBootstrap(buf []byte, index uint64) interface{} defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_bootstrap"}, time.Now()) var req structs.ACLTokenBootstrapRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.BootstrapACLTokens(index, req.ResetIndex, req.Token); err != nil { - n.logger.Printf("[ERR] nomad.fsm: BootstrapACLToken failed: %v", err) + n.logger.Error("BootstrapACLToken failed", "error", err) return err } return nil @@ -987,7 +988,7 @@ func (n *nomadFSM) applyACLTokenBootstrap(buf []byte, index uint64) interface{} func (n *nomadFSM) applyAutopilotUpdate(buf []byte, index uint64) interface{} { var req structs.AutopilotSetConfigRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } defer metrics.MeasureSince([]string{"nomad", "fsm", "autopilot"}, time.Now()) @@ -1020,8 +1021,8 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { // Create a new state store config := &state.StateStoreConfig{ - LogOutput: n.config.LogOutput, - Region: n.config.Region, + Logger: n.config.Logger, + Region: n.config.Region, } newState, err := state.NewStateStore(config) if err != nil { @@ -1060,7 +1061,7 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { switch snapType { case TimeTableSnapshot: if err := n.timetable.Deserialize(dec); err != nil { - return fmt.Errorf("time table deserialize failed: %v", err) + return fmt.Errorf("time table deserialize failed", "error", err) } case NodeSnapshot: @@ -1245,7 +1246,7 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { // remove this block. index, err := newState.Index("job_summary") if err != nil { - return fmt.Errorf("couldn't fetch index of job summary table: %v", err) + return fmt.Errorf("couldn't fetch index of job summary table", "error", err) } // If the index is 0 that means there is no job summary in the snapshot so @@ -1257,7 +1258,7 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { return fmt.Errorf("unable to query latest index: %v", index) } if err := newState.ReconcileJobSummaries(latestIndex); err != nil { - return fmt.Errorf("error reconciling summaries: %v", err) + return fmt.Errorf("error reconciling summaries", "error", err) } } @@ -1290,12 +1291,12 @@ func (n *nomadFSM) failLeakedDeployments(state *state.StateStore) error { // and thus the older deployment leaks and then the job is removed. iter, err := state.Deployments(nil) if err != nil { - return fmt.Errorf("failed to query deployments: %v", err) + return fmt.Errorf("failed to query deployments", "error", err) } dindex, err := state.Index("deployment") if err != nil { - return fmt.Errorf("couldn't fetch index of deployments table: %v", err) + return fmt.Errorf("couldn't fetch index of deployments table", "error", err) } for { @@ -1347,7 +1348,7 @@ func (n *nomadFSM) reconcileQueuedAllocations(index uint64) error { snap, err := n.state.Snapshot() if err != nil { - return fmt.Errorf("unable to create snapshot: %v", err) + return fmt.Errorf("unable to create snapshot", "error", err) } // Invoking the scheduler for every job so that we can populate the number diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go index f2fa2d1cf38..840e9c348e0 100644 --- a/nomad/fsm_test.go +++ b/nomad/fsm_test.go @@ -3,7 +3,6 @@ package nomad import ( "bytes" "fmt" - "os" "reflect" "strings" "testing" @@ -12,6 +11,7 @@ import ( "github.com/google/go-cmp/cmp" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/state" @@ -52,7 +52,7 @@ func testFSM(t *testing.T) *nomadFSM { EvalBroker: broker, Periodic: dispatcher, Blocked: NewBlockedEvals(broker), - LogOutput: os.Stderr, + Logger: testlog.HCLogger(t), Region: "global", } fsm, err := NewFSM(fsmConfig) diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index 9a55bb256a9..fe3b829ffb8 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -2,11 +2,14 @@ package nomad import ( "errors" + "sync" "time" + log "github.com/hashicorp/go-hclog" + memdb "github.com/hashicorp/go-memdb" + "github.com/armon/go-metrics" "github.com/hashicorp/consul/lib" - memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/nomad/structs" ) @@ -26,12 +29,33 @@ var ( heartbeatNotLeaderErr = errors.New(heartbeatNotLeader) ) +// nodeHeartbeater is used to track expiration times of node heartbeats. If it +// detects an expired node, the node status is updated to be 'down'. +type nodeHeartbeater struct { + *Server + logger log.Logger + + // heartbeatTimers track the expiration time of each heartbeat that has + // a TTL. On expiration, the node status is updated to be 'down'. + heartbeatTimers map[string]*time.Timer + heartbeatTimersLock sync.Mutex +} + +// newNodeHeartbeater returns a new node heartbeater used to detect and act on +// failed node heartbeats. +func newNodeHeartbeater(s *Server) *nodeHeartbeater { + return &nodeHeartbeater{ + Server: s, + logger: s.logger.Named("heartbeat"), + } +} + // initializeHeartbeatTimers is used when a leader is newly elected to create // a new map to track heartbeat expiration and to reset all the timers from // the previously known set of timers. -func (s *Server) initializeHeartbeatTimers() error { +func (h *nodeHeartbeater) initializeHeartbeatTimers() error { // Scan all nodes and reset their timer - snap, err := s.fsm.State().Snapshot() + snap, err := h.fsm.State().Snapshot() if err != nil { return err } @@ -43,8 +67,8 @@ func (s *Server) initializeHeartbeatTimers() error { return err } - s.heartbeatTimersLock.Lock() - defer s.heartbeatTimersLock.Unlock() + h.heartbeatTimersLock.Lock() + defer h.heartbeatTimersLock.Unlock() // Handle each node for { @@ -56,77 +80,77 @@ func (s *Server) initializeHeartbeatTimers() error { if node.TerminalStatus() { continue } - s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL) + h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL) } return nil } // resetHeartbeatTimer is used to reset the TTL of a heartbeat. // This can be used for new heartbeats and existing ones. -func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) { - s.heartbeatTimersLock.Lock() - defer s.heartbeatTimersLock.Unlock() +func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) { + h.heartbeatTimersLock.Lock() + defer h.heartbeatTimersLock.Unlock() // Do not create a timer for the node since we are not the leader. This // check avoids the race in which leadership is lost but a timer is created // on this server since it was servicing an RPC during a leadership loss. - if !s.IsLeader() { - s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring resetting node %q TTL since this node is not the leader", id) + if !h.IsLeader() { + h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id) return 0, heartbeatNotLeaderErr } // Compute the target TTL value - n := len(s.heartbeatTimers) - ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n) + n := len(h.heartbeatTimers) + ttl := lib.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n) ttl += lib.RandomStagger(ttl) // Reset the TTL - s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace) + h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace) return ttl, nil } // resetHeartbeatTimerLocked is used to reset a heartbeat timer // assuming the heartbeatTimerLock is already held -func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) { +func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) { // Ensure a timer map exists - if s.heartbeatTimers == nil { - s.heartbeatTimers = make(map[string]*time.Timer) + if h.heartbeatTimers == nil { + h.heartbeatTimers = make(map[string]*time.Timer) } // Renew the heartbeat timer if it exists - if timer, ok := s.heartbeatTimers[id]; ok { + if timer, ok := h.heartbeatTimers[id]; ok { timer.Reset(ttl) return } // Create a new timer to track expiration of this heartbeat timer := time.AfterFunc(ttl, func() { - s.invalidateHeartbeat(id) + h.invalidateHeartbeat(id) }) - s.heartbeatTimers[id] = timer + h.heartbeatTimers[id] = timer } // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we // need to invalidate the heartbeat. -func (s *Server) invalidateHeartbeat(id string) { +func (h *nodeHeartbeater) invalidateHeartbeat(id string) { defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now()) // Clear the heartbeat timer - s.heartbeatTimersLock.Lock() - if timer, ok := s.heartbeatTimers[id]; ok { + h.heartbeatTimersLock.Lock() + if timer, ok := h.heartbeatTimers[id]; ok { timer.Stop() - delete(s.heartbeatTimers, id) + delete(h.heartbeatTimers, id) } - s.heartbeatTimersLock.Unlock() + h.heartbeatTimersLock.Unlock() // Do not invalidate the node since we are not the leader. This check avoids // the race in which leadership is lost but a timer is created on this // server since it was servicing an RPC during a leadership loss. - if !s.IsLeader() { - s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring node %q TTL since this node is not the leader", id) + if !h.IsLeader() { + h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id) return } - s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id) + h.logger.Warn("node TTL expired", "node_id", id) // Make a request to update the node status req := structs.NodeUpdateStatusRequest{ @@ -134,54 +158,54 @@ func (s *Server) invalidateHeartbeat(id string) { Status: structs.NodeStatusDown, NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed), WriteRequest: structs.WriteRequest{ - Region: s.config.Region, + Region: h.config.Region, }, } var resp structs.NodeUpdateResponse - if err := s.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil { - s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err) + if err := h.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil { + h.logger.Error("update node status failed", "error", err) } } // clearHeartbeatTimer is used to clear the heartbeat time for // a single heartbeat. This is used when a heartbeat is destroyed // explicitly and no longer needed. -func (s *Server) clearHeartbeatTimer(id string) error { - s.heartbeatTimersLock.Lock() - defer s.heartbeatTimersLock.Unlock() +func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error { + h.heartbeatTimersLock.Lock() + defer h.heartbeatTimersLock.Unlock() - if timer, ok := s.heartbeatTimers[id]; ok { + if timer, ok := h.heartbeatTimers[id]; ok { timer.Stop() - delete(s.heartbeatTimers, id) + delete(h.heartbeatTimers, id) } return nil } // clearAllHeartbeatTimers is used when a leader is stepping // down and we no longer need to track any heartbeat timers. -func (s *Server) clearAllHeartbeatTimers() error { - s.heartbeatTimersLock.Lock() - defer s.heartbeatTimersLock.Unlock() +func (h *nodeHeartbeater) clearAllHeartbeatTimers() error { + h.heartbeatTimersLock.Lock() + defer h.heartbeatTimersLock.Unlock() - for _, t := range s.heartbeatTimers { + for _, t := range h.heartbeatTimers { t.Stop() } - s.heartbeatTimers = nil + h.heartbeatTimers = nil return nil } // heartbeatStats is a long running routine used to capture // the number of active heartbeats being tracked -func (s *Server) heartbeatStats() { +func (h *nodeHeartbeater) heartbeatStats() { for { select { case <-time.After(5 * time.Second): - s.heartbeatTimersLock.Lock() - num := len(s.heartbeatTimers) - s.heartbeatTimersLock.Unlock() + h.heartbeatTimersLock.Lock() + num := len(h.heartbeatTimers) + h.heartbeatTimersLock.Unlock() metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num)) - case <-s.shutdownCh: + case <-h.shutdownCh: return } } diff --git a/nomad/job_endpoint.go b/nomad/job_endpoint.go index d1e2af642ad..5f2dfe653be 100644 --- a/nomad/job_endpoint.go +++ b/nomad/job_endpoint.go @@ -7,10 +7,12 @@ import ( "strings" "time" + log "github.com/hashicorp/go-hclog" + memdb "github.com/hashicorp/go-memdb" + "github.com/armon/go-metrics" "github.com/golang/snappy" "github.com/hashicorp/consul/lib" - memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/client/driver" @@ -50,7 +52,8 @@ var ( // Job endpoint is used for job interactions type Job struct { - srv *Server + srv *Server + logger log.Logger } // Register is used to upsert a job for scheduling @@ -90,10 +93,10 @@ func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegis // Check if override is set and we do not have permissions if args.PolicyOverride { if !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySentinelOverride) { - j.srv.logger.Printf("[WARN] nomad.job: policy override attempted without permissions for Job %q", args.Job.ID) + j.logger.Warn("policy override attempted without permissions for job", "job", args.Job.ID) return structs.ErrPermissionDenied } - j.srv.logger.Printf("[WARN] nomad.job: policy override set for Job %q", args.Job.ID) + j.logger.Warn("policy override set for job", "job", args.Job.ID) } } @@ -186,11 +189,11 @@ func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegis // Commit this update via Raft fsmErr, index, err := j.srv.raftApply(structs.JobRegisterRequestType, args) if err, ok := fsmErr.(error); ok && err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Register failed: %v", err) + j.logger.Error("registering job failed", "error", err, "fsm", true) return err } if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Register failed: %v", err) + j.logger.Error("registering job failed", "error", err, "raft", true) return err } @@ -226,7 +229,7 @@ func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegis // but that the EvalUpdate does not. _, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err) + j.logger.Error("eval create failed", "error", err, "method", "register") return err } @@ -495,7 +498,7 @@ func (j *Job) Stable(args *structs.JobStabilityRequest, reply *structs.JobStabil // Commit this stability request via Raft _, modifyIndex, err := j.srv.raftApply(structs.JobStabilityRequestType, args) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Job stability request failed: %v", err) + j.logger.Error("submitting job stability request failed", "error", err) return err } @@ -585,7 +588,7 @@ func (j *Job) Evaluate(args *structs.JobEvaluateRequest, reply *structs.JobRegis _, evalIndex, err := j.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, updateTransitionReq) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err) + j.logger.Error("eval create failed", "error", err, "method", "evaluate") return err } @@ -630,7 +633,7 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD // Commit this update via Raft _, index, err := j.srv.raftApply(structs.JobDeregisterRequestType, args) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Deregister failed: %v", err) + j.logger.Error("deregister failed", "error", err) return err } @@ -663,7 +666,7 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD // Commit this evaluation via Raft _, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err) + j.logger.Error("eval create failed", "error", err, "method", "deregister") return err } @@ -748,7 +751,7 @@ func (j *Job) BatchDeregister(args *structs.JobBatchDeregisterRequest, reply *st // Commit this update via Raft _, index, err := j.srv.raftApply(structs.JobBatchDeregisterRequestType, args) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: batch deregister failed: %v", err) + j.logger.Error("batch deregister failed", "error", err) return err } @@ -1211,7 +1214,7 @@ func (j *Job) Plan(args *structs.JobPlanRequest, reply *structs.JobPlanResponse) } // Create the scheduler and run it - sched, err := scheduler.NewScheduler(eval.Type, j.srv.logger, snap, planner) + sched, err := scheduler.NewScheduler(eval.Type, j.logger, snap, planner) if err != nil { return err } @@ -1433,11 +1436,11 @@ func (j *Job) Dispatch(args *structs.JobDispatchRequest, reply *structs.JobDispa // Commit this update via Raft fsmErr, jobCreateIndex, err := j.srv.raftApply(structs.JobRegisterRequestType, regReq) if err, ok := fsmErr.(error); ok && err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Dispatched job register failed: %v", err) + j.logger.Error("dispatched job register failed", "error", err, "fsm", true) return err } if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Dispatched job register failed: %v", err) + j.logger.Error("dispatched job register failed", "error", err, "raft", true) return err } @@ -1466,7 +1469,7 @@ func (j *Job) Dispatch(args *structs.JobDispatchRequest, reply *structs.JobDispa // Commit this evaluation via Raft _, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err) + j.logger.Error("eval create failed", "error", err, "method", "dispatch") return err } diff --git a/nomad/leader.go b/nomad/leader.go index 9f64062d47a..0ea07c8443e 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -3,7 +3,6 @@ package nomad import ( "bytes" "context" - "errors" "fmt" "math/rand" "net" @@ -15,6 +14,7 @@ import ( "strings" "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-version" "github.com/hashicorp/nomad/helper/uuid" @@ -54,7 +54,7 @@ func (s *Server) monitorLeadership() { switch { case isLeader: if weAreLeaderCh != nil { - s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running") + s.logger.Error("attempted to start the leader loop while running") continue } @@ -64,19 +64,19 @@ func (s *Server) monitorLeadership() { defer leaderLoop.Done() s.leaderLoop(ch) }(weAreLeaderCh) - s.logger.Printf("[INFO] nomad: cluster leadership acquired") + s.logger.Info("cluster leadership acquired") default: if weAreLeaderCh == nil { - s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running") + s.logger.Error("attempted to stop the leader loop while not running") continue } - s.logger.Printf("[DEBUG] nomad: shutting down leader loop") + s.logger.Debug("shutting down leader loop") close(weAreLeaderCh) leaderLoop.Wait() weAreLeaderCh = nil - s.logger.Printf("[INFO] nomad: cluster leadership lost") + s.logger.Info("cluster leadership lost") } case <-s.shutdownCh: @@ -100,7 +100,7 @@ RECONCILE: start := time.Now() barrier := s.raft.Barrier(barrierWriteTimeout) if err := barrier.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) + s.logger.Error("failed to wait for barrier", "error", err) goto WAIT } metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) @@ -108,12 +108,12 @@ RECONCILE: // Check if we need to handle initial leadership actions if !establishedLeader { if err := s.establishLeadership(stopCh); err != nil { - s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err) + s.logger.Error("failed to establish leadership", "error", err) // Immediately revoke leadership since we didn't successfully // establish leadership. if err := s.revokeLeadership(); err != nil { - s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) + s.logger.Error("failed to revoke leadership", "error", err) } goto WAIT @@ -122,14 +122,14 @@ RECONCILE: establishedLeader = true defer func() { if err := s.revokeLeadership(); err != nil { - s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) + s.logger.Error("failed to revoke leadership", "error", err) } }() } // Reconcile any missing data if err := s.reconcile(); err != nil { - s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) + s.logger.Error("failed to reconcile", "error", err) goto WAIT } @@ -250,7 +250,7 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error { // are available to be initialized. Otherwise initialization may use stale // data. if err := s.initializeHeartbeatTimers(); err != nil { - s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) + s.logger.Error("heartbeat timer setup failed", "error", err) return err } @@ -364,6 +364,7 @@ func (s *Server) restoreRevokingAccessors() error { // dispatcher is maintained only by the leader, so it must be restored anytime a // leadership transition takes place. func (s *Server) restorePeriodicDispatcher() error { + logger := s.logger.Named("periodic") ws := memdb.NewWatchSet() iter, err := s.fsm.State().JobsByPeriodic(ws, true) if err != nil { @@ -381,7 +382,7 @@ func (s *Server) restorePeriodicDispatcher() error { } if err := s.periodicDispatcher.Add(job); err != nil { - s.logger.Printf("[ERR] nomad.periodic: %v", err) + logger.Error("failed to add job to periodic dispatcher", "error", err) continue } @@ -405,7 +406,7 @@ func (s *Server) restorePeriodicDispatcher() error { // nextLaunch is the next launch that should occur. nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation())) if err != nil { - s.logger.Printf("[ERR] nomad.periodic: failed to determine next periodic launch for job %s: %v", job.NamespacedID(), err) + logger.Error("failed to determine next periodic launch for job", "job", job.NamespacedID(), "error", err) continue } @@ -417,12 +418,10 @@ func (s *Server) restorePeriodicDispatcher() error { } if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil { - msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) - s.logger.Printf("[ERR] nomad.periodic: %s", msg) - return errors.New(msg) + logger.Error("force run of periodic job failed", "job", job.NamespacedID(), "error", err) + return fmt.Errorf("force run of periodic job %q failed: %v", job.NamespacedID(), err) } - s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ - " run during leadership establishment", job.ID) + logger.Debug("periodic job force runned during leadership establishment", "job", job.NamespacedID()) } return nil @@ -444,7 +443,7 @@ func (s *Server) schedulePeriodic(stopCh chan struct{}) { getLatest := func() (uint64, bool) { snapshotIndex, err := s.fsm.State().LatestIndex() if err != nil { - s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err) + s.logger.Error("failed to determine state store's index", "error", err) return 0, false } @@ -512,7 +511,7 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { updateEval := eval.Copy() updateEval.Status = structs.EvalStatusFailed updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) - s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval) + s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString()) // Create a follow-up evaluation that will be used to retry the // scheduling for the job after the cluster is hopefully more stable @@ -526,7 +525,7 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { Evals: []*structs.Evaluation{updateEval, followupEval}, } if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { - s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err) + s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err) continue } @@ -564,7 +563,7 @@ func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { Evals: cancel, } if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { - s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err) + s.logger.Error("failed to update duplicate evals", "evals", log.Fmt("%#v", cancel), "error", err) continue } } @@ -599,13 +598,13 @@ func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) { timer.Reset(s.config.StatsCollectionInterval) state, err := s.State().Snapshot() if err != nil { - s.logger.Printf("[ERR] nomad: failed to get state: %v", err) + s.logger.Error("failed to get state", "error", err) continue } ws := memdb.NewWatchSet() iter, err := state.JobSummaries(ws) if err != nil { - s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err) + s.logger.Error("failed to get job summaries", "error", err) continue } @@ -717,7 +716,7 @@ func (s *Server) revokeLeadership() error { // Clear the heartbeat timers on either shutdown or step down, // since we are no longer responsible for TTL expirations. if err := s.clearAllHeartbeatTimers(); err != nil { - s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) + s.logger.Error("clearing heartbeat timers failed", "error", err) return err } @@ -760,8 +759,7 @@ func (s *Server) reconcileMember(member serf.Member) error { err = s.removeRaftPeer(member, parts) } if err != nil { - s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", - member, err) + s.logger.Error("failed to reconcile member", "member", member, "error", err) return err } return nil @@ -775,7 +773,7 @@ func (s *Server) reconcileJobSummaries() error { if err != nil { return fmt.Errorf("unable to read latest index: %v", err) } - s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index) + s.logger.Debug("leader reconciling job summaries", "index", index) args := &structs.GenericResponse{} msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag @@ -794,7 +792,8 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { for _, member := range members { valid, p := isNomadServer(member) if valid && member.Name != m.Name && p.Bootstrap { - s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) + s.logger.Error("skipping adding Raft peer because an existing peer is in bootstrap mode and only one server should be in bootstrap mode", + "existing_peer", member.Name, "joining_peer", m.Name) return nil } } @@ -806,13 +805,13 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() configFuture := s.raft.GetConfiguration() if err := configFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) + s.logger.Error("failed to get raft configuration", "error", err) return err } if m.Name == s.config.NodeName { if l := len(configFuture.Configuration().Servers); l < 3 { - s.logger.Printf("[DEBUG] consul: Skipping self join check for %q since the cluster is too small", m.Name) + s.logger.Debug("skipping self join check for peer since the cluster is too small", "peer", m.Name) return nil } } @@ -842,12 +841,12 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { if err := future.Error(); err != nil { return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err) } - s.logger.Printf("[INFO] nomad: removed server with duplicate address: %s", server.Address) + s.logger.Info("removed server with duplicate address", "address", server.Address) } else { if err := future.Error(); err != nil { return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err) } - s.logger.Printf("[INFO] nomad: removed server with duplicate ID: %s", server.ID) + s.logger.Info("removed server with duplicate ID", "id", server.ID) } } } @@ -857,19 +856,19 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { case minRaftProtocol >= 3: addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) if err := addFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) + s.logger.Error("failed to add raft peer", "error", err) return err } case minRaftProtocol == 2 && parts.RaftVersion >= 3: addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) if err := addFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) + s.logger.Error("failed to add raft peer", "error", err) return err } default: addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) if err := addFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) + s.logger.Error("failed to add raft peer", "error", err) return err } } @@ -887,7 +886,7 @@ func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { // log entries. configFuture := s.raft.GetConfiguration() if err := configFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) + s.logger.Error("failed to get raft configuration", "error", err) return err } @@ -900,21 +899,19 @@ func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { for _, server := range configFuture.Configuration().Servers { // If we understand the new add/remove APIs and the server was added by ID, use the new remove API if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) { - s.logger.Printf("[INFO] nomad: removing server by ID: %q", server.ID) + s.logger.Info("removing server by ID", "id", server.ID) future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", - server.ID, err) + s.logger.Error("failed to remove raft peer", "id", server.ID, "error", err) return err } break } else if server.Address == raft.ServerAddress(addr) { // If not, use the old remove API - s.logger.Printf("[INFO] nomad: removing server by address: %q", server.Address) + s.logger.Info("removing server by address", "address", server.Address) future := s.raft.RemovePeer(raft.ServerAddress(addr)) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", - addr, err) + s.logger.Error("failed to remove raft peer", "address", addr, "error", err) return err } break @@ -934,7 +931,7 @@ func (s *Server) replicateACLPolicies(stopCh chan struct{}) { }, } limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) - s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region) + s.logger.Debug("starting ACL policy replication from authoritative region", "authorative_region", req.Region) START: for { @@ -951,7 +948,7 @@ START: err := s.forwardRegion(s.config.AuthoritativeRegion, "ACL.ListPolicies", &req, &resp) if err != nil { - s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) + s.logger.Error("failed to fetch policies from authoritative region", "error", err) goto ERR_WAIT } @@ -965,7 +962,7 @@ START: } _, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args) if err != nil { - s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err) + s.logger.Error("failed to delete policies", "error", err) goto ERR_WAIT } } @@ -985,7 +982,7 @@ START: var reply structs.ACLPolicySetResponse if err := s.forwardRegion(s.config.AuthoritativeRegion, "ACL.GetPolicies", &req, &reply); err != nil { - s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) + s.logger.Error("failed to fetch policies from authoritative region", "error", err) goto ERR_WAIT } for _, policy := range reply.Policies { @@ -1000,7 +997,7 @@ START: } _, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args) if err != nil { - s.logger.Printf("[ERR] nomad: failed to update policies: %v", err) + s.logger.Error("failed to update policies", "error", err) goto ERR_WAIT } } @@ -1076,7 +1073,7 @@ func (s *Server) replicateACLTokens(stopCh chan struct{}) { }, } limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) - s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region) + s.logger.Debug("starting ACL token replication from authoritative region", "authorative_region", req.Region) START: for { @@ -1093,7 +1090,7 @@ START: err := s.forwardRegion(s.config.AuthoritativeRegion, "ACL.ListTokens", &req, &resp) if err != nil { - s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) + s.logger.Error("failed to fetch tokens from authoritative region", "error", err) goto ERR_WAIT } @@ -1107,7 +1104,7 @@ START: } _, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args) if err != nil { - s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err) + s.logger.Error("failed to delete tokens", "error", err) goto ERR_WAIT } } @@ -1127,7 +1124,7 @@ START: var reply structs.ACLTokenSetResponse if err := s.forwardRegion(s.config.AuthoritativeRegion, "ACL.GetTokens", &req, &reply); err != nil { - s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) + s.logger.Error("failed to fetch tokens from authoritative region", "error", err) goto ERR_WAIT } for _, token := range reply.Tokens { @@ -1142,7 +1139,7 @@ START: } _, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args) if err != nil { - s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err) + s.logger.Error("failed to update tokens", "error", err) goto ERR_WAIT } } @@ -1212,7 +1209,7 @@ func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig { state := s.fsm.State() _, config, err := state.AutopilotConfig() if err != nil { - s.logger.Printf("[ERR] autopilot: failed to get config: %v", err) + s.logger.Named("autopilot").Error("failed to get autopilot config", "error", err) return nil } if config != nil { @@ -1220,14 +1217,14 @@ func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig { } if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion) { - s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String()) + s.logger.Named("autopilot").Warn("can't initialize until all servers are above minimum version", "min_version", minAutopilotVersion) return nil } config = s.config.AutopilotConfig req := structs.AutopilotSetConfigRequest{Config: *config} if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil { - s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err) + s.logger.Named("autopilot").Error("failed to initialize config", "error", err) return nil } diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 9bc5fdfb4aa..84f51ee0831 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -9,6 +9,9 @@ import ( "golang.org/x/sync/errgroup" + log "github.com/hashicorp/go-hclog" + vapi "github.com/hashicorp/vault/api" + "github.com/armon/go-metrics" "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-multierror" @@ -17,7 +20,6 @@ import ( "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" - vapi "github.com/hashicorp/vault/api" ) const ( @@ -48,7 +50,8 @@ const ( // Node endpoint is used for client interactions type Node struct { - srv *Server + srv *Server + logger log.Logger // ctx provides context regarding the underlying connection ctx *RPCContext @@ -158,7 +161,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp // Commit this update via Raft _, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: Register failed: %v", err) + n.logger.Error("register failed", "error", err) return err } reply.NodeModifyIndex = index @@ -172,7 +175,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp if structs.ShouldDrainNode(args.Node.Status) || transitionToReady { evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -183,7 +186,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp if !args.Node.TerminalStatus() { ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) + n.logger.Error("heartbeat reset failed", "error", err) return err } reply.HeartbeatTTL = ttl @@ -199,7 +202,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + n.logger.Error("failed to populate NodeUpdateResponse", "error", err) return err } @@ -279,7 +282,7 @@ func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.No // Commit this update via Raft _, index, err := n.srv.raftApply(structs.NodeDeregisterRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: Deregister failed: %v", err) + n.logger.Error("deregister failed", "error", err) return err } @@ -289,21 +292,21 @@ func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.No // Create the evaluations for this node evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } // Determine if there are any Vault accessors on the node accessors, err := snap.VaultAccessorsByNode(ws, args.NodeID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: looking up accessors for node %q failed: %v", args.NodeID, err) + n.logger.Error("looking up accessors for node failed", "node_id", args.NodeID, "error", err) return err } if l := len(accessors); l != 0 { - n.srv.logger.Printf("[DEBUG] nomad.client: revoking %d accessors on node %q due to deregister", l, args.NodeID) + n.logger.Debug("revoking accessors on node due to deregister", "num_accessors", l, "node_id", args.NodeID) if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: revoking accessors for node %q failed: %v", args.NodeID, err) + n.logger.Error("revoking accessors for node failed", "node_id", args.NodeID, "error", err) return err } } @@ -381,7 +384,7 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct _, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: status update failed: %v", err) + n.logger.Error("status update failed", "error", err) return err } reply.NodeModifyIndex = index @@ -392,7 +395,7 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct if structs.ShouldDrainNode(args.Status) || transitionToReady { evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -405,21 +408,21 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct // Determine if there are any Vault accessors on the node accessors, err := n.srv.State().VaultAccessorsByNode(ws, args.NodeID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: looking up accessors for node %q failed: %v", args.NodeID, err) + n.logger.Error("looking up accessors for node failed", "node_id", args.NodeID, "error", err) return err } if l := len(accessors); l != 0 { - n.srv.logger.Printf("[DEBUG] nomad.client: revoking %d accessors on node %q due to down state", l, args.NodeID) + n.logger.Debug("revoking accessors on node due to down state", "num_accessors", l, "node_id", args.NodeID) if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: revoking accessors for node %q failed: %v", args.NodeID, err) + n.logger.Error("revoking accessors for node failed", "node_id", args.NodeID, "error", err) return err } } default: ttl, err := n.srv.resetHeartbeatTimer(args.NodeID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) + n.logger.Error("heartbeat reset failed", "error", err) return err } reply.HeartbeatTTL = ttl @@ -430,7 +433,7 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + n.logger.Error("failed to populate NodeUpdateResponse", "error", err) return err } @@ -511,7 +514,7 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, // Commit this update via Raft _, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err) + n.logger.Error("drain update failed", "error", err) return err } reply.NodeModifyIndex = index @@ -521,7 +524,7 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.MarkEligible && args.DrainStrategy == nil { evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -599,12 +602,12 @@ func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest, // Commit this update via Raft outErr, index, err := n.srv.raftApply(structs.NodeUpdateEligibilityRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eligibility update failed: %v", err) + n.logger.Error("eligibility update failed", "error", err) return err } if outErr != nil { if err, ok := outErr.(error); ok && err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eligibility update failed: %v", err) + n.logger.Error("eligibility update failed", "error", err) return err } } @@ -614,7 +617,7 @@ func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest, if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.Eligibility == structs.NodeSchedulingEligible { evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -662,7 +665,7 @@ func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUp // Create the evaluation evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -674,7 +677,7 @@ func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUp n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + n.logger.Error("failed to populate NodeUpdateResponse", "error", err) return err } return nil @@ -1012,11 +1015,11 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene if existingAlloc, _ := n.srv.State().AllocByID(nil, alloc.ID); existingAlloc != nil { job, err := n.srv.State().JobByID(nil, existingAlloc.Namespace, existingAlloc.JobID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: UpdateAlloc unable to find job ID %q :%v", existingAlloc.JobID, err) + n.logger.Error("UpdateAlloc unable to find job", "job", existingAlloc.JobID, "error", err) continue } if job == nil { - n.srv.logger.Printf("[DEBUG] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID) + n.logger.Debug("UpdateAlloc unable to find job", "job", existingAlloc.JobID) continue } taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup) @@ -1092,7 +1095,7 @@ func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Alloc } if len(trimmedEvals) > 0 { - n.srv.logger.Printf("[DEBUG] nomad.client: Adding %v evaluations for rescheduling failed allocations", len(trimmedEvals)) + n.logger.Debug("adding evaluations for rescheduling failed allocations", "num_evals", len(trimmedEvals)) } // Prepare the batch update batch := &structs.AllocUpdateRequest{ @@ -1105,7 +1108,7 @@ func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Alloc var mErr multierror.Error _, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, batch) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: alloc update failed: %v", err) + n.logger.Error("alloc update failed", "error", err) mErr.Errors = append(mErr.Errors, err) } @@ -1122,7 +1125,7 @@ func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Alloc ws := memdb.NewWatchSet() accessors, err := n.srv.State().VaultAccessorsByAlloc(ws, alloc.ID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: looking up accessors for alloc %q failed: %v", alloc.ID, err) + n.logger.Error("looking up Vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err) mErr.Errors = append(mErr.Errors, err) } @@ -1130,9 +1133,9 @@ func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Alloc } if l := len(revoke); l != 0 { - n.srv.logger.Printf("[DEBUG] nomad.client: revoking %d accessors due to terminal allocations", l) + n.logger.Debug("revoking accessors due to terminal allocations", "num_accessors", l) if err := n.srv.vault.RevokeTokens(context.Background(), revoke, true); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: batched accessor revocation failed: %v", err) + n.logger.Error("batched Vault accessor revocation failed", "error", err) mErr.Errors = append(mErr.Errors, err) } } @@ -1316,7 +1319,7 @@ func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError) } - n.srv.logger.Printf("[ERR] nomad.client: DeriveVaultToken failed (recoverable %v): %v", recoverable, e) + n.logger.Error("DeriveVaultToken failed", "recoverable", recoverable, "error", e) } if done, err := n.srv.forward("Node.DeriveVaultToken", args, args, reply); done { @@ -1487,10 +1490,10 @@ func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, // If there was an error revoke the created tokens if createErr != nil { - n.srv.logger.Printf("[ERR] nomad.node: Vault token creation for alloc %q failed: %v", alloc.ID, createErr) + n.logger.Error("Vault token creation for alloc failed", "alloc_id", alloc.ID, "error", createErr) if revokeErr := n.srv.vault.RevokeTokens(context.Background(), accessors, false); revokeErr != nil { - n.srv.logger.Printf("[ERR] nomad.node: Vault token revocation for alloc %q failed: %v", alloc.ID, revokeErr) + n.logger.Error("Vault token revocation for alloc failed", "alloc_id", alloc.ID, "error", revokeErr) } if rerr, ok := createErr.(*structs.RecoverableError); ok { @@ -1506,7 +1509,7 @@ func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, req := structs.VaultAccessorsRequest{Accessors: accessors} _, index, err := n.srv.raftApply(structs.VaultAccessorRegisterRequestType, &req) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: Register Vault accessors for alloc %q failed: %v", alloc.ID, err) + n.logger.Error("registering Vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err) // Determine if we can recover from the error retry := false @@ -1542,7 +1545,7 @@ func (n *Node) EmitEvents(args *structs.EmitNodeEventsRequest, reply *structs.Em _, index, err := n.srv.raftApply(structs.UpsertNodeEventsType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.node upserting node events failed: %v", err) + n.logger.Error("upserting node events failed", "error", err) return err } diff --git a/nomad/operator_endpoint.go b/nomad/operator_endpoint.go index e6c992132e1..39615b3fa89 100644 --- a/nomad/operator_endpoint.go +++ b/nomad/operator_endpoint.go @@ -4,6 +4,8 @@ import ( "fmt" "net" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/consul/agent/consul/autopilot" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" @@ -12,7 +14,8 @@ import ( // Operator endpoint is used to perform low-level operator tasks for Nomad. type Operator struct { - srv *Server + srv *Server + logger log.Logger } // RaftGetConfiguration is used to retrieve the current Raft configuration. @@ -117,12 +120,11 @@ REMOVE: // pass. future := op.srv.raft.RemovePeer(args.Address) if err := future.Error(); err != nil { - op.srv.logger.Printf("[WARN] nomad.operator: Failed to remove Raft peer %q: %v", - args.Address, err) + op.logger.Warn("failed to remove Raft peer", "peer", args.Address, "error", err) return err } - op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer %q", args.Address) + op.logger.Warn("removed Raft peer", "peer", args.Address) return nil } @@ -182,12 +184,11 @@ REMOVE: future = op.srv.raft.RemovePeer(address) } if err := future.Error(); err != nil { - op.srv.logger.Printf("[WARN] nomad.operator: Failed to remove Raft peer with id %q: %v", - args.ID, err) + op.logger.Warn("failed to remove Raft peer", "peer_id", args.ID, "error", err) return err } - op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer with id %q", args.ID) + op.logger.Warn("removed Raft peer", "peer_id", args.ID) return nil } @@ -238,7 +239,7 @@ func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRe // Apply the update resp, _, err := op.srv.raftApply(structs.AutopilotRequestType, args) if err != nil { - op.srv.logger.Printf("[ERR] nomad.operator: Apply failed: %v", err) + op.logger.Error("failed applying AutoPilot configuration", "error", err) return err } if respErr, ok := resp.(error); ok { diff --git a/nomad/periodic.go b/nomad/periodic.go index 05f953189c6..edd295461da 100644 --- a/nomad/periodic.go +++ b/nomad/periodic.go @@ -4,13 +4,14 @@ import ( "container/heap" "context" "fmt" - "log" "strconv" "strings" "sync" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" ) @@ -27,7 +28,7 @@ type PeriodicDispatch struct { updateCh chan struct{} stopFn context.CancelFunc - logger *log.Logger + logger log.Logger l sync.RWMutex } @@ -144,13 +145,13 @@ func (s *Server) RunningChildren(job *structs.Job) (bool, error) { // NewPeriodicDispatch returns a periodic dispatcher that is used to track and // launch periodic jobs. -func NewPeriodicDispatch(logger *log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch { +func NewPeriodicDispatch(logger log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch { return &PeriodicDispatch{ dispatcher: dispatcher, tracked: make(map[structs.NamespacedID]*structs.Job), heap: NewPeriodicHeap(), updateCh: make(chan struct{}, 1), - logger: logger, + logger: logger.Named("periodic"), } } @@ -229,12 +230,12 @@ func (p *PeriodicDispatch) Add(job *structs.Job) error { if err := p.heap.Update(job, next); err != nil { return fmt.Errorf("failed to update job %q (%s) launch time: %v", job.ID, job.Namespace, err) } - p.logger.Printf("[DEBUG] nomad.periodic: updated periodic job %q (%s)", job.ID, job.Namespace) + p.logger.Debug("updated periodic job", "job", job.NamespacedID()) } else { if err := p.heap.Push(job, next); err != nil { return fmt.Errorf("failed to add job %v: %v", job.ID, err) } - p.logger.Printf("[DEBUG] nomad.periodic: registered periodic job %q (%s)", job.ID, job.Namespace) + p.logger.Debug("registered periodic job", "job", job.NamespacedID()) } // Signal an update. @@ -281,7 +282,7 @@ func (p *PeriodicDispatch) removeLocked(jobID structs.NamespacedID) error { default: } - p.logger.Printf("[DEBUG] nomad.periodic: deregistered periodic job %q (%s)", jobID.ID, jobID.Namespace) + p.logger.Debug("deregistered periodic job", "job", job.NamespacedID()) return nil } @@ -328,7 +329,7 @@ func (p *PeriodicDispatch) run(ctx context.Context) { } else { launchDur := launch.Sub(time.Now().In(job.Periodic.GetLocation())) launchCh = time.After(launchDur) - p.logger.Printf("[DEBUG] nomad.periodic: launching job %q (%s) in %s", job.ID, job.Namespace, launchDur) + p.logger.Debug("scheduled periodic job launch", "launch_delay", launchDur, "job", job.NamespacedID()) } select { @@ -349,9 +350,9 @@ func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) { nextLaunch, err := job.Periodic.Next(launchTime) if err != nil { - p.logger.Printf("[ERR] nomad.periodic: failed to parse next periodic launch for job %s: %v", job.NamespacedID(), err) + p.logger.Error("failed to parse next periodic launch", "job", job.NamespacedID(), "error", err) } else if err := p.heap.Update(job, nextLaunch); err != nil { - p.logger.Printf("[ERR] nomad.periodic: failed to update next launch of periodic job %s: %v", job.NamespacedID(), err) + p.logger.Error("failed to update next launch of periodic job", "job", job.NamespacedID(), "error", err) } // If the job prohibits overlapping and there are running children, we skip @@ -359,23 +360,19 @@ func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) { if job.Periodic.ProhibitOverlap { running, err := p.dispatcher.RunningChildren(job) if err != nil { - msg := fmt.Sprintf("[ERR] nomad.periodic: failed to determine if"+ - " periodic job %q (%s) has running children: %v", job.ID, job.Namespace, err) - p.logger.Println(msg) + p.logger.Error("failed to determine if periodic job has running children", "job", "error", err) p.l.Unlock() return } if running { - msg := fmt.Sprintf("[DEBUG] nomad.periodic: skipping launch of"+ - " periodic job %q (%s) because job prohibits overlap", job.ID, job.Namespace) - p.logger.Println(msg) + p.logger.Debug("skipping launch of periodic job because job prohibits overlap", "job", job.NamespacedID()) p.l.Unlock() return } } - p.logger.Printf("[DEBUG] nomad.periodic: launching job %q (%v) at %v", job.ID, job.Namespace, launchTime) + p.logger.Debug(" launching job", "job", job.NamespacedID(), "launch_time", launchTime) p.l.Unlock() p.createEval(job, launchTime) } @@ -409,8 +406,7 @@ func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) eval, err := p.dispatcher.DispatchJob(derived) if err != nil { - p.logger.Printf("[ERR] nomad.periodic: failed to dispatch job %q (%s): %v", - periodicJob.ID, periodicJob.Namespace, err) + p.logger.Error("failed to dispatch job", "job", periodicJob.NamespacedID(), "error", err) return nil, err } @@ -425,9 +421,8 @@ func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) ( // Have to recover in case the job copy panics. defer func() { if r := recover(); r != nil { - p.logger.Printf("[ERR] nomad.periodic: deriving job from"+ - " periodic job %q (%s) failed; deregistering from periodic runner: %v", - periodicJob.ID, periodicJob.Namespace, r) + p.logger.Error("deriving child job from periodic job failed; deregistering from periodic runner", + "job", periodicJob.NamespacedID(), "error", r) p.Remove(periodicJob.Namespace, periodicJob.ID) derived = nil diff --git a/nomad/periodic_endpoint.go b/nomad/periodic_endpoint.go index 2cf355b6ef0..ecac2565421 100644 --- a/nomad/periodic_endpoint.go +++ b/nomad/periodic_endpoint.go @@ -4,15 +4,18 @@ import ( "fmt" "time" - "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/armon/go-metrics" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/structs" ) // Periodic endpoint is used for periodic job interactions type Periodic struct { - srv *Server + srv *Server + logger log.Logger } // Force is used to force a new instance of a periodic job diff --git a/nomad/periodic_test.go b/nomad/periodic_test.go index a4f56cb564b..e8985ed3a4d 100644 --- a/nomad/periodic_test.go +++ b/nomad/periodic_test.go @@ -78,7 +78,7 @@ func (t times) Less(i, j int) bool { return t[i].Before(t[j]) } // testPeriodicDispatcher returns an enabled PeriodicDispatcher which uses the // MockJobEvalDispatcher. func testPeriodicDispatcher(t *testing.T) (*PeriodicDispatch, *MockJobEvalDispatcher) { - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) m := NewMockJobEvalDispatcher() d := NewPeriodicDispatch(logger, m) d.SetEnabled(true) diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go index 089af0f5853..12f4dc02a2c 100644 --- a/nomad/plan_apply.go +++ b/nomad/plan_apply.go @@ -2,18 +2,45 @@ package nomad import ( "fmt" - "log" "runtime" "time" - "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/armon/go-metrics" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" ) +// planner is used to mange the submitted allocation plans that are waiting +// to be accessed by the leader +type planner struct { + *Server + log log.Logger + + // planQueue is used to manage the submitted allocation + // plans that are waiting to be assessed by the leader + planQueue *PlanQueue +} + +// newPlanner returns a new planner to be used for managing allocation plans. +func newPlanner(s *Server) (*planner, error) { + // Create a plan queue + planQueue, err := NewPlanQueue() + if err != nil { + return nil, err + } + + return &planner{ + Server: s, + log: s.logger.Named("planner"), + planQueue: planQueue, + }, nil +} + // planApply is a long lived goroutine that reads plan allocations from // the plan queue, determines if they can be applied safely and applies // them via Raft. @@ -40,7 +67,7 @@ import ( // the Raft log is updated. This means our schedulers will stall, // but there are many of those and only a single plan verifier. // -func (s *Server) planApply() { +func (p *planner) planApply() { // waitCh is used to track an outstanding application while snap // holds an optimistic state which includes that plan application. var waitCh chan struct{} @@ -56,7 +83,7 @@ func (s *Server) planApply() { for { // Pull the next pending plan, exit if we are no longer leader - pending, err := s.planQueue.Dequeue(0) + pending, err := p.planQueue.Dequeue(0) if err != nil { return } @@ -72,18 +99,18 @@ func (s *Server) planApply() { // Snapshot the state so that we have a consistent view of the world // if no snapshot is available if waitCh == nil || snap == nil { - snap, err = s.fsm.State().Snapshot() + snap, err = p.fsm.State().Snapshot() if err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to snapshot state: %v", err) + p.logger.Error("failed to snapshot state", "error", err) pending.respond(nil, err) continue } } // Evaluate the plan - result, err := evaluatePlan(pool, snap, pending.plan, s.logger) + result, err := evaluatePlan(pool, snap, pending.plan, p.logger) if err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to evaluate plan: %v", err) + p.logger.Error("failed to evaluate plan", "error", err) pending.respond(nil, err) continue } @@ -98,30 +125,30 @@ func (s *Server) planApply() { // This also limits how out of date our snapshot can be. if waitCh != nil { <-waitCh - snap, err = s.fsm.State().Snapshot() + snap, err = p.fsm.State().Snapshot() if err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to snapshot state: %v", err) + p.logger.Error("failed to snapshot state", "error", err) pending.respond(nil, err) continue } } // Dispatch the Raft transaction for the plan - future, err := s.applyPlan(pending.plan, result, snap) + future, err := p.applyPlan(pending.plan, result, snap) if err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to submit plan: %v", err) + p.logger.Error("failed to submit plan", "error", err) pending.respond(nil, err) continue } // Respond to the plan in async waitCh = make(chan struct{}) - go s.asyncPlanWait(waitCh, future, result, pending) + go p.asyncPlanWait(waitCh, future, result, pending) } } // applyPlan is used to apply the plan result and to return the alloc index -func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { +func (p *planner) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { // Determine the minimum number of updates, could be more if there // are multiple updates per node minUpdates := len(result.NodeUpdate) @@ -155,14 +182,14 @@ func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap } // Dispatch the Raft transaction - future, err := s.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req) + future, err := p.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req) if err != nil { return nil, err } // Optimistically apply to our state view if snap != nil { - nextIdx := s.raft.AppliedIndex() + 1 + nextIdx := p.raft.AppliedIndex() + 1 if err := snap.UpsertPlanResults(nextIdx, &req); err != nil { return future, err } @@ -171,14 +198,14 @@ func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap } // asyncPlanWait is used to apply and respond to a plan async -func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, +func (p *planner) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, result *structs.PlanResult, pending *pendingPlan) { defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) defer close(waitCh) // Wait for the plan to apply if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to apply plan: %v", err) + p.logger.Error("failed to apply plan", "error", err) pending.respond(nil, err) return } @@ -199,7 +226,7 @@ func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, // evaluatePlan is used to determine what portions of a plan // can be applied if any. Returns if there should be a plan application // which may be partial or if there was an error -func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) { +func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) { defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) // Check if the plan exceeds quota @@ -215,7 +242,7 @@ func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.P return nil, err } - logger.Printf("[DEBUG] nomad.planner: plan for evaluation %q exceeds quota limit. Forcing refresh to %d", plan.EvalID, index) + logger.Debug("plan for evaluation exceeds quota limit. Forcing state refresh", "eval_id", plan.EvalID, "refresh_index", index) return &structs.PlanResult{RefreshIndex: index}, nil } @@ -225,7 +252,7 @@ func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.P // evaluatePlanPlacements is used to determine what portions of a plan can be // applied if any, looking for node over commitment. Returns if there should be // a plan application which may be partial or if there was an error -func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) { +func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) { // Create a result holder for the plan result := &structs.PlanResult{ NodeUpdate: make(map[string][]*structs.Allocation), @@ -265,7 +292,7 @@ func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan if !fit { // Log the reason why the node's allocations could not be made if reason != "" { - logger.Printf("[DEBUG] nomad.planner: plan for node %q rejected because: %v", nodeID, reason) + logger.Debug("plan for node rejected", "node_id", nodeID, "reason", reason) } // Set that this is a partial commit partialCommit = true diff --git a/nomad/plan_apply_test.go b/nomad/plan_apply_test.go index a6e6529d283..ee36ced35fb 100644 --- a/nomad/plan_apply_test.go +++ b/nomad/plan_apply_test.go @@ -253,7 +253,7 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) { pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize) defer pool.Shutdown() - result, err := evaluatePlan(pool, snap, plan, testlog.Logger(t)) + result, err := evaluatePlan(pool, snap, plan, testlog.HCLogger(t)) if err != nil { t.Fatalf("err: %v", err) } @@ -300,7 +300,7 @@ func TestPlanApply_EvalPlan_Partial(t *testing.T) { pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize) defer pool.Shutdown() - result, err := evaluatePlan(pool, snap, plan, testlog.Logger(t)) + result, err := evaluatePlan(pool, snap, plan, testlog.HCLogger(t)) if err != nil { t.Fatalf("err: %v", err) } @@ -361,7 +361,7 @@ func TestPlanApply_EvalPlan_Partial_AllAtOnce(t *testing.T) { pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize) defer pool.Shutdown() - result, err := evaluatePlan(pool, snap, plan, testlog.Logger(t)) + result, err := evaluatePlan(pool, snap, plan, testlog.HCLogger(t)) if err != nil { t.Fatalf("err: %v", err) } diff --git a/nomad/plan_endpoint.go b/nomad/plan_endpoint.go index 556c016df2a..bacea404d0a 100644 --- a/nomad/plan_endpoint.go +++ b/nomad/plan_endpoint.go @@ -3,13 +3,16 @@ package nomad import ( "time" + log "github.com/hashicorp/go-hclog" + "github.com/armon/go-metrics" "github.com/hashicorp/nomad/nomad/structs" ) // Plan endpoint is used for plan interactions type Plan struct { - srv *Server + srv *Server + logger log.Logger } // Submit is used to submit a plan to the leader diff --git a/nomad/regions_endpoint.go b/nomad/regions_endpoint.go index b9d2cd0f14b..84afad2e72b 100644 --- a/nomad/regions_endpoint.go +++ b/nomad/regions_endpoint.go @@ -1,10 +1,15 @@ package nomad -import "github.com/hashicorp/nomad/nomad/structs" +import ( + log "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/nomad/nomad/structs" +) // Region is used to query and list the known regions type Region struct { - srv *Server + srv *Server + logger log.Logger } // List is used to list all of the known regions. No leader forwarding is diff --git a/nomad/rpc.go b/nomad/rpc.go index 2d6302afa24..39baca7e0cf 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -14,8 +14,10 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/hashicorp/consul/lib" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/hashicorp/consul/lib" "github.com/hashicorp/nomad/helper/pool" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -43,6 +45,18 @@ const ( enqueueLimit = 30 * time.Second ) +type rpcHandler struct { + *Server + logger log.Logger +} + +func newRpcHandler(s *Server) *rpcHandler { + return &rpcHandler{ + Server: s, + logger: s.logger.Named("rpc"), + } +} + // RPCContext provides metadata about the RPC connection. type RPCContext struct { // Conn exposes the raw connection. @@ -63,20 +77,20 @@ type RPCContext struct { } // listen is used to listen for incoming RPC connections -func (s *Server) listen(ctx context.Context) { - defer close(s.listenerCh) +func (r *rpcHandler) listen(ctx context.Context) { + defer close(r.listenerCh) for { select { case <-ctx.Done(): - s.logger.Println("[INFO] nomad.rpc: Closing server RPC connection") + r.logger.Info("closing server RPC connection") return default: } // Accept a connection - conn, err := s.rpcListener.Accept() + conn, err := r.rpcListener.Accept() if err != nil { - if s.shutdown { + if r.shutdown { return } @@ -86,32 +100,32 @@ func (s *Server) listen(ctx context.Context) { default: } - s.logger.Printf("[ERR] nomad.rpc: failed to accept RPC conn: %v", err) + r.logger.Error("failed to accept RPC conn", "error", err) continue } - go s.handleConn(ctx, conn, &RPCContext{Conn: conn}) + go r.handleConn(ctx, conn, &RPCContext{Conn: conn}) metrics.IncrCounter([]string{"nomad", "rpc", "accept_conn"}, 1) } } // handleConn is used to determine if this is a Raft or // Nomad type RPC connection and invoke the correct handler -func (s *Server) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { +func (r *rpcHandler) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { // Read a single byte buf := make([]byte, 1) if _, err := conn.Read(buf); err != nil { if err != io.EOF { - s.logger.Printf("[ERR] nomad.rpc: failed to read byte: %v", err) + r.logger.Error("failed to read first RPC byte", "error", err) } conn.Close() return } // Enforce TLS if EnableRPC is set - if s.config.TLSConfig.EnableRPC && !rpcCtx.TLS && pool.RPCType(buf[0]) != pool.RpcTLS { - if !s.config.TLSConfig.RPCUpgradeMode { - s.logger.Printf("[WARN] nomad.rpc: Non-TLS connection attempted from %s with RequireTLS set", conn.RemoteAddr().String()) + if r.config.TLSConfig.EnableRPC && !rpcCtx.TLS && pool.RPCType(buf[0]) != pool.RpcTLS { + if !r.config.TLSConfig.RPCUpgradeMode { + r.logger.Warn("non-TLS connection attempted with RequireTLS set", "remote_addr", conn.RemoteAddr()) conn.Close() return } @@ -122,39 +136,39 @@ func (s *Server) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCConte case pool.RpcNomad: // Create an RPC Server and handle the request server := rpc.NewServer() - s.setupRpcServer(server, rpcCtx) - s.handleNomadConn(ctx, conn, server) + r.setupRpcServer(server, rpcCtx) + r.handleNomadConn(ctx, conn, server) // Remove any potential mapping between a NodeID to this connection and // close the underlying connection. - s.removeNodeConn(rpcCtx) + r.removeNodeConn(rpcCtx) case pool.RpcRaft: metrics.IncrCounter([]string{"nomad", "rpc", "raft_handoff"}, 1) - s.raftLayer.Handoff(ctx, conn) + r.raftLayer.Handoff(ctx, conn) case pool.RpcMultiplex: - s.handleMultiplex(ctx, conn, rpcCtx) + r.handleMultiplex(ctx, conn, rpcCtx) case pool.RpcTLS: - if s.rpcTLS == nil { - s.logger.Printf("[WARN] nomad.rpc: TLS connection attempted, server not configured for TLS") + if r.rpcTLS == nil { + r.logger.Warn("TLS connection attempted, server not configured for TLS") conn.Close() return } - conn = tls.Server(conn, s.rpcTLS) + conn = tls.Server(conn, r.rpcTLS) // Force a handshake so we can get information about the TLS connection // state. tlsConn, ok := conn.(*tls.Conn) if !ok { - s.logger.Printf("[ERR] nomad.rpc: expected TLS connection but got %T", conn) + r.logger.Error("expected TLS connection", "got", log.Fmt("%T", conn)) conn.Close() return } if err := tlsConn.Handshake(); err != nil { - s.logger.Printf("[WARN] nomad.rpc: failed TLS handshake from connection from %v: %v", tlsConn.RemoteAddr(), err) + r.logger.Warn("failed TLS handshake", "remote_addr", tlsConn.RemoteAddr(), "error", err) conn.Close() return } @@ -167,16 +181,16 @@ func (s *Server) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCConte state := tlsConn.ConnectionState() rpcCtx.VerifiedChains = state.VerifiedChains - s.handleConn(ctx, conn, rpcCtx) + r.handleConn(ctx, conn, rpcCtx) case pool.RpcStreaming: - s.handleStreamingConn(conn) + r.handleStreamingConn(conn) case pool.RpcMultiplexV2: - s.handleMultiplexV2(ctx, conn, rpcCtx) + r.handleMultiplexV2(ctx, conn, rpcCtx) default: - s.logger.Printf("[ERR] nomad.rpc: unrecognized RPC byte: %v", buf[0]) + r.logger.Error("unrecognized RPC byte", "byte", buf[0]) conn.Close() return } @@ -184,19 +198,19 @@ func (s *Server) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCConte // handleMultiplex is used to multiplex a single incoming connection // using the Yamux multiplexer -func (s *Server) handleMultiplex(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { +func (r *rpcHandler) handleMultiplex(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { defer func() { // Remove any potential mapping between a NodeID to this connection and // close the underlying connection. - s.removeNodeConn(rpcCtx) + r.removeNodeConn(rpcCtx) conn.Close() }() conf := yamux.DefaultConfig() - conf.LogOutput = s.config.LogOutput + conf.LogOutput = r.config.LogOutput server, err := yamux.Server(conn, conf) if err != nil { - s.logger.Printf("[ERR] nomad.rpc: multiplex failed to create yamux server: %v", err) + r.logger.Error("multiplex failed to create yamux server", "error", err) return } @@ -205,7 +219,7 @@ func (s *Server) handleMultiplex(ctx context.Context, conn net.Conn, rpcCtx *RPC // Create the RPC server for this connection rpcServer := rpc.NewServer() - s.setupRpcServer(rpcServer, rpcCtx) + r.setupRpcServer(rpcServer, rpcCtx) for { // stop handling connections if context was cancelled @@ -216,31 +230,31 @@ func (s *Server) handleMultiplex(ctx context.Context, conn net.Conn, rpcCtx *RPC sub, err := server.Accept() if err != nil { if err != io.EOF { - s.logger.Printf("[ERR] nomad.rpc: multiplex conn accept failed: %v", err) + r.logger.Error("multiplex conn accept failed", "error", err) } return } - go s.handleNomadConn(ctx, sub, rpcServer) + go r.handleNomadConn(ctx, sub, rpcServer) } } // handleNomadConn is used to service a single Nomad RPC connection -func (s *Server) handleNomadConn(ctx context.Context, conn net.Conn, server *rpc.Server) { +func (r *rpcHandler) handleNomadConn(ctx context.Context, conn net.Conn, server *rpc.Server) { defer conn.Close() rpcCodec := pool.NewServerCodec(conn) for { select { case <-ctx.Done(): - s.logger.Println("[INFO] nomad.rpc: Closing server RPC connection") + r.logger.Info("closing server RPC connection") return - case <-s.shutdownCh: + case <-r.shutdownCh: return default: } if err := server.ServeRequest(rpcCodec); err != nil { if err != io.EOF && !strings.Contains(err.Error(), "closed") { - s.logger.Printf("[ERR] nomad.rpc: RPC error: %v (%v)", err, conn) + r.logger.Error("RPC error", "error", err, "connection", conn) metrics.IncrCounter([]string{"nomad", "rpc", "request_error"}, 1) } return @@ -250,7 +264,7 @@ func (s *Server) handleNomadConn(ctx context.Context, conn net.Conn, server *rpc } // handleStreamingConn is used to handle a single Streaming Nomad RPC connection. -func (s *Server) handleStreamingConn(conn net.Conn) { +func (r *rpcHandler) handleStreamingConn(conn net.Conn) { defer conn.Close() // Decode the header @@ -258,7 +272,7 @@ func (s *Server) handleStreamingConn(conn net.Conn) { decoder := codec.NewDecoder(conn, structs.MsgpackHandle) if err := decoder.Decode(&header); err != nil { if err != io.EOF && !strings.Contains(err.Error(), "closed") { - s.logger.Printf("[ERR] nomad.rpc: Streaming RPC error: %v (%v)", err, conn) + r.logger.Error("streaming RPC error", "error", err, "connection", conn) metrics.IncrCounter([]string{"nomad", "streaming_rpc", "request_error"}, 1) } @@ -266,9 +280,9 @@ func (s *Server) handleStreamingConn(conn net.Conn) { } ack := structs.StreamingRpcAck{} - handler, err := s.streamingRpcs.GetHandler(header.Method) + handler, err := r.streamingRpcs.GetHandler(header.Method) if err != nil { - s.logger.Printf("[ERR] nomad.rpc: Streaming RPC error: %v (%v)", err, conn) + r.logger.Error("streaming RPC error", "error", err, "connection", conn) metrics.IncrCounter([]string{"nomad", "streaming_rpc", "request_error"}, 1) ack.Error = err.Error() } @@ -292,19 +306,19 @@ func (s *Server) handleStreamingConn(conn net.Conn) { // handleMultiplexV2 is used to multiplex a single incoming connection // using the Yamux multiplexer. Version 2 handling allows a single connection to // switch streams between regulars RPCs and Streaming RPCs. -func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { +func (r *rpcHandler) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { defer func() { // Remove any potential mapping between a NodeID to this connection and // close the underlying connection. - s.removeNodeConn(rpcCtx) + r.removeNodeConn(rpcCtx) conn.Close() }() conf := yamux.DefaultConfig() - conf.LogOutput = s.config.LogOutput + conf.LogOutput = r.config.LogOutput server, err := yamux.Server(conn, conf) if err != nil { - s.logger.Printf("[ERR] nomad.rpc: multiplex_v2 failed to create yamux server: %v", err) + r.logger.Error("multiplex_v2 failed to create yamux server", "error", err) return } @@ -313,7 +327,7 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R // Create the RPC server for this connection rpcServer := rpc.NewServer() - s.setupRpcServer(rpcServer, rpcCtx) + r.setupRpcServer(rpcServer, rpcCtx) for { // stop handling connections if context was cancelled @@ -325,7 +339,7 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R sub, err := server.Accept() if err != nil { if err != io.EOF { - s.logger.Printf("[ERR] nomad.rpc: multiplex_v2 conn accept failed: %v", err) + r.logger.Error("multiplex_v2 conn accept failed", "error", err) } return } @@ -334,7 +348,7 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R buf := make([]byte, 1) if _, err := sub.Read(buf); err != nil { if err != io.EOF { - s.logger.Printf("[ERR] nomad.rpc: multiplex_v2 failed to read byte: %v", err) + r.logger.Error("multiplex_v2 failed to read first byte", "error", err) } return } @@ -342,12 +356,12 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R // Determine which handler to use switch pool.RPCType(buf[0]) { case pool.RpcNomad: - go s.handleNomadConn(ctx, sub, rpcServer) + go r.handleNomadConn(ctx, sub, rpcServer) case pool.RpcStreaming: - go s.handleStreamingConn(sub) + go r.handleStreamingConn(sub) default: - s.logger.Printf("[ERR] nomad.rpc: multiplex_v2 unrecognized RPC byte: %v", buf[0]) + r.logger.Error("multiplex_v2 unrecognized first RPC byte", "byte", buf[0]) return } } @@ -356,7 +370,7 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R // forward is used to forward to a remote region or to forward to the local leader // Returns a bool of if forwarding was performed, as well as any error -func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, reply interface{}) (bool, error) { +func (r *rpcHandler) forward(method string, info structs.RPCInfo, args interface{}, reply interface{}) (bool, error) { var firstCheck time.Time region := info.RequestRegion() @@ -365,10 +379,10 @@ func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, } // Handle region forwarding - if region != s.config.Region { + if region != r.config.Region { // Mark that we are forwarding the RPC info.SetForwarded() - err := s.forwardRegion(region, method, args, reply) + err := r.forwardRegion(region, method, args, reply) return true, err } @@ -379,7 +393,7 @@ func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, CHECK_LEADER: // Find the leader - isLeader, remoteServer := s.getLeader() + isLeader, remoteServer := r.getLeader() // Handle the case we are the leader if isLeader { @@ -390,7 +404,7 @@ CHECK_LEADER: if remoteServer != nil { // Mark that we are forwarding the RPC info.SetForwarded() - err := s.forwardLeader(remoteServer, method, args, reply) + err := r.forwardLeader(remoteServer, method, args, reply) return true, err } @@ -398,12 +412,12 @@ CHECK_LEADER: if firstCheck.IsZero() { firstCheck = time.Now() } - if time.Now().Sub(firstCheck) < s.config.RPCHoldTimeout { - jitter := lib.RandomStagger(s.config.RPCHoldTimeout / structs.JitterFraction) + if time.Now().Sub(firstCheck) < r.config.RPCHoldTimeout { + jitter := lib.RandomStagger(r.config.RPCHoldTimeout / structs.JitterFraction) select { case <-time.After(jitter): goto CHECK_LEADER - case <-s.shutdownCh: + case <-r.shutdownCh: } } @@ -436,49 +450,48 @@ func (s *Server) getLeader() (bool, *serverParts) { } // forwardLeader is used to forward an RPC call to the leader, or fail if no leader -func (s *Server) forwardLeader(server *serverParts, method string, args interface{}, reply interface{}) error { +func (r *rpcHandler) forwardLeader(server *serverParts, method string, args interface{}, reply interface{}) error { // Handle a missing server if server == nil { return structs.ErrNoLeader } - return s.connPool.RPC(s.config.Region, server.Addr, server.MajorVersion, method, args, reply) + return r.connPool.RPC(r.config.Region, server.Addr, server.MajorVersion, method, args, reply) } // forwardServer is used to forward an RPC call to a particular server -func (s *Server) forwardServer(server *serverParts, method string, args interface{}, reply interface{}) error { +func (r *rpcHandler) forwardServer(server *serverParts, method string, args interface{}, reply interface{}) error { // Handle a missing server if server == nil { return errors.New("must be given a valid server address") } - return s.connPool.RPC(s.config.Region, server.Addr, server.MajorVersion, method, args, reply) + return r.connPool.RPC(r.config.Region, server.Addr, server.MajorVersion, method, args, reply) } // forwardRegion is used to forward an RPC call to a remote region, or fail if no servers -func (s *Server) forwardRegion(region, method string, args interface{}, reply interface{}) error { +func (r *rpcHandler) forwardRegion(region, method string, args interface{}, reply interface{}) error { // Bail if we can't find any servers - s.peerLock.RLock() - servers := s.peers[region] + r.peerLock.RLock() + servers := r.peers[region] if len(servers) == 0 { - s.peerLock.RUnlock() - s.logger.Printf("[WARN] nomad.rpc: RPC request for region '%s', no path found", - region) + r.peerLock.RUnlock() + r.logger.Warn("no path found to region", "region", region) return structs.ErrNoRegionPath } // Select a random addr offset := rand.Intn(len(servers)) server := servers[offset] - s.peerLock.RUnlock() + r.peerLock.RUnlock() // Forward to remote Nomad metrics.IncrCounter([]string{"nomad", "rpc", "cross-region", region}, 1) - return s.connPool.RPC(region, server.Addr, server.MajorVersion, method, args, reply) + return r.connPool.RPC(region, server.Addr, server.MajorVersion, method, args, reply) } // streamingRpc creates a connection to the given server and conducts the // initial handshake, returning the connection or an error. It is the callers // responsibility to close the connection if there is no returned error. -func (s *Server) streamingRpc(server *serverParts, method string) (net.Conn, error) { +func (r *rpcHandler) streamingRpc(server *serverParts, method string) (net.Conn, error) { // Try to dial the server conn, err := net.DialTimeout("tcp", server.Addr.String(), 10*time.Second) if err != nil { @@ -491,7 +504,7 @@ func (s *Server) streamingRpc(server *serverParts, method string) (net.Conn, err tcp.SetNoDelay(true) } - if err := s.streamingRpcImpl(conn, server.Region, method); err != nil { + if err := r.streamingRpcImpl(conn, server.Region, method); err != nil { return nil, err } @@ -502,11 +515,11 @@ func (s *Server) streamingRpc(server *serverParts, method string) (net.Conn, err // the handshake to establish a streaming RPC for the given method. If an error // is returned, the underlying connection has been closed. Otherwise it is // assumed that the connection has been hijacked by the RPC method. -func (s *Server) streamingRpcImpl(conn net.Conn, region, method string) error { +func (r *rpcHandler) streamingRpcImpl(conn net.Conn, region, method string) error { // Check if TLS is enabled - s.tlsWrapLock.RLock() - tlsWrap := s.tlsWrap - s.tlsWrapLock.RUnlock() + r.tlsWrapLock.RLock() + tlsWrap := r.tlsWrap + r.tlsWrapLock.RUnlock() if tlsWrap != nil { // Switch the connection into TLS mode @@ -565,7 +578,7 @@ func (s *Server) raftApplyFuture(t structs.MessageType, msg interface{}) (raft.A // Warn if the command is very large if n := len(buf); n > raftWarnSize { - s.logger.Printf("[WARN] nomad: Attempting to apply large raft entry (type %d) (%d bytes)", t, n) + s.logger.Warn("attempting to apply large raft entry", "raft_type", t, "bytes", n) } future := s.raft.Apply(buf, enqueueLimit) @@ -589,13 +602,13 @@ func (s *Server) raftApply(t structs.MessageType, msg interface{}) (interface{}, } // setQueryMeta is used to populate the QueryMeta data for an RPC call -func (s *Server) setQueryMeta(m *structs.QueryMeta) { - if s.IsLeader() { +func (r *rpcHandler) setQueryMeta(m *structs.QueryMeta) { + if r.IsLeader() { m.LastContact = 0 m.KnownLeader = true } else { - m.LastContact = time.Now().Sub(s.raft.LastContact()) - m.KnownLeader = (s.raft.Leader() != "") + m.LastContact = time.Now().Sub(r.raft.LastContact()) + m.KnownLeader = (r.raft.Leader() != "") } } @@ -615,7 +628,7 @@ type blockingOptions struct { // blockingRPC is used for queries that need to wait for a // minimum index. This is used to block and wait for changes. -func (s *Server) blockingRPC(opts *blockingOptions) error { +func (r *rpcHandler) blockingRPC(opts *blockingOptions) error { ctx := context.Background() var cancel context.CancelFunc var state *state.StateStore @@ -641,7 +654,7 @@ func (s *Server) blockingRPC(opts *blockingOptions) error { RUN_QUERY: // Update the query meta data - s.setQueryMeta(opts.queryMeta) + r.setQueryMeta(opts.queryMeta) // Increment the rpc query counter metrics.IncrCounter([]string{"nomad", "rpc", "query"}, 1) @@ -649,7 +662,7 @@ RUN_QUERY: // We capture the state store and its abandon channel but pass a snapshot to // the blocking query function. We operate on the snapshot to allow separate // calls to the state store not all wrapped within the same transaction. - state = s.fsm.State() + state = r.fsm.State() abandonCh := state.AbandonCh() snap, _ := state.Snapshot() stateSnap := &snap.StateStore diff --git a/nomad/search_endpoint.go b/nomad/search_endpoint.go index 9eb892dd31c..4e1b3acfa8d 100644 --- a/nomad/search_endpoint.go +++ b/nomad/search_endpoint.go @@ -1,11 +1,14 @@ package nomad import ( + "fmt" "strings" "time" metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -31,7 +34,8 @@ var ( // Search endpoint is used to look up matches for a given prefix and context type Search struct { - srv *Server + srv *Server + logger log.Logger } // getMatches extracts matches for an iterator, and returns a list of ids for @@ -60,7 +64,7 @@ func (s *Search) getMatches(iter memdb.ResultIterator, prefix string) ([]string, default: matchID, ok := getEnterpriseMatch(raw) if !ok { - s.srv.logger.Printf("[ERR] nomad.resources: unexpected type for resources context: %T", t) + s.logger.Error("unexpected type for resources context", "type", fmt.Sprintf("%T", t)) continue } diff --git a/nomad/serf.go b/nomad/serf.go index 2bfc11ba1c7..65eba6044ee 100644 --- a/nomad/serf.go +++ b/nomad/serf.go @@ -5,6 +5,8 @@ import ( "sync/atomic" "time" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" "github.com/hashicorp/serf/serf" @@ -38,7 +40,7 @@ func (s *Server) serfEventHandler() { s.localMemberEvent(e.(serf.MemberEvent)) case serf.EventMemberUpdate, serf.EventUser, serf.EventQuery: // Ignore default: - s.logger.Printf("[WARN] nomad: unhandled serf event: %#v", e) + s.logger.Warn("unhandled serf event", "event", log.Fmt("%#v", e)) } case <-s.shutdownCh: @@ -52,10 +54,10 @@ func (s *Server) nodeJoin(me serf.MemberEvent) { for _, m := range me.Members { ok, parts := isNomadServer(m) if !ok { - s.logger.Printf("[WARN] nomad: non-server in gossip pool: %s", m.Name) + s.logger.Warn("non-server in gossip pool", "member", m.Name) continue } - s.logger.Printf("[INFO] nomad: adding server %s", parts) + s.logger.Info("adding server", "server", parts) // Check if this server is known found := false @@ -102,7 +104,7 @@ func (s *Server) maybeBootstrap() { panic("neither raftInmem or raftStore is initialized") } if err != nil { - s.logger.Printf("[ERR] nomad: failed to read last raft index: %v", err) + s.logger.Error("failed to read last raft index", "error", err) return } @@ -125,11 +127,11 @@ func (s *Server) maybeBootstrap() { continue } if p.Expect != 0 && p.Expect != int(atomic.LoadInt32(&s.config.BootstrapExpect)) { - s.logger.Printf("[ERR] nomad: peer %v has a conflicting expect value. All nodes should expect the same number.", member) + s.logger.Error("peer has a conflicting expect value. All nodes should expect the same number", "member", member) return } if p.Bootstrap { - s.logger.Printf("[ERR] nomad: peer %v has bootstrap mode. Expect disabled.", member) + s.logger.Error("peer has bootstrap mode. Expect disabled", "member", member) return } servers = append(servers, *p) @@ -154,8 +156,7 @@ func (s *Server) maybeBootstrap() { if err := s.connPool.RPC(s.config.Region, server.Addr, server.MajorVersion, "Status.Peers", req, &peers); err != nil { nextRetry := (1 << attempt) * peerRetryBase - s.logger.Printf("[ERR] nomad: Failed to confirm peer status for %s: %v. Retrying in "+ - "%v...", server.Name, err, nextRetry.String()) + s.logger.Error("failed to confirm peer status", "peer", server.Name, "error", err, "retry", nextRetry) time.Sleep(nextRetry) } else { break @@ -174,7 +175,8 @@ func (s *Server) maybeBootstrap() { // correctness because no server in the existing cluster will vote // for this server, but it makes things much more stable. if len(peers) > 0 { - s.logger.Printf("[INFO] nomad: Existing Raft peers reported by %s (%v), disabling bootstrap mode", server.Name, server.Addr) + s.logger.Info("disabling bootstrap mode because existing Raft peers being reported by peer", + "peer_name", server.Name, "peer_address", server.Addr) atomic.StoreInt32(&s.config.BootstrapExpect, 0) return } @@ -186,7 +188,7 @@ func (s *Server) maybeBootstrap() { var addrs []string minRaftVersion, err := s.autopilot.MinRaftProtocol() if err != nil { - s.logger.Printf("[ERR] nomad: Failed to read server raft versions: %v", err) + s.logger.Error("failed to read server raft versions", "error", err) } for _, server := range servers { @@ -204,11 +206,11 @@ func (s *Server) maybeBootstrap() { } configuration.Servers = append(configuration.Servers, peer) } - s.logger.Printf("[INFO] nomad: Found expected number of peers (%s), attempting to bootstrap cluster...", - strings.Join(addrs, ",")) + s.logger.Info("found expected number of peers, attempting to bootstrap cluster...", + "peers", strings.Join(addrs, ",")) future := s.raft.BootstrapCluster(configuration) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: Failed to bootstrap cluster: %v", err) + s.logger.Error("failed to bootstrap cluster", "error", err) } // Bootstrapping complete, or failed for some reason, don't enter this again @@ -222,7 +224,7 @@ func (s *Server) nodeFailed(me serf.MemberEvent) { if !ok { continue } - s.logger.Printf("[INFO] nomad: removing server %s", parts) + s.logger.Info("removing server", "server", parts) // Remove the server if known s.peerLock.Lock() diff --git a/nomad/server.go b/nomad/server.go index ff1733dfe13..f22137ac755 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -5,7 +5,6 @@ import ( "crypto/tls" "fmt" "io/ioutil" - "log" "net" "net/rpc" "os" @@ -16,11 +15,14 @@ import ( "sync/atomic" "time" - "github.com/hashicorp/consul/agent/consul/autopilot" consulapi "github.com/hashicorp/consul/api" - "github.com/hashicorp/consul/lib" + log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" lru "github.com/hashicorp/golang-lru" + raftboltdb "github.com/hashicorp/raft-boltdb" + + "github.com/hashicorp/consul/agent/consul/autopilot" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/codec" "github.com/hashicorp/nomad/helper/pool" @@ -33,7 +35,6 @@ import ( "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/nomad/scheduler" "github.com/hashicorp/raft" - raftboltdb "github.com/hashicorp/raft-boltdb" "github.com/hashicorp/serf/serf" ) @@ -90,7 +91,7 @@ const ( type Server struct { config *Config - logger *log.Logger + logger log.Logger // Connection pool to other Nomad servers connPool *pool.ConnPool @@ -119,6 +120,10 @@ type Server struct { tlsWrap tlsutil.RegionWrapper tlsWrapLock sync.RWMutex + // TODO(alex,hclog): Can I move more into the handler? + // rpcHandler is used to serve and handle RPCs + *rpcHandler + // rpcServer is the static RPC server that is used by the local agent. rpcServer *rpc.Server @@ -183,14 +188,13 @@ type Server struct { // periodicDispatcher is used to track and create evaluations for periodic jobs. periodicDispatcher *PeriodicDispatch - // planQueue is used to manage the submitted allocation - // plans that are waiting to be assessed by the leader - planQueue *PlanQueue + // planner is used to mange the submitted allocation plans that are waiting + // to be accessed by the leader + *planner - // heartbeatTimers track the expiration time of each heartbeat that has - // a TTL. On expiration, the node status is updated to be 'down'. - heartbeatTimers map[string]*time.Timer - heartbeatTimersLock sync.Mutex + // nodeHeartbeater is used to track expiration times of node heartbeats. If it + // detects an expired node, the node status is updated to be 'down'. + *nodeHeartbeater // consulCatalog is used for discovering other Nomad Servers via Consul consulCatalog consul.CatalogAPI @@ -247,7 +251,7 @@ type endpoints struct { // NewServer is used to construct a new Nomad server from the // configuration, potentially returning an error -func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logger) (*Server, error) { +func NewServer(config *Config, consulCatalog consul.CatalogAPI) (*Server, error) { // Check the protocol version if err := config.CheckVersion(); err != nil { return nil, err @@ -266,12 +270,6 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg // Create a new blocked eval tracker. blockedEvals := NewBlockedEvals(evalBroker) - // Create a plan queue - planQueue, err := NewPlanQueue() - if err != nil { - return nil, err - } - // Configure TLS tlsConf, err := tlsutil.NewTLSConfiguration(config.TLSConfig, true, true) if err != nil { @@ -293,7 +291,7 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg config: config, consulCatalog: consulCatalog, connPool: pool.NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, tlsWrap), - logger: logger, + logger: config.Logger.ResetNamed("nomad"), tlsWrap: tlsWrap, rpcServer: rpc.NewServer(), streamingRpcs: structs.NewStreamingRpcRegistry(), @@ -304,36 +302,48 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg eventCh: make(chan serf.Event, 256), evalBroker: evalBroker, blockedEvals: blockedEvals, - planQueue: planQueue, rpcTLS: incomingTLS, aclCache: aclCache, shutdownCh: make(chan struct{}), } + // Create the RPC handler + s.rpcHandler = newRpcHandler(s) + + // Create the planner + planner, err := newPlanner(s) + if err != nil { + return nil, err + } + s.planner = planner + + // Create the node heartbeater + s.nodeHeartbeater = newNodeHeartbeater(s) + // Create the periodic dispatcher for launching periodic jobs. s.periodicDispatcher = NewPeriodicDispatch(s.logger, s) // Initialize the stats fetcher that autopilot will use. - s.statsFetcher = NewStatsFetcher(logger, s.connPool, s.config.Region) + s.statsFetcher = NewStatsFetcher(s.logger, s.connPool, s.config.Region) // Setup Vault if err := s.setupVaultClient(); err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to setup Vault client: %v", err) + s.logger.Error("failed to setup Vault client", "error", err) return nil, fmt.Errorf("Failed to setup Vault client: %v", err) } // Initialize the RPC layer if err := s.setupRPC(tlsWrap); err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to start RPC layer: %s", err) + s.logger.Error("failed to start RPC layer", "error", err) return nil, fmt.Errorf("Failed to start RPC layer: %v", err) } // Initialize the Raft server if err := s.setupRaft(); err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to start Raft: %s", err) + s.logger.Error("failed to start Raft", "error", err) return nil, fmt.Errorf("Failed to start Raft: %v", err) } @@ -341,24 +351,26 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg s.serf, err = s.setupSerf(config.SerfConfig, s.eventCh, serfSnapshot) if err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to start serf WAN: %s", err) + s.logger.Error("failed to start serf WAN", "error", err) return nil, fmt.Errorf("Failed to start serf: %v", err) } // Initialize the scheduling workers if err := s.setupWorkers(); err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to start workers: %s", err) + s.logger.Error("failed to start workers", "error", err) return nil, fmt.Errorf("Failed to start workers: %v", err) } // Setup the Consul syncer if err := s.setupConsulSyncer(); err != nil { - return nil, fmt.Errorf("failed to create server Consul syncer: %v", err) + s.logger.Error("failed to create server consul syncer", "error", err) + return nil, fmt.Errorf("failed to create server Consul syncer: %v", "error", err) } // Setup the deployment watcher. if err := s.setupDeploymentWatcher(); err != nil { + s.logger.Error("failed to create deployment watcher", "error", err) return nil, fmt.Errorf("failed to create deployment watcher: %v", err) } @@ -383,7 +395,7 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg go evalBroker.EmitStats(time.Second, s.shutdownCh) // Emit metrics for the plan queue - go planQueue.EmitStats(time.Second, s.shutdownCh) + go s.planQueue.EmitStats(time.Second, s.shutdownCh) // Emit metrics for the blocked eval tracker. go blockedEvals.EmitStats(time.Second, s.shutdownCh) @@ -413,7 +425,7 @@ func (s *Server) createRPCListener() (*net.TCPListener, error) { s.listenerCh = make(chan struct{}) listener, err := net.ListenTCP("tcp", s.config.RPCAddr) if err != nil { - s.logger.Printf("[ERR] nomad: error when initializing TLS listener %s", err) + s.logger.Error("failed to initialize TLS listener", "error", err) return listener, err } @@ -445,23 +457,23 @@ func getTLSConf(enableRPC bool, tlsConf *tlsutil.Config) (*tls.Config, tlsutil.R // reloadTLSConnections updates a server's TLS configuration and reloads RPC // connections. func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { - s.logger.Printf("[INFO] nomad: reloading server connections due to configuration changes") + s.logger.Info("reloading server connections due to configuration changes") // Check if we can reload the RPC listener if s.rpcListener == nil || s.rpcCancel == nil { - s.logger.Println("[WARN] nomad: Unable to reload configuration due to uninitialized rpc listner") + s.logger.Warn("unable to reload configuration due to uninitialized rpc listner") return fmt.Errorf("can't reload uninitialized RPC listener") } tlsConf, err := tlsutil.NewTLSConfiguration(newTLSConfig, true, true) if err != nil { - s.logger.Printf("[ERR] nomad: unable to create TLS configuration %s", err) + s.logger.Error("unable to create TLS configuration", "error", err) return err } incomingTLS, tlsWrap, err := getTLSConf(newTLSConfig.EnableRPC, tlsConf) if err != nil { - s.logger.Printf("[ERR] nomad: unable to reset TLS context %s", err) + s.logger.Error("unable to reset TLS context", "error", err) return err } @@ -482,7 +494,7 @@ func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { s.connPool.ReloadTLS(tlsWrap) if err := s.rpcListener.Close(); err != nil { - s.logger.Printf("[ERR] nomad: Unable to close rpc listener %s", err) + s.logger.Error("unable to close rpc listener", "error", err) return err } @@ -504,13 +516,13 @@ func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { s.raftLayer.ReloadTLS(wrapper) s.raftTransport.CloseStreams() - s.logger.Printf("[DEBUG] nomad: finished reloading server connections") + s.logger.Debug("finished reloading server connections") return nil } // Shutdown is used to shutdown the server func (s *Server) Shutdown() error { - s.logger.Printf("[INFO] nomad: shutting down server") + s.logger.Info("shutting down server") s.shutdownLock.Lock() defer s.shutdownLock.Unlock() @@ -530,7 +542,7 @@ func (s *Server) Shutdown() error { s.raftLayer.Close() future := s.raft.Shutdown() if err := future.Error(); err != nil { - s.logger.Printf("[WARN] nomad: Error shutting down raft: %s", err) + s.logger.Warn("error shutting down raft", "error", err) } if s.raftStore != nil { s.raftStore.Close() @@ -570,13 +582,13 @@ func (s *Server) IsShutdown() bool { // Leave is used to prepare for a graceful shutdown of the server func (s *Server) Leave() error { - s.logger.Printf("[INFO] nomad: server starting leave") + s.logger.Info("server starting leave") s.left = true // Check the number of known peers numPeers, err := s.numPeers() if err != nil { - s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err) + s.logger.Error("failed to check raft peers during leave", "error", err) return err } @@ -596,12 +608,12 @@ func (s *Server) Leave() error { if minRaftProtocol >= 2 && s.config.RaftConfig.ProtocolVersion >= 3 { future := s.raft.RemoveServer(raft.ServerID(s.config.NodeID), 0, 0) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) + s.logger.Error("failed to remove ourself as raft peer", "error", err) } } else { future := s.raft.RemovePeer(addr) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) + s.logger.Error("failed to remove ourself as raft peer", "error", err) } } } @@ -609,7 +621,7 @@ func (s *Server) Leave() error { // Leave the gossip pool if s.serf != nil { if err := s.serf.Leave(); err != nil { - s.logger.Printf("[ERR] nomad: failed to leave Serf cluster: %v", err) + s.logger.Error("failed to leave Serf cluster", "error", err) } } @@ -626,7 +638,7 @@ func (s *Server) Leave() error { // Get the latest configuration. future := s.raft.GetConfiguration() if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) + s.logger.Error("failed to get raft configuration", "error", err) break } @@ -656,7 +668,7 @@ func (s *Server) Leave() error { // may not realize that it has been removed. Need to revisit this // and the warning here. if !left { - s.logger.Printf("[WARN] nomad: failed to leave raft configuration gracefully, timeout") + s.logger.Warn("failed to leave raft configuration gracefully, timeout") } } return nil @@ -680,12 +692,12 @@ func (s *Server) Reload(newConfig *Config) error { shouldReloadTLS, err := tlsutil.ShouldReloadRPCConnections(s.config.TLSConfig, newConfig.TLSConfig) if err != nil { - s.logger.Printf("[ERR] nomad: error checking whether to reload TLS configuration: %s", err) + s.logger.Error("error checking whether to reload TLS configuration", "error", err) } if shouldReloadTLS { if err := s.reloadTLSConnections(newConfig.TLSConfig); err != nil { - s.logger.Printf("[ERR] nomad: error reloading server TLS configuration: %s", err) + s.logger.Error("error reloading server TLS configuration", "error", err) multierror.Append(&mErr, err) } } @@ -771,7 +783,7 @@ func (s *Server) setupBootstrapHandler() error { } consulQueryCount++ - s.logger.Printf("[DEBUG] server.nomad: lost contact with Nomad quorum, falling back to Consul for server list") + s.logger.Debug("lost contact with Nomad quorum, falling back to Consul for server list") dcs, err := s.consulCatalog.Datacenters() if err != nil { @@ -803,7 +815,7 @@ func (s *Server) setupBootstrapHandler() error { consulServices, _, err := s.consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts) if err != nil { err := fmt.Errorf("failed to query service %q in Consul datacenter %q: %v", nomadServerServiceName, dc, err) - s.logger.Printf("[WARN] server.nomad: %v", err) + s.logger.Warn("failed to query Nomad service in Consul datacenter", "service_name", nomadServerServiceName, "dc", dc, "error", err) mErr.Errors = append(mErr.Errors, err) continue } @@ -831,7 +843,7 @@ func (s *Server) setupBootstrapHandler() error { // Log the error and return nil so future handlers // can attempt to register the `nomad` service. pollInterval := peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor) - s.logger.Printf("[TRACE] server.nomad: no Nomad Servers advertising service %+q in Consul datacenters %+q, sleeping for %v", nomadServerServiceName, dcs, pollInterval) + s.logger.Trace("no Nomad Servers advertising Nomad service in Consul datacenters", "service_name", nomadServerServiceName, "datacenters", dcs, "retry", pollInterval) peersTimeout.Reset(pollInterval) return nil } @@ -843,7 +855,7 @@ func (s *Server) setupBootstrapHandler() error { } peersTimeout.Reset(maxStaleLeadership) - s.logger.Printf("[INFO] server.nomad: successfully contacted %d Nomad Servers", numServersContacted) + s.logger.Info("successfully contacted Nomad servers", "num_servers", numServersContacted) return nil } @@ -860,7 +872,7 @@ func (s *Server) setupBootstrapHandler() error { // Only log if it worked last time if lastOk { lastOk = false - s.logger.Printf("[ERR] consul: error looking up Nomad servers: %v", err) + s.logger.Error("error looking up Nomad servers in Consul", "error", err) } d = defaultConsulDiscoveryIntervalRetry } @@ -998,27 +1010,27 @@ func (s *Server) setupRpcServer(server *rpc.Server, ctx *RPCContext) { // Add the static endpoints to the RPC server. if s.staticEndpoints.Status == nil { // Initialize the list just once - s.staticEndpoints.ACL = &ACL{s} - s.staticEndpoints.Alloc = &Alloc{s} - s.staticEndpoints.Eval = &Eval{s} - s.staticEndpoints.Job = &Job{s} - s.staticEndpoints.Node = &Node{srv: s} // Add but don't register - s.staticEndpoints.Deployment = &Deployment{srv: s} - s.staticEndpoints.Operator = &Operator{s} - s.staticEndpoints.Periodic = &Periodic{s} - s.staticEndpoints.Plan = &Plan{s} - s.staticEndpoints.Region = &Region{s} - s.staticEndpoints.Status = &Status{s} - s.staticEndpoints.System = &System{s} - s.staticEndpoints.Search = &Search{s} + s.staticEndpoints.ACL = &ACL{srv: s, logger: s.logger.Named("acl")} + s.staticEndpoints.Alloc = &Alloc{srv: s, logger: s.logger.Named("alloc")} + s.staticEndpoints.Eval = &Eval{srv: s, logger: s.logger.Named("eval")} + s.staticEndpoints.Job = &Job{srv: s, logger: s.logger.Named("job")} + s.staticEndpoints.Node = &Node{srv: s, logger: s.logger.Named("client")} // Add but don't register + s.staticEndpoints.Deployment = &Deployment{srv: s, logger: s.logger.Named("deployment")} + s.staticEndpoints.Operator = &Operator{srv: s, logger: s.logger.Named("operator")} + s.staticEndpoints.Periodic = &Periodic{srv: s, logger: s.logger.Named("periodic")} + s.staticEndpoints.Plan = &Plan{srv: s, logger: s.logger.Named("plan")} + s.staticEndpoints.Region = &Region{srv: s, logger: s.logger.Named("region")} + s.staticEndpoints.Status = &Status{srv: s, logger: s.logger.Named("status")} + s.staticEndpoints.System = &System{srv: s, logger: s.logger.Named("system")} + s.staticEndpoints.Search = &Search{srv: s, logger: s.logger.Named("search")} s.staticEndpoints.Enterprise = NewEnterpriseEndpoints(s) // Client endpoints - s.staticEndpoints.ClientStats = &ClientStats{s} - s.staticEndpoints.ClientAllocations = &ClientAllocations{s} + s.staticEndpoints.ClientStats = &ClientStats{srv: s, logger: s.logger.Named("client_stats")} + s.staticEndpoints.ClientAllocations = &ClientAllocations{srv: s, logger: s.logger.Named("client_allocs")} // Streaming endpoints - s.staticEndpoints.FileSystem = &FileSystem{s} + s.staticEndpoints.FileSystem = &FileSystem{srv: s, logger: s.logger.Named("client_fs")} s.staticEndpoints.FileSystem.register() } @@ -1041,7 +1053,7 @@ func (s *Server) setupRpcServer(server *rpc.Server, ctx *RPCContext) { server.Register(s.staticEndpoints.FileSystem) // Create new dynamic endpoints and add them to the RPC server. - node := &Node{srv: s, ctx: ctx} + node := &Node{srv: s, ctx: ctx, logger: s.logger.Named("client")} // Register the dynamic endpoints server.Register(node) @@ -1053,7 +1065,7 @@ func (s *Server) setupRaft() error { defer func() { if s.raft == nil && s.raftStore != nil { if err := s.raftStore.Close(); err != nil { - s.logger.Printf("[ERR] nomad: failed to close Raft store: %v", err) + s.logger.Error("failed to close Raft store", "error", err) } } }() @@ -1063,7 +1075,7 @@ func (s *Server) setupRaft() error { EvalBroker: s.evalBroker, Periodic: s.periodicDispatcher, Blocked: s.blockedEvals, - LogOutput: s.config.LogOutput, + Logger: s.logger, Region: s.Region(), } var err error @@ -1152,10 +1164,10 @@ func (s *Server) setupRaft() error { if err := os.Remove(peersFile); err != nil { return fmt.Errorf("failed to delete peers.json, please delete manually (see peers.info for details): %v", err) } - s.logger.Printf("[INFO] nomad: deleted peers.json file (see peers.info for details)") + s.logger.Info("deleted peers.json file (see peers.info for details)") } } else if _, err := os.Stat(peersFile); err == nil { - s.logger.Printf("[INFO] nomad: found peers.json file, recovering Raft configuration...") + s.logger.Info("found peers.json file, recovering Raft configuration...") configuration, err := raft.ReadPeersJSON(peersFile) if err != nil { return fmt.Errorf("recovery failed to parse peers.json: %v", err) @@ -1171,7 +1183,7 @@ func (s *Server) setupRaft() error { if err := os.Remove(peersFile); err != nil { return fmt.Errorf("recovery failed to delete peers.json, please delete manually (see peers.info for details): %v", err) } - s.logger.Printf("[INFO] nomad: deleted peers.json file after successful recovery") + s.logger.Info("deleted peers.json file after successful recovery") } } @@ -1269,7 +1281,7 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( func (s *Server) setupWorkers() error { // Check if all the schedulers are disabled if len(s.config.EnabledSchedulers) == 0 || s.config.NumSchedulers == 0 { - s.logger.Printf("[WARN] nomad: no enabled schedulers") + s.logger.Warn("no enabled schedulers") return nil } @@ -1297,8 +1309,7 @@ func (s *Server) setupWorkers() error { s.workers = append(s.workers, w) } } - s.logger.Printf("[INFO] nomad: starting %d scheduling worker(s) for %v", - s.config.NumSchedulers, s.config.EnabledSchedulers) + s.logger.Info("starting scheduling worker(s)", "num_workers", s.config.NumSchedulers, "schedulers", s.config.EnabledSchedulers) return nil } diff --git a/nomad/server_setup_oss.go b/nomad/server_setup_oss.go index 9755e46ee4f..b73d1fa4543 100644 --- a/nomad/server_setup_oss.go +++ b/nomad/server_setup_oss.go @@ -2,14 +2,17 @@ package nomad -import "github.com/hashicorp/consul/agent/consul/autopilot" +import ( + "github.com/hashicorp/consul/agent/consul/autopilot" + log "github.com/hashicorp/go-hclog" +) type EnterpriseState struct{} func (s *Server) setupEnterprise(config *Config) error { // Set up the OSS version of autopilot apDelegate := &AutopilotDelegate{s} - s.autopilot = autopilot.NewAutopilot(s.logger, apDelegate, config.AutopilotInterval, config.ServerHealthInterval) + s.autopilot = autopilot.NewAutopilot(s.logger.StandardLogger(&log.StandardLoggerOptions{InferLevels: true}), apDelegate, config.AutopilotInterval, config.ServerHealthInterval) return nil } diff --git a/nomad/server_test.go b/nomad/server_test.go index 30633b9e87b..c3d057f43ef 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -531,7 +531,7 @@ func TestServer_InvalidSchedulers(t *testing.T) { // Set the config to not have the core scheduler config := DefaultConfig() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) s := &Server{ config: config, logger: logger, diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index a1e25b8f80c..9b7cbc06070 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -3,13 +3,13 @@ package state import ( "context" "fmt" - "io" - "log" "sort" "time" - "github.com/hashicorp/go-memdb" + log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" + + "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -33,8 +33,8 @@ type IndexEntry struct { // StateStoreConfig is used to configure a new state store type StateStoreConfig struct { - // LogOutput is used to configure the output of the state store's logs - LogOutput io.Writer + // Logger is used to output the state store's logs + Logger log.Logger // Region is the region of the server embedding the state store. Region string @@ -48,7 +48,7 @@ type StateStoreConfig struct { // returned as a result of a read against the state store should be // considered a constant and NEVER modified in place. type StateStore struct { - logger *log.Logger + logger log.Logger db *memdb.MemDB // config is the passed in configuration @@ -69,7 +69,7 @@ func NewStateStore(config *StateStoreConfig) (*StateStore, error) { // Create the state store s := &StateStore{ - logger: log.New(config.LogOutput, "", log.LstdFlags|log.Lmicroseconds), + logger: config.Logger.Named("state_store"), db: db, config: config, abandonCh: make(chan struct{}), @@ -1605,7 +1605,7 @@ func (s *StateStore) nestedUpsertEval(txn *memdb.Txn, index uint64, eval *struct hasSummaryChanged = true } } else { - s.logger.Printf("[ERR] state_store: unable to update queued for job %q and task group %q", eval.JobID, tg) + s.logger.Error("unable to update queued for job and task group", "job_id", eval.JobID, "task_group", tg, "namespace", eval.Namespace) } } @@ -1681,9 +1681,8 @@ func (s *StateStore) updateEvalModifyIndex(txn *memdb.Txn, index uint64, evalID return fmt.Errorf("eval lookup failed: %v", err) } if existing == nil { - err := fmt.Errorf("unable to find eval id %q", evalID) - s.logger.Printf("[ERR] state_store: %v", err) - return err + s.logger.Error("unable to find eval", "eval_id", evalID) + return fmt.Errorf("unable to find eval id %q", evalID) } eval := existing.(*structs.Evaluation).Copy() // Update the indexes @@ -3012,7 +3011,7 @@ func (s *StateStore) ReconcileJobSummaries(index uint64) error { case structs.AllocClientStatusPending: tg.Starting += 1 default: - s.logger.Printf("[ERR] state_store: invalid client status: %v in allocation %q", alloc.ClientStatus, alloc.ID) + s.logger.Error("invalid client status set on allocation", "client_status", alloc.ClientStatus, "alloc_id", alloc.ID) } summary.Summary[alloc.TaskGroup] = tg } @@ -3448,8 +3447,8 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat if existingAlloc == nil { switch alloc.DesiredStatus { case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: - s.logger.Printf("[ERR] state_store: new allocation inserted into state store with id: %v and state: %v", - alloc.ID, alloc.DesiredStatus) + s.logger.Error("new allocation inserted into state store with bad desired status", + "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus) } switch alloc.ClientStatus { case structs.AllocClientStatusPending: @@ -3460,8 +3459,8 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat summaryChanged = true case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed, structs.AllocClientStatusComplete: - s.logger.Printf("[ERR] state_store: new allocation inserted into state store with id: %v and state: %v", - alloc.ID, alloc.ClientStatus) + s.logger.Error("new allocation inserted into state store with bad client status", + "alloc_id", alloc.ID, "client_status", alloc.ClientStatus) } } else if existingAlloc.ClientStatus != alloc.ClientStatus { // Incrementing the client of the bin of the current state @@ -3488,8 +3487,8 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat tgSummary.Lost -= 1 case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete: default: - s.logger.Printf("[ERR] state_store: invalid old state of allocation with id: %v, and state: %v", - existingAlloc.ID, existingAlloc.ClientStatus) + s.logger.Error("invalid old client status for allocatio", + "alloc_id", existingAlloc.ID, "client_status", existingAlloc.ClientStatus) } summaryChanged = true } diff --git a/nomad/state/testing.go b/nomad/state/testing.go index ee7dce1d6c7..5bf2350166a 100644 --- a/nomad/state/testing.go +++ b/nomad/state/testing.go @@ -7,8 +7,8 @@ import ( func TestStateStore(t testing.T) *StateStore { config := &StateStoreConfig{ - LogOutput: testlog.NewWriter(t), - Region: "global", + Logger: testlog.HCLogger(t), + Region: "global", } state, err := NewStateStore(config) if err != nil { diff --git a/nomad/stats_fetcher.go b/nomad/stats_fetcher.go index 0ae05e227ea..475d013d833 100644 --- a/nomad/stats_fetcher.go +++ b/nomad/stats_fetcher.go @@ -2,9 +2,10 @@ package nomad import ( "context" - "log" "sync" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/consul/agent/consul/autopilot" "github.com/hashicorp/nomad/helper/pool" "github.com/hashicorp/serf/serf" @@ -18,7 +19,7 @@ import ( // a single in-flight RPC to any given server, so goroutines don't accumulate // as we run the health check fairly frequently. type StatsFetcher struct { - logger *log.Logger + logger log.Logger pool *pool.ConnPool region string inflight map[string]struct{} @@ -26,9 +27,9 @@ type StatsFetcher struct { } // NewStatsFetcher returns a stats fetcher. -func NewStatsFetcher(logger *log.Logger, pool *pool.ConnPool, region string) *StatsFetcher { +func NewStatsFetcher(logger log.Logger, pool *pool.ConnPool, region string) *StatsFetcher { return &StatsFetcher{ - logger: logger, + logger: logger.Named("stats_fetcher"), pool: pool, region: region, inflight: make(map[string]struct{}), @@ -44,8 +45,7 @@ func (f *StatsFetcher) fetch(server *serverParts, replyCh chan *autopilot.Server var reply autopilot.ServerStats err := f.pool.RPC(f.region, server.Addr, server.MajorVersion, "Status.RaftStats", &args, &reply) if err != nil { - f.logger.Printf("[WARN] nomad: error getting server health from %q: %v", - server.Name, err) + f.logger.Warn("failed retrieving server health", "server", server.Name, "error", err) } else { replyCh <- &reply } @@ -73,8 +73,7 @@ func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[str f.inflightLock.Lock() for _, server := range servers { if _, ok := f.inflight[server.ID]; ok { - f.logger.Printf("[WARN] nomad: error getting server health from %q: last request still outstanding", - server.Name) + f.logger.Warn("failed retrieving server health; last request still outstanding", "server", server.Name) } else { workItem := &workItem{ server: server, @@ -96,8 +95,7 @@ func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[str replies[workItem.server.ID] = reply case <-ctx.Done(): - f.logger.Printf("[WARN] nomad: error getting server health from %q: %v", - workItem.server.Name, ctx.Err()) + f.logger.Warn("failed retrieving server health", "server", workItem.server.Name, "error", ctx.Err()) } } return replies diff --git a/nomad/status_endpoint.go b/nomad/status_endpoint.go index a79bbd22050..3830bca62d2 100644 --- a/nomad/status_endpoint.go +++ b/nomad/status_endpoint.go @@ -1,19 +1,20 @@ package nomad import ( + "errors" "fmt" "strconv" - "errors" + log "github.com/hashicorp/go-hclog" "github.com/hashicorp/consul/agent/consul/autopilot" - "github.com/hashicorp/nomad/nomad/structs" ) // Status endpoint is used to check on server status type Status struct { - srv *Server + srv *Server + logger log.Logger } // Version is used to allow clients to determine the capabilities diff --git a/nomad/system_endpoint.go b/nomad/system_endpoint.go index 0c290db31c0..6f26ff1c2b4 100644 --- a/nomad/system_endpoint.go +++ b/nomad/system_endpoint.go @@ -3,12 +3,15 @@ package nomad import ( "fmt" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/nomad/structs" ) // System endpoint is used to call invoke system tasks. type System struct { - srv *Server + srv *Server + logger log.Logger } // GarbageCollect is used to trigger the system to immediately garbage collect nodes, evals diff --git a/nomad/testing.go b/nomad/testing.go index a18a929ec0d..9696aa386fe 100644 --- a/nomad/testing.go +++ b/nomad/testing.go @@ -37,6 +37,7 @@ func TestACLServer(t testing.T, cb func(*Config)) (*Server, *structs.ACLToken) { func TestServer(t testing.T, cb func(*Config)) *Server { // Setup the default settings config := DefaultConfig() + config.Logger = testlog.HCLogger(t) config.Build = "0.8.0+unittest" config.DevMode = true nodeNum := atomic.AddUint32(&nodeNumber, 1) @@ -76,9 +77,7 @@ func TestServer(t testing.T, cb func(*Config)) *Server { // Enable raft as leader if we have bootstrap on config.RaftConfig.StartAsLeader = !config.DevDisableBootstrap - logger := testlog.WithPrefix(t, fmt.Sprintf("[%s] ", config.NodeName)) - hclogger := testlog.HCLogger(t) - catalog := consul.NewMockCatalog(hclogger) + catalog := consul.NewMockCatalog(config.Logger) for i := 10; i >= 0; i-- { // Get random ports @@ -90,7 +89,7 @@ func TestServer(t testing.T, cb func(*Config)) *Server { config.SerfConfig.MemberlistConfig.BindPort = ports[1] // Create server - server, err := NewServer(config, catalog, logger) + server, err := NewServer(config, catalog) if err == nil { return server } else if i == 0 { diff --git a/nomad/vault.go b/nomad/vault.go index fc60075fb09..332d9fc993a 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -4,7 +4,6 @@ import ( "context" "errors" "fmt" - "log" "math/rand" "sync" "sync/atomic" @@ -13,10 +12,12 @@ import ( "gopkg.in/tomb.v2" metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" + vapi "github.com/hashicorp/vault/api" + "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" - vapi "github.com/hashicorp/vault/api" "github.com/mitchellh/mapstructure" "golang.org/x/sync/errgroup" @@ -206,7 +207,7 @@ type vaultClient struct { lastRenewed time.Time tomb *tomb.Tomb - logger *log.Logger + logger log.Logger // stats stores the stats stats *VaultStats @@ -219,7 +220,7 @@ type vaultClient struct { // NewVaultClient returns a Vault client from the given config. If the client // couldn't be made an error is returned. -func NewVaultClient(c *config.VaultConfig, logger *log.Logger, purgeFn PurgeVaultAccessorFn) (*vaultClient, error) { +func NewVaultClient(c *config.VaultConfig, logger log.Logger, purgeFn PurgeVaultAccessorFn) (*vaultClient, error) { if c == nil { return nil, fmt.Errorf("must pass valid VaultConfig") } @@ -230,7 +231,7 @@ func NewVaultClient(c *config.VaultConfig, logger *log.Logger, purgeFn PurgeVaul v := &vaultClient{ config: c, - logger: logger, + logger: logger.Named("vault"), limiter: rate.NewLimiter(requestRateLimit, int(requestRateLimit)), revoking: make(map[*structs.VaultAccessor]time.Time), purgeFn: purgeFn, @@ -387,7 +388,7 @@ func (v *vaultClient) buildClient() error { // Create the Vault API client client, err := vapi.NewClient(apiConf) if err != nil { - v.logger.Printf("[ERR] vault: failed to create Vault client. Not retrying: %v", err) + v.logger.Error("failed to create Vault client and not retrying", "error", err) return err } @@ -417,8 +418,7 @@ OUTER: // Ensure the API is reachable if !initStatus { if _, err := v.client.Sys().InitStatus(); err != nil { - v.logger.Printf("[WARN] vault: failed to contact Vault API. Retrying in %v: %v", - v.config.ConnectionRetryIntv, err) + v.logger.Warn("failed to contact Vault API", "retry", v.config.ConnectionRetryIntv, "error", err) retryTimer.Reset(v.config.ConnectionRetryIntv) continue OUTER } @@ -426,7 +426,7 @@ OUTER: } // Retry validating the token till success if err := v.parseSelfToken(); err != nil { - v.logger.Printf("[ERR] vault: failed to validate self token/role. Retrying in %v: %v", v.config.ConnectionRetryIntv, err) + v.logger.Error("failed to validate self token/role", "retry", v.config.ConnectionRetryIntv, "error", err) retryTimer.Reset(v.config.ConnectionRetryIntv) v.l.Lock() v.connEstablished = true @@ -444,10 +444,9 @@ OUTER: // If we are given a non-root token, start renewing it if v.tokenData.Root && v.tokenData.CreationTTL == 0 { - v.logger.Printf("[DEBUG] vault: not renewing token as it is root") + v.logger.Debug("not renewing token as it is root") } else { - v.logger.Printf("[DEBUG] vault: token lease duration is %v", - time.Duration(v.tokenData.CreationTTL)*time.Second) + v.logger.Debug("starting renewal loop", "creation_ttl", time.Duration(v.tokenData.CreationTTL)*time.Second) v.tomb.Go(wrapNilError(v.renewalLoop)) } @@ -484,7 +483,7 @@ func (v *vaultClient) renewalLoop() { // Set the timer to poke us after half of that time is up. durationUntilRenew := currentExpiration.Sub(time.Now()) / 2 - v.logger.Printf("[INFO] vault: renewing token in %v", durationUntilRenew) + v.logger.Info("successfully renewed token", "next_renewal", durationUntilRenew) authRenewTimer.Reset(durationUntilRenew) // Reset any backoff @@ -504,7 +503,7 @@ func (v *vaultClient) renewalLoop() { // Set base values and add some backoff - v.logger.Printf("[WARN] vault: got error or bad auth, so backing off: %v", err) + v.logger.Warn("got error or bad auth, so backing off", "error", err) switch { case backoff < 5: backoff = 5 @@ -521,7 +520,7 @@ func (v *vaultClient) renewalLoop() { if maxBackoff < 0 { // We have failed to renew the token past its expiration. Stop // renewing with Vault. - v.logger.Printf("[ERR] vault: failed to renew Vault token before lease expiration. Shutting down Vault client") + v.logger.Error("failed to renew Vault token before lease expiration. Shutting down Vault client") v.l.Lock() v.connEstablished = false v.connEstablishedErr = err @@ -533,7 +532,7 @@ func (v *vaultClient) renewalLoop() { } durationUntilRetry := time.Duration(backoff) * time.Second - v.logger.Printf("[INFO] vault: backing off for %v", durationUntilRetry) + v.logger.Info("backing off renewal", "retry", durationUntilRetry) authRenewTimer.Reset(durationUntilRetry) } @@ -557,7 +556,7 @@ func (v *vaultClient) renew() error { } v.lastRenewed = time.Now() - v.logger.Printf("[DEBUG] vault: successfully renewed server token") + v.logger.Debug("successfully renewed server token") return nil } @@ -699,12 +698,12 @@ func (v *vaultClient) validateCapabilities(role string, root bool) error { if structs.VaultUnrecoverableError.MatchString(err.Error()) { // Since we can't read permissions, we just log a warning that we // can't tell if the Vault token will work - msg := fmt.Sprintf("Can not lookup token capabilities. "+ + msg := fmt.Sprintf("can not lookup token capabilities. "+ "As such certain operations may fail in the future. "+ "Please give Nomad a Vault token with one of the following "+ "capabilities %q on %q so that the required capabilities can be verified", vaultCapabilitiesCapability, vaultCapabilitiesLookupPath) - v.logger.Printf("[WARN] vault: %s", msg) + v.logger.Warn(msg) return nil } else { multierror.Append(&mErr, err) @@ -913,7 +912,7 @@ func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, ta validationErr = fmt.Errorf("Vault returned WrapInfo without WrappedAccessor. Secret warnings: %v", secret.Warnings) } if validationErr != nil { - v.logger.Printf("[WARN] vault: failed to CreateToken: %v", err) + v.logger.Warn("ailed to CreateToken", "error", err) return nil, structs.NewRecoverableError(validationErr, true) } @@ -1005,7 +1004,7 @@ func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.Vau return nil } - v.logger.Printf("[WARN] vault: failed to revoke tokens. Will reattempt until TTL: %v", err) + v.logger.Warn("failed to revoke tokens. Will reattempt until TTL", "error", err) v.storeForRevocation(accessors) return nil } else if !committed { @@ -1015,7 +1014,7 @@ func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.Vau } if err := v.purgeFn(accessors); err != nil { - v.logger.Printf("[ERR] vault: failed to purge Vault accessors: %v", err) + v.logger.Error("failed to purge Vault accessors", "error", err) v.storeForRevocation(accessors) return nil } @@ -1138,7 +1137,7 @@ func (v *vaultClient) revokeDaemon() { } if err := v.parallelRevoke(context.Background(), revoking); err != nil { - v.logger.Printf("[WARN] vault: background token revocation errored: %v", err) + v.logger.Warn("background token revocation errored", "error", err) v.revLock.Unlock() continue } @@ -1149,7 +1148,7 @@ func (v *vaultClient) revokeDaemon() { // Call the passed in token revocation function if err := v.purgeFn(revoking); err != nil { // Can continue since revocation is idempotent - v.logger.Printf("[ERR] vault: token revocation errored: %v", err) + v.logger.Error("token revocation errored", "error", err) continue } diff --git a/nomad/vault_test.go b/nomad/vault_test.go index c6368b981f0..f7fdbddb82f 100644 --- a/nomad/vault_test.go +++ b/nomad/vault_test.go @@ -149,7 +149,7 @@ func testVaultRoleAndToken(v *testutil.TestVault, t *testing.T, vaultPolicies ma func TestVaultClient_BadConfig(t *testing.T) { t.Parallel() conf := &config.VaultConfig{} - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) // Should be no error since Vault is not enabled _, err := NewVaultClient(nil, logger, nil) @@ -178,7 +178,7 @@ func TestVaultClient_EstablishConnection(t *testing.T) { t.Parallel() for i := 10; i >= 0; i-- { v := testutil.NewTestVaultDelayed(t) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) v.Config.ConnectionRetryIntv = 100 * time.Millisecond client, err := NewVaultClient(v.Config, logger, nil) if err != nil { @@ -246,7 +246,7 @@ func TestVaultClient_ValidateRole(t *testing.T) { } v.Config.Token = testVaultRoleAndToken(v, t, vaultPolicies, data, nil) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) v.Config.ConnectionRetryIntv = 100 * time.Millisecond client, err := NewVaultClient(v.Config, logger, nil) if err != nil { @@ -285,7 +285,7 @@ func TestVaultClient_ValidateRole_NonExistant(t *testing.T) { v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) v.Config.Token = v.RootToken - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) v.Config.ConnectionRetryIntv = 100 * time.Millisecond v.Config.Role = "test-nonexistent" client, err := NewVaultClient(v.Config, logger, nil) @@ -334,7 +334,7 @@ func TestVaultClient_ValidateToken(t *testing.T) { } v.Config.Token = testVaultRoleAndToken(v, t, vaultPolicies, data, []string{"token-lookup", "nomad-role-create"}) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) v.Config.ConnectionRetryIntv = 100 * time.Millisecond client, err := NewVaultClient(v.Config, logger, nil) if err != nil { @@ -377,7 +377,7 @@ func TestVaultClient_SetActive(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -413,7 +413,7 @@ func TestVaultClient_SetConfig(t *testing.T) { // Set the configs token in a new test role v2.Config.Token = defaultTestVaultWhitelistRoleAndToken(v2, t, 20) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -469,7 +469,7 @@ func TestVaultClient_SetConfig_Disable(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -507,7 +507,7 @@ func TestVaultClient_RenewalLoop(t *testing.T) { v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -565,7 +565,7 @@ func TestVaultClient_LookupToken_Invalid(t *testing.T) { } // Enable vault but use a bad address so it never establishes a conn - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(conf, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -584,7 +584,7 @@ func TestVaultClient_LookupToken_Root(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -649,7 +649,7 @@ func TestVaultClient_LookupToken_Role(t *testing.T) { // Set the configs token in a new test role v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -711,7 +711,7 @@ func TestVaultClient_LookupToken_RateLimit(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -771,7 +771,7 @@ func TestVaultClient_CreateToken_Root(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -819,7 +819,7 @@ func TestVaultClient_CreateToken_Whitelist_Role(t *testing.T) { v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -870,7 +870,7 @@ func TestVaultClient_CreateToken_Root_Target_Role(t *testing.T) { v.Config.Role = "test" // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -929,7 +929,7 @@ func TestVaultClient_CreateToken_Blacklist_Role(t *testing.T) { v.Config.Role = "test" // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -978,7 +978,7 @@ func TestVaultClient_CreateToken_Role_InvalidToken(t *testing.T) { v.Config.Token = "foo-bar" // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1016,7 +1016,7 @@ func TestVaultClient_CreateToken_Role_Unrecoverable(t *testing.T) { v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1050,7 +1050,7 @@ func TestVaultClient_CreateToken_Prestart(t *testing.T) { Addr: "http://127.0.0.1:0", } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(vconfig, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1082,7 +1082,7 @@ func TestVaultClient_RevokeTokens_PreEstablishs(t *testing.T) { Token: uuid.Generate(), Addr: "http://127.0.0.1:0", } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(vconfig, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1130,7 +1130,7 @@ func TestVaultClient_RevokeTokens_Root(t *testing.T) { return nil } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, purge) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1198,7 +1198,7 @@ func TestVaultClient_RevokeTokens_Role(t *testing.T) { return nil } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, purge) if err != nil { t.Fatalf("failed to build vault client: %v", err) diff --git a/nomad/worker.go b/nomad/worker.go index 6908188fbaf..b6bab0bbc94 100644 --- a/nomad/worker.go +++ b/nomad/worker.go @@ -2,13 +2,14 @@ package nomad import ( "fmt" - "log" "strings" "sync" "time" - "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/armon/go-metrics" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/scheduler" ) @@ -54,7 +55,7 @@ const ( // of the scheduler with the plumbing required to make it all work. type Worker struct { srv *Server - logger *log.Logger + logger log.Logger start time.Time paused bool @@ -75,7 +76,7 @@ type Worker struct { func NewWorker(srv *Server) (*Worker, error) { w := &Worker{ srv: srv, - logger: srv.logger, + logger: srv.logger.ResetNamed("worker"), start: time.Now(), } w.pauseCond = sync.NewCond(&w.pauseLock) @@ -159,7 +160,7 @@ REQ: metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) if err != nil { if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() { - w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err) + w.logger.Error("failed to dequeue evaluation", "error", err) } // Adjust the backoff based on the error. If it is a scheduler version @@ -179,7 +180,7 @@ REQ: // Check if we got a response if resp.Eval != nil { - w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID) + w.logger.Debug("dequeued evaluation", "eval_id", resp.Eval.ID) return resp.Eval, resp.Token, resp.GetWaitIndex(), false } @@ -215,10 +216,9 @@ func (w *Worker) sendAck(evalID, token string, ack bool) { // Make the RPC call err := w.srv.RPC(endpoint, &req, &resp) if err != nil { - w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v", - verb, evalID, err) + w.logger.Error(fmt.Sprintf("failed to %s evaluation", verb), "eval_id", evalID, "error", err) } else { - w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID) + w.logger.Debug(fmt.Sprintf("%s evaluation", verb), "eval_id", evalID) } } @@ -320,14 +320,13 @@ func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler. SUBMIT: // Make the RPC call if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { - w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v", - plan.EvalID, err) + w.logger.Error("failed to submit plan for evaluation", "eval_id", plan.EvalID, "error", err) if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { goto SUBMIT } return nil, nil, err } else { - w.logger.Printf("[DEBUG] worker: submitted plan at index %d for evaluation %s", resp.Index, plan.EvalID) + w.logger.Debug("submitted plan for evaluation", "plan_resp_index", resp.Index, "eval_id", plan.EvalID) w.backoffReset() } @@ -344,7 +343,7 @@ SUBMIT: var state scheduler.State if result.RefreshIndex != 0 { // Wait for the raft log to catchup to the evaluation - w.logger.Printf("[DEBUG] worker: refreshing state to index %d for %q", result.RefreshIndex, plan.EvalID) + w.logger.Debug("refreshing state", "refresh_index", result.RefreshIndex, "eval_id", plan.EvalID) if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil { return nil, nil, err } @@ -386,14 +385,13 @@ func (w *Worker) UpdateEval(eval *structs.Evaluation) error { SUBMIT: // Make the RPC call if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { - w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v", - eval, err) + w.logger.Error("failed to update evaluation", "eval", log.Fmt("%#v", eval), "error", err) if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { goto SUBMIT } return err } else { - w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval) + w.logger.Debug("updated evaluation", "eval", log.Fmt("%#v", eval)) w.backoffReset() } return nil @@ -424,14 +422,13 @@ func (w *Worker) CreateEval(eval *structs.Evaluation) error { SUBMIT: // Make the RPC call if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { - w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v", - eval, err) + w.logger.Error("failed to create evaluation", "eval", log.Fmt("%#v", eval), "error", err) if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { goto SUBMIT } return err } else { - w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval) + w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval)) w.backoffReset() } return nil @@ -486,14 +483,13 @@ func (w *Worker) ReblockEval(eval *structs.Evaluation) error { SUBMIT: // Make the RPC call if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil { - w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v", - eval, err) + w.logger.Error("failed to reblock evaluation", "eval", log.Fmt("%#v", eval), "error", err) if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { goto SUBMIT } return err } else { - w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval) + w.logger.Debug("reblocked evaluation", "eval", log.Fmt("%#v", eval)) w.backoffReset() } return nil diff --git a/nomad/worker_test.go b/nomad/worker_test.go index 297102dfd87..2f3e3172831 100644 --- a/nomad/worker_test.go +++ b/nomad/worker_test.go @@ -1,14 +1,15 @@ package nomad import ( - "log" "reflect" "strings" "sync" "testing" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -35,7 +36,7 @@ func (n *NoopScheduler) Process(eval *structs.Evaluation) error { } func init() { - scheduler.BuiltinSchedulers["noop"] = func(logger *log.Logger, s scheduler.State, p scheduler.Planner) scheduler.Scheduler { + scheduler.BuiltinSchedulers["noop"] = func(logger log.Logger, s scheduler.State, p scheduler.Planner) scheduler.Scheduler { n := &NoopScheduler{ state: s, planner: p, diff --git a/scheduler/context.go b/scheduler/context.go index 569a1347f5c..031a3b45a51 100644 --- a/scheduler/context.go +++ b/scheduler/context.go @@ -1,10 +1,11 @@ package scheduler import ( - "log" "regexp" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/go-version" "github.com/hashicorp/nomad/nomad/structs" ) @@ -18,7 +19,7 @@ type Context interface { Plan() *structs.Plan // Logger provides a way to log - Logger() *log.Logger + Logger() log.Logger // Metrics returns the current metrics Metrics() *structs.AllocMetric @@ -67,13 +68,13 @@ type EvalContext struct { EvalCache state State plan *structs.Plan - logger *log.Logger + logger log.Logger metrics *structs.AllocMetric eligibility *EvalEligibility } // NewEvalContext constructs a new EvalContext -func NewEvalContext(s State, p *structs.Plan, log *log.Logger) *EvalContext { +func NewEvalContext(s State, p *structs.Plan, log log.Logger) *EvalContext { ctx := &EvalContext{ state: s, plan: p, @@ -91,7 +92,7 @@ func (e *EvalContext) Plan() *structs.Plan { return e.plan } -func (e *EvalContext) Logger() *log.Logger { +func (e *EvalContext) Logger() log.Logger { return e.logger } diff --git a/scheduler/context_test.go b/scheduler/context_test.go index b499ff2810a..f6430244784 100644 --- a/scheduler/context_test.go +++ b/scheduler/context_test.go @@ -18,7 +18,7 @@ func testContext(t testing.TB) (*state.StateStore, *EvalContext) { NodeAllocation: make(map[string][]*structs.Allocation), } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) ctx := NewEvalContext(state, plan, logger) return state, ctx diff --git a/scheduler/feasible.go b/scheduler/feasible.go index d25680a7ce9..eb74fa1f99c 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -136,9 +136,7 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { // to verify the client supports this. if driverInfo, ok := option.Drivers[driver]; ok { if driverInfo == nil { - c.ctx.Logger(). - Printf("[WARN] scheduler.DriverChecker: node %v has no driver info set for %v", - option.ID, driver) + c.ctx.Logger().Named("driver_checker").Warn("node has no driver info set", "node_id", option.ID, "driver", driver) return false } @@ -152,9 +150,7 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { enabled, err := strconv.ParseBool(value) if err != nil { - c.ctx.Logger(). - Printf("[WARN] scheduler.DriverChecker: node %v has invalid driver setting %v: %v", - option.ID, driverStr, value) + c.ctx.Logger().Named("driver_checker").Warn("node has invalid driver setting", "node_id", option.ID, "driver", driver, "val", value) return false } @@ -241,8 +237,7 @@ func (iter *DistinctHostsIterator) satisfiesDistinctHosts(option *structs.Node) // Get the proposed allocations proposed, err := iter.ctx.ProposedAllocs(option.ID) if err != nil { - iter.ctx.Logger().Printf( - "[ERR] scheduler.dynamic-constraint: failed to get proposed allocations: %v", err) + iter.ctx.Logger().Named("distinct_hosts").Error("failed to get proposed allocations", "error", err) return false } diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 59c1233b68c..eb2b8023fcc 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -2,10 +2,11 @@ package scheduler import ( "fmt" - "log" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" @@ -72,7 +73,7 @@ func (s *SetStatusError) Error() string { // most workloads. It also supports a 'batch' mode to optimize for fast decision // making at the cost of quality. type GenericScheduler struct { - logger *log.Logger + logger log.Logger state State planner Planner batch bool @@ -94,9 +95,9 @@ type GenericScheduler struct { } // NewServiceScheduler is a factory function to instantiate a new service scheduler -func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler { +func NewServiceScheduler(logger log.Logger, state State, planner Planner) Scheduler { s := &GenericScheduler{ - logger: logger, + logger: logger.Named("service_sched"), state: state, planner: planner, batch: false, @@ -105,9 +106,9 @@ func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Sched } // NewBatchScheduler is a factory function to instantiate a new batch scheduler -func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler { +func NewBatchScheduler(logger log.Logger, state State, planner Planner) Scheduler { s := &GenericScheduler{ - logger: logger, + logger: logger.Named("batch_sched"), state: state, planner: planner, batch: true, @@ -120,6 +121,9 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error { // Store the evaluation s.eval = eval + // Update our logger with the eval's information + s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) + // Verify the evaluation trigger reason is understood switch eval.TriggeredBy { case structs.EvalTriggerJobRegister, structs.EvalTriggerJobDeregister, @@ -243,7 +247,7 @@ func (s *GenericScheduler) process() (bool, error) { // Compute the target job allocations if err := s.computeJobAllocs(); err != nil { - s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) + s.logger.Error("failed to compute job allocations", "error", err) return false, err } @@ -252,10 +256,10 @@ func (s *GenericScheduler) process() (bool, error) { // current evaluation is already a blocked eval, we reuse it. if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil { if err := s.createBlockedEval(false); err != nil { - s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err) + s.logger.Error("failed to make blocked eval", "error", err) return false, err } - s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID) + s.logger.Debug("failed to place all allocations, blocked eval created", "blocked_eval_id", s.blocked.ID) } // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan @@ -270,10 +274,10 @@ func (s *GenericScheduler) process() (bool, error) { eval.PreviousEval = s.eval.ID // TODO(preetha) this should be batching evals before inserting them if err := s.planner.CreateEval(eval); err != nil { - s.logger.Printf("[ERR] sched: %#v failed to make next eval for rescheduling: %v", s.eval, err) + s.logger.Error("failed to make next eval for rescheduling", "error", err) return false, err } - s.logger.Printf("[DEBUG] sched: %#v: found reschedulable allocs, next eval '%s' created", s.eval, eval.ID) + s.logger.Debug("found reschedulable allocs, followup eval created", "followup_eval_id", eval.ID) } } @@ -290,7 +294,7 @@ func (s *GenericScheduler) process() (bool, error) { // If we got a state refresh, try again since we have stale data if newState != nil { - s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) + s.logger.Debug("refresh forced") s.state = newState return false, nil } @@ -298,8 +302,7 @@ func (s *GenericScheduler) process() (bool, error) { // Try again if the plan was not fully committed, potential conflict fullCommit, expected, actual := result.FullCommit(s.plan) if !fullCommit { - s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", - s.eval, expected, actual) + s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual) if newState == nil { return false, fmt.Errorf("missing state refresh after partial commit") } @@ -332,11 +335,11 @@ func (s *GenericScheduler) computeJobAllocs() error { // nodes to lost updateNonTerminalAllocsToLost(s.plan, tainted, allocs) - reconciler := NewAllocReconciler(s.ctx.Logger(), + reconciler := NewAllocReconciler(s.logger, genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID), s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted, s.eval.ID) results := reconciler.Compute() - s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, results) + s.logger.Debug("reconciled current state with desired state", "results", log.Fmt("%#v", results)) if s.eval.AnnotatePlan { s.plan.Annotations = &structs.PlanAnnotations{ diff --git a/scheduler/propertyset.go b/scheduler/propertyset.go index 8ac9b48c6d9..d0483b78133 100644 --- a/scheduler/propertyset.go +++ b/scheduler/propertyset.go @@ -4,6 +4,7 @@ import ( "fmt" "strconv" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" @@ -14,6 +15,9 @@ type propertySet struct { // ctx is used to lookup the plan and state ctx Context + // logger is the logger for the property set + logger log.Logger + // jobID is the job we are operating on jobID string @@ -55,6 +59,7 @@ func NewPropertySet(ctx Context, job *structs.Job) *propertySet { jobID: job.ID, namespace: job.Namespace, existingValues: make(map[string]uint64), + logger: ctx.Logger().Named("property_set"), } return p @@ -81,7 +86,7 @@ func (p *propertySet) setConstraint(constraint *structs.Constraint, taskGroup st c, err := strconv.ParseUint(v, 10, 64) if err != nil { p.errorBuilding = fmt.Errorf("failed to convert RTarget %q to uint64: %v", v, err) - p.ctx.Logger().Printf("[ERR] scheduler.dynamic-constraint: %v", p.errorBuilding) + p.logger.Error("failed to convert RTarget to uint64", "RTarget", v, "error", err) return } @@ -130,7 +135,7 @@ func (p *propertySet) populateExisting() { allocs, err := p.ctx.State().AllocsByJob(ws, p.namespace, p.jobID, false) if err != nil { p.errorBuilding = fmt.Errorf("failed to get job's allocations: %v", err) - p.ctx.Logger().Printf("[ERR] scheduler.dynamic-constraint: %v", p.errorBuilding) + p.logger.Error("failed to get job's allocations", "job", p.jobID, "namespace", p.namespace, "error", err) return } @@ -141,7 +146,7 @@ func (p *propertySet) populateExisting() { nodes, err := p.buildNodeMap(allocs) if err != nil { p.errorBuilding = err - p.ctx.Logger().Printf("[ERR] scheduler.dynamic-constraint: %v", err) + p.logger.Error("failed to build node map", "error", err) return } @@ -179,7 +184,7 @@ func (p *propertySet) PopulateProposed() { nodes, err := p.buildNodeMap(both) if err != nil { p.errorBuilding = err - p.ctx.Logger().Printf("[ERR] scheduler.dynamic-constraint: %v", err) + p.logger.Error("failed to build node map", "error", err) return } diff --git a/scheduler/rank.go b/scheduler/rank.go index 56620075332..3fc09598947 100644 --- a/scheduler/rank.go +++ b/scheduler/rank.go @@ -179,9 +179,7 @@ OUTER: // Get the proposed allocations proposed, err := option.ProposedAllocs(iter.ctx) if err != nil { - iter.ctx.Logger().Printf( - "[ERR] sched.binpack: failed to get proposed allocations: %v", - err) + iter.ctx.Logger().Named("binpack").Error("failed retrieving proposed allocations", "error", err) continue } @@ -292,9 +290,7 @@ func (iter *JobAntiAffinityIterator) Next() *RankedNode { // Get the proposed allocations proposed, err := option.ProposedAllocs(iter.ctx) if err != nil { - iter.ctx.Logger().Printf( - "[ERR] sched.job-anti-aff: failed to get proposed allocations: %v", - err) + iter.ctx.Logger().Named("job_anti_affinity").Error("failed retrieving proposed allocations", "error", err) continue } diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 6c1611afe52..9826751f319 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -2,11 +2,12 @@ package scheduler import ( "fmt" - "log" "time" "sort" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" @@ -38,7 +39,7 @@ type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job, type allocReconciler struct { // logger is used to log debug information. Logging should be kept at a // minimal here - logger *log.Logger + logger log.Logger // canInplace is used to check if the allocation can be inplace upgraded allocUpdateFn allocUpdateType @@ -155,11 +156,11 @@ func (r *reconcileResults) Changes() int { // NewAllocReconciler creates a new reconciler that should be used to determine // the changes required to bring the cluster state inline with the declared jobspec -func NewAllocReconciler(logger *log.Logger, allocUpdateFn allocUpdateType, batch bool, +func NewAllocReconciler(logger log.Logger, allocUpdateFn allocUpdateType, batch bool, jobID string, job *structs.Job, deployment *structs.Deployment, existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler { return &allocReconciler{ - logger: logger, + logger: logger.Named("reconciler"), allocUpdateFn: allocUpdateFn, batch: batch, jobID: jobID, diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index d096d195aab..28df0b17069 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -278,7 +278,7 @@ func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) { // existing allocations func TestReconciler_Place_NoExisting(t *testing.T) { job := mock.Job() - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, nil, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, nil, nil, "") r := reconciler.Compute() // Assert the correct results @@ -314,7 +314,7 @@ func TestReconciler_Place_Existing(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -352,7 +352,7 @@ func TestReconciler_ScaleDown_Partial(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -391,7 +391,7 @@ func TestReconciler_ScaleDown_Zero(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -431,7 +431,7 @@ func TestReconciler_ScaleDown_Zero_DuplicateNames(t *testing.T) { expectedStopped = append(expectedStopped, i%2) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -466,7 +466,7 @@ func TestReconciler_Inplace(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -504,7 +504,7 @@ func TestReconciler_Inplace_ScaleUp(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -544,7 +544,7 @@ func TestReconciler_Inplace_ScaleDown(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -581,7 +581,7 @@ func TestReconciler_Destructive(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -617,7 +617,7 @@ func TestReconciler_Destructive_ScaleUp(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -656,7 +656,7 @@ func TestReconciler_Destructive_ScaleDown(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -701,7 +701,7 @@ func TestReconciler_LostNode(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -751,7 +751,7 @@ func TestReconciler_LostNode_ScaleUp(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -801,7 +801,7 @@ func TestReconciler_LostNode_ScaleDown(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -847,7 +847,7 @@ func TestReconciler_DrainNode(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -900,7 +900,7 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -954,7 +954,7 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -999,7 +999,7 @@ func TestReconciler_RemovedTG(t *testing.T) { newName := "different" job.TaskGroups[0].Name = newName - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -1061,7 +1061,7 @@ func TestReconciler_JobStopped(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, c.jobID, c.job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -1101,7 +1101,7 @@ func TestReconciler_MultiTG(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -1153,7 +1153,7 @@ func TestReconciler_MultiTG_SingleUpdateStanza(t *testing.T) { DesiredTotal: 10, } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -1229,7 +1229,7 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) { // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate()) + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate()) r := reconciler.Compute() // Two reschedule attempts were already made, one more can be made at a future time @@ -1308,7 +1308,7 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) { FinishedAt: now.Add(10 * time.Second)}} } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate()) + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate()) r := reconciler.Compute() // Verify that two follow up evals were created @@ -1401,7 +1401,7 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) { // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "") reconciler.now = now r := reconciler.Compute() @@ -1475,7 +1475,7 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, uuid.Generate()) + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, uuid.Generate()) r := reconciler.Compute() // Should place a new placement and create a follow up eval for the delayed reschedule @@ -1543,7 +1543,7 @@ func TestReconciler_Service_ClientStatusComplete(t *testing.T) { // Mark one as client status complete allocs[4].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Should place a new placement for the alloc that was marked complete @@ -1599,7 +1599,7 @@ func TestReconciler_Service_DesiredStop_ClientStatusComplete(t *testing.T) { allocs[4].ClientStatus = structs.AllocClientStatusFailed allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Should place a new placement for the alloc that was marked stopped @@ -1676,7 +1676,7 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Verify that no follow up evals were created @@ -1753,7 +1753,7 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) { FinishedAt: now.Add(-4 * time.Second)}} allocs[1].ClientStatus = structs.AllocClientStatusFailed - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") reconciler.now = now r := reconciler.Compute() @@ -1833,7 +1833,7 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) { allocs[1].ClientStatus = structs.AllocClientStatusFailed allocs[1].FollowupEvalID = evalID - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, evalID) + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, evalID) reconciler.now = now.Add(-30 * time.Second) r := reconciler.Compute() @@ -1941,7 +1941,7 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") r := reconciler.Compute() // Verify that no follow up evals were created @@ -2063,7 +2063,7 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") reconciler.now = now r := reconciler.Compute() @@ -2189,7 +2189,7 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") reconciler.now = now r := reconciler.Compute() @@ -2253,7 +2253,7 @@ func TestReconciler_DontReschedule_PreviouslyRescheduled(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Should place 1 - one is a new placement to make up the desired count of 5 @@ -2340,7 +2340,7 @@ func TestReconciler_CancelDeployment_JobStop(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, c.jobID, c.job, c.deployment, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, c.deployment, allocs, nil, "") r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate @@ -2417,7 +2417,7 @@ func TestReconciler_CancelDeployment_JobUpdate(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, c.deployment, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, c.deployment, allocs, nil, "") r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate @@ -2466,7 +2466,7 @@ func TestReconciler_CreateDeployment_RollingUpgrade_Destructive(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() d := structs.NewDeployment(job) @@ -2509,7 +2509,7 @@ func TestReconciler_CreateDeployment_RollingUpgrade_Inplace(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() d := structs.NewDeployment(job) @@ -2551,7 +2551,7 @@ func TestReconciler_CreateDeployment_NewerCreateIndex(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() d := structs.NewDeployment(job) @@ -2595,7 +2595,7 @@ func TestReconciler_DontCreateDeployment_NoChanges(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -2673,7 +2673,7 @@ func TestReconciler_PausedOrFailedDeployment_NoMoreCanaries(t *testing.T) { d.TaskGroups[canary.TaskGroup].PlacedCanaries = []string{canary.ID} mockUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{canary.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -2738,7 +2738,7 @@ func TestReconciler_PausedOrFailedDeployment_NoMorePlacements(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -2812,7 +2812,7 @@ func TestReconciler_PausedOrFailedDeployment_NoMoreDestructiveUpdates(t *testing allocs = append(allocs, newAlloc) mockUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{newAlloc.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -2884,7 +2884,7 @@ func TestReconciler_DrainNode_Canary(t *testing.T) { tainted[n.ID] = n mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -2956,7 +2956,7 @@ func TestReconciler_LostNode_Canary(t *testing.T) { tainted[n.ID] = n mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -3022,7 +3022,7 @@ func TestReconciler_StopOldCanaries(t *testing.T) { allocs = append(allocs, canary) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3075,7 +3075,7 @@ func TestReconciler_NewCanaries(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3123,7 +3123,7 @@ func TestReconciler_NewCanaries_CountGreater(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3174,7 +3174,7 @@ func TestReconciler_NewCanaries_MultiTG(t *testing.T) { } } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3227,7 +3227,7 @@ func TestReconciler_NewCanaries_ScaleUp(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3275,7 +3275,7 @@ func TestReconciler_NewCanaries_ScaleDown(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3352,7 +3352,7 @@ func TestReconciler_NewCanaries_FillNames(t *testing.T) { allocs = append(allocs, canary) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -3421,7 +3421,7 @@ func TestReconciler_PromoteCanaries_Unblock(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -3495,7 +3495,7 @@ func TestReconciler_PromoteCanaries_CanariesEqualCount(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -3594,7 +3594,7 @@ func TestReconciler_DeploymentLimit_HealthAccounting(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -3675,7 +3675,7 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -3760,7 +3760,7 @@ func TestReconciler_FailedDeployment_TaintedNodes(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -3816,7 +3816,7 @@ func TestReconciler_CompleteDeployment(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -3871,7 +3871,7 @@ func TestReconciler_MarkDeploymentComplete_FailedAllocations(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -3966,7 +3966,7 @@ func TestReconciler_FailedDeployment_CancelCanaries(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -4035,7 +4035,7 @@ func TestReconciler_FailedDeployment_NewJob(t *testing.T) { jobNew := job.Copy() jobNew.Version += 100 - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, jobNew, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, jobNew, d, allocs, nil, "") r := reconciler.Compute() dnew := structs.NewDeployment(jobNew) @@ -4088,7 +4088,7 @@ func TestReconciler_MarkDeploymentComplete(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -4157,7 +4157,7 @@ func TestReconciler_JobChange_ScaleUp_SecondEval(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -4192,7 +4192,7 @@ func TestReconciler_RollingUpgrade_MissingAllocs(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() d := structs.NewDeployment(job) @@ -4244,7 +4244,7 @@ func TestReconciler_Batch_Rerun(t *testing.T) { job2 := job.Copy() job2.CreateIndex++ - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, true, job2.ID, job2, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job2.ID, job2, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -4305,7 +4305,7 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) { StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert that no rescheduled placements were created @@ -4360,7 +4360,7 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) { allocs[i].DesiredTransition.Reschedule = helper.BoolToPtr(true) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert that no rescheduled placements were created @@ -4443,7 +4443,7 @@ func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) { allocs = append(allocs, new) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, jobv2, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, jobv2, d, allocs, nil, "") r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -4505,7 +4505,7 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert that rescheduled placements were created @@ -4567,7 +4567,7 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) { // Mark DesiredTransition ForceReschedule allocs[0].DesiredTransition = structs.DesiredTransition{ForceReschedule: helper.BoolToPtr(true)} - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Verify that no follow up evals were created diff --git a/scheduler/scheduler.go b/scheduler/scheduler.go index df76e1b017b..b9f0f5e3aa7 100644 --- a/scheduler/scheduler.go +++ b/scheduler/scheduler.go @@ -2,7 +2,8 @@ package scheduler import ( "fmt" - "log" + + log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/nomad/state" @@ -27,7 +28,7 @@ var BuiltinSchedulers = map[string]Factory{ // NewScheduler is used to instantiate and return a new scheduler // given the scheduler name, initial state, and planner. -func NewScheduler(name string, logger *log.Logger, state State, planner Planner) (Scheduler, error) { +func NewScheduler(name string, logger log.Logger, state State, planner Planner) (Scheduler, error) { // Lookup the factory function factory, ok := BuiltinSchedulers[name] if !ok { @@ -40,7 +41,7 @@ func NewScheduler(name string, logger *log.Logger, state State, planner Planner) } // Factory is used to instantiate a new Scheduler -type Factory func(*log.Logger, State, Planner) Scheduler +type Factory func(log.Logger, State, Planner) Scheduler // Scheduler is the top level instance for a scheduler. A scheduler is // meant to only encapsulate business logic, pushing the various plumbing diff --git a/scheduler/spread.go b/scheduler/spread.go index f4e2b0c0fc5..39186c4fc09 100644 --- a/scheduler/spread.go +++ b/scheduler/spread.go @@ -127,7 +127,7 @@ func (iter *SpreadIterator) Next() *RankedNode { usedCount += 1 // Set score to -1 if there were errors in building this attribute if errorMsg != "" { - iter.ctx.Logger().Printf("[WARN] sched: error building spread attributes for task group %v:%v", tgName, errorMsg) + iter.ctx.Logger().Named("spread").Warn("error building spread attributes for task group", "task_group", tgName, "error", errorMsg) totalSpreadScore -= 1.0 continue } diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go index 7a84bd8a9e1..53131e3b425 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_sched.go @@ -2,9 +2,10 @@ package scheduler import ( "fmt" - "log" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" ) @@ -19,7 +20,7 @@ const ( // SystemScheduler is used for 'system' jobs. This scheduler is // designed for services that should be run on every client. type SystemScheduler struct { - logger *log.Logger + logger log.Logger state State planner Planner @@ -41,9 +42,9 @@ type SystemScheduler struct { // NewSystemScheduler is a factory function to instantiate a new system // scheduler. -func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { +func NewSystemScheduler(logger log.Logger, state State, planner Planner) Scheduler { return &SystemScheduler{ - logger: logger, + logger: logger.Named("system_sched"), state: state, planner: planner, } @@ -54,6 +55,9 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) error { // Store the evaluation s.eval = eval + // Update our logger with the eval's information + s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) + // Verify the evaluation trigger reason is understood switch eval.TriggeredBy { case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, @@ -123,7 +127,7 @@ func (s *SystemScheduler) process() (bool, error) { // Compute the target job allocations if err := s.computeJobAllocs(); err != nil { - s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) + s.logger.Error("failed to compute job allocations", "error", err) return false, err } @@ -138,10 +142,10 @@ func (s *SystemScheduler) process() (bool, error) { if s.limitReached && s.nextEval == nil { s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) if err := s.planner.CreateEval(s.nextEval); err != nil { - s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) + s.logger.Error("failed to make next eval for rolling update", "error", err) return false, err } - s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) + s.logger.Debug("rolling update limit reached, next eval created", "next_eval_id", s.nextEval.ID) } // Submit the plan @@ -157,7 +161,7 @@ func (s *SystemScheduler) process() (bool, error) { // If we got a state refresh, try again since we have stale data if newState != nil { - s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) + s.logger.Debug("refresh forced") s.state = newState return false, nil } @@ -165,8 +169,7 @@ func (s *SystemScheduler) process() (bool, error) { // Try again if the plan was not fully committed, potential conflict fullCommit, expected, actual := result.FullCommit(s.plan) if !fullCommit { - s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", - s.eval, expected, actual) + s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual) return false, nil } @@ -201,7 +204,10 @@ func (s *SystemScheduler) computeJobAllocs() error { // Diff the required and existing allocations diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) - s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) + s.logger.Debug("reconciled current state with desired state", + "place", len(diff.place), "update", len(diff.update), + "migrate", len(diff.migrate), "stop", len(diff.stop), + "ignore", len(diff.ignore), "lost", len(diff.lost)) // Add all the allocs to stop for _, e := range diff.stop { diff --git a/scheduler/testing.go b/scheduler/testing.go index 47a6caaeb00..0410b1190d7 100644 --- a/scheduler/testing.go +++ b/scheduler/testing.go @@ -203,7 +203,7 @@ func (h *Harness) Snapshot() State { // Scheduler is used to return a new scheduler from // a snapshot of current state using the harness for planning. func (h *Harness) Scheduler(factory Factory) Scheduler { - logger := testlog.Logger(h.t) + logger := testlog.HCLogger(h.t) return factory(logger, h.Snapshot(), h) } diff --git a/scheduler/util.go b/scheduler/util.go index b08f81b3e80..19d2cdc3372 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -2,11 +2,12 @@ package scheduler import ( "fmt" - "log" "math/rand" "reflect" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/structs" ) @@ -431,12 +432,12 @@ func networkPortMap(n *structs.NetworkResource) map[string]int { } // setStatus is used to update the status of the evaluation -func setStatus(logger *log.Logger, planner Planner, +func setStatus(logger log.Logger, planner Planner, eval, nextEval, spawnedBlocked *structs.Evaluation, tgMetrics map[string]*structs.AllocMetric, status, desc string, queuedAllocs map[string]int, deploymentID string) error { - logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status) + logger.Debug("setting eval status", "status", status) newEval := eval.Copy() newEval.Status = status newEval.StatusDescription = desc @@ -495,8 +496,7 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, // Get the existing node node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID) if err != nil { - ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", - eval, update.Alloc.NodeID, err) + ctx.Logger().Error("failed to get node", "node_id", update.Alloc.NodeID, "error", err) continue } if node == nil { @@ -550,7 +550,7 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, } if len(updates) > 0 { - ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates)) + ctx.Logger().Debug("made in-place updates", "in-place", inplaceCount, "total_updates", len(updates)) } return updates[:n], updates[n:] } @@ -682,7 +682,7 @@ func desiredUpdates(diff *diffResult, inplaceUpdates, // adjustQueuedAllocations decrements the number of allocations pending per task // group based on the number of allocations successfully placed -func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) { +func adjustQueuedAllocations(logger log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) { if result == nil { return } @@ -703,7 +703,7 @@ func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, que if _, ok := queuedAllocs[allocation.TaskGroup]; ok { queuedAllocs[allocation.TaskGroup]-- } else { - logger.Printf("[ERR] sched: allocation %q placed but not in list of unplaced allocations", allocation.TaskGroup) + logger.Error("allocation placed but task group is not in list of unplaced allocations", "task_group", allocation.TaskGroup) } } } @@ -764,7 +764,7 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy ws := memdb.NewWatchSet() node, err := ctx.State().NodeByID(ws, existing.NodeID) if err != nil { - ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", evalID, existing.NodeID, err) + ctx.Logger().Error("failed to get node", "node_id", existing.NodeID, "error", err) return true, false, nil } if node == nil { diff --git a/scheduler/util_test.go b/scheduler/util_test.go index 3ce672ca7dc..5db5da49a83 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -622,7 +622,7 @@ func TestEvictAndPlace_LimitEqualToAllocs(t *testing.T) { func TestSetStatus(t *testing.T) { h := NewHarness(t) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) eval := mock.Eval() status := "a" desc := "b" @@ -1079,7 +1079,7 @@ func TestDesiredUpdates(t *testing.T) { } func TestUtil_AdjustQueuedAllocations(t *testing.T) { - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) alloc1 := mock.Alloc() alloc2 := mock.Alloc() alloc2.CreateIndex = 4 From 32f9da9e07b4fd5111f8311afa09f2717e59dcfe Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Sat, 15 Sep 2018 16:42:38 -0700 Subject: [PATCH 3/3] small fixes --- .../taskrunner/task_runner_test.go | 6 +- command/agent/consul/client.go | 2 +- nomad/drainer/draining_node_test.go | 10 +-- nomad/drainer/watch_nodes_test.go | 11 +-- nomad/fsm.go | 73 +++++++++---------- nomad/leader.go | 4 +- nomad/server.go | 2 +- plugins/shared/loader/loader_test.go | 8 +- 8 files changed, 49 insertions(+), 67 deletions(-) diff --git a/client/allocrunner/taskrunner/task_runner_test.go b/client/allocrunner/taskrunner/task_runner_test.go index a0aa7c79d59..3503d3ee5e0 100644 --- a/client/allocrunner/taskrunner/task_runner_test.go +++ b/client/allocrunner/taskrunner/task_runner_test.go @@ -134,7 +134,7 @@ func testTaskRunnerFromAlloc(t *testing.T, restarts bool, alloc *structs.Allocat vclient := vaultclient.NewMockVaultClient() cclient := consul.NewMockAgent() - serviceClient := consul.NewServiceClient(cclient, logger, true) + serviceClient := consul.NewServiceClient(cclient, testlog.HCLogger(t), true) go serviceClient.Run() tr := NewTaskRunner(logger, conf, db, upd.Update, taskDir, alloc, task, vclient, serviceClient) if !restarts { @@ -633,7 +633,7 @@ func TestTaskRunner_UnregisterConsul_Retries(t *testing.T) { ctx := testTaskRunnerFromAlloc(t, true, alloc) // Use mockConsulServiceClient - consul := consulApi.NewMockConsulServiceClient(t) + consul := consulApi.NewMockConsulServiceClient(t, testlog.HCLogger(t)) ctx.tr.consul = consul ctx.tr.MarkReceived() @@ -1851,7 +1851,7 @@ func TestTaskRunner_CheckWatcher_Restart(t *testing.T) { // backed by a mock consul whose checks are always unhealthy. consulAgent := consul.NewMockAgent() consulAgent.SetStatus("critical") - consulClient := consul.NewServiceClient(consulAgent, ctx.tr.logger, true) + consulClient := consul.NewServiceClient(consulAgent, testlog.HCLogger(t), true) go consulClient.Run() defer consulClient.Shutdown() diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 884ca8668b6..30e3f9f6798 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -533,7 +533,7 @@ func (c *ServiceClient) sync() error { } } - c.logger.Debug("sync complete", "registed_services", sreg, "deregistered_services", sdereg, + c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg, "registered_checks", creg, "deregistered_checks", cdereg) return nil } diff --git a/nomad/drainer/draining_node_test.go b/nomad/drainer/draining_node_test.go index da94699fe25..1e3a951198b 100644 --- a/nomad/drainer/draining_node_test.go +++ b/nomad/drainer/draining_node_test.go @@ -4,7 +4,6 @@ import ( "testing" "time" - "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -15,14 +14,7 @@ import ( // testDrainingNode creates a *drainingNode with a 1h deadline but no allocs func testDrainingNode(t *testing.T) *drainingNode { t.Helper() - - sconfig := &state.StateStoreConfig{ - LogOutput: testlog.NewWriter(t), - Region: "global", - } - state, err := state.NewStateStore(sconfig) - require.Nil(t, err) - + state := state.TestStateStore(t) node := mock.Node() node.DrainStrategy = &structs.DrainStrategy{ DrainSpec: structs.DrainSpec{ diff --git a/nomad/drainer/watch_nodes_test.go b/nomad/drainer/watch_nodes_test.go index 43503e929eb..ca64e2dd134 100644 --- a/nomad/drainer/watch_nodes_test.go +++ b/nomad/drainer/watch_nodes_test.go @@ -16,16 +16,7 @@ import ( func testNodeDrainWatcher(t *testing.T) (*nodeDrainWatcher, *state.StateStore, *MockNodeTracker) { t.Helper() - - sconfig := &state.StateStoreConfig{ - LogOutput: testlog.NewWriter(t), - Region: "global", - } - state, err := state.NewStateStore(sconfig) - if err != nil { - t.Fatalf("failed to create state store: %v", err) - } - + state := state.TestStateStore(t) limiter := rate.NewLimiter(100.0, 100) logger := testlog.HCLogger(t) m := NewMockNodeTracker() diff --git a/nomad/fsm.go b/nomad/fsm.go index d174b0f0c81..14585429bb8 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -267,7 +267,7 @@ func (n *nomadFSM) applyUpsertNode(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "register_node"}, time.Now()) var req structs.NodeRegisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } // Handle upgrade paths @@ -291,7 +291,7 @@ func (n *nomadFSM) applyDeregisterNode(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_node"}, time.Now()) var req structs.NodeDeregisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.DeleteNode(index, req.NodeID); err != nil { @@ -305,7 +305,7 @@ func (n *nomadFSM) applyStatusUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "node_status_update"}, time.Now()) var req structs.NodeUpdateStatusRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpdateNodeStatus(index, req.NodeID, req.Status, req.NodeEvent); err != nil { @@ -333,7 +333,7 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "node_drain_update"}, time.Now()) var req structs.NodeUpdateDrainRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } // COMPAT Remove in version 0.10 @@ -361,7 +361,7 @@ func (n *nomadFSM) applyBatchDrainUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_node_drain_update"}, time.Now()) var req structs.BatchNodeUpdateDrainRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.BatchUpdateNodeDrain(index, req.Updates, req.NodeEvents); err != nil { @@ -375,7 +375,7 @@ func (n *nomadFSM) applyNodeEligibilityUpdate(buf []byte, index uint64) interfac defer metrics.MeasureSince([]string{"nomad", "fsm", "node_eligibility_update"}, time.Now()) var req structs.NodeUpdateEligibilityRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } // Lookup the existing node @@ -404,7 +404,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "register_job"}, time.Now()) var req structs.JobRegisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } /* Handle upgrade paths: @@ -426,7 +426,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { // tracking it. if err := n.periodicDispatcher.Add(req.Job); err != nil { n.logger.Error("periodicDispatcher.Add failed", "error", err) - return fmt.Errorf("failed adding job to periodic dispatcher", "error", err) + return fmt.Errorf("failed adding job to periodic dispatcher: %v", err) } // Create a watch set @@ -497,7 +497,7 @@ func (n *nomadFSM) applyDeregisterJob(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_job"}, time.Now()) var req structs.JobDeregisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.handleJobDeregister(index, req.JobID, req.Namespace, req.Purge); err != nil { @@ -512,7 +512,7 @@ func (n *nomadFSM) applyBatchDeregisterJob(buf []byte, index uint64) interface{} defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_deregister_job"}, time.Now()) var req structs.JobBatchDeregisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } for jobNS, options := range req.Jobs { @@ -572,7 +572,7 @@ func (n *nomadFSM) applyUpdateEval(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "update_eval"}, time.Now()) var req structs.EvalUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } return n.upsertEvals(index, req.Evals) } @@ -617,7 +617,7 @@ func (n *nomadFSM) applyDeleteEval(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "delete_eval"}, time.Now()) var req structs.EvalDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.DeleteEval(index, req.Evals, req.Allocs); err != nil { @@ -631,7 +631,7 @@ func (n *nomadFSM) applyAllocUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update"}, time.Now()) var req structs.AllocUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } // Attach the job to all the allocations. It is pulled out in the @@ -675,7 +675,7 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_client_update"}, time.Now()) var req structs.AllocUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if len(req.Alloc) == 0 { return nil @@ -739,7 +739,7 @@ func (n *nomadFSM) applyAllocUpdateDesiredTransition(buf []byte, index uint64) i defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update_desired_transition"}, time.Now()) var req structs.AllocUpdateDesiredTransitionRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpdateAllocsDesiredTransitions(index, req.Allocs, req.Evals); err != nil { @@ -764,8 +764,7 @@ func (n *nomadFSM) applyUpsertNodeEvent(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_node_events"}, time.Now()) var req structs.EmitNodeEventsRequest if err := structs.Decode(buf, &req); err != nil { - n.logger.Error("failed to decode EmitNodeEventsRequest", "error", err) - return err + panic(fmt.Errorf("failed to decode EmitNodeEventsRequest: %v", err)) } if err := n.state.UpsertNodeEvents(index, req.NodeEvents); err != nil { @@ -782,7 +781,7 @@ func (n *nomadFSM) applyUpsertVaultAccessor(buf []byte, index uint64) interface{ defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_vault_accessor"}, time.Now()) var req structs.VaultAccessorsRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpsertVaultAccessor(index, req.Accessors); err != nil { @@ -798,7 +797,7 @@ func (n *nomadFSM) applyDeregisterVaultAccessor(buf []byte, index uint64) interf defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_vault_accessor"}, time.Now()) var req structs.VaultAccessorsRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.DeleteVaultAccessors(index, req.Accessors); err != nil { @@ -814,7 +813,7 @@ func (n *nomadFSM) applyPlanResults(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_plan_results"}, time.Now()) var req structs.ApplyPlanResultsRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpsertPlanResults(index, &req); err != nil { @@ -831,7 +830,7 @@ func (n *nomadFSM) applyDeploymentStatusUpdate(buf []byte, index uint64) interfa defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_status_update"}, time.Now()) var req structs.DeploymentStatusUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpdateDeploymentStatus(index, &req); err != nil { @@ -848,7 +847,7 @@ func (n *nomadFSM) applyDeploymentPromotion(buf []byte, index uint64) interface{ defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_promotion"}, time.Now()) var req structs.ApplyDeploymentPromoteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpdateDeploymentPromotion(index, &req); err != nil { @@ -866,7 +865,7 @@ func (n *nomadFSM) applyDeploymentAllocHealth(buf []byte, index uint64) interfac defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_alloc_health"}, time.Now()) var req structs.ApplyDeploymentAllocHealthRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpdateDeploymentAllocHealth(index, &req); err != nil { @@ -883,7 +882,7 @@ func (n *nomadFSM) applyDeploymentDelete(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_delete"}, time.Now()) var req structs.DeploymentDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.DeleteDeployment(index, req.Deployments); err != nil { @@ -899,7 +898,7 @@ func (n *nomadFSM) applyJobStability(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_job_stability"}, time.Now()) var req structs.JobStabilityRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpdateJobStability(index, req.Namespace, req.JobID, req.JobVersion, req.Stable); err != nil { @@ -915,7 +914,7 @@ func (n *nomadFSM) applyACLPolicyUpsert(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_policy_upsert"}, time.Now()) var req structs.ACLPolicyUpsertRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpsertACLPolicies(index, req.Policies); err != nil { @@ -930,7 +929,7 @@ func (n *nomadFSM) applyACLPolicyDelete(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_policy_delete"}, time.Now()) var req structs.ACLPolicyDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.DeleteACLPolicies(index, req.Names); err != nil { @@ -945,7 +944,7 @@ func (n *nomadFSM) applyACLTokenUpsert(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_upsert"}, time.Now()) var req structs.ACLTokenUpsertRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.UpsertACLTokens(index, req.Tokens); err != nil { @@ -960,7 +959,7 @@ func (n *nomadFSM) applyACLTokenDelete(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_delete"}, time.Now()) var req structs.ACLTokenDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.DeleteACLTokens(index, req.AccessorIDs); err != nil { @@ -975,7 +974,7 @@ func (n *nomadFSM) applyACLTokenBootstrap(buf []byte, index uint64) interface{} defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_bootstrap"}, time.Now()) var req structs.ACLTokenBootstrapRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } if err := n.state.BootstrapACLTokens(index, req.ResetIndex, req.Token); err != nil { @@ -988,7 +987,7 @@ func (n *nomadFSM) applyACLTokenBootstrap(buf []byte, index uint64) interface{} func (n *nomadFSM) applyAutopilotUpdate(buf []byte, index uint64) interface{} { var req structs.AutopilotSetConfigRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request", "error", err)) + panic(fmt.Errorf("failed to decode request: %v", err)) } defer metrics.MeasureSince([]string{"nomad", "fsm", "autopilot"}, time.Now()) @@ -1061,7 +1060,7 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { switch snapType { case TimeTableSnapshot: if err := n.timetable.Deserialize(dec); err != nil { - return fmt.Errorf("time table deserialize failed", "error", err) + return fmt.Errorf("time table deserialize failed: %v", err) } case NodeSnapshot: @@ -1246,7 +1245,7 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { // remove this block. index, err := newState.Index("job_summary") if err != nil { - return fmt.Errorf("couldn't fetch index of job summary table", "error", err) + return fmt.Errorf("couldn't fetch index of job summary table: %v", err) } // If the index is 0 that means there is no job summary in the snapshot so @@ -1258,7 +1257,7 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { return fmt.Errorf("unable to query latest index: %v", index) } if err := newState.ReconcileJobSummaries(latestIndex); err != nil { - return fmt.Errorf("error reconciling summaries", "error", err) + return fmt.Errorf("error reconciling summaries: %v", err) } } @@ -1291,12 +1290,12 @@ func (n *nomadFSM) failLeakedDeployments(state *state.StateStore) error { // and thus the older deployment leaks and then the job is removed. iter, err := state.Deployments(nil) if err != nil { - return fmt.Errorf("failed to query deployments", "error", err) + return fmt.Errorf("failed to query deployments: %v", err) } dindex, err := state.Index("deployment") if err != nil { - return fmt.Errorf("couldn't fetch index of deployments table", "error", err) + return fmt.Errorf("couldn't fetch index of deployments table: %v", err) } for { @@ -1348,7 +1347,7 @@ func (n *nomadFSM) reconcileQueuedAllocations(index uint64) error { snap, err := n.state.Snapshot() if err != nil { - return fmt.Errorf("unable to create snapshot", "error", err) + return fmt.Errorf("unable to create snapshot: %v", err) } // Invoking the scheduler for every job so that we can populate the number diff --git a/nomad/leader.go b/nomad/leader.go index 0ea07c8443e..ef0e622cf9b 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -931,7 +931,7 @@ func (s *Server) replicateACLPolicies(stopCh chan struct{}) { }, } limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) - s.logger.Debug("starting ACL policy replication from authoritative region", "authorative_region", req.Region) + s.logger.Debug("starting ACL policy replication from authoritative region", "authoritative_region", req.Region) START: for { @@ -1073,7 +1073,7 @@ func (s *Server) replicateACLTokens(stopCh chan struct{}) { }, } limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) - s.logger.Debug("starting ACL token replication from authoritative region", "authorative_region", req.Region) + s.logger.Debug("starting ACL token replication from authoritative region", "authoritative_region", req.Region) START: for { diff --git a/nomad/server.go b/nomad/server.go index f22137ac755..af10e14ec87 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -365,7 +365,7 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI) (*Server, error) // Setup the Consul syncer if err := s.setupConsulSyncer(); err != nil { s.logger.Error("failed to create server consul syncer", "error", err) - return nil, fmt.Errorf("failed to create server Consul syncer: %v", "error", err) + return nil, fmt.Errorf("failed to create server Consul syncer: %v", err) } // Setup the deployment watcher. diff --git a/plugins/shared/loader/loader_test.go b/plugins/shared/loader/loader_test.go index 4df18e1fb9b..0b8db21c2eb 100644 --- a/plugins/shared/loader/loader_test.go +++ b/plugins/shared/loader/loader_test.go @@ -236,7 +236,7 @@ func TestPluginLoader_External_Config_Bad(t *testing.T) { Config: map[string]interface{}{ "foo": "1", "bar": "2", - "non-existant": "3", + "non-existent": "3", }, }, }, @@ -244,7 +244,7 @@ func TestPluginLoader_External_Config_Bad(t *testing.T) { _, err := NewPluginLoader(lconfig) require.Error(err) - require.Contains(err.Error(), "No argument or block type is named \"non-existant\"") + require.Contains(err.Error(), "No argument or block type is named \"non-existent\"") } func TestPluginLoader_External_VersionOverlap(t *testing.T) { @@ -452,7 +452,7 @@ func TestPluginLoader_Internal_Config_Bad(t *testing.T) { Config: map[string]interface{}{ "foo": "1", "bar": "2", - "non-existant": "3", + "non-existent": "3", }, }, }, @@ -460,7 +460,7 @@ func TestPluginLoader_Internal_Config_Bad(t *testing.T) { _, err := NewPluginLoader(lconfig) require.Error(err) - require.Contains(err.Error(), "No argument or block type is named \"non-existant\"") + require.Contains(err.Error(), "No argument or block type is named \"non-existent\"") } func TestPluginLoader_InternalOverrideExternal(t *testing.T) {