Skip to content

Commit

Permalink
ui: Compute longer Histogram Windows
Browse files Browse the repository at this point in the history
Adds a derived configuration value "HistogramWindowDuration" to the server
config, which specifies the approximate duration for which individual samples
are retained in our sliding window histograms.

Previously, histogram windows were initialized using the MetricSampleWindow
value. However, this resulted in cockroachdb#12998; this is caused because the histogram
window is rotated when querying a histogram, not when recording a
sample. In many cases, the timing is such that all samples recorded are
discarded right before the histogram is queried, resulting in a zero value.

Because of this, and additionally issue cockroachdb#12835, we are lengthening the histogram
window duration for all histogram metrics. This includes all existing histograms
that were using MetricSampleWindow, and a few histograms which were using
constant values of 1 minute. The new value is arbitrarily chosen to be six times
the MetricSampleWindow configuration setting; under default settings, that
equates to a window of one minute.

The work here can be considered a temporary fix to alleviate the problems in a
couple of issues; the histogram system is going to be greatly overhauled in
issue cockroachdb#7896, part of which will be the removal of sliding-window histograms.

Fixes cockroachdb#12835
Fixes cockroachdb#12998
  • Loading branch information
Matt Tracy committed Feb 3, 2017
1 parent c10e3e8 commit c15bfe6
Show file tree
Hide file tree
Showing 13 changed files with 89 additions and 46 deletions.
8 changes: 4 additions & 4 deletions pkg/kv/txn_coord_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,15 @@ var (
)

// MakeTxnMetrics returns a TxnMetrics struct that contains metrics whose
// windowed portions retain data for approximately sampleInterval.
func MakeTxnMetrics(sampleInterval time.Duration) TxnMetrics {
// windowed portions retain data for approximately histogramWindow.
func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics {
return TxnMetrics{
Aborts: metric.NewCounterWithRates(metaAbortsRates),
Commits: metric.NewCounterWithRates(metaCommitsRates),
Commits1PC: metric.NewCounterWithRates(metaCommits1PCRates),
Abandons: metric.NewCounterWithRates(metaAbandonsRates),
Durations: metric.NewLatency(metaDurationsHistograms, sampleInterval),
Restarts: metric.NewHistogram(metaRestartsHistogram, sampleInterval, 100, 3),
Durations: metric.NewLatency(metaDurationsHistograms, histogramWindow),
Restarts: metric.NewHistogram(metaRestartsHistogram, histogramWindow, 100, 3),
}
}

Expand Down
27 changes: 27 additions & 0 deletions pkg/server/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,33 @@ type Config struct {
enginesCreated bool
}

// HistogramWindowInterval is used to determine the approximate length of time
// that individual samples are retained in in-memory histograms. Currently,
// it is set to the arbitrary length of six times the Metrics sample interval.
//
// The length of the window must be longer than the sampling interval due to
// issue #12998, which was causing histograms to return zero values when sampled
// because all samples had been evicted.
//
// Note that this is only intended to be a temporary fix for the above issue,
// as our current handling of metric histograms have numerous additional
// problems. These are tracked in github issue #7896, which has been given
// a relatively high priority in light of recent confusion around histogram
// metrics. For more information on the issues underlying our histogram system
// and the proposed fixes, please see issue #7896.
func (cfg Config) HistogramWindowInterval() time.Duration {
hwi := cfg.MetricsSampleInterval * 6

// Rudimentary overflow detection; this can result if MetricsSampleInterval
// is set to an extremely large number, likely in the context of a test or
// an intentional attempt to disable metrics collection. Just return
// cfg.MetricsSampleInterval in this case.
if hwi < cfg.MetricsSampleInterval {
return cfg.MetricsSampleInterval
}
return hwi
}

// GetTotalMemory returns either the total system memory or if possible the
// cgroups available memory.
func GetTotalMemory() (int64, error) {
Expand Down
7 changes: 4 additions & 3 deletions pkg/server/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ type nodeMetrics struct {
Err *metric.Counter
}

func makeNodeMetrics(reg *metric.Registry, sampleInterval time.Duration) nodeMetrics {
func makeNodeMetrics(reg *metric.Registry, histogramWindow time.Duration) nodeMetrics {
nm := nodeMetrics{
Latency: metric.NewLatency(metaExecLatency, sampleInterval),
Latency: metric.NewLatency(metaExecLatency, histogramWindow),
Success: metric.NewCounter(metaExecSuccess),
Err: metric.NewCounter(metaExecError),
}
Expand Down Expand Up @@ -201,6 +201,7 @@ func bootstrapCluster(
cfg.TestingKnobs = storage.StoreTestingKnobs{}
cfg.ScanInterval = 10 * time.Minute
cfg.MetricsSampleInterval = time.Duration(math.MaxInt64)
cfg.HistogramWindowInterval = time.Duration(math.MaxInt64)
cfg.ConsistencyCheckInterval = 10 * time.Minute
cfg.AmbientCtx.Tracer = tracing.NewTracer()
// Create a KV DB with a local sender.
Expand Down Expand Up @@ -272,7 +273,7 @@ func NewNode(
storeCfg: cfg,
stopper: stopper,
recorder: recorder,
metrics: makeNodeMetrics(reg, cfg.MetricsSampleInterval),
metrics: makeNodeMetrics(reg, cfg.HistogramWindowInterval),
stores: storage.NewStores(cfg.AmbientCtx, cfg.Clock),
txnMetrics: txnMetrics,
eventLogger: eventLogger,
Expand Down
1 change: 1 addition & 0 deletions pkg/server/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ func createTestNode(
cfg.DB = client.NewDB(sender)
cfg.Transport = storage.NewDummyRaftTransport()
cfg.MetricsSampleInterval = metric.TestSampleInterval
cfg.HistogramWindowInterval = metric.TestSampleInterval
active, renewal := storage.NodeLivenessDurations(
storage.RaftElectionTimeout(cfg.RaftTickInterval, cfg.RaftElectionTimeoutTicks))
cfg.NodeLiveness = storage.NewNodeLiveness(
Expand Down
34 changes: 20 additions & 14 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
s.distSender = kv.NewDistSender(distSenderCfg, s.gossip)
s.registry.AddMetricStruct(s.distSender.Metrics())

txnMetrics := kv.MakeTxnMetrics(s.cfg.MetricsSampleInterval)
txnMetrics := kv.MakeTxnMetrics(s.cfg.HistogramWindowInterval())
s.registry.AddMetricStruct(txnMetrics)
s.txnCoordSender = kv.NewTxnCoordSender(
s.cfg.AmbientCtx,
Expand Down Expand Up @@ -236,7 +236,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
roachpb.RegisterExternalServer(s.grpc, s.kvDB)

// Set up internal memory metrics for use by internal SQL executors.
s.internalMemMetrics = sql.MakeMemMetrics("internal")
s.internalMemMetrics = sql.MakeMemMetrics("internal", cfg.HistogramWindowInterval())
s.registry.AddMetricStruct(s.internalMemMetrics)

// Set up Lease Manager
Expand All @@ -259,21 +259,21 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
distsqlrun.RegisterDistSQLServer(s.grpc, s.distSQLServer)

// Set up admin memory metrics for use by admin SQL executors.
s.adminMemMetrics = sql.MakeMemMetrics("admin")
s.adminMemMetrics = sql.MakeMemMetrics("admin", cfg.HistogramWindowInterval())
s.registry.AddMetricStruct(s.adminMemMetrics)

// Set up Executor
execCfg := sql.ExecutorConfig{
AmbientCtx: s.cfg.AmbientCtx,
NodeID: &s.nodeIDContainer,
DB: s.db,
Gossip: s.gossip,
DistSender: s.distSender,
RPCContext: s.rpcContext,
LeaseManager: s.leaseMgr,
Clock: s.clock,
DistSQLSrv: s.distSQLServer,
MetricsSampleInterval: s.cfg.MetricsSampleInterval,
AmbientCtx: s.cfg.AmbientCtx,
NodeID: &s.nodeIDContainer,
DB: s.db,
Gossip: s.gossip,
DistSender: s.distSender,
RPCContext: s.rpcContext,
LeaseManager: s.leaseMgr,
Clock: s.clock,
DistSQLSrv: s.distSQLServer,
HistogramWindowInterval: s.cfg.HistogramWindowInterval(),
}
if s.cfg.TestingKnobs.SQLExecutor != nil {
execCfg.TestingKnobs = s.cfg.TestingKnobs.SQLExecutor.(*sql.ExecutorTestingKnobs)
Expand All @@ -290,7 +290,12 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
s.registry.AddMetricStruct(s.sqlExecutor)

s.pgServer = pgwire.MakeServer(
s.cfg.AmbientCtx, s.cfg.Config, s.sqlExecutor, &s.internalMemMetrics, s.cfg.SQLMemoryPoolSize,
s.cfg.AmbientCtx,
s.cfg.Config,
s.sqlExecutor,
&s.internalMemMetrics,
s.cfg.SQLMemoryPoolSize,
s.cfg.HistogramWindowInterval(),
)
s.registry.AddMetricStruct(s.pgServer.Metrics())

Expand All @@ -312,6 +317,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
ConsistencyCheckInterval: s.cfg.ConsistencyCheckInterval,
ConsistencyCheckPanicOnFailure: s.cfg.ConsistencyCheckPanicOnFailure,
MetricsSampleInterval: s.cfg.MetricsSampleInterval,
HistogramWindowInterval: s.cfg.HistogramWindowInterval(),
StorePool: s.storePool,
SQLExecutor: sql.InternalExecutor{
LeaseManager: s.leaseMgr,
Expand Down
6 changes: 3 additions & 3 deletions pkg/sql/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,8 @@ type ExecutorConfig struct {

TestingKnobs *ExecutorTestingKnobs
SchemaChangerTestingKnobs *SchemaChangerTestingKnobs
// MetricsSampleInterval is (server.Context).MetricsSampleInterval.
MetricsSampleInterval time.Duration
// HistogramWindowInterval is (server.Context).HistogramWindowInterval.
HistogramWindowInterval time.Duration
}

var _ base.ModuleTestingKnobs = &ExecutorTestingKnobs{}
Expand Down Expand Up @@ -308,7 +308,7 @@ func NewExecutor(cfg ExecutorConfig, stopper *stop.Stopper) *Executor {
stopper: stopper,
reCache: parser.NewRegexpCache(512),

Latency: metric.NewLatency(MetaLatency, cfg.MetricsSampleInterval),
Latency: metric.NewLatency(MetaLatency, cfg.HistogramWindowInterval),
TxnBeginCount: metric.NewCounter(MetaTxnBegin),
TxnCommitCount: metric.NewCounter(MetaTxnCommit),
TxnAbortCount: metric.NewCounter(MetaTxnAbort),
Expand Down
8 changes: 4 additions & 4 deletions pkg/sql/mem_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ var _ metric.Struct = MemoryMetrics{}
const log10int64times1000 = 19 * 1000

// MakeMemMetrics instantiates the metric objects for an SQL endpoint.
func MakeMemMetrics(endpoint string) MemoryMetrics {
func MakeMemMetrics(endpoint string, histogramWindow time.Duration) MemoryMetrics {
prefix := "sql.mon." + endpoint
MetaMemMaxBytes := metric.Metadata{Name: prefix + ".max"}
MetaMemCurBytes := metric.Metadata{Name: prefix + ".cur"}
Expand All @@ -67,11 +67,11 @@ func MakeMemMetrics(endpoint string) MemoryMetrics {
MetaMemMaxSessionBytes := metric.Metadata{Name: prefix + ".session.max"}
MetaMemSessionCurBytes := metric.Metadata{Name: prefix + ".session.cur"}
return MemoryMetrics{
MaxBytesHist: metric.NewHistogram(MetaMemMaxBytes, time.Minute, log10int64times1000, 3),
MaxBytesHist: metric.NewHistogram(MetaMemMaxBytes, histogramWindow, log10int64times1000, 3),
CurBytesCount: metric.NewCounter(MetaMemCurBytes),
TxnMaxBytesHist: metric.NewHistogram(MetaMemMaxTxnBytes, time.Minute, log10int64times1000, 3),
TxnMaxBytesHist: metric.NewHistogram(MetaMemMaxTxnBytes, histogramWindow, log10int64times1000, 3),
TxnCurBytesCount: metric.NewCounter(MetaMemTxnCurBytes),
SessionMaxBytesHist: metric.NewHistogram(MetaMemMaxSessionBytes, time.Minute, log10int64times1000, 3),
SessionMaxBytesHist: metric.NewHistogram(MetaMemMaxSessionBytes, histogramWindow, log10int64times1000, 3),
SessionCurBytesCount: metric.NewCounter(MetaMemSessionCurBytes),
}
}
11 changes: 7 additions & 4 deletions pkg/sql/pgwire/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,15 @@ type ServerMetrics struct {
internalMemMetrics *sql.MemoryMetrics
}

func makeServerMetrics(internalMemMetrics *sql.MemoryMetrics) ServerMetrics {
func makeServerMetrics(
internalMemMetrics *sql.MemoryMetrics, histogramWindow time.Duration,
) ServerMetrics {
return ServerMetrics{
Conns: metric.NewCounter(MetaConns),
BytesInCount: metric.NewCounter(MetaBytesIn),
BytesOutCount: metric.NewCounter(MetaBytesOut),
ConnMemMetrics: sql.MakeMemMetrics("conns"),
SQLMemMetrics: sql.MakeMemMetrics("client"),
ConnMemMetrics: sql.MakeMemMetrics("conns", histogramWindow),
SQLMemMetrics: sql.MakeMemMetrics("client", histogramWindow),
internalMemMetrics: internalMemMetrics,
}
}
Expand All @@ -131,12 +133,13 @@ func MakeServer(
executor *sql.Executor,
internalMemMetrics *sql.MemoryMetrics,
maxSQLMem int64,
histogramWindow time.Duration,
) *Server {
server := &Server{
AmbientCtx: ambientCtx,
cfg: cfg,
executor: executor,
metrics: makeServerMetrics(internalMemMetrics),
metrics: makeServerMetrics(internalMemMetrics, histogramWindow),
}
server.sqlMemoryPool = mon.MakeMonitor("sql",
server.metrics.SQLMemMetrics.CurBytesCount,
Expand Down
6 changes: 3 additions & 3 deletions pkg/sql/pgwire/v3_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ import (
)

func makeTestV3Conn(c net.Conn) v3Conn {
metrics := makeServerMetrics(nil)
metrics := makeServerMetrics(nil, metric.TestSampleInterval)
mon := mon.MakeUnlimitedMonitor(context.Background(), "test", nil, nil, 1000)
exec := sql.NewExecutor(
sql.ExecutorConfig{
AmbientCtx: log.AmbientContext{Tracer: tracing.NewTracer()},
MetricsSampleInterval: metric.TestSampleInterval,
AmbientCtx: log.AmbientContext{Tracer: tracing.NewTracer()},
HistogramWindowInterval: metric.TestSampleInterval,
},
nil, /* stopper */
)
Expand Down
12 changes: 6 additions & 6 deletions pkg/storage/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ type StoreMetrics struct {
}
}

func newStoreMetrics(sampleInterval time.Duration) *StoreMetrics {
func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
storeRegistry := metric.NewRegistry()
sm := &StoreMetrics{
registry: storeRegistry,
Expand Down Expand Up @@ -683,23 +683,23 @@ func newStoreMetrics(sampleInterval time.Duration) *StoreMetrics {
// gives sane (though mathematically nonsensical) results when exposed
// at the moment.
MuReplicaNanos: metric.NewHistogram(
metaMuReplicaNanos, sampleInterval,
metaMuReplicaNanos, histogramWindow,
time.Second.Nanoseconds(), 1,
),
MuCommandQueueNanos: metric.NewHistogram(
metaMuCommandQueueNanos, sampleInterval,
metaMuCommandQueueNanos, histogramWindow,
time.Second.Nanoseconds(), 1,
),
MuRaftNanos: metric.NewHistogram(
metaMuRaftNanos, sampleInterval,
metaMuRaftNanos, histogramWindow,
time.Second.Nanoseconds(), 1,
),
MuStoreNanos: metric.NewHistogram(
metaMuStoreNanos, sampleInterval,
metaMuStoreNanos, histogramWindow,
time.Second.Nanoseconds(), 1,
),
MuSchedulerNanos: metric.NewHistogram(
metaMuSchedulerNanos, time.Minute,
metaMuSchedulerNanos, histogramWindow,
time.Second.Nanoseconds(), 1,
),

Expand Down
8 changes: 6 additions & 2 deletions pkg/storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ func TestStoreConfig(clock *hlc.Clock) StoreConfig {
ScanInterval: 10 * time.Minute,
ConsistencyCheckInterval: 10 * time.Minute,
ConsistencyCheckPanicOnFailure: true,
MetricsSampleInterval: time.Hour,
MetricsSampleInterval: metric.TestSampleInterval,
HistogramWindowInterval: metric.TestSampleInterval,
EnableCoalescedHeartbeats: true,
EnableEpochRangeLeases: true,
}
Expand Down Expand Up @@ -661,6 +662,9 @@ type StoreConfig struct {
// MetricsSampleInterval is (server.Context).MetricsSampleInterval
MetricsSampleInterval time.Duration

// HistogramWindowInterval is (server.Context).HistogramWindowInterval
HistogramWindowInterval time.Duration

// EnableCoalescedHeartbeats controls whether heartbeats are coalesced.
EnableCoalescedHeartbeats bool

Expand Down Expand Up @@ -840,7 +844,7 @@ func NewStore(cfg StoreConfig, eng engine.Engine, nodeDesc *roachpb.NodeDescript
engine: eng,
allocator: MakeAllocator(cfg.StorePool),
nodeDesc: nodeDesc,
metrics: newStoreMetrics(cfg.MetricsSampleInterval),
metrics: newStoreMetrics(cfg.HistogramWindowInterval),
}
// EnableCoalescedHeartbeats is enabled by TestStoreConfig, so in that case
// ignore the environment variable. Otherwise, use whatever the environment
Expand Down
1 change: 1 addition & 0 deletions pkg/testutils/localtestcluster/local_test_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ func (ltc *LocalTestCluster) Start(t testing.TB, baseCtx *base.Config, initSende
)
cfg.Transport = transport
cfg.MetricsSampleInterval = metric.TestSampleInterval
cfg.HistogramWindowInterval = metric.TestSampleInterval
ltc.Store = storage.NewStore(cfg, ltc.Eng, nodeDesc)
if err := ltc.Store.Bootstrap(roachpb.StoreIdent{NodeID: nodeID, StoreID: 1}); err != nil {
t.Fatalf("unable to start local test cluster: %s", err)
Expand Down
6 changes: 3 additions & 3 deletions pkg/util/metric/metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,10 @@ func NewHistogram(metadata Metadata, duration time.Duration, maxVal int64, sigFi
// with one digit of precision (i.e. errors of <10ms at 100ms, <6s at 60s).
//
// The windowed portion of the Histogram retains values for approximately
// sampleDuration.
func NewLatency(metadata Metadata, sampleDuration time.Duration) *Histogram {
// histogramWindow.
func NewLatency(metadata Metadata, histogramWindow time.Duration) *Histogram {
return NewHistogram(
metadata, sampleDuration, MaxLatency.Nanoseconds(), 1,
metadata, histogramWindow, MaxLatency.Nanoseconds(), 1,
)
}

Expand Down

0 comments on commit c15bfe6

Please sign in to comment.