diff --git a/Makefile b/Makefile index e7ef845c988d..96cede686950 100644 --- a/Makefile +++ b/Makefile @@ -1211,7 +1211,7 @@ ARCHIVE_EXTRAS = \ $(BUILDINFO) \ $(SQLPARSER_TARGETS) \ $(OPTGEN_TARGETS) \ - pkg/ui/assets.ccl.installed pkg/ui/assets.oss.installed + pkg/ui/distccl/assets pkg/ui/distoss/assets pkg/ui/dist_vendor # TODO(benesch): Make this recipe use `git ls-files --recurse-submodules` # instead of scripts/ls-files.sh once Git v2.11 is widely deployed. @@ -1442,7 +1442,7 @@ ui-watch ui-watch-secure: $(UI_CCL_DLLS) pkg/ui/yarn.opt.installed ui-clean: ## Remove build artifacts. find pkg/ui/distccl/assets pkg/ui/distoss/assets -mindepth 1 -not -name .gitkeep -delete rm -rf pkg/ui/assets.ccl.installed pkg/ui/assets.oss.installed - rm -rf pkg/ui/dist/* + rm -rf pkg/ui/dist_vendor/* rm -f $(UI_PROTOS_CCL) $(UI_PROTOS_OSS) rm -f pkg/ui/workspaces/db-console/*manifest.json rm -rf pkg/ui/workspaces/cluster-ui/dist diff --git a/pkg/server/debug/BUILD.bazel b/pkg/server/debug/BUILD.bazel index 6f731cd5e7a5..38120156700d 100644 --- a/pkg/server/debug/BUILD.bazel +++ b/pkg/server/debug/BUILD.bazel @@ -29,6 +29,8 @@ go_library( "//pkg/util/log/severity", "//pkg/util/stop", "//pkg/util/timeutil", + "//pkg/util/tracing", + "//pkg/util/tracing/tracingui", "//pkg/util/uint128", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_pebble//tool", diff --git a/pkg/server/debug/server.go b/pkg/server/debug/server.go index 788b99b47b3c..15cdef61d07d 100644 --- a/pkg/server/debug/server.go +++ b/pkg/server/debug/server.go @@ -32,6 +32,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingui" "github.com/cockroachdb/errors" pebbletool "github.com/cockroachdb/pebble/tool" metrics "github.com/rcrowley/go-metrics" @@ -61,14 +63,18 @@ var _ = func() *settings.StringSetting { // Server serves the /debug/* family of tools. type Server struct { - st *cluster.Settings - mux *http.ServeMux - spy logSpy + ambientCtx log.AmbientContext + st *cluster.Settings + mux *http.ServeMux + spy logSpy } // NewServer sets up a debug server. func NewServer( - st *cluster.Settings, hbaConfDebugFn http.HandlerFunc, profiler pprofui.Profiler, + ambientContext log.AmbientContext, + st *cluster.Settings, + hbaConfDebugFn http.HandlerFunc, + profiler pprofui.Profiler, ) *Server { mux := http.NewServeMux() @@ -137,9 +143,10 @@ func NewServer( }) return &Server{ - st: st, - mux: mux, - spy: spy, + ambientCtx: ambientContext, + st: st, + mux: mux, + spy: spy, } } @@ -232,6 +239,12 @@ func (ds *Server) RegisterClosedTimestampSideTransport( }) } +// RegisterTracez registers the /debug/tracez handler, which renders snapshots +// of active spans. +func (ds *Server) RegisterTracez(tr *tracing.Tracer) { + tracingui.RegisterHTTPHandlers(ds.ambientCtx, ds.mux, tr) +} + // ServeHTTP serves various tools under the /debug endpoint. func (ds *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { handler, _ := ds.mux.Handler(r) diff --git a/pkg/server/server.go b/pkg/server/server.go index b66783293f9c..4203ed2fa1d7 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -834,7 +834,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) { } sStatus.setStmtDiagnosticsRequester(sqlServer.execCfg.StmtDiagnosticsRecorder) sStatus.baseStatusServer.sqlServer = sqlServer - debugServer := debug.NewServer(st, sqlServer.pgServer.HBADebugFn(), sStatus) + debugServer := debug.NewServer(cfg.BaseConfig.AmbientCtx, st, sqlServer.pgServer.HBADebugFn(), sStatus) node.InitLogger(sqlServer.execCfg) *lateBoundServer = Server{ @@ -1958,6 +1958,7 @@ func (s *Server) PreStart(ctx context.Context) error { return errors.Wrapf(err, "failed to register engines with debug server") } s.debug.RegisterClosedTimestampSideTransport(s.ctSender, s.node.storeCfg.ClosedTimestampReceiver) + s.debug.RegisterTracez(s.cfg.Tracer) s.ctSender.Run(ctx, state.nodeID) diff --git a/pkg/server/tenant.go b/pkg/server/tenant.go index c74b18a3913b..bd7c9a2809bc 100644 --- a/pkg/server/tenant.go +++ b/pkg/server/tenant.go @@ -211,7 +211,7 @@ func StartTenant( ) mux := http.NewServeMux() - debugServer := debug.NewServer(args.Settings, s.pgServer.HBADebugFn(), s.execCfg.SQLStatusServer) + debugServer := debug.NewServer(baseCfg.AmbientCtx, args.Settings, s.pgServer.HBADebugFn(), s.execCfg.SQLStatusServer) mux.Handle("/", debugServer) mux.Handle("/_status/", gwMux) mux.HandleFunc("/health", func(w http.ResponseWriter, req *http.Request) { diff --git a/pkg/ui/.gitignore b/pkg/ui/.gitignore index 0d5500eded18..a583d996bffd 100644 --- a/pkg/ui/.gitignore +++ b/pkg/ui/.gitignore @@ -14,6 +14,7 @@ assets.oss.installed # Generated intermediates .cache-loader dist*/** +dist_vendor !dist*/_empty_assets/assets/index.html !distccl/assets/.gitkeep !distccl/distccl.go diff --git a/pkg/ui/BUILD.bazel b/pkg/ui/BUILD.bazel index 862ffdf0aed6..144d5d3725a3 100644 --- a/pkg/ui/BUILD.bazel +++ b/pkg/ui/BUILD.bazel @@ -1,8 +1,16 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("@build_bazel_rules_nodejs//:index.bzl", "nodejs_binary") go_library( name = "ui", - srcs = ["ui.go"], + srcs = [ + "tracing_ui.go", + "ui.go", + ], + embedsrcs = [ + "dist_vendor/list.min.js", + "templates/tracing/html_template.html", + ], importpath = "github.com/cockroachdb/cockroach/pkg/ui", visibility = ["//visibility:public"], deps = [ @@ -35,3 +43,11 @@ EOF """, visibility = ["//pkg/ui:__subpackages__"], ) + +genrule( + name = "listjs", + srcs = ["@npm//:node_modules/list.js/dist/list.min.js"], + outs = ["dist_vendor/list.min.js"], + cmd = "cp ./$(location @npm//:node_modules/list.js/dist/list.min.js) $@", + tools = ["@npm//list.js"], +) diff --git a/pkg/ui/templates/tracing/html_template.html b/pkg/ui/templates/tracing/html_template.html new file mode 100644 index 000000000000..f66b7a4d6ad0 --- /dev/null +++ b/pkg/ui/templates/tracing/html_template.html @@ -0,0 +1,140 @@ + + + + + + + + + +Take a snapshot of current operations +
+Stored snapshots (ID: capture time): +{{$id := .SnapshotID}} +{{range $i, $s := .AllSnapshots}} + + {{if eq $s.ID $id}} + [current] {{$s.ID}}: {{formatTimeNoMillis .CapturedAt}} + {{else}} + {{$s.ID}}: {{formatTimeNoMillis .CapturedAt}} + {{end}} + +{{end}} +
+ +

Spans currently open: {{len .SpansList.Spans}}. Snapshot captured at: {{formatTime .CapturedAt}} UTC. Page generated at: {{formatTime .Now}} UTC.

+{{if ne .Err nil}} +

There was an error producing this snapshot; it might be incomplete: {{.Err}}

+{{end}} + +
+ + + + + + + + + + + + +
+ + + + + +
+
+ + + diff --git a/pkg/ui/tracing_ui.go b/pkg/ui/tracing_ui.go new file mode 100644 index 000000000000..000ee4f99ce7 --- /dev/null +++ b/pkg/ui/tracing_ui.go @@ -0,0 +1,36 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package ui + +import ( + "embed" + "io/fs" +) + +// This file deals with embedding assets used by /debug/tracez. + +//go:embed dist_vendor/list.min.js +var listJS embed.FS + +//go:embed templates/tracing/html_template.html +// SpansTableTemplateSrc contains a template used by /debug/tracez +var SpansTableTemplateSrc string + +// ListJS exposes list.js package +var ListJS fs.FS + +func init() { + f, err := fs.Sub(listJS, "dist_vendor") + if err != nil { + panic(err) + } + ListJS = f +} diff --git a/pkg/ui/workspaces/db-console/BUILD.bazel b/pkg/ui/workspaces/db-console/BUILD.bazel index 75b99f44106d..494064a475c6 100644 --- a/pkg/ui/workspaces/db-console/BUILD.bazel +++ b/pkg/ui/workspaces/db-console/BUILD.bazel @@ -130,6 +130,7 @@ DEPENDENCIES = [ "@npm//highlight.js", "@npm//lodash", "@npm//long", + "@npm//list.js", "@npm//mini-create-react-context", "@npm//moment", "@npm//nvd3", diff --git a/pkg/ui/workspaces/db-console/package.json b/pkg/ui/workspaces/db-console/package.json index 1d1f0d06e2af..600d687f10b7 100644 --- a/pkg/ui/workspaces/db-console/package.json +++ b/pkg/ui/workspaces/db-console/package.json @@ -29,6 +29,7 @@ "connected-react-router": "^6.9.1", "create-react-context": "^0.3.0", "highlight.js": "^10.6.0", + "list.js": "^2.3.1", "lodash": "^4.17.21", "long": "^4.0.0", "mini-create-react-context": "^0.3.2", diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx index 4b827609d132..6bcd38d4dc89 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx @@ -330,6 +330,7 @@ export default function Debug() { + { new RemoveBrokenDependenciesPlugin(), new CopyWebpackPlugin([ { from: path.resolve(__dirname, "favicon.ico"), to: "favicon.ico" }, + { + from: path.resolve( + !isBazelBuild ? __dirname : "", + !isBazelBuild ? "../.." : "", + "node_modules/list.js/dist/list.min.js", + ), + to: path.resolve(__dirname, "../../dist_vendor/list.min.js"), + }, ]), // use WebpackBar instead of webpack dashboard to fit multiple webpack dev server outputs (db-console and cluster-ui) new WebpackBar({ diff --git a/pkg/ui/yarn-vendor b/pkg/ui/yarn-vendor index b7cef07d6748..dfceabccc717 160000 --- a/pkg/ui/yarn-vendor +++ b/pkg/ui/yarn-vendor @@ -1 +1 @@ -Subproject commit b7cef07d6748b5af824fb1c1af90cc17f3009c56 +Subproject commit dfceabccc717aff312f17423089652e9e697355b diff --git a/pkg/ui/yarn.lock b/pkg/ui/yarn.lock index 0295930e2b57..c0198cdf4cb7 100644 --- a/pkg/ui/yarn.lock +++ b/pkg/ui/yarn.lock @@ -11461,6 +11461,13 @@ linkify-it@^2.0.0: dependencies: uc.micro "^1.0.1" +list.js@^2.3.1: + version "2.3.1" + resolved "https://registry.yarnpkg.com/list.js/-/list.js-2.3.1.tgz#48961989ffe52b0505e352f7a521f819f51df7e7" + integrity sha512-jnmm7DYpKtH3DxtO1E2VNCC9Gp7Wrp/FWA2JxQrZUhVJ2RCQBd57pCN6W5w6jpsfWZV0PCAbTX2NOPgyFeeZZg== + dependencies: + string-natural-compare "^2.0.2" + listr-silent-renderer@^1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/listr-silent-renderer/-/listr-silent-renderer-1.1.1.tgz#924b5a3757153770bf1a8e3fbf74b8bbf3f9242e" @@ -16399,6 +16406,11 @@ string-length@^3.1.0: astral-regex "^1.0.0" strip-ansi "^5.2.0" +string-natural-compare@^2.0.2: + version "2.0.3" + resolved "https://registry.yarnpkg.com/string-natural-compare/-/string-natural-compare-2.0.3.tgz#9dbe1dd65490a5fe14f7a5c9bc686fc67cb9c6e4" + integrity sha512-4Kcl12rNjc+6EKhY8QyDVuQTAlMWwRiNbsxnVwBUKFr7dYPQuXVrtNU4sEkjF9LHY0AY6uVbB3ktbkIH4LC+BQ== + string-replace-webpack-plugin@^0.1.3: version "0.1.3" resolved "https://registry.yarnpkg.com/string-replace-webpack-plugin/-/string-replace-webpack-plugin-0.1.3.tgz#73c657e759d66cfe80ae1e0cf091aa256d0e715c" diff --git a/pkg/util/stop/stopper.go b/pkg/util/stop/stopper.go index ab30e9308d2f..bc0a7ae660b1 100644 --- a/pkg/util/stop/stopper.go +++ b/pkg/util/stop/stopper.go @@ -461,6 +461,11 @@ func (s *Stopper) RunAsyncTaskEx(ctx context.Context, opt TaskOpts, f func(conte } // If the caller has a span, the task gets a child span. + // + // Note that we have to create the child in this parent goroutine; we can't + // defer the creation to the spawned async goroutine since the parent span + // might get Finish()ed by then. However, we'll update the child'd goroutine + // ID. var sp *tracing.Span switch opt.SpanOpt { case FollowsFromSpan: @@ -480,6 +485,7 @@ func (s *Stopper) RunAsyncTaskEx(ctx context.Context, opt TaskOpts, f func(conte defer s.runPostlude() if sp != nil { defer sp.Finish() + sp.UpdateGoroutineIDToCurrent() } if alloc != nil { defer alloc.Release() diff --git a/pkg/util/tracing/BUILD.bazel b/pkg/util/tracing/BUILD.bazel index fa4c8a23bc83..3210ab249ec2 100644 --- a/pkg/util/tracing/BUILD.bazel +++ b/pkg/util/tracing/BUILD.bazel @@ -14,6 +14,7 @@ go_library( "tags.go", "test_utils.go", "tracer.go", + "tracer_snapshots.go", "utils.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/util/tracing", diff --git a/pkg/util/tracing/crdbspan.go b/pkg/util/tracing/crdbspan.go index 5cb5c93ab98f..40e096255635 100644 --- a/pkg/util/tracing/crdbspan.go +++ b/pkg/util/tracing/crdbspan.go @@ -35,7 +35,6 @@ type crdbSpan struct { traceID tracingpb.TraceID // probabilistically unique spanID tracingpb.SpanID // probabilistically unique parentSpanID tracingpb.SpanID - goroutineID uint64 operation string // name of operation associated with the span startTime time.Time @@ -66,6 +65,10 @@ type childRef struct { type crdbSpanMu struct { syncutil.Mutex + // goroutineID is the ID of the goroutine that created this span, or the goroutine that + // subsequently adopted it through Span.UpdateGoroutineIDToCurrent()). + goroutineID uint64 + // parent is the span's local parent, if any. parent is not set if the span is // a root or the parent span is remote. // @@ -527,7 +530,7 @@ func (s *crdbSpan) getRecordingNoChildrenLocked( TraceID: s.traceID, SpanID: s.spanID, ParentSpanID: s.parentSpanID, - GoroutineID: s.goroutineID, + GoroutineID: s.mu.goroutineID, Operation: s.operation, StartTime: s.startTime, Duration: s.mu.duration, @@ -727,6 +730,13 @@ func (s *crdbSpan) withLock(f func()) { f() } +// setGoroutineID updates the span's goroutine ID. +func (s *crdbSpan) setGoroutineID(gid int64) { + s.mu.Lock() + defer s.mu.Unlock() + s.mu.goroutineID = uint64(gid) +} + var sortPool = sync.Pool{ New: func() interface{} { return &Recording{} diff --git a/pkg/util/tracing/span.go b/pkg/util/tracing/span.go index c2c738274f92..70578a035207 100644 --- a/pkg/util/tracing/span.go +++ b/pkg/util/tracing/span.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" + "github.com/petermattis/goid" "go.opentelemetry.io/otel/attribute" oteltrace "go.opentelemetry.io/otel/trace" ) @@ -288,6 +289,16 @@ func (sp *Span) IsSterile() bool { return sp.i.sterile } +// UpdateGoroutineIDToCurrent updates the span's goroutine ID to the current +// goroutine. This should be called when a different goroutine takes ownership +// of a span. +func (sp *Span) UpdateGoroutineIDToCurrent() { + if sp.detectUseAfterFinish() { + return + } + sp.i.crdb.setGoroutineID(goid.Get()) +} + // SpanMeta is information about a Span that is not local to this // process. Typically, SpanMeta is populated from information // about a Span on the other end of an RPC, and is used to derive diff --git a/pkg/util/tracing/tracer.go b/pkg/util/tracing/tracer.go index 1bc4d6aec6ff..91021a7ad676 100644 --- a/pkg/util/tracing/tracer.go +++ b/pkg/util/tracing/tracer.go @@ -28,6 +28,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/iterutil" "github.com/cockroachdb/cockroach/pkg/util/netutil/addr" "github.com/cockroachdb/cockroach/pkg/util/randutil" + "github.com/cockroachdb/cockroach/pkg/util/ring" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" @@ -59,6 +60,9 @@ const ( // maxLogsPerSpanExternal limits the number of logs in a Span for external // tracers (net/trace, OpenTelemetry); use a comfortable limit. maxLogsPerSpanExternal = 1000 + // maxSnapshots limits the number of snapshots that a Tracer will hold in + // memory. Beyond this limit, each new snapshot evicts the oldest one. + maxSnapshots = 10 ) // These constants are used to form keys to represent tracing context @@ -239,7 +243,14 @@ type Tracer struct { // activeSpans is a map that references all non-Finish'ed local root spans, // i.e. those for which no WithParent() option was supplied. - activeSpansRegistry *spanRegistry + activeSpansRegistry *SpanRegistry + snapshotsMu struct { + syncutil.Mutex + // snapshots stores the activeSpansRegistry snapshots taken during the + // Tracer's lifetime. The ring buffer will contain snapshots with contiguous + // IDs, from the oldest one to + maxSnapshots - 1. + snapshots ring.Buffer // snapshotWithID + } testingMu syncutil.Mutex // protects testingRecordAsyncSpans testingRecordAsyncSpans bool // see TestingRecordAsyncSpans @@ -263,7 +274,7 @@ type Tracer struct { stack string } -// spanRegistry is a map that references all non-Finish'ed local root spans, +// SpanRegistry is a map that references all non-Finish'ed local root spans, // i.e. those for which no WithLocalParent() option was supplied. The // map is keyed on the span ID, which is deterministically unique. // @@ -272,30 +283,30 @@ type Tracer struct { // // The map can be introspected by `Tracer.VisitSpans`. A Span can also be // retrieved from its ID by `Tracer.GetActiveSpanByID`. -type spanRegistry struct { +type SpanRegistry struct { mu struct { syncutil.Mutex m map[tracingpb.SpanID]*crdbSpan } } -func makeSpanRegistry() *spanRegistry { - r := &spanRegistry{} +func makeSpanRegistry() *SpanRegistry { + r := &SpanRegistry{} r.mu.m = make(map[tracingpb.SpanID]*crdbSpan) return r } -func (r *spanRegistry) removeSpanLocked(id tracingpb.SpanID) { +func (r *SpanRegistry) removeSpanLocked(id tracingpb.SpanID) { delete(r.mu.m, id) } -func (r *spanRegistry) addSpan(s *crdbSpan) { +func (r *SpanRegistry) addSpan(s *crdbSpan) { r.mu.Lock() defer r.mu.Unlock() r.addSpanLocked(s) } -func (r *spanRegistry) addSpanLocked(s *crdbSpan) { +func (r *SpanRegistry) addSpanLocked(s *crdbSpan) { // Ensure that the registry does not grow unboundedly in case there is a leak. // When the registry reaches max size, each new span added kicks out an // arbitrary existing span. We rely on map iteration order here to make this @@ -310,7 +321,7 @@ func (r *spanRegistry) addSpanLocked(s *crdbSpan) { } // getSpanByID looks up a span in the registry. Returns nil if not found. -func (r *spanRegistry) getSpanByID(id tracingpb.SpanID) RegistrySpan { +func (r *SpanRegistry) getSpanByID(id tracingpb.SpanID) RegistrySpan { r.mu.Lock() defer r.mu.Unlock() crdbSpan, ok := r.mu.m[id] @@ -321,7 +332,10 @@ func (r *spanRegistry) getSpanByID(id tracingpb.SpanID) RegistrySpan { return crdbSpan } -func (r *spanRegistry) visitSpans(visitor func(span RegistrySpan) error) error { +// VisitSpans calls the visitor callback for every local root span in the +// registry. Iterations stops when the visitor returns an error. If that error +// is iterutils.StopIteration(), then VisitSpans() returns nil. +func (r *SpanRegistry) VisitSpans(visitor func(span RegistrySpan) error) error { r.mu.Lock() sl := make([]*crdbSpan, 0, len(r.mu.m)) for _, sp := range r.mu.m { @@ -343,7 +357,7 @@ func (r *spanRegistry) visitSpans(visitor func(span RegistrySpan) error) error { // testingAll returns (pointers to) all the spans in the registry, in an // arbitrary order. Since spans can generally finish at any point and use of a // finished span is not permitted, this method is only suitable for tests. -func (r *spanRegistry) testingAll() []*crdbSpan { +func (r *SpanRegistry) testingAll() []*crdbSpan { r.mu.Lock() defer r.mu.Unlock() res := make([]*crdbSpan, 0, len(r.mu.m)) @@ -358,7 +372,7 @@ func (r *spanRegistry) testingAll() []*crdbSpan { // removing the parent from the registry, the children are accessible in the // registry through that parent; if we didn't do this swap when the parent is // removed, the children would not be part of the registry anymore. -func (r *spanRegistry) swap(parentID tracingpb.SpanID, children []*crdbSpan) { +func (r *SpanRegistry) swap(parentID tracingpb.SpanID, children []*crdbSpan) { r.mu.Lock() defer r.mu.Unlock() r.removeSpanLocked(parentID) @@ -857,7 +871,6 @@ child operation: %s, tracer created at: traceID = tracingpb.TraceID(randutil.FastInt63()) } spanID := tracingpb.SpanID(randutil.FastInt63()) - goroutineID := uint64(goid.Get()) // Now allocate the main *Span and contained crdbSpan. // Allocate these together to save on individual allocs. @@ -876,15 +889,15 @@ child operation: %s, tracer created at: }{} helper.crdbSpan = crdbSpan{ - tracer: t, - traceID: traceID, - spanID: spanID, - goroutineID: goroutineID, - startTime: startTime, - logTags: opts.LogTags, + tracer: t, + traceID: traceID, + spanID: spanID, + startTime: startTime, + logTags: opts.LogTags, mu: crdbSpanMu{ - duration: -1, // unfinished - tags: helper.tagsAlloc[:0], + duration: -1, // unfinished + tags: helper.tagsAlloc[:0], + goroutineID: uint64(goid.Get()), }, } helper.crdbSpan.operation = opName @@ -1139,7 +1152,7 @@ func (t *Tracer) GetActiveSpanByID(spanID tracingpb.SpanID) RegistrySpan { // VisitSpans invokes the visitor with all active Spans. The function will // gracefully exit if the visitor returns iterutil.StopIteration(). func (t *Tracer) VisitSpans(visitor func(span RegistrySpan) error) error { - return t.activeSpansRegistry.visitSpans(visitor) + return t.activeSpansRegistry.VisitSpans(visitor) } // TestingRecordAsyncSpans is a test-only helper that configures @@ -1180,6 +1193,11 @@ func (t *Tracer) PanicOnUseAfterFinish() bool { return t.panicOnUseAfterFinish } +// SpanRegistry exports the registry containing all currently-open spans. +func (t *Tracer) SpanRegistry() *SpanRegistry { + return t.activeSpansRegistry +} + // ForkSpan forks the current span, if any[1]. Forked spans "follow from" the // original, and are typically used to trace operations that may outlive the // parent (think async tasks). See the package-level documentation for more diff --git a/pkg/util/tracing/tracer_snapshots.go b/pkg/util/tracing/tracer_snapshots.go new file mode 100644 index 000000000000..8f4ce784202f --- /dev/null +++ b/pkg/util/tracing/tracer_snapshots.go @@ -0,0 +1,195 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tracing + +import ( + "bufio" + "runtime" + "strconv" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/errors" +) + +// SpansSnapshot represents a snapshot of all the open spans at a certain point +// in time. +type SpansSnapshot struct { + // CapturedAt is the time when the snapshot was collected. + CapturedAt time.Time + // Traces contains the collected traces. Each "local route" corresponds to one + // trace. "Detached recording" spans are included in the parent's trace. + Traces []Recording + // Stacks is a map from groutine ID to the goroutine's stack trace. All + // goroutines running at the time when this snapshot is produced are + // represented here. A goroutine referenced by a span's GoroutineID field will + // not be present if it has finished since the respective span was started. + Stacks map[int]string + // Err is set if an error was encountered when producing the snapshot. The + // snapshot will be incomplete in this case. + Err error +} + +// SnapshotID identifies a spans snapshot. The ID can be used to retrieve a +// specific snapshot from the Tracer. +type SnapshotID int +type snapshotWithID struct { + ID SnapshotID + SpansSnapshot +} + +// SaveSnapshot collects a snapshot of the currently open spans (i.e. the +// contents of the Tracer's active spans registry) saves it in-memory in the +// Tracer. The snapshot's ID is returned; the snapshot can be retrieved +// subsequently through t.GetSnapshot(id). +// +// Snapshots also include a dump of all the goroutine stack traces. +func (t *Tracer) SaveSnapshot() SnapshotID { + snap := t.generateSnapshot() + t.snapshotsMu.Lock() + defer t.snapshotsMu.Unlock() + + snapshots := &t.snapshotsMu.snapshots + + if snapshots.Len() == maxSnapshots { + snapshots.RemoveFirst() + } + var id SnapshotID + if snapshots.Len() == 0 { + id = 1 + } else { + id = snapshots.GetLast().(snapshotWithID).ID + 1 + } + snapshots.AddLast(snapshotWithID{ + ID: id, + SpansSnapshot: snap, + }) + return id +} + +var errSnapshotTooOld = errors.Newf( + "the requested snapshot is too old and has been deleted. "+ + "Only the last %d snapshots are stored.", maxSnapshots) + +var errSnapshotDoesntExist = errors.New("the requested snapshot doesn't exist") + +// GetSnapshot returns the snapshot with the given ID. If the ID is below the +// minimum stored snapshot, then the requested snapshot must have been +// garbage-collected and errSnapshotTooOld is returned. If the snapshot id is +// beyond the maximum stored ID, errSnapshotDoesntExist is returned. +// +// Note that SpansSpanshot has an Err field through which errors are returned. +// In these error cases, the snapshot will be incomplete. +func (t *Tracer) GetSnapshot(id SnapshotID) (SpansSnapshot, error) { + t.snapshotsMu.Lock() + defer t.snapshotsMu.Unlock() + snapshots := &t.snapshotsMu.snapshots + + if snapshots.Len() == 0 { + return SpansSnapshot{}, errSnapshotDoesntExist + } + minID := snapshots.GetFirst().(snapshotWithID).ID + if id < minID { + return SpansSnapshot{}, errSnapshotTooOld + } + maxID := snapshots.GetLast().(snapshotWithID).ID + if id > maxID { + return SpansSnapshot{}, errSnapshotDoesntExist + } + + return snapshots.Get(int(id - minID)).(snapshotWithID).SpansSnapshot, nil +} + +// SnapshotInfo represents minimal info about a stored snapshot, as returned by +// Tracer.GetSnapshots(). +type SnapshotInfo struct { + ID SnapshotID + CapturedAt time.Time +} + +// GetSnapshots returns info on all stored span snapshots. +func (t *Tracer) GetSnapshots() []SnapshotInfo { + t.snapshotsMu.Lock() + defer t.snapshotsMu.Unlock() + snapshots := &t.snapshotsMu.snapshots + + res := make([]SnapshotInfo, snapshots.Len()) + for i := 0; i < snapshots.Len(); i++ { + s := snapshots.Get(i).(snapshotWithID) + res[i] = SnapshotInfo{ + ID: s.ID, + CapturedAt: s.CapturedAt, + } + } + return res +} + +// generateSnapshot produces a snapshot of all the currently open spans and +// all the current goroutine stacktraces. +// +// Note that SpansSpanshot has an Err field through which errors are returned. +// In these error cases, the snapshot will be incomplete. +func (t *Tracer) generateSnapshot() SpansSnapshot { + capturedAt := timeutil.Now() + // Collect the traces. + traces := make([]Recording, 0, 1000) + _ = t.SpanRegistry().VisitSpans(func(sp RegistrySpan) error { + rec := sp.GetFullRecording(RecordingVerbose) + traces = append(traces, rec) + return nil + }) + + // Collect and parse the goroutine stack traces. + + // We don't know how big the traces are, so grow a few times if they don't + // fit. Start large, though. + var stacks []byte + for n := 1 << 20; /* 1mb */ n <= (1 << 29); /* 512mb */ n *= 2 { + stacks = make([]byte, n) + nbytes := runtime.Stack(stacks, true /* all */) + if nbytes < len(stacks) { + break + } + } + + splits := strings.Split(string(stacks), "\n\n") + stackMap := make(map[int]string, len(splits)) + var parseErr error + for _, s := range splits { + // Parse the goroutine ID. The first line of each stack is expected to look like: + // goroutine 115 [chan receive]: + scanner := bufio.NewScanner(strings.NewReader(s)) + scanner.Split(bufio.ScanWords) + // Skip the word "goroutine". + if !scanner.Scan() { + parseErr = errors.Errorf("unexpected end of string") + break + } + // Scan the goroutine ID. + if !scanner.Scan() { + parseErr = errors.Errorf("unexpected end of string") + break + } + goroutineID, err := strconv.Atoi(scanner.Text()) + if err != nil { + panic(err) + } + stackMap[goroutineID] = s + } + + return SpansSnapshot{ + CapturedAt: capturedAt, + Traces: traces, + Stacks: stackMap, + Err: parseErr, + } +} diff --git a/pkg/util/tracing/tracingpb/recorded_span.go b/pkg/util/tracing/tracingpb/recorded_span.go index b398962c941d..2bb4862a5104 100644 --- a/pkg/util/tracing/tracingpb/recorded_span.go +++ b/pkg/util/tracing/tracingpb/recorded_span.go @@ -25,6 +25,25 @@ type TraceID uint64 // SpanID is a probabilistically-unique span id. type SpanID uint64 +// Recording represents a group of RecordedSpans rooted at a fixed root span, as +// returned by GetRecording. Spans are sorted by StartTime. +type Recording []RecordedSpan + +// Less implements sort.Interface. +func (r Recording) Less(i, j int) bool { + return r[i].StartTime.Before(r[j].StartTime) +} + +// Swap implements sort.Interface. +func (r Recording) Swap(i, j int) { + r[i], r[j] = r[j], r[i] +} + +// Len implements sort.Interface. +func (r Recording) Len() int { + return len(r) +} + // LogMessageField is the field name used for the log message in a LogRecord. const LogMessageField = "event" diff --git a/pkg/util/tracing/tracingui/BUILD.bazel b/pkg/util/tracing/tracingui/BUILD.bazel new file mode 100644 index 000000000000..b225899b887e --- /dev/null +++ b/pkg/util/tracing/tracingui/BUILD.bazel @@ -0,0 +1,17 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "tracingui", + srcs = ["span_registry_ui.go"], + importpath = "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingui", + visibility = ["//visibility:public"], + deps = [ + "//pkg/ui", + "//pkg/util/log", + "//pkg/util/sysutil", + "//pkg/util/timeutil", + "//pkg/util/tracing", + "//pkg/util/tracing/tracingpb", + "@com_github_cockroachdb_errors//:errors", + ], +) diff --git a/pkg/util/tracing/tracingui/span_registry_ui.go b/pkg/util/tracing/tracingui/span_registry_ui.go new file mode 100644 index 000000000000..cdf23044f7a0 --- /dev/null +++ b/pkg/util/tracing/tracingui/span_registry_ui.go @@ -0,0 +1,277 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tracingui + +import ( + "context" + "fmt" + "html/template" + "net/http" + "net/url" + "strconv" + "time" + + "github.com/cockroachdb/cockroach/pkg/ui" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/sysutil" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" + "github.com/cockroachdb/errors" +) + +// This file deals with producing the /debug/tracez page, which lists +// a snapshot of the spans in the Tracer's active spans registry. + +const dateAndtimeFormat = "2006-01-02 15:04:05.000" +const timeFormat = "15:04:05.000" + +// RegisterHTTPHandlers registers the /debug/tracez handlers, and helpers. +func RegisterHTTPHandlers(ambientCtx log.AmbientContext, mux *http.ServeMux, tr *tracing.Tracer) { + fileServer := http.StripPrefix("/debug/assets/", http.FileServer(http.FS(ui.ListJS))) + mux.HandleFunc("/debug/tracez", + func(w http.ResponseWriter, req *http.Request) { + serveHTTP(ambientCtx.AnnotateCtx(context.Background()), w, req, tr) + }) + mux.HandleFunc("/debug/show-trace", + func(w http.ResponseWriter, req *http.Request) { + serveHTTPTrace(ambientCtx.AnnotateCtx(req.Context()), w, req, tr) + }) + mux.HandleFunc("/debug/assets/list.min.js", fileServer.ServeHTTP) +} + +type pageData struct { + SnapshotID tracing.SnapshotID + Now time.Time + CapturedAt time.Time + Err error + AllSnapshots []tracing.SnapshotInfo + SpansList spansList +} + +type spansList struct { + Spans []tracingpb.RecordedSpan + // Stacks contains stack traces for the goroutines referenced by the Spans + // through their GoroutineID field. + Stacks map[int]string // GoroutineID to stack trace +} + +var spansTableTemplate *template.Template + +func init() { + spansTableTemplate = template.Must(template.New("spans-list").Funcs( + template.FuncMap{ + "formatTime": formatTime, + "formatTimeNoMillis": formatTimeNoMillis, + "since": func(t time.Time, capturedAt time.Time) string { + return fmt.Sprintf("(%s ago)", formatDuration(capturedAt.Sub(t))) + }, + "timeRaw": func(t time.Time) int64 { return t.UnixMicro() }, + }, + ).Parse(ui.SpansTableTemplateSrc)) +} + +func formatTime(t time.Time) string { + t = t.UTC() + if t.Truncate(24*time.Hour) == timeutil.Now().Truncate(24*time.Hour) { + return t.Format(timeFormat) + } + return t.Format(dateAndtimeFormat) +} + +func formatTimeNoMillis(t time.Time) string { + t = t.UTC() + if t.Truncate(24*time.Hour) == timeutil.Now().Truncate(24*time.Hour) { + const format = "15:04:05" + return t.Format(format) + } + const format = "2006-01-02 15:04:05" + return t.Format(format) +} + +// formatDuration formats a duration in one of the following formats, depending +// on its magnitude. +// 0.001s +// 1.000s +// 1m01s +// 1h05m01s +// 1d02h05m +func formatDuration(d time.Duration) string { + d = d.Round(time.Millisecond) + days := d / (24 * time.Hour) + d -= days * 24 * time.Hour + h := d / time.Hour + d -= h * time.Hour + m := d / time.Minute + d -= m * time.Minute + s := d / time.Second + d -= s * time.Second + millis := d / time.Millisecond + if days != 0 { + return fmt.Sprintf("%dd%02dh%02dm", days, h, m) + } + if h != 0 { + return fmt.Sprintf("%dh%02dm%02ds", h, m, s) + } + if m != 0 { + return fmt.Sprintf("%dm%02ds", m, s) + } + return fmt.Sprintf("%d.%03ds", s, millis) +} + +func serveHTTP(ctx context.Context, w http.ResponseWriter, r *http.Request, tr *tracing.Tracer) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + var pageErr error + if err := r.ParseForm(); err != nil { + log.Warningf(ctx, "error parsing /tracez form: %s", err.Error()) + pageErr = err + } + + snapID := r.Form.Get("snap") + var snapshot tracing.SpansSnapshot + var snapshotID tracing.SnapshotID + switch snapID { + case "new": + // Capture a new snapshot and return a redirect to that snapshot's ID. + snapshotID = tr.SaveSnapshot() + newURL, err := r.URL.Parse(fmt.Sprintf("?snap=%d", snapshotID)) + if err != nil { + pageErr = err + break + } + http.Redirect(w, r, newURL.String(), http.StatusFound) + return + case "": + default: + id, err := strconv.Atoi(snapID) + if err != nil { + pageErr = errors.Errorf("invalid snapshot ID: %s", snapID) + break + } + snapshotID = tracing.SnapshotID(id) + snapshot, err = tr.GetSnapshot(snapshotID) + if err != nil { + pageErr = err + break + } + } + + // Flatten the recordings. + spans := make([]tracingpb.RecordedSpan, 0, len(snapshot.Traces)*3) + for _, r := range snapshot.Traces { + spans = append(spans, r...) + } + + // Copy the stack traces and augment the map. + stacks := make(map[int]string, len(snapshot.Stacks)) + for k, v := range snapshot.Stacks { + stacks[k] = v + } + // Fill in messages for the goroutines for which we don't have a stack trace. + for _, s := range spans { + gid := int(s.GoroutineID) + if _, ok := stacks[gid]; !ok { + stacks[gid] = "Goroutine not found. Goroutine must have finished since the span was created." + } + } + + if pageErr != nil { + snapshot.Err = pageErr + } + err := spansTableTemplate.ExecuteTemplate(w, "spans-list", pageData{ + Now: timeutil.Now(), + CapturedAt: snapshot.CapturedAt, + SnapshotID: snapshotID, + AllSnapshots: tr.GetSnapshots(), + Err: snapshot.Err, + SpansList: spansList{ + Spans: spans, + Stacks: stacks, + }, + }) + if err != nil { + // We can get a "connection reset by peer" error if the browser requesting + // the page has gone away. + if !sysutil.IsErrConnectionReset(err) { + log.Warningf(ctx, "error executing tracez template: %s", err) + http.Error(w, err.Error(), http.StatusInternalServerError) + } + } +} + +func serveHTTPTrace( + ctx context.Context, w http.ResponseWriter, r *http.Request, tr *tracing.Tracer, +) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + var err error + if err = r.ParseForm(); err != nil { + goto Error + } + + { + var traceID tracingpb.TraceID + tid := r.Form.Get("trace") + if tid == "" { + err = errors.Errorf("trace ID missing; use ?trace=") + goto Error + } + var id int + id, err = strconv.Atoi(tid) + if err != nil { + err = errors.Errorf("invalid trace ID: %s", tid) + goto Error + } + traceID = tracingpb.TraceID(id) + + var snapshotID tracing.SnapshotID + var snapshot tracing.SpansSnapshot + snapID := r.Form.Get("snap") + if snapID != "" { + var id int + id, err = strconv.Atoi(snapID) + if err != nil { + err = errors.Errorf("invalid snapshot ID: %s", snapID) + goto Error + } + snapshotID = tracing.SnapshotID(id) + snapshot, err = tr.GetSnapshot(snapshotID) + if err != nil { + goto Error + } + } else { + // If no snapshot is specified, we'll take a new one now and redirect to + // it. + snapshotID = tr.SaveSnapshot() + var newURL *url.URL + newURL, err = r.URL.Parse(fmt.Sprintf("?trace=%d&snap=%d", traceID, snapshotID)) + if err != nil { + goto Error + } + http.Redirect(w, r, newURL.String(), http.StatusFound) + return + } + + for _, r := range snapshot.Traces { + if r[0].TraceID == traceID { + _, err = w.Write([]byte("
" + r.String() + "\n
")) + if err != nil { + goto Error + } + return + } + } + err = errors.Errorf("trace %d not found in snapshot", traceID) + goto Error + } + +Error: + http.Error(w, err.Error(), http.StatusInternalServerError) +}