diff --git a/src/control/SConscript b/src/control/SConscript index e8865f368bb..06410fee53a 100644 --- a/src/control/SConscript +++ b/src/control/SConscript @@ -106,7 +106,8 @@ def install_go_bin(env, name, libs=None, install_man=False): build_path = join('$BUILD_DIR/src/control', f'{name}.8') menv = env.Clone() # This runs code from the build area so needs LD_LIBRARY_PATH set. - menv.d_enable_ld_path(["cart", "gurt", "client/api", "common", "client/dfs", "utils"]) + menv.d_enable_ld_path(["cart", "gurt", "client/api", "common", "client/dfs", "utils", + "utils/self_test"]) menv.Command(build_path, target, f'{gen_bin} manpage -o {build_path}') menv.Install('$PREFIX/share/man/man8', build_path) @@ -151,9 +152,12 @@ def scons(): "-L$BUILD_DIR/src/cart " "-L$BUILD_DIR/src/common " "-L$BUILD_DIR/src/client/dfs " - "-L$BUILD_DIR/src/utils $_RPATH") + "-L$BUILD_DIR/src/utils " + "-L$BUILD_DIR/src/utils/self_test " + "$_RPATH") dbenv.AppendENVPath("CGO_LDFLAGS", dblibs, sep=" ") - install_go_bin(dbenv, 'daos', libs=['daos_cmd_hdlrs', 'dfs', 'duns', 'daos'], + install_go_bin(dbenv, 'daos', libs=['daos_cmd_hdlrs', 'dfs', 'duns', 'daos', + 'daos_self_test'], install_man=True) if not prereqs.server_requested(): diff --git a/src/control/cmd/daos/health.go b/src/control/cmd/daos/health.go index cbc29c1e3ba..70e54213084 100644 --- a/src/control/cmd/daos/health.go +++ b/src/control/cmd/daos/health.go @@ -14,12 +14,16 @@ import ( "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/cmd/daos/pretty" + "github.com/daos-stack/daos/src/control/common/cmdutil" "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/ranklist" + "github.com/daos-stack/daos/src/control/lib/ui" "github.com/daos-stack/daos/src/control/logging" ) type healthCmds struct { - Check healthCheckCmd `command:"check" description:"Perform DAOS system health checks"` + Check healthCheckCmd `command:"check" description:"Perform DAOS system health checks"` + NetTest netTestCmd `command:"net-test" description:"Perform non-destructive DAOS networking tests"` } type healthCheckCmd struct { @@ -68,7 +72,7 @@ func (cmd *healthCheckCmd) Execute([]string) error { return err } - sysInfo, err := cmd.apiProvider.GetSystemInfo() + sysInfo, err := cmd.apiProvider.GetSystemInfo(cmd.MustLogCtx()) if err != nil { cmd.Errorf("failed to query system information: %v", err) } @@ -166,3 +170,66 @@ func (cmd *healthCheckCmd) Execute([]string) error { return nil } + +type netTestCmd struct { + cmdutil.JSONOutputCmd + cmdutil.LogCmd + sysCmd + Ranks ui.RankSetFlag `short:"r" long:"ranks" description:"Use the specified ranks as test endpoints (default: all)"` + Tags ui.RankSetFlag `short:"t" long:"tags" description:"Use the specified tags on ranks" default:"0"` + XferSize ui.ByteSizeFlag `short:"s" long:"size" description:"Per-RPC transfer size (send/reply)"` + MaxInflight uint `short:"m" long:"max-inflight" description:"Maximum number of inflight RPCs"` + RepCount uint `short:"c" long:"rep-count" description:"Number of times to repeat the RPCs, per endpoint"` + TpsBytes bool `short:"y" long:"bytes" description:"Show throughput values in bytes per second"` + Verbose bool `short:"v" long:"verbose" description:"Display more detailed DAOS network testing information"` +} + +func (cmd *netTestCmd) Execute(_ []string) error { + cfg := &daos.SelfTestConfig{ + GroupName: cmd.SysName, + EndpointRanks: cmd.Ranks.Ranks(), + EndpointTags: ranklist.RanksToUint32(cmd.Tags.Ranks()), + MaxInflightRPCs: cmd.MaxInflight, + Repetitions: cmd.RepCount, + } + if cmd.XferSize.IsSet() { + // If set, use that size, otherwise use the zero value. + cfg.SendSizes = []uint64{cmd.XferSize.Bytes} + cfg.ReplySizes = cfg.SendSizes + } + if err := cfg.SetDefaults(); err != nil { + return err + } + + if !cmd.JSONOutputEnabled() { + var cfgBuf strings.Builder + if err := pretty.PrintSelfTestConfig(&cfgBuf, cfg, cmd.Verbose); err != nil { + return err + } + cmd.Info(cfgBuf.String()) + cmd.Info("Starting non-destructive network test (duration depends on performance)...\n\n") + } + + res, err := RunSelfTest(cmd.MustLogCtx(), cfg) + if err != nil { + return err + } + + if cmd.JSONOutputEnabled() { + return cmd.OutputJSON(struct { + Cfg *daos.SelfTestConfig `json:"configuration"` + Res []*daos.SelfTestResult `json:"results"` + }{ + Cfg: cfg, + Res: res, + }, nil) + } + + var resBuf strings.Builder + if err := pretty.PrintSelfTestResults(&resBuf, res, cmd.Verbose, cmd.TpsBytes); err != nil { + return err + } + cmd.Info(resBuf.String()) + + return nil +} diff --git a/src/control/cmd/daos/health_test.go b/src/control/cmd/daos/health_test.go new file mode 100644 index 00000000000..95565b36bb2 --- /dev/null +++ b/src/control/cmd/daos/health_test.go @@ -0,0 +1,64 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package main + +import ( + "context" + "testing" + + "github.com/dustin/go-humanize" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + + "github.com/daos-stack/daos/src/control/common/cmdutil" + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/ranklist" + "github.com/daos-stack/daos/src/control/lib/ui" + "github.com/daos-stack/daos/src/control/logging" +) + +func RunSelfTest(ctx context.Context, cfg *daos.SelfTestConfig) ([]*daos.SelfTestResult, error) { + return []*daos.SelfTestResult{}, nil +} + +func TestDaos_netTestCmdExecute(t *testing.T) { + // Quickie smoke test for the UI -- will flesh out later. + var opts cliOptions + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + args := []string{ + "health", "net-test", + "--ranks", "0-3", + "--tags", "4-9", + "--size", "20 MiB", + "--rep-count", "2222", + "--bytes", "--verbose", + } + expArgs := netTestCmd{} + expArgs.Ranks.Replace(ranklist.MustCreateRankSet("0-3")) + expArgs.Tags.Replace(ranklist.MustCreateRankSet("4-9")) + expArgs.XferSize.Bytes = 20 * humanize.MiByte + expArgs.RepCount = 2222 + expArgs.Verbose = true + expArgs.TpsBytes = true + + if err := parseOpts(args, &opts, log); err != nil { + t.Fatal(err) + } + cmpOpts := cmp.Options{ + cmpopts.IgnoreUnexported(netTestCmd{}), + cmp.Comparer(func(a, b ranklist.RankSet) bool { + return a.String() == b.String() + }), + cmp.Comparer(func(a, b ui.ByteSizeFlag) bool { + return a.String() == b.String() + }), + cmpopts.IgnoreTypes(cmdutil.LogCmd{}, cmdutil.JSONOutputCmd{}), + } + test.CmpAny(t, "health net-test args", expArgs, opts.Health.NetTest, cmpOpts...) +} diff --git a/src/control/cmd/daos/main.go b/src/control/cmd/daos/main.go index 5d6886cb2e5..95e13585340 100644 --- a/src/control/cmd/daos/main.go +++ b/src/control/cmd/daos/main.go @@ -20,6 +20,7 @@ import ( "github.com/daos-stack/daos/src/control/common/cmdutil" "github.com/daos-stack/daos/src/control/fault" "github.com/daos-stack/daos/src/control/lib/atm" + "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/logging" ) @@ -182,7 +183,7 @@ or query/manage an object inside a container.` // Initialize the daos debug system first so that // any allocations made as part of argument parsing // are logged when running under NLT. - debugFini, err := initDaosDebug() + debugFini, err := daos.InitLogging(daos.UnsetLogMask) if err != nil { exitWithError(log, err) } diff --git a/src/control/cmd/daos/pretty/selftest.go b/src/control/cmd/daos/pretty/selftest.go new file mode 100644 index 00000000000..9a18dc0cb44 --- /dev/null +++ b/src/control/cmd/daos/pretty/selftest.go @@ -0,0 +1,262 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package pretty + +import ( + "fmt" + "io" + "sort" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/hostlist" + "github.com/daos-stack/daos/src/control/lib/ranklist" + "github.com/daos-stack/daos/src/control/lib/txtfmt" + "github.com/daos-stack/daos/src/control/lib/ui" +) + +type timeUnit uint64 + +const ( + ns timeUnit = 0 + us timeUnit = 1000 + ms timeUnit = 1000 * 1000 + s timeUnit = 1000 * 1000 * 1000 +) + +func (u timeUnit) String() string { + switch u { + case ns: + return "ns" + case us: + return "μs" + case ms: + return "ms" + case s: + return "s" + default: + return "unknown" + } +} + +func printLatencyVal(val float64, u timeUnit) string { + return fmt.Sprintf("%.02f%s", val/float64(u), u) +} + +// PrintSelfTestResult generates a human-readable representation of the supplied +// daos.SelfTestResult struct and writes it to the supplied io.Writer. +func PrintSelfTestResult(out io.Writer, result *daos.SelfTestResult, verbose, showBytes bool) error { + if result == nil { + return errors.Errorf("nil %T", result) + } + + rpcThroughput := float64(result.MasterLatency.Succeeded()) / result.Duration.Seconds() + + epRanks := ranklist.NewRankSet() + epTgts := hostlist.NewNumericSet() + for _, ep := range result.TargetEndpoints { + epRanks.Add(ep.Rank) + epTgts.Add(uint(ep.Tag)) + } + srvEpTitle := "Server Endpoint" + if epRanks.Count() > 1 { + srvEpTitle += "s" + } + summary := []txtfmt.TableRow{ + {srvEpTitle: epRanks.RangedString() + ":" + epTgts.RangedString()}, + {"RPC Throughput": fmt.Sprintf("%.02f RPC/s", rpcThroughput)}, + } + if result.SendSize > 0 || result.ReplySize > 0 { + suffix := "B/s" + bw := rpcThroughput * (float64(result.SendSize) + float64(result.ReplySize)) + if !showBytes { + bw *= 8 + suffix = "bps" + } + summary = append(summary, txtfmt.TableRow{ + "RPC Bandwidth": ui.FmtHumanSize(bw, suffix, false), + }) + } + _, masterBuckets := result.MasterLatency.Percentiles() + summary = append(summary, txtfmt.TableRow{ + "Average Latency": printLatencyVal(float64(result.MasterLatency.Average()), ms), + }) + if l, found := masterBuckets[95]; found { + summary = append(summary, txtfmt.TableRow{ + "95% Latency": printLatencyVal(l.UpperBound, ms), + }) + } + if l, found := masterBuckets[99]; found { + summary = append(summary, txtfmt.TableRow{ + "99% Latency": printLatencyVal(l.UpperBound, ms), + }) + } + if verbose { + summary = append(summary, []txtfmt.TableRow{ + {"Client Endpoint": result.MasterEndpoint.String()}, + {"Duration": result.Duration.String()}, + {"Repetitions": fmt.Sprintf("%d", result.Repetitions)}, + {"Send Size": ui.FmtHumanSize(float64(result.SendSize), "B", true)}, + {"Reply Size": ui.FmtHumanSize(float64(result.ReplySize), "B", true)}, + }...) + } + if result.MasterLatency.FailCount > 0 { + failPct := (float64(result.MasterLatency.FailCount) / float64(result.Repetitions)) * 100 + summary = append(summary, txtfmt.TableRow{ + "Failed RPCs": fmt.Sprintf("%d (%.01f%%)", result.MasterLatency.FailCount, failPct), + }) + } + ef := txtfmt.NewEntityFormatter("Client/Server Network Test Summary", 2) + fmt.Fprintln(out, ef.Format(summary)) + + if !verbose { + return nil + } + + fmt.Fprintln(out, "Per-Target Latency Results") + iw := txtfmt.NewIndentWriter(out) + + var hasFailed bool + dispUnit := ms // TODO: Calculate based on average value? + pctTitles := make(map[uint64]string) + var table []txtfmt.TableRow + for _, ep := range result.TargetEndpoints { + el, found := result.TargetLatencies[ep] + if !found { + continue + } + + if el.FailCount > 0 { + hasFailed = true + } + pcts, buckets := el.Percentiles() + + row := txtfmt.TableRow{ + "Target": ep.String(), + "Min": printLatencyVal(float64(el.Min), dispUnit), + "Max": printLatencyVal(float64(el.Max), dispUnit), + "Failed": fmt.Sprintf("%.01f%%", float64(el.FailCount)/float64(el.TotalRPCs)*100), + } + if verbose { + row["Average"] = printLatencyVal(float64(el.Average()), dispUnit) + row["StdDev"] = printLatencyVal(el.StdDev(), dispUnit) + } + + for _, pct := range pcts { + pctTitles[pct] = fmt.Sprintf("%d%%", pct) + row[pctTitles[pct]] = printLatencyVal(buckets[pct].UpperBound, dispUnit) + } + + table = append(table, row) + } + + var pctKeys []uint64 + for key := range pctTitles { + pctKeys = append(pctKeys, key) + } + sort.Slice(pctKeys, func(a, b int) bool { + return pctKeys[a] < pctKeys[b] + }) + titles := []string{"Target", "Min"} + for _, key := range pctKeys { + titles = append(titles, pctTitles[key]) + } + titles = append(titles, "Max") + if verbose { + titles = append(titles, "Average") + titles = append(titles, "StdDev") + } + if hasFailed { + titles = append(titles, "Failed") + } + tf := txtfmt.NewTableFormatter(titles...) + tf.InitWriter(iw) + tf.Format(table) + + return nil +} + +// PrintSelfTestResults generates a human-readable representation of the supplied +// slice of daos.SelfTestResult structs and writes it to the supplied io.Writer. +func PrintSelfTestResults(out io.Writer, results []*daos.SelfTestResult, verbose, showBytes bool) error { + if len(results) == 0 { + fmt.Fprintln(out, "No test results.") + } + if len(results) > 1 { + fmt.Fprintf(out, "Showing %d self test results:\n", len(results)) + out = txtfmt.NewIndentWriter(out) + } + for _, res := range results { + if err := PrintSelfTestResult(out, res, verbose, showBytes); err != nil { + return err + } + } + + return nil +} + +// PrintSelfTestConfig generates a human-readable representation of the self_test configuration. +func PrintSelfTestConfig(out io.Writer, cfg *daos.SelfTestConfig, verbose bool) error { + if cfg == nil { + return errors.Errorf("nil %T", cfg) + } + + srvRow := func(r []ranklist.Rank) txtfmt.TableRow { + srvTitle := "Server" + if len(r) == 1 { + return txtfmt.TableRow{srvTitle: fmt.Sprintf("%d", r[0])} + } + srvTitle += "s" + if len(r) == 0 { + return txtfmt.TableRow{srvTitle: "All"} + } + return txtfmt.TableRow{srvTitle: ranklist.RankSetFromRanks(r).RangedString()} + } + rpcSizeRow := func(dir string, sizes []uint64) txtfmt.TableRow { + title := fmt.Sprintf("%s RPC Size", dir) + if len(sizes) == 0 { + return txtfmt.TableRow{title: "None"} + } else if len(sizes) == 1 { + return txtfmt.TableRow{title: ui.FmtHumanSize(float64(sizes[0]), "B", true)} + } + sizeStrs := make([]string, len(sizes)) + for i, size := range sizes { + sizeStrs[i] = ui.FmtHumanSize(float64(size), "B", true) + } + return txtfmt.TableRow{title + "s": fmt.Sprintf("%v", sizeStrs)} + } + cfgRows := []txtfmt.TableRow{ + srvRow(cfg.EndpointRanks), + rpcSizeRow("Send", cfg.SendSizes), + rpcSizeRow("Reply", cfg.ReplySizes), + {"RPCs Per Server": fmt.Sprintf("%d", cfg.Repetitions)}, + } + if verbose { + tagRow := func(t []uint32) txtfmt.TableRow { + tagTitle := "Tag" + if len(t) == 1 { + return txtfmt.TableRow{tagTitle: fmt.Sprintf("%d", t[0])} + } + tagTitle += "s" + if len(t) == 0 { + return txtfmt.TableRow{tagTitle: "ERROR (0 tags)"} // Can't(?) happen... + } + return txtfmt.TableRow{tagTitle: ranklist.RankSetFromRanks(ranklist.RanksFromUint32(t)).RangedString()} + } + cfgRows = append(cfgRows, []txtfmt.TableRow{ + {"System Name": cfg.GroupName}, + tagRow(cfg.EndpointTags), + {"Max In-Flight RPCs": fmt.Sprintf("%d", cfg.MaxInflightRPCs)}, + }...) + } + + ef := txtfmt.NewEntityFormatter("Client/Server Network Test Parameters", 2) + fmt.Fprintln(out, ef.Format(cfgRows)) + + return nil +} diff --git a/src/control/cmd/daos/pretty/selftest_test.go b/src/control/cmd/daos/pretty/selftest_test.go new file mode 100644 index 00000000000..8ae57f4347a --- /dev/null +++ b/src/control/cmd/daos/pretty/selftest_test.go @@ -0,0 +1,371 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package pretty_test + +import ( + "errors" + "strings" + "testing" + "time" + + "github.com/daos-stack/daos/src/control/cmd/daos/pretty" + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/ranklist" +) + +func TestPretty_PrintSelfTestConfig(t *testing.T) { + genCfg := func(xfrm func(cfg *daos.SelfTestConfig)) *daos.SelfTestConfig { + cfg := &daos.SelfTestConfig{} + cfg.SetDefaults() + if xfrm != nil { + xfrm(cfg) + } + return cfg + } + for name, tc := range map[string]struct { + cfg *daos.SelfTestConfig + verbose bool + expStr string + expErr error + }{ + "nil": { + expErr: errors.New("nil"), + }, + "defaults": { + cfg: genCfg(nil), + expStr: ` +Client/Server Network Test Parameters +------------------------------------- + Servers : All + Send RPC Size : 1.00 MiB + Reply RPC Size : 1.00 MiB + RPCs Per Server: 10000 + +`, + }, + "single server": { + cfg: genCfg(func(cfg *daos.SelfTestConfig) { + cfg.EndpointRanks = []ranklist.Rank{1} + }), + expStr: ` +Client/Server Network Test Parameters +------------------------------------- + Server : 1 + Send RPC Size : 1.00 MiB + Reply RPC Size : 1.00 MiB + RPCs Per Server: 10000 + +`, + }, + "custom": { + cfg: genCfg(func(cfg *daos.SelfTestConfig) { + cfg.EndpointRanks = []ranklist.Rank{0, 1, 2} + cfg.SendSizes = []uint64{1024, 1024 * 1024} + cfg.ReplySizes = []uint64{2048 * 1024, 2048 * 1024 * 1024} + }), + expStr: ` +Client/Server Network Test Parameters +------------------------------------- + Servers : [0-2] + Send RPC Sizes : [1.00 KiB 1.00 MiB] + Reply RPC Sizes: [2.00 MiB 2.00 GiB] + RPCs Per Server: 10000 + +`, + }, + "defaults - verbose": { + cfg: genCfg(nil), + verbose: true, + expStr: ` +Client/Server Network Test Parameters +------------------------------------- + Servers : All + Send RPC Size : 1.00 MiB + Reply RPC Size : 1.00 MiB + RPCs Per Server : 10000 + System Name : daos_server + Tag : 0 + Max In-Flight RPCs: 16 + +`, + }, + "custom - verbose": { + cfg: genCfg(func(cfg *daos.SelfTestConfig) { + cfg.EndpointRanks = []ranklist.Rank{0, 1, 2} + cfg.EndpointTags = []uint32{0, 1, 2} + cfg.SendSizes = []uint64{1024, 1024 * 1024} + cfg.ReplySizes = []uint64{2048 * 1024, 2048 * 1024 * 1024} + }), + verbose: true, + expStr: ` +Client/Server Network Test Parameters +------------------------------------- + Servers : [0-2] + Send RPC Sizes : [1.00 KiB 1.00 MiB] + Reply RPC Sizes : [2.00 MiB 2.00 GiB] + RPCs Per Server : 10000 + System Name : daos_server + Tags : [0-2] + Max In-Flight RPCs: 16 + +`, + }, + "no sizes?": { + cfg: genCfg(func(cfg *daos.SelfTestConfig) { + cfg.SendSizes = []uint64{} + cfg.ReplySizes = []uint64{} + }), + verbose: true, + expStr: ` +Client/Server Network Test Parameters +------------------------------------- + Servers : All + Send RPC Size : None + Reply RPC Size : None + RPCs Per Server : 10000 + System Name : daos_server + Tag : 0 + Max In-Flight RPCs: 16 + +`, + }, + "no targets?": { + cfg: genCfg(func(cfg *daos.SelfTestConfig) { + cfg.EndpointTags = []uint32{} + }), + verbose: true, + expStr: ` +Client/Server Network Test Parameters +------------------------------------- + Servers : All + Send RPC Size : 1.00 MiB + Reply RPC Size : 1.00 MiB + RPCs Per Server : 10000 + System Name : daos_server + Tags : ERROR (0 tags) + Max In-Flight RPCs: 16 + +`, + }, + } { + t.Run(name, func(t *testing.T) { + var bld strings.Builder + gotErr := pretty.PrintSelfTestConfig(&bld, tc.cfg, tc.verbose) + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + test.CmpAny(t, "Config Output", strings.TrimLeft(tc.expStr, "\n"), bld.String()) + }) + } +} + +func genResult(xfrm func(result *daos.SelfTestResult)) *daos.SelfTestResult { + cfg := &daos.SelfTestConfig{} + cfg.SetDefaults() + result := &daos.SelfTestResult{ + MasterEndpoint: daos.SelfTestEndpoint{Rank: 3, Tag: 0}, + TargetEndpoints: []daos.SelfTestEndpoint{ + {Rank: 0, Tag: 0}, + {Rank: 1, Tag: 0}, + {Rank: 2, Tag: 0}, + }, + Repetitions: cfg.Repetitions * 3, + SendSize: cfg.SendSizes[0], + ReplySize: cfg.ReplySizes[0], + BufferAlignment: cfg.BufferAlignment, + Duration: 8500 * time.Millisecond, + MasterLatency: &daos.EndpointLatency{}, + } + for i := int64(1); i <= int64(result.Repetitions); i++ { + result.MasterLatency.AddValue(i * 1000) + result.AddTargetLatency(ranklist.Rank(i%3), 0, i*1000) + } + if xfrm != nil { + xfrm(result) + } + return result +} + +func TestPrettyPrintSelfTestResult(t *testing.T) { + for name, tc := range map[string]struct { + result *daos.SelfTestResult + verbose bool + showBytes bool + expStr string + expErr error + }{ + "nil": { + expErr: errors.New("nil"), + }, + "non-verbose, bps": { + result: genResult(nil), + expStr: ` +Client/Server Network Test Summary +---------------------------------- + Server Endpoints: [0-2]:0 + RPC Throughput : 3529.41 RPC/s + RPC Bandwidth : 59.21 Gbps + Average Latency : 15.00ms + 95% Latency : 28.50ms + 99% Latency : 29.70ms + +`, + }, + "non-verbose, bytes": { + result: genResult(nil), + showBytes: true, + expStr: ` +Client/Server Network Test Summary +---------------------------------- + Server Endpoints: [0-2]:0 + RPC Throughput : 3529.41 RPC/s + RPC Bandwidth : 7.40 GB/s + Average Latency : 15.00ms + 95% Latency : 28.50ms + 99% Latency : 29.70ms + +`, + }, + "verbose, bps": { + result: genResult(nil), + verbose: true, + expStr: ` +Client/Server Network Test Summary +---------------------------------- + Server Endpoints: [0-2]:0 + RPC Throughput : 3529.41 RPC/s + RPC Bandwidth : 59.21 Gbps + Average Latency : 15.00ms + 95% Latency : 28.50ms + 99% Latency : 29.70ms + Client Endpoint : 3:0 + Duration : 8.5s + Repetitions : 30000 + Send Size : 1.00 MiB + Reply Size : 1.00 MiB + +Per-Target Latency Results + Target Min 50% 75% 90% 95% 99% Max Average StdDev + ------ --- --- --- --- --- --- --- ------- ------ + 0:0 0.00ms 15.00ms 22.50ms 27.00ms 28.50ms 29.70ms 30.00ms 15.00ms 8.66ms + 1:0 0.00ms 15.00ms 22.50ms 27.00ms 28.50ms 29.70ms 30.00ms 15.00ms 8.66ms + 2:0 0.00ms 15.00ms 22.50ms 27.00ms 28.50ms 29.70ms 30.00ms 15.00ms 8.66ms +`, + }, + "verbose with failures, bytes": { + result: genResult(func(res *daos.SelfTestResult) { + for i := int64(1); i <= int64(res.Repetitions/4); i++ { + res.MasterLatency.AddValue(-1) + res.AddTargetLatency(ranklist.Rank(i%3), 0, -1) + } + }), + verbose: true, + showBytes: true, + expStr: ` +Client/Server Network Test Summary +---------------------------------- + Server Endpoints: [0-2]:0 + RPC Throughput : 3529.41 RPC/s + RPC Bandwidth : 7.40 GB/s + Average Latency : 15.00ms + 95% Latency : 28.50ms + 99% Latency : 29.70ms + Client Endpoint : 3:0 + Duration : 8.5s + Repetitions : 30000 + Send Size : 1.00 MiB + Reply Size : 1.00 MiB + Failed RPCs : 7500 (25.0%) + +Per-Target Latency Results + Target Min 50% 75% 90% 95% 99% Max Average StdDev Failed + ------ --- --- --- --- --- --- --- ------- ------ ------ + 0:0 0.00ms 15.00ms 22.50ms 27.00ms 28.50ms 29.70ms 30.00ms 15.00ms 8.66ms 20.0% + 1:0 0.00ms 15.00ms 22.50ms 27.00ms 28.50ms 29.70ms 30.00ms 15.00ms 8.66ms 20.0% + 2:0 0.00ms 15.00ms 22.50ms 27.00ms 28.50ms 29.70ms 30.00ms 15.00ms 8.66ms 20.0% +`, + }, + } { + t.Run(name, func(t *testing.T) { + var bld strings.Builder + gotErr := pretty.PrintSelfTestResult(&bld, tc.result, tc.verbose, tc.showBytes) + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + test.CmpAny(t, "Self Test Result", strings.TrimLeft(tc.expStr, "\n"), bld.String()) + }) + } +} + +func TestPretty_PrintSelfTestResults(t *testing.T) { + for name, tc := range map[string]struct { + results []*daos.SelfTestResult + verbose bool + expStr string + expErr error + }{ + "zero results": { + expStr: ` +No test results. +`, + }, + "one result": { + results: []*daos.SelfTestResult{genResult(nil)}, + expStr: ` +Client/Server Network Test Summary +---------------------------------- + Server Endpoints: [0-2]:0 + RPC Throughput : 3529.41 RPC/s + RPC Bandwidth : 59.21 Gbps + Average Latency : 15.00ms + 95% Latency : 28.50ms + 99% Latency : 29.70ms + +`, + }, + "two results": { + results: []*daos.SelfTestResult{genResult(nil), genResult(nil)}, + expStr: ` +Showing 2 self test results: + Client/Server Network Test Summary + ---------------------------------- + Server Endpoints: [0-2]:0 + RPC Throughput : 3529.41 RPC/s + RPC Bandwidth : 59.21 Gbps + Average Latency : 15.00ms + 95% Latency : 28.50ms + 99% Latency : 29.70ms + + Client/Server Network Test Summary + ---------------------------------- + Server Endpoints: [0-2]:0 + RPC Throughput : 3529.41 RPC/s + RPC Bandwidth : 59.21 Gbps + Average Latency : 15.00ms + 95% Latency : 28.50ms + 99% Latency : 29.70ms + +`, + }, + } { + t.Run(name, func(t *testing.T) { + var bld strings.Builder + gotErr := pretty.PrintSelfTestResults(&bld, tc.results, tc.verbose, false) + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + test.CmpAny(t, "Config Output", strings.TrimLeft(tc.expStr, "\n"), bld.String()) + }) + } +} diff --git a/src/control/cmd/daos/stubbed.go b/src/control/cmd/daos/stubbed.go new file mode 100644 index 00000000000..4a08ad77255 --- /dev/null +++ b/src/control/cmd/daos/stubbed.go @@ -0,0 +1,15 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build !test_stubs +// +build !test_stubs + +package main + +import "github.com/daos-stack/daos/src/control/lib/daos/api" + +var ( + RunSelfTest = api.RunSelfTest +) diff --git a/src/control/cmd/daos/system.go b/src/control/cmd/daos/system.go index bdb189494f8..b0deb41f3a8 100644 --- a/src/control/cmd/daos/system.go +++ b/src/control/cmd/daos/system.go @@ -19,7 +19,7 @@ type systemQueryCmd struct { } func (cmd *systemQueryCmd) Execute(_ []string) error { - sysInfo, err := cmd.apiProvider.GetSystemInfo() + sysInfo, err := cmd.apiProvider.GetSystemInfo(cmd.MustLogCtx()) if err != nil { return errors.Wrap(err, "failed to query DAOS system") } diff --git a/src/control/cmd/daos/util.go b/src/control/cmd/daos/util.go index 1800a130e2a..d5b128bf9a4 100644 --- a/src/control/cmd/daos/util.go +++ b/src/control/cmd/daos/util.go @@ -275,20 +275,24 @@ type daosCaller interface { initDAOS() (func(), error) } +type sysCmd struct { + SysName string +} + +func (sc *sysCmd) setSysName(sysName string) { + sc.SysName = sysName +} + type daosCmd struct { cmdutil.NoArgsCmd cmdutil.JSONOutputCmd cmdutil.LogCmd + sysCmd apiProvider *api.Provider - SysName string -} - -func (dc *daosCmd) setSysName(sysName string) { - dc.SysName = sysName } func (dc *daosCmd) initDAOS() (func(), error) { - provider, err := api.NewProvider(dc.Logger) + provider, err := api.NewProvider(dc.Logger, false) if err != nil { return func() {}, err } @@ -297,16 +301,6 @@ func (dc *daosCmd) initDAOS() (func(), error) { return provider.Cleanup, nil } -func initDaosDebug() (func(), error) { - if rc := C.daos_debug_init(nil); rc != 0 { - return nil, errors.Wrap(daosError(rc), "daos_debug_init() failed") - } - - return func() { - C.daos_debug_fini() - }, nil -} - func resolveDunsPath(path string, ap *C.struct_cmd_args_s) error { if path == "" { return errors.New("empty path") diff --git a/src/control/cmd/dmg/pool.go b/src/control/cmd/dmg/pool.go index cbd1209a41b..d7d267fbbe3 100644 --- a/src/control/cmd/dmg/pool.go +++ b/src/control/cmd/dmg/pool.go @@ -120,33 +120,8 @@ func (trf *tierRatioFlag) UnmarshalFlag(fv string) error { return nil } -type sizeFlag struct { - bytes uint64 -} - -func (sf sizeFlag) IsSet() bool { - return sf.bytes > 0 -} - -func (sf sizeFlag) String() string { - return humanize.Bytes(sf.bytes) -} - -func (sf *sizeFlag) UnmarshalFlag(fv string) (err error) { - if fv == "" { - return errors.New("no size specified") - } - - sf.bytes, err = humanize.ParseBytes(fv) - if err != nil { - return errors.Errorf("invalid size %q", fv) - } - - return nil -} - type poolSizeFlag struct { - sizeFlag + ui.ByteSizeFlag availRatio uint64 } @@ -155,7 +130,7 @@ func (psf poolSizeFlag) IsRatio() bool { } func (psf poolSizeFlag) IsSet() bool { - return psf.sizeFlag.IsSet() || psf.IsRatio() + return psf.ByteSizeFlag.IsSet() || psf.IsRatio() } func (psf poolSizeFlag) String() string { @@ -163,7 +138,7 @@ func (psf poolSizeFlag) String() string { return fmt.Sprintf("%d%%", psf.availRatio) } - return psf.sizeFlag.String() + return psf.ByteSizeFlag.String() } func (psf *poolSizeFlag) UnmarshalFlag(fv string) error { @@ -182,7 +157,7 @@ func (psf *poolSizeFlag) UnmarshalFlag(fv string) error { return nil } - return psf.sizeFlag.UnmarshalFlag(fv) + return psf.ByteSizeFlag.UnmarshalFlag(fv) } // PoolCreateCmd is the struct representing the command to create a DAOS pool. @@ -199,8 +174,8 @@ type PoolCreateCmd struct { TierRatio tierRatioFlag `short:"t" long:"tier-ratio" description:"Percentage of storage tiers for pool storage (auto; default: 6,94)"` NumRanks uint32 `short:"k" long:"nranks" description:"Number of ranks to use (auto)"` NumSvcReps uint32 `short:"v" long:"nsvc" description:"Number of pool service replicas"` - ScmSize sizeFlag `short:"s" long:"scm-size" description:"Per-engine SCM allocation for DAOS pool (manual)"` - NVMeSize sizeFlag `short:"n" long:"nvme-size" description:"Per-engine NVMe allocation for DAOS pool (manual)"` + ScmSize ui.ByteSizeFlag `short:"s" long:"scm-size" description:"Per-engine SCM allocation for DAOS pool (manual)"` + NVMeSize ui.ByteSizeFlag `short:"n" long:"nvme-size" description:"Per-engine NVMe allocation for DAOS pool (manual)"` RankList ui.RankSetFlag `short:"r" long:"ranks" description:"Storage engine unique identifiers (ranks) for DAOS pool"` Args struct { @@ -259,7 +234,7 @@ func (cmd *PoolCreateCmd) storageAutoTotal(req *control.PoolCreateReq) error { req.NumRanks = cmd.NumRanks req.TierRatio = cmd.TierRatio.Ratios() - req.TotalBytes = cmd.Size.bytes + req.TotalBytes = cmd.Size.Bytes scmPercentage := ratio2Percentage(cmd.Logger, req.TierRatio[0], req.TierRatio[1]) msg := fmt.Sprintf("Creating DAOS pool with automatic storage allocation: "+ @@ -280,8 +255,8 @@ func (cmd *PoolCreateCmd) storageManual(req *control.PoolCreateReq) error { return errIncompatFlags("tier-ratio", "scm-size") } - scmBytes := cmd.ScmSize.bytes - nvmeBytes := cmd.NVMeSize.bytes + scmBytes := cmd.ScmSize.Bytes + nvmeBytes := cmd.NVMeSize.Bytes req.TierBytes = []uint64{scmBytes, nvmeBytes} msg := fmt.Sprintf("Creating DAOS pool with manual per-engine storage allocation:"+ diff --git a/src/control/cmd/dmg/pretty/telemetry.go b/src/control/cmd/dmg/pretty/telemetry.go index fb234f048ae..70091c3f8ee 100644 --- a/src/control/cmd/dmg/pretty/telemetry.go +++ b/src/control/cmd/dmg/pretty/telemetry.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -13,6 +13,7 @@ import ( "strings" "github.com/daos-stack/daos/src/control/lib/control" + "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/txtfmt" ) @@ -76,7 +77,7 @@ func PrintMetricsQueryResp(out io.Writer, resp *control.MetricsQueryResp) error return nil } -func printMetrics(out io.Writer, metrics []control.Metric, metricType control.MetricType) { +func printMetrics(out io.Writer, metrics []daos.Metric, metricType daos.MetricType) { if len(metrics) == 0 { fmt.Fprintf(out, "No metrics found\n") return @@ -92,7 +93,7 @@ func printMetrics(out io.Writer, metrics []control.Metric, metricType control.Me for _, m := range metrics { switch realM := m.(type) { - case *control.SimpleMetric: + case *daos.SimpleMetric: labels := metricLabelsToStr(realM.Labels) name := metricType.String() table = append(table, txtfmt.TableRow{ @@ -100,7 +101,7 @@ func printMetrics(out io.Writer, metrics []control.Metric, metricType control.Me labelTitle: labels, valTitle: fmt.Sprintf("%g", realM.Value), }) - case *control.SummaryMetric: + case *daos.SummaryMetric: labels := metricLabelsToStr(realM.Labels) table = append(table, txtfmt.TableRow{ nameTitle: "Sample Count", @@ -119,7 +120,7 @@ func printMetrics(out io.Writer, metrics []control.Metric, metricType control.Me valTitle: fmt.Sprintf("%g", realM.Quantiles[quant]), }) } - case *control.HistogramMetric: + case *daos.HistogramMetric: labels := metricLabelsToStr(realM.Labels) table = append(table, txtfmt.TableRow{ nameTitle: "Sample Count", @@ -150,7 +151,7 @@ func printMetrics(out io.Writer, metrics []control.Metric, metricType control.Me tablePrint.Format(table) } -func metricLabelsToStr(labels control.LabelMap) string { +func metricLabelsToStr(labels daos.MetricLabelMap) string { if len(labels) == 0 { return "N/A" } diff --git a/src/control/cmd/dmg/pretty/telemetry_test.go b/src/control/cmd/dmg/pretty/telemetry_test.go index 66881a091cd..09e82c6ce90 100644 --- a/src/control/cmd/dmg/pretty/telemetry_test.go +++ b/src/control/cmd/dmg/pretty/telemetry_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -13,6 +13,7 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/lib/control" + "github.com/daos-stack/daos/src/control/lib/daos" ) func TestPretty_PrintMetricsListResp(t *testing.T) { @@ -30,16 +31,16 @@ func TestPretty_PrintMetricsListResp(t *testing.T) { }, "empty list": { resp: &control.MetricsListResp{ - AvailableMetricSets: []*control.MetricSet{}, + AvailableMetricSets: []*daos.MetricSet{}, }, }, "one item": { resp: &control.MetricsListResp{ - AvailableMetricSets: []*control.MetricSet{ + AvailableMetricSets: []*daos.MetricSet{ { Name: "test_metric_1", Description: "Test Metric", - Type: control.MetricTypeGeneric, + Type: daos.MetricTypeGeneric, }, }, }, @@ -51,21 +52,21 @@ test_metric_1 Generic Test Metric }, "multi item": { resp: &control.MetricsListResp{ - AvailableMetricSets: []*control.MetricSet{ + AvailableMetricSets: []*daos.MetricSet{ { Name: "test_metric_1", Description: "Test metric", - Type: control.MetricTypeGauge, + Type: daos.MetricTypeGauge, }, { Name: "test_metric_2", Description: "Another test metric", - Type: control.MetricTypeSummary, + Type: daos.MetricTypeSummary, }, { Name: "funny_hats", Description: "Hilarious headwear", - Type: control.MetricTypeCounter, + Type: daos.MetricTypeCounter, }, }, }, @@ -79,7 +80,7 @@ funny_hats Counter Hilarious headwear }, "write failure": { resp: &control.MetricsListResp{ - AvailableMetricSets: []*control.MetricSet{ + AvailableMetricSets: []*daos.MetricSet{ { Name: "test_metric_1", Description: "Test Metric", @@ -117,12 +118,12 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { }, "empty list": { resp: &control.MetricsQueryResp{ - MetricSets: []*control.MetricSet{}, + MetricSets: []*daos.MetricSet{}, }, }, "set without values": { resp: &control.MetricsQueryResp{ - MetricSets: []*control.MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "test_metric_1", Description: "Test Metric", @@ -138,26 +139,26 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { }, "untyped": { resp: &control.MetricsQueryResp{ - MetricSets: []*control.MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "my_metric", Description: "A test metric", - Type: control.MetricTypeGeneric, - Metrics: []control.Metric{ - &control.SimpleMetric{ + Type: daos.MetricTypeGeneric, + Metrics: []daos.Metric{ + &daos.SimpleMetric{ Labels: map[string]string{ "foo": "bar", }, Value: 2.25, }, - &control.SimpleMetric{ + &daos.SimpleMetric{ Labels: map[string]string{ "ring": "one", "bearer": "frodo", }, Value: 5, }, - &control.SimpleMetric{ + &daos.SimpleMetric{ Labels: map[string]string{}, Value: 125, }, @@ -178,26 +179,26 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { }, "counter type": { resp: &control.MetricsQueryResp{ - MetricSets: []*control.MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "my_counter", Description: "A test metric", - Type: control.MetricTypeCounter, - Metrics: []control.Metric{ - &control.SimpleMetric{ + Type: daos.MetricTypeCounter, + Metrics: []daos.Metric{ + &daos.SimpleMetric{ Labels: map[string]string{ "foo": "bar", }, Value: 2.25, }, - &control.SimpleMetric{ + &daos.SimpleMetric{ Labels: map[string]string{ "ring": "one", "bearer": "frodo", }, Value: 5, }, - &control.SimpleMetric{ + &daos.SimpleMetric{ Labels: map[string]string{}, Value: 125, }, @@ -218,26 +219,26 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { }, "gauge type": { resp: &control.MetricsQueryResp{ - MetricSets: []*control.MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "my_gauge", Description: "A test metric", - Type: control.MetricTypeGauge, - Metrics: []control.Metric{ - &control.SimpleMetric{ + Type: daos.MetricTypeGauge, + Metrics: []daos.Metric{ + &daos.SimpleMetric{ Labels: map[string]string{ "foo": "bar", }, Value: 2.25, }, - &control.SimpleMetric{ + &daos.SimpleMetric{ Labels: map[string]string{ "ring": "one", "bearer": "frodo", }, Value: 5, }, - &control.SimpleMetric{ + &daos.SimpleMetric{ Labels: map[string]string{}, Value: 125, }, @@ -258,14 +259,14 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { }, "summary type": { resp: &control.MetricsQueryResp{ - MetricSets: []*control.MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "my_summary", Description: "A test metric", - Type: control.MetricTypeSummary, - Metrics: []control.Metric{ - &control.SummaryMetric{ - Labels: control.LabelMap{ + Type: daos.MetricTypeSummary, + Metrics: []daos.Metric{ + &daos.SummaryMetric{ + Labels: daos.MetricLabelMap{ "foo": "bar", }, SampleCount: 55, @@ -275,8 +276,8 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { 0.5: 33.333, }, }, - &control.SummaryMetric{ - Labels: control.LabelMap{}, + &daos.SummaryMetric{ + Labels: daos.MetricLabelMap{}, SampleCount: 102, SampleSum: 19.84, Quantiles: map[float64]float64{ @@ -304,19 +305,19 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { }, "histogram type": { resp: &control.MetricsQueryResp{ - MetricSets: []*control.MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "my_histogram", Description: "A test metric", - Type: control.MetricTypeHistogram, - Metrics: []control.Metric{ - &control.HistogramMetric{ - Labels: control.LabelMap{ + Type: daos.MetricTypeHistogram, + Metrics: []daos.Metric{ + &daos.HistogramMetric{ + Labels: daos.MetricLabelMap{ "foo": "bar", }, SampleCount: 55, SampleSum: 6094.27, - Buckets: []*control.MetricBucket{ + Buckets: []*daos.MetricBucket{ { UpperBound: 500, CumulativeCount: 2, @@ -327,8 +328,8 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { }, }, }, - &control.HistogramMetric{ - Labels: control.LabelMap{}, + &daos.HistogramMetric{ + Labels: daos.MetricLabelMap{}, SampleCount: 22, SampleSum: 102, }, @@ -354,26 +355,26 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { }, "multiple sets": { resp: &control.MetricsQueryResp{ - MetricSets: []*control.MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "my_counter", Description: "A test metric", - Type: control.MetricTypeCounter, - Metrics: []control.Metric{ - &control.SimpleMetric{ + Type: daos.MetricTypeCounter, + Metrics: []daos.Metric{ + &daos.SimpleMetric{ Labels: map[string]string{ "foo": "bar", }, Value: 2.25, }, - &control.SimpleMetric{ + &daos.SimpleMetric{ Labels: map[string]string{ "ring": "one", "bearer": "frodo", }, Value: 5, }, - &control.SimpleMetric{ + &daos.SimpleMetric{ Labels: map[string]string{}, Value: 125, }, @@ -382,9 +383,9 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { { Name: "my_summary", Description: "Another test metric", - Type: control.MetricTypeSummary, - Metrics: []control.Metric{ - &control.SummaryMetric{ + Type: daos.MetricTypeSummary, + Metrics: []daos.Metric{ + &daos.SummaryMetric{ SampleCount: 55, SampleSum: 6094.27, Quantiles: map[float64]float64{ @@ -418,7 +419,7 @@ func TestPretty_PrintMetricsQueryResp(t *testing.T) { }, "write failure": { resp: &control.MetricsQueryResp{ - MetricSets: []*control.MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "test_metric_1", Description: "Test Metric", diff --git a/src/control/common/test/utils.go b/src/control/common/test/utils.go index f7cc72ef72a..2c7610a82d4 100644 --- a/src/control/common/test/utils.go +++ b/src/control/common/test/utils.go @@ -140,6 +140,15 @@ func CmpErr(t *testing.T, want, got error) { } } +// CmpAny compares two values and fails the test if they are not equal. +func CmpAny(t *testing.T, desc string, want, got any, cmpOpts ...cmp.Option) { + t.Helper() + + if diff := cmp.Diff(want, got, cmpOpts...); diff != "" { + t.Fatalf("unexpected %s (-want, +got):\n%s\n", desc, diff) + } +} + // SplitFile separates file content into contiguous sections separated by // a blank line. func SplitFile(path string) (sections [][]string, err error) { diff --git a/src/control/lib/control/telemetry.go b/src/control/lib/control/telemetry.go index 62854438df4..919e54ff284 100644 --- a/src/control/lib/control/telemetry.go +++ b/src/control/lib/control/telemetry.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -8,13 +8,12 @@ package control import ( "context" - "encoding/json" "fmt" "net/url" "sort" - "strconv" "strings" + "github.com/daos-stack/daos/src/control/lib/daos" "github.com/pkg/errors" pclient "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" @@ -58,299 +57,21 @@ func scrapeMetrics(ctx context.Context, req httpGetter) (pbMetricMap, error) { return pbMetricMap(result), nil } -// MetricType defines the different types of metrics. -type MetricType uint32 - -const ( - MetricTypeUnknown MetricType = iota - MetricTypeGeneric - MetricTypeCounter - MetricTypeGauge - MetricTypeSummary - MetricTypeHistogram - - metricTypeUnknownStr = "Unknown" - metricTypeGenericStr = "Generic" - metricTypeCounterStr = "Counter" - metricTypeGaugeStr = "Gauge" - metricTypeSummaryStr = "Summary" - metricTypeHistogramStr = "Histogram" -) - -func (t MetricType) String() string { - switch t { - case MetricTypeGeneric: - return metricTypeGenericStr - case MetricTypeCounter: - return metricTypeCounterStr - case MetricTypeGauge: - return metricTypeGaugeStr - case MetricTypeSummary: - return metricTypeSummaryStr - case MetricTypeHistogram: - return metricTypeHistogramStr - } - - return metricTypeUnknownStr -} - -func metricTypeFromPrometheus(pType pclient.MetricType) MetricType { +func metricTypeFromPrometheus(pType pclient.MetricType) daos.MetricType { switch pType { case pclient.MetricType_COUNTER: - return MetricTypeCounter + return daos.MetricTypeCounter case pclient.MetricType_GAUGE: - return MetricTypeGauge + return daos.MetricTypeGauge case pclient.MetricType_SUMMARY: - return MetricTypeSummary + return daos.MetricTypeSummary case pclient.MetricType_HISTOGRAM: - return MetricTypeHistogram + return daos.MetricTypeHistogram case pclient.MetricType_UNTYPED: - return MetricTypeGeneric - } - - return MetricTypeUnknown -} - -func metricTypeFromString(typeStr string) MetricType { - // normalize the strings for comparison - switch strings.ToLower(typeStr) { - case strings.ToLower(metricTypeCounterStr): - return MetricTypeCounter - case strings.ToLower(metricTypeGaugeStr): - return MetricTypeGauge - case strings.ToLower(metricTypeSummaryStr): - return MetricTypeSummary - case strings.ToLower(metricTypeHistogramStr): - return MetricTypeHistogram - case strings.ToLower(metricTypeGenericStr): - return MetricTypeGeneric - } - return MetricTypeUnknown -} - -type ( - // Metric is an interface implemented by all metric types. - Metric interface { - IsMetric() - } - - // LabelMap is the set of key-value label pairs. - LabelMap map[string]string - - // SimpleMetric is a specific metric with a value. - SimpleMetric struct { - Labels LabelMap `json:"labels"` - Value float64 `json:"value"` - } - - // QuantileMap is the set of quantile measurements. - QuantileMap map[float64]float64 - - // SummaryMetric represents a group of observations. - SummaryMetric struct { - Labels LabelMap `json:"labels"` - SampleCount uint64 `json:"sample_count"` - SampleSum float64 `json:"sample_sum"` - Quantiles QuantileMap `json:"quantiles"` - } - - // MetricBucket represents a bucket for observations to be sorted into. - MetricBucket struct { - CumulativeCount uint64 `json:"cumulative_count"` - UpperBound float64 `json:"upper_bound"` - } - - // HistogramMetric represents a group of observations sorted into - // buckets. - HistogramMetric struct { - Labels LabelMap `json:"labels"` - SampleCount uint64 `json:"sample_count"` - SampleSum float64 `json:"sample_sum"` - Buckets []*MetricBucket `json:"buckets"` - } - - // MetricSet is a group of related metrics. - MetricSet struct { - Name string `json:"name"` - Description string `json:"description"` - Type MetricType `json:"type"` - Metrics []Metric `json:"metrics"` - } -) - -// IsMetric identifies SimpleMetric as a Metric. -func (*SimpleMetric) IsMetric() {} - -// IsMetric identifies SummaryMetric as a Metric. -func (*SummaryMetric) IsMetric() {} - -// UnmarshalJSON unmarshals a SummaryMetric from JSON. -func (m *SummaryMetric) UnmarshalJSON(data []byte) error { - if m == nil { - return errors.New("nil SummaryMetric") - } - - if m.Quantiles == nil { - m.Quantiles = make(QuantileMap) - } - - type Alias SummaryMetric - aux := (*Alias)(m) - if err := json.Unmarshal(data, &aux); err != nil { - return err - } - - return nil -} - -// IsMetric identifies HistogramMetric as a Metric. -func (*HistogramMetric) IsMetric() {} - -// Keys gets the sorted list of label keys. -func (m LabelMap) Keys() []string { - result := make([]string, 0, len(m)) - for label := range m { - result = append(result, label) - } - sort.Strings(result) - return result -} - -// Keys gets the sorted list of quantile keys. -func (m QuantileMap) Keys() []float64 { - result := make([]float64, 0, len(m)) - for q := range m { - result = append(result, q) - } - sort.Float64s(result) - return result -} - -// MarshalJSON marshals the QuantileMap into JSON. -func (m QuantileMap) MarshalJSON() ([]byte, error) { - strMap := make(map[string]string) - - fmtFloat := func(f float64) string { - return strconv.FormatFloat(f, 'g', -1, 64) - } - - for key, val := range m { - strMap[fmtFloat(key)] = fmtFloat(val) - } - - return json.Marshal(&strMap) -} - -// UnmarshalJSON unmarshals the QuantileMap from JSON. -func (m QuantileMap) UnmarshalJSON(data []byte) error { - if m == nil { - return errors.New("QuantileMap is nil") - } - - fromJSON := make(map[string]string) - - if err := json.Unmarshal(data, &fromJSON); err != nil { - return nil - } - - for key, val := range fromJSON { - floatKey, err := strconv.ParseFloat(key, 64) - if err != nil { - return errors.Wrapf(err, "QuantileMap key %q", key) - } - - floatVal, err := strconv.ParseFloat(val, 64) - if err != nil { - return errors.Wrapf(err, "QuantileMap value %q for key %q", val, key) - } - - m[floatKey] = floatVal + return daos.MetricTypeGeneric } - return nil -} - -// MarshalJSON marshals the MetricSet to JSON. -func (ms *MetricSet) MarshalJSON() ([]byte, error) { - type toJSON MetricSet - return json.Marshal(&struct { - Type string `json:"type"` - *toJSON - }{ - Type: strings.ToLower(ms.Type.String()), - toJSON: (*toJSON)(ms), - }) -} - -// jsonMetric serves as a universal metric representation for unmarshaling from -// JSON. It covers all possible fields of Metric types. -type jsonMetric struct { - Labels LabelMap `json:"labels"` - Value float64 `json:"value"` - SampleCount uint64 `json:"sample_count"` - SampleSum float64 `json:"sample_sum"` - Quantiles QuantileMap `json:"quantiles"` - Buckets []*MetricBucket `json:"buckets"` -} -// UnmarshalJSON unmarshals a Metric into the jsonMetric type. -func (jm *jsonMetric) UnmarshalJSON(data []byte) error { - if jm == nil { - return errors.New("nil jsonMetric") - } - - if jm.Quantiles == nil { - jm.Quantiles = make(QuantileMap) - } - - type Alias jsonMetric - aux := (*Alias)(jm) - if err := json.Unmarshal(data, &aux); err != nil { - return err - } - - return nil -} - -// UnmarshalJSON unmarshals the MetricSet from JSON. -func (ms *MetricSet) UnmarshalJSON(data []byte) error { - if ms == nil { - return errors.New("nil MetricSet") - } - - type fromJSON MetricSet - from := &struct { - Type string `json:"type"` - Metrics []*jsonMetric `json:"metrics"` - *fromJSON - }{ - fromJSON: (*fromJSON)(ms), - } - if err := json.Unmarshal(data, from); err != nil { - return err - } - - ms.Type = metricTypeFromString(from.Type) - for _, m := range from.Metrics { - switch ms.Type { - case MetricTypeSummary: - ms.Metrics = append(ms.Metrics, &SummaryMetric{ - Labels: m.Labels, - SampleCount: m.SampleCount, - SampleSum: m.SampleSum, - Quantiles: m.Quantiles, - }) - case MetricTypeHistogram: - ms.Metrics = append(ms.Metrics, &HistogramMetric{ - Labels: m.Labels, - SampleCount: m.SampleCount, - SampleSum: m.SampleSum, - Buckets: m.Buckets, - }) - default: - ms.Metrics = append(ms.Metrics, newSimpleMetric(m.Labels, m.Value)) - } - } - return nil + return daos.MetricTypeUnknown } type ( @@ -363,7 +84,7 @@ type ( // MetricsListResp contains the list of available metrics. MetricsListResp struct { - AvailableMetricSets []*MetricSet `json:"available_metric_sets"` + AvailableMetricSets []*daos.MetricSet `json:"available_metric_sets"` } ) @@ -390,10 +111,10 @@ func MetricsList(ctx context.Context, req *MetricsListReq) (*MetricsListResp, er resp := new(MetricsListResp) - list := make([]*MetricSet, 0, len(scraped)) + list := make([]*daos.MetricSet, 0, len(scraped)) for _, name := range scraped.Keys() { mf := scraped[name] - newMetric := &MetricSet{ + newMetric := &daos.MetricSet{ Name: name, Description: mf.GetHelp(), Type: metricTypeFromPrometheus(mf.GetType()), @@ -416,7 +137,7 @@ type ( // MetricsQueryResp contains the list of telemetry values per host. MetricsQueryResp struct { - MetricSets []*MetricSet `json:"metric_sets"` + MetricSets []*daos.MetricSet `json:"metric_sets"` } ) @@ -453,14 +174,14 @@ func MetricsQuery(ctx context.Context, req *MetricsQueryReq) (*MetricsQueryResp, func newMetricsQueryResp(scraped pbMetricMap, metricNames []string) (*MetricsQueryResp, error) { resp := new(MetricsQueryResp) - list := make([]*MetricSet, 0, len(metricNames)) + list := make([]*daos.MetricSet, 0, len(metricNames)) for _, name := range metricNames { mf, found := scraped[name] if !found { return nil, errors.Errorf("metric %q not found on host", name) } - newSet := &MetricSet{ + newSet := &daos.MetricSet{ Name: name, Description: mf.GetHelp(), Type: metricTypeFromPrometheus(mf.GetType()), @@ -482,7 +203,7 @@ func newMetricsQueryResp(scraped pbMetricMap, metricNames []string) (*MetricsQue return resp, nil } -func getMetricFromPrometheus(pMetric *pclient.Metric, metricType pclient.MetricType) (Metric, error) { +func getMetricFromPrometheus(pMetric *pclient.Metric, metricType pclient.MetricType) (daos.Metric, error) { labels := metricsLabelsToMap(pMetric) switch metricType { case pclient.MetricType_COUNTER: @@ -491,11 +212,11 @@ func getMetricFromPrometheus(pMetric *pclient.Metric, metricType pclient.MetricT return newSimpleMetric(labels, pMetric.GetGauge().GetValue()), nil case pclient.MetricType_SUMMARY: summary := pMetric.GetSummary() - newMetric := &SummaryMetric{ + newMetric := &daos.SummaryMetric{ Labels: labels, SampleSum: summary.GetSampleSum(), SampleCount: summary.GetSampleCount(), - Quantiles: QuantileMap{}, + Quantiles: daos.QuantileMap{}, } for _, q := range summary.Quantile { newMetric.Quantiles[q.GetQuantile()] = q.GetValue() @@ -503,14 +224,14 @@ func getMetricFromPrometheus(pMetric *pclient.Metric, metricType pclient.MetricT return newMetric, nil case pclient.MetricType_HISTOGRAM: histogram := pMetric.GetHistogram() - newMetric := &HistogramMetric{ + newMetric := &daos.HistogramMetric{ Labels: labels, SampleSum: histogram.GetSampleSum(), SampleCount: histogram.GetSampleCount(), } for _, b := range histogram.Bucket { newMetric.Buckets = append(newMetric.Buckets, - &MetricBucket{ + &daos.MetricBucket{ UpperBound: b.GetUpperBound(), CumulativeCount: b.GetCumulativeCount(), }) @@ -523,8 +244,8 @@ func getMetricFromPrometheus(pMetric *pclient.Metric, metricType pclient.MetricT return nil, errors.New("unknown metric type") } -func newSimpleMetric(labels map[string]string, value float64) *SimpleMetric { - return &SimpleMetric{ +func newSimpleMetric(labels map[string]string, value float64) *daos.SimpleMetric { + return &daos.SimpleMetric{ Labels: labels, Value: value, } diff --git a/src/control/lib/control/telemetry_test.go b/src/control/lib/control/telemetry_test.go index f5d8cb701ec..5887283852d 100644 --- a/src/control/lib/control/telemetry_test.go +++ b/src/control/lib/control/telemetry_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -23,6 +23,7 @@ import ( "google.golang.org/protobuf/proto" "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/daos" ) func newTestMetricFamily(name string, help string, mType pclient.MetricType) *pclient.MetricFamily { @@ -253,7 +254,7 @@ func TestControl_MetricsList(t *testing.T) { return []byte{}, nil }, expResp: &MetricsListResp{ - AvailableMetricSets: []*MetricSet{}, + AvailableMetricSets: []*daos.MetricSet{}, }, }, "success": { @@ -263,16 +264,16 @@ func TestControl_MetricsList(t *testing.T) { }, scrapeFn: mockScrapeFnSuccess(t, testMetricFam...), expResp: &MetricsListResp{ - AvailableMetricSets: []*MetricSet{ + AvailableMetricSets: []*daos.MetricSet{ { Name: "counter", Description: "this is the counter help", - Type: MetricTypeCounter, + Type: daos.MetricTypeCounter, }, { Name: "gauge", Description: "this is the gauge help", - Type: MetricTypeGauge, + Type: daos.MetricTypeGauge, }, }, }, @@ -299,7 +300,7 @@ func TestControl_MetricsList(t *testing.T) { } func TestControl_getMetricFromPrometheus(t *testing.T) { - testLabels := LabelMap{ + testLabels := daos.MetricLabelMap{ "foo": "bar", "baz": "snafu", } @@ -341,7 +342,7 @@ func TestControl_getMetricFromPrometheus(t *testing.T) { for name, tc := range map[string]struct { input *pclient.Metric inputType pclient.MetricType - expResult Metric + expResult daos.Metric expErr error }{ "counter": { @@ -362,11 +363,11 @@ func TestControl_getMetricFromPrometheus(t *testing.T) { "summary": { input: testSummary, inputType: pclient.MetricType_SUMMARY, - expResult: &SummaryMetric{ + expResult: &daos.SummaryMetric{ Labels: testLabels, SampleSum: testSummary.Summary.GetSampleSum(), SampleCount: testSummary.Summary.GetSampleCount(), - Quantiles: QuantileMap{ + Quantiles: daos.QuantileMap{ 0: 1, 1: 2, 2: 3, @@ -377,11 +378,11 @@ func TestControl_getMetricFromPrometheus(t *testing.T) { "histogram": { input: testHistogram, inputType: pclient.MetricType_HISTOGRAM, - expResult: &HistogramMetric{ + expResult: &daos.HistogramMetric{ Labels: testLabels, SampleSum: testHistogram.Histogram.GetSampleSum(), SampleCount: testHistogram.Histogram.GetSampleCount(), - Buckets: []*MetricBucket{ + Buckets: []*daos.MetricBucket{ { UpperBound: 100, CumulativeCount: 1, @@ -465,7 +466,7 @@ func TestControl_MetricsQuery(t *testing.T) { return []byte{}, nil }, expResp: &MetricsQueryResp{ - MetricSets: []*MetricSet{}, + MetricSets: []*daos.MetricSet{}, }, }, "all metrics": { @@ -475,39 +476,39 @@ func TestControl_MetricsQuery(t *testing.T) { }, scrapeFn: mockScrapeFnSuccess(t, testMetricFam...), expResp: &MetricsQueryResp{ - MetricSets: []*MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "my_counter", Description: "this is the counter help", - Type: MetricTypeCounter, - Metrics: []Metric{ + Type: daos.MetricTypeCounter, + Metrics: []daos.Metric{ newSimpleMetric(map[string]string{}, 0), }, }, { Name: "my_gauge", Description: "this is the gauge help", - Type: MetricTypeGauge, - Metrics: []Metric{ + Type: daos.MetricTypeGauge, + Metrics: []daos.Metric{ newSimpleMetric(map[string]string{}, 0), }, }, { Name: "my_generic", Description: "this is the generic help", - Type: MetricTypeGeneric, - Metrics: []Metric{ + Type: daos.MetricTypeGeneric, + Metrics: []daos.Metric{ newSimpleMetric(map[string]string{}, 0), }, }, { Name: "my_histogram", Description: "this is the histogram help", - Type: MetricTypeHistogram, - Metrics: []Metric{ - &HistogramMetric{ - Labels: LabelMap{}, - Buckets: []*MetricBucket{ + Type: daos.MetricTypeHistogram, + Metrics: []daos.Metric{ + &daos.HistogramMetric{ + Labels: daos.MetricLabelMap{}, + Buckets: []*daos.MetricBucket{ // Prometheus library parsing // includes inf bucket at minimum {UpperBound: math.Inf(0)}, @@ -518,11 +519,11 @@ func TestControl_MetricsQuery(t *testing.T) { { Name: "my_summary", Description: "this is the summary help", - Type: MetricTypeSummary, - Metrics: []Metric{ - &SummaryMetric{ - Labels: LabelMap{}, - Quantiles: QuantileMap{0: 0}, + Type: daos.MetricTypeSummary, + Metrics: []daos.Metric{ + &daos.SummaryMetric{ + Labels: daos.MetricLabelMap{}, + Quantiles: daos.QuantileMap{0: 0}, }, }, }, @@ -537,20 +538,20 @@ func TestControl_MetricsQuery(t *testing.T) { }, scrapeFn: mockScrapeFnSuccess(t, testMetricFam...), expResp: &MetricsQueryResp{ - MetricSets: []*MetricSet{ + MetricSets: []*daos.MetricSet{ { Name: "my_generic", Description: "this is the generic help", - Type: MetricTypeGeneric, - Metrics: []Metric{ + Type: daos.MetricTypeGeneric, + Metrics: []daos.Metric{ newSimpleMetric(map[string]string{}, 0), }, }, { Name: "my_counter", Description: "this is the counter help", - Type: MetricTypeCounter, - Metrics: []Metric{ + Type: daos.MetricTypeCounter, + Metrics: []daos.Metric{ newSimpleMetric(map[string]string{}, 0), }, }, @@ -589,29 +590,29 @@ func TestControl_Metric_JSON(t *testing.T) { } for name, tc := range map[string]struct { - metric Metric + metric daos.Metric }{ "nil": {}, "simple": { metric: newSimpleMetric(testLabelMap, 123), }, "summary": { - metric: &SummaryMetric{ + metric: &daos.SummaryMetric{ Labels: testLabelMap, SampleSum: 5678.9, SampleCount: 42, - Quantiles: QuantileMap{ + Quantiles: daos.QuantileMap{ 0.25: 50, 0.5: 42, }, }, }, "histogram": { - metric: &HistogramMetric{ + metric: &daos.HistogramMetric{ Labels: testLabelMap, SampleSum: 9876, SampleCount: 120, - Buckets: []*MetricBucket{ + Buckets: []*daos.MetricBucket{ { CumulativeCount: 55, UpperBound: 500, @@ -626,16 +627,16 @@ func TestControl_Metric_JSON(t *testing.T) { t.Fatalf("expected to marshal, got %q", err) } - var unmarshaled Metric + var unmarshaled daos.Metric switch tc.metric.(type) { - case *SimpleMetric: - unmarshaled = new(SimpleMetric) - case *SummaryMetric: - unmarshaled = new(SummaryMetric) - case *HistogramMetric: - unmarshaled = new(HistogramMetric) + case *daos.SimpleMetric: + unmarshaled = new(daos.SimpleMetric) + case *daos.SummaryMetric: + unmarshaled = new(daos.SummaryMetric) + case *daos.HistogramMetric: + unmarshaled = new(daos.HistogramMetric) default: - unmarshaled = new(SimpleMetric) + unmarshaled = new(daos.SimpleMetric) } err = json.Unmarshal(marshaled, unmarshaled) @@ -645,7 +646,7 @@ func TestControl_Metric_JSON(t *testing.T) { expResult := tc.metric if tc.metric == nil { - expResult = &SimpleMetric{} + expResult = &daos.SimpleMetric{} } if diff := cmp.Diff(expResult, unmarshaled); diff != "" { @@ -655,62 +656,17 @@ func TestControl_Metric_JSON(t *testing.T) { } } -func TestControl_metricTypeFromString(t *testing.T) { - for name, tc := range map[string]struct { - input string - expType MetricType - }{ - "empty": { - expType: MetricTypeUnknown, - }, - "counter": { - input: "counter", - expType: MetricTypeCounter, - }, - "gauge": { - input: "gauge", - expType: MetricTypeGauge, - }, - "summary": { - input: "summary", - expType: MetricTypeSummary, - }, - "histogram": { - input: "histogram", - expType: MetricTypeHistogram, - }, - "generic": { - input: "generic", - expType: MetricTypeGeneric, - }, - "invalid": { - input: "some garbage text", - expType: MetricTypeUnknown, - }, - "weird capitalization": { - input: "CoUnTeR", - expType: MetricTypeCounter, - }, - } { - t.Run(name, func(t *testing.T) { - gotType := metricTypeFromString(tc.input) - - test.AssertEqual(t, tc.expType, gotType, "") - }) - } -} - func TestControl_MetricSet_JSON(t *testing.T) { for name, tc := range map[string]struct { - set *MetricSet + set *daos.MetricSet }{ "nil": {}, "generic type": { - set: &MetricSet{ + set: &daos.MetricSet{ Name: "timespan", Description: "It's been a while", - Type: MetricTypeGeneric, - Metrics: []Metric{ + Type: daos.MetricTypeGeneric, + Metrics: []daos.Metric{ newSimpleMetric(map[string]string{ "units": "nanoseconds", }, float64(time.Second)), @@ -718,11 +674,11 @@ func TestControl_MetricSet_JSON(t *testing.T) { }, }, "counter type": { - set: &MetricSet{ + set: &daos.MetricSet{ Name: "one_ring", Description: "Precious...", - Type: MetricTypeCounter, - Metrics: []Metric{ + Type: daos.MetricTypeCounter, + Metrics: []daos.Metric{ newSimpleMetric(map[string]string{ "owner": "frodo", }, 1), @@ -730,11 +686,11 @@ func TestControl_MetricSet_JSON(t *testing.T) { }, }, "gauge type": { - set: &MetricSet{ + set: &daos.MetricSet{ Name: "funny_hats", Description: "Hilarious headgear in inventory", - Type: MetricTypeGauge, - Metrics: []Metric{ + Type: daos.MetricTypeGauge, + Metrics: []daos.Metric{ newSimpleMetric(map[string]string{ "type": "tophat", }, 1), @@ -748,12 +704,12 @@ func TestControl_MetricSet_JSON(t *testing.T) { }, }, "summary type": { - set: &MetricSet{ + set: &daos.MetricSet{ Name: "alpha", Description: "The first letter! Everybody's favorite!", - Type: MetricTypeSummary, - Metrics: []Metric{ - &SummaryMetric{ + Type: daos.MetricTypeSummary, + Metrics: []daos.Metric{ + &daos.SummaryMetric{ Labels: map[string]string{"beta": "b"}, SampleCount: 3, SampleSum: 42, @@ -763,16 +719,16 @@ func TestControl_MetricSet_JSON(t *testing.T) { }, }, "histogram type": { - set: &MetricSet{ + set: &daos.MetricSet{ Name: "my_histogram", Description: "This is a histogram", - Type: MetricTypeHistogram, - Metrics: []Metric{ - &HistogramMetric{ + Type: daos.MetricTypeHistogram, + Metrics: []daos.Metric{ + &daos.HistogramMetric{ Labels: map[string]string{"owner": "me"}, SampleCount: 1024, SampleSum: 12344, - Buckets: []*MetricBucket{ + Buckets: []*daos.MetricBucket{ { CumulativeCount: 789, UpperBound: 500, @@ -793,7 +749,7 @@ func TestControl_MetricSet_JSON(t *testing.T) { t.Fatalf("expected to marshal, got %q", err) } - unmarshaled := new(MetricSet) + unmarshaled := new(daos.MetricSet) err = json.Unmarshal(marshaled, unmarshaled) if err != nil { t.Fatalf("expected to unmarshal, got %q", err) @@ -801,7 +757,7 @@ func TestControl_MetricSet_JSON(t *testing.T) { expResult := tc.set if tc.set == nil { - expResult = &MetricSet{} + expResult = &daos.MetricSet{} } if diff := cmp.Diff(expResult, unmarshaled); diff != "" { diff --git a/src/control/lib/daos/api/api.go b/src/control/lib/daos/api/api.go index dbd6d2725a8..51ef20e669a 100644 --- a/src/control/lib/daos/api/api.go +++ b/src/control/lib/daos/api/api.go @@ -38,7 +38,7 @@ func (api *api) isInitialized() bool { // Init performs DAOS API initialization steps and returns a closure // to be called before application exit. -func (api *api) Init() (func(), error) { +func (api *api) Init(initLogging bool) (func(), error) { api.Lock() defer api.Unlock() @@ -47,12 +47,21 @@ func (api *api) Init() (func(), error) { return stubFini, daos.Already } - if err := daosError(C.daos_init()); err != nil { + logFini := stubFini + if initLogging { + fini, err := daos.InitLogging(daos.DefaultErrorMask) + if err != nil { + return stubFini, err + } + logFini = fini + } + + if err := daosError(daos_init()); err != nil { return stubFini, err } api.initialized = true - return api.Fini, nil + return func() { api.Fini(); logFini() }, nil } // Fini releases resources obtained during DAOS API initialization. @@ -64,6 +73,6 @@ func (api *api) Fini() { return } - C.daos_fini() + daos_fini() api.initialized = false } diff --git a/src/control/lib/daos/api/errors.go b/src/control/lib/daos/api/errors.go new file mode 100644 index 00000000000..6d1b4b665e3 --- /dev/null +++ b/src/control/lib/daos/api/errors.go @@ -0,0 +1,13 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package api + +import "github.com/pkg/errors" + +var ( + ErrNoSystemRanks = errors.New("no ranks in system") +) diff --git a/src/control/lib/daos/api/libdaos.go b/src/control/lib/daos/api/libdaos.go new file mode 100644 index 00000000000..d7c6bfed82d --- /dev/null +++ b/src/control/lib/daos/api/libdaos.go @@ -0,0 +1,38 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build !test_stubs +// +build !test_stubs + +package api + +/* +#include +#include +#include + +#cgo LDFLAGS: -lcart -lgurt -ldaos -ldaos_common +*/ +import "C" + +func daos_init() C.int { + return C.daos_init() +} + +func daos_fini() { + C.daos_fini() +} + +func dc_agent_fini() { + C.dc_agent_fini() +} + +func daos_mgmt_get_sys_info(sys *C.char, sys_info **C.struct_daos_sys_info) C.int { + return C.daos_mgmt_get_sys_info(sys, sys_info) +} + +func daos_mgmt_put_sys_info(sys_info *C.struct_daos_sys_info) { + C.daos_mgmt_put_sys_info(sys_info) +} diff --git a/src/control/lib/daos/api/libdaos_selftest.go b/src/control/lib/daos/api/libdaos_selftest.go new file mode 100644 index 00000000000..a466ba54b74 --- /dev/null +++ b/src/control/lib/daos/api/libdaos_selftest.go @@ -0,0 +1,31 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build !test_stubs +// +build !test_stubs + +package api + +/* +#cgo CFLAGS: -I${SRCDIR}/../../../../cart/utils -I${SRCDIR}/../../../../utils/self_test +#cgo LDFLAGS: -lgurt -lcart -ldaos_self_test + +#include "self_test_lib.h" +*/ +import "C" + +func run_self_test(sizes *C.struct_st_size_params, numSizes C.int, repCount C.int, maxInflight C.int, groupName *C.char, + optMsEndpoints *C.struct_st_endpoint, numOptMsEndpoints C.uint32_t, + tgtEndpoints *C.struct_st_endpoint, numTgtEndpoints C.uint32_t, + msEndpoints **C.struct_st_master_endpt, numMsEndpoints *C.uint32_t, + sizeLatencies ****C.struct_st_latency, bufAlignment C.int16_t) C.int { + return C.run_self_test(sizes, numSizes, repCount, maxInflight, groupName, + optMsEndpoints, numOptMsEndpoints, tgtEndpoints, numTgtEndpoints, + msEndpoints, numMsEndpoints, sizeLatencies, bufAlignment, nil, true, true) +} + +func self_test_fini(agent_used C.bool) { + C.self_test_fini(agent_used) +} diff --git a/src/control/lib/daos/api/libdaos_selftest_stubs.go b/src/control/lib/daos/api/libdaos_selftest_stubs.go new file mode 100644 index 00000000000..9c9e1eb4603 --- /dev/null +++ b/src/control/lib/daos/api/libdaos_selftest_stubs.go @@ -0,0 +1,170 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build test_stubs +// +build test_stubs + +package api + +import ( + "unsafe" + + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/hostlist" + "github.com/daos-stack/daos/src/control/lib/ranklist" +) + +/* +#cgo CFLAGS: -I${SRCDIR}/../../../../cart/utils -I${SRCDIR}/../../../../utils/self_test +#cgo LDFLAGS: -lgurt -lcart -ldaos_self_test + +#include +#include + +#include "self_test_lib.h" + +struct st_latency *** +alloc_latency_arrays(size_t num_sizes, size_t num_endpoints, size_t num_latencies) +{ + struct st_latency ***latencies = NULL; + int i, j; + + D_ALLOC_ARRAY(latencies, num_sizes); + if (latencies == NULL) + return NULL; + + for (i = 0; i < num_sizes; i++) { + D_ALLOC_ARRAY(latencies[i], num_endpoints); + if (latencies[i] == NULL) + return NULL; + + for (j = 0; j < num_endpoints; j++) { + D_ALLOC_ARRAY(latencies[i][j], num_latencies); + if (latencies[i][j] == NULL) + return NULL; + } + } + + return latencies; +} +*/ +import "C" + +type run_self_test_EndpointLatency struct { + val C.int64_t + rank C.uint32_t + tag C.uint32_t + cci_rc C.int +} + +var ( + run_self_test_RunConfig *daos.SelfTestConfig + run_self_test_RC C.int = 0 + run_self_test_MsEndpoints []daos.SelfTestEndpoint + run_self_test_EndpointLatencies []run_self_test_EndpointLatency +) + +func run_self_test(sizes *C.struct_st_size_params, numSizes C.int, repCount C.int, maxInflight C.int, groupName *C.char, + optMsEndpoints *C.struct_st_endpoint, numOptMsEndpoints C.uint32_t, + tgtEndpoints *C.struct_st_endpoint, numTgtEndpoints C.uint32_t, + msEndpoints **C.struct_st_master_endpt, numMsEndpoints *C.uint32_t, + sizeLatencies ****C.struct_st_latency, bufAlignment C.int16_t) C.int { + + cfg := &daos.SelfTestConfig{ + GroupName: C.GoString(groupName), + Repetitions: uint(repCount), + BufferAlignment: int16(bufAlignment), + MaxInflightRPCs: uint(maxInflight), + } + + if numSizes > 0 { + cfg.SendSizes = make([]uint64, int(numSizes)) + cfg.ReplySizes = make([]uint64, int(numSizes)) + testSizesSlice := unsafe.Slice(sizes, int(numSizes)) + for i := 0; i < int(numSizes); i++ { + cfg.SendSizes[i] = uint64(testSizesSlice[i].send_size) + cfg.ReplySizes[i] = uint64(testSizesSlice[i].reply_size) + } + } + + if numOptMsEndpoints > 0 { + cfg.MasterEndpoints = make([]daos.SelfTestEndpoint, int(numOptMsEndpoints)) + msEndpointsSlice := unsafe.Slice(optMsEndpoints, int(numOptMsEndpoints)) + for i := 0; i < int(numOptMsEndpoints); i++ { + cfg.MasterEndpoints[i].Rank = ranklist.Rank(msEndpointsSlice[i].rank) + cfg.MasterEndpoints[i].Tag = uint32(msEndpointsSlice[i].tag) + } + run_self_test_MsEndpoints = cfg.MasterEndpoints + } + + if numTgtEndpoints > 0 { + rankSet := ranklist.NewRankSet() + tagSet := hostlist.NewNumericSet() + tgtEndpointsSlice := unsafe.Slice(tgtEndpoints, int(numTgtEndpoints)) + for i := 0; i < int(numTgtEndpoints); i++ { + rankSet.Add(ranklist.Rank(tgtEndpointsSlice[i].rank)) + tagSet.Add(uint(tgtEndpointsSlice[i].tag)) + } + cfg.EndpointRanks = rankSet.Ranks() + for _, tag := range tagSet.Slice() { + cfg.EndpointTags = append(cfg.EndpointTags, uint32(tag)) + } + + // If the configuration doesn't specify master endpoints, + // create one similarly to how the library does it. + if len(run_self_test_MsEndpoints) == 0 { + run_self_test_MsEndpoints = []daos.SelfTestEndpoint{ + { + Rank: cfg.EndpointRanks[len(cfg.EndpointRanks)-1] + 1, + Tag: cfg.EndpointTags[len(cfg.EndpointTags)-1], + }, + } + } + } + + run_self_test_RunConfig = cfg + if run_self_test_RC != 0 { + return run_self_test_RC + } + + // Construct the C array of master endpoints for the out parameter. + // Must be freed by the caller. + *numMsEndpoints = C.uint32_t(len(run_self_test_MsEndpoints)) + ptr, err := C.calloc(C.size_t(len(run_self_test_MsEndpoints)), C.sizeof_struct_st_master_endpt) + if err != nil { + panic("calloc() failed for master endpoints") + } + *msEndpoints = (*C.struct_st_master_endpt)(ptr) + msEpSlice := unsafe.Slice(*msEndpoints, int(*numMsEndpoints)) + for i := 0; i < int(*numMsEndpoints); i++ { + msEpSlice[i].endpt.ep_rank = C.uint32_t(run_self_test_MsEndpoints[i].Rank) + msEpSlice[i].endpt.ep_tag = C.uint32_t(run_self_test_MsEndpoints[i].Tag) + } + + // Construct the multi-dimensional C array of test latencies for the out parameter. + // Must be freed by the caller. + *sizeLatencies = C.alloc_latency_arrays(C.size_t(numSizes), C.size_t(*numMsEndpoints), C.size_t(len(run_self_test_EndpointLatencies))) + if *sizeLatencies == nil { + panic("calloc() failed for latency arrays") + } + + sizesSlice := unsafe.Slice(*sizeLatencies, int(numSizes)) + for i := 0; i < int(numSizes); i++ { + msSessSlice := unsafe.Slice(sizesSlice[i], int(*numMsEndpoints)) + for j := 0; j < int(*numMsEndpoints); j++ { + epLatSlice := unsafe.Slice(msSessSlice[j], len(run_self_test_EndpointLatencies)) + + for k := 0; k < len(run_self_test_EndpointLatencies); k++ { + epLatSlice[k].val = run_self_test_EndpointLatencies[k].val + epLatSlice[k].rank = run_self_test_EndpointLatencies[k].rank + epLatSlice[k].tag = run_self_test_EndpointLatencies[k].tag + } + } + } + + return run_self_test_RC +} + +func self_test_fini(agent_used C.bool) {} diff --git a/src/control/lib/daos/api/libdaos_stubs.go b/src/control/lib/daos/api/libdaos_stubs.go new file mode 100644 index 00000000000..341b90bdd34 --- /dev/null +++ b/src/control/lib/daos/api/libdaos_stubs.go @@ -0,0 +1,116 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build test_stubs +// +build test_stubs + +package api + +import ( + "unsafe" + + "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/lib/daos" +) + +/* +#include +#include +*/ +import "C" + +var ( + daos_init_RC C.int = 0 +) + +func daos_init() C.int { + return daos_init_RC +} + +func daos_fini() {} + +func dc_agent_fini() {} + +var ( + defaultSystemInfo *daos.SystemInfo = &daos.SystemInfo{ + Name: build.DefaultSystemName, + Provider: "ofi+tcp", + AgentPath: "/does/not/exist", + RankURIs: []*daos.RankURI{ + {Rank: 0, URI: "/does/not/exist"}, + {Rank: 1, URI: "/does/not/exist"}, + {Rank: 2, URI: "/does/not/exist"}, + }, + AccessPointRankURIs: []*daos.RankURI{ + {Rank: 0, URI: "/does/not/exist"}, + {Rank: 1, URI: "/does/not/exist"}, + {Rank: 2, URI: "/does/not/exist"}, + }, + } + daos_mgmt_get_sys_info_SystemInfo *daos.SystemInfo = defaultSystemInfo + daos_mgmt_get_sys_info_RC C.int = 0 +) + +func daos_mgmt_get_sys_info(group *C.char, sys_info_out **C.struct_daos_sys_info) C.int { + if daos_mgmt_get_sys_info_RC != 0 { + return daos_mgmt_get_sys_info_RC + } + + si := &C.struct_daos_sys_info{} + for i, c := range daos_mgmt_get_sys_info_SystemInfo.Name { + si.dsi_system_name[i] = C.char(c) + } + if group != nil && C.GoString(group) != daos_mgmt_get_sys_info_SystemInfo.Name { + panic("invalid group") + } + for i, c := range daos_mgmt_get_sys_info_SystemInfo.Provider { + si.dsi_fabric_provider[i] = C.char(c) + } + for i, c := range daos_mgmt_get_sys_info_SystemInfo.AgentPath { + si.dsi_agent_path[i] = C.char(c) + } + + si.dsi_nr_ranks = C.uint32_t(len(daos_mgmt_get_sys_info_SystemInfo.RankURIs)) + si.dsi_ranks = (*C.struct_daos_rank_uri)(C.calloc(C.size_t(si.dsi_nr_ranks), C.sizeof_struct_daos_rank_uri)) + if si.dsi_ranks == nil { + panic("calloc() failed for system ranks") + } + rankSlice := unsafe.Slice(si.dsi_ranks, int(si.dsi_nr_ranks)) + for i, rankURI := range daos_mgmt_get_sys_info_SystemInfo.RankURIs { + rankSlice[i].dru_rank = C.uint32_t(rankURI.Rank) + rankSlice[i].dru_uri = C.CString(rankURI.URI) + } + + si.dsi_nr_ms_ranks = C.uint32_t(len(daos_mgmt_get_sys_info_SystemInfo.AccessPointRankURIs)) + si.dsi_ms_ranks = (*C.uint32_t)(C.calloc(C.size_t(si.dsi_nr_ms_ranks), C.sizeof_uint32_t)) + if si.dsi_ms_ranks == nil { + panic("calloc() failed for ms ranks") + } + msRankSlice := unsafe.Slice(si.dsi_ms_ranks, int(si.dsi_nr_ms_ranks)) + for i, rankURI := range daos_mgmt_get_sys_info_SystemInfo.AccessPointRankURIs { + msRankSlice[i] = C.uint32_t(rankURI.Rank) + } + + *sys_info_out = si + return 0 +} + +func daos_mgmt_put_sys_info(sys_info *C.struct_daos_sys_info) { + if sys_info == nil { + return + } + + if sys_info.dsi_ranks != nil { + rankSlice := unsafe.Slice(sys_info.dsi_ranks, int(sys_info.dsi_nr_ranks)) + for _, rankURI := range rankSlice { + C.free(unsafe.Pointer(rankURI.dru_uri)) + } + C.free(unsafe.Pointer(sys_info.dsi_ranks)) + } + + if sys_info.dsi_ms_ranks != nil { + C.free(unsafe.Pointer(sys_info.dsi_ms_ranks)) + } +} diff --git a/src/control/lib/daos/api/provider.go b/src/control/lib/daos/api/provider.go index 5b7c74be35c..443cfbd1441 100644 --- a/src/control/lib/daos/api/provider.go +++ b/src/control/lib/daos/api/provider.go @@ -28,9 +28,9 @@ type ( ) // NewProvider returns an initialized DAOS API provider. -func NewProvider(log debugTraceLogger) (*Provider, error) { +func NewProvider(log debugTraceLogger, initLogging bool) (*Provider, error) { api := &api{} - cleanup, err := api.Init() + cleanup, err := api.Init(initLogging) if err != nil { return nil, errors.Wrap(err, "failed to initialize DAOS API") } diff --git a/src/control/lib/daos/api/selftest.go b/src/control/lib/daos/api/selftest.go new file mode 100644 index 00000000000..49e62aa6638 --- /dev/null +++ b/src/control/lib/daos/api/selftest.go @@ -0,0 +1,227 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package api + +import ( + "context" + "time" + "unsafe" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/ranklist" + "github.com/daos-stack/daos/src/control/logging" +) + +/* +#cgo CFLAGS: -I${SRCDIR}/../../../../cart/utils -I${SRCDIR}/../../../../utils/self_test + +#include "self_test_lib.h" + +void +set_size_params(struct st_size_params *params, int send_size, int reply_size) +{ + params->send_size = send_size; + if (send_size == 0) + params->send_type = CRT_SELF_TEST_MSG_TYPE_EMPTY; + else if (send_size < CRT_SELF_TEST_AUTO_BULK_THRESH) + params->send_type = CRT_SELF_TEST_MSG_TYPE_IOV; + else + params->send_type = CRT_SELF_TEST_MSG_TYPE_BULK_GET; + + params->reply_size = reply_size; + if (reply_size == 0) + params->reply_type = CRT_SELF_TEST_MSG_TYPE_EMPTY; + else if (reply_size < CRT_SELF_TEST_AUTO_BULK_THRESH) + params->reply_type = CRT_SELF_TEST_MSG_TYPE_IOV; + else + params->reply_type = CRT_SELF_TEST_MSG_TYPE_BULK_PUT; +} +*/ +import "C" + +type tgtEndpointSlice []daos.SelfTestEndpoint + +// toC returns a pointer to a C array of target endpoints. +// NB: Caller must free the array when finished. +func (tes tgtEndpointSlice) toC() (*C.struct_st_endpoint, error) { + if len(tes) == 0 { + return nil, errors.New("empty tgt endpoint slice") + } + + ptr, err := C.calloc(C.size_t(len(tes)), C.sizeof_struct_st_endpoint) + if err != nil { + return nil, err + } + cEndpoints := (*C.struct_st_endpoint)(ptr) + endPoints := unsafe.Slice(cEndpoints, len(tes)) + for i, ep := range tes { + endPoints[i].rank = C.uint32_t(ep.Rank) + endPoints[i].tag = C.uint32_t(ep.Tag) + } + + return cEndpoints, nil +} + +// getAllSystemRanks returns the set of system ranks available to use +// for a self_test run. If no ranks are available, a sentinel error +// is returned. +func getAllSystemRanks(ctx context.Context) ([]ranklist.Rank, error) { + log := logging.FromContext(ctx) + p, err := NewProvider(log, false) + if err != nil { + return nil, err + } + defer p.Cleanup() + + si, err := p.GetSystemInfo(ctx) + if err != nil { + return nil, err + } + + if len(si.RankURIs) == 0 { + return nil, ErrNoSystemRanks + } + + var systemRanks []ranklist.Rank + for _, rankURI := range si.RankURIs { + systemRanks = append(systemRanks, ranklist.Rank(rankURI.Rank)) + } + + return systemRanks, nil +} + +// RunSelfTest uses the provided configuration to run the logic +// behind the self_test tool. Per-size structured test results +// are returned as a slice. +func RunSelfTest(ctx context.Context, cfg *daos.SelfTestConfig) ([]*daos.SelfTestResult, error) { + if err := cfg.Validate(); err != nil { + return nil, errors.Wrap(err, "invalid self_test configuration") + } + + ptr, err := C.calloc(C.size_t(len(cfg.SendSizes)), C.sizeof_struct_st_size_params) + if err != nil { + return nil, err + } + cSizes := (*C.struct_st_size_params)(ptr) + defer C.free(unsafe.Pointer(cSizes)) + testSizes := unsafe.Slice(cSizes, len(cfg.SendSizes)) + for i := 0; i < len(testSizes); i++ { + C.set_size_params(&testSizes[i], C.int(cfg.SendSizes[i]), C.int(cfg.ReplySizes[i])) + } + + if len(cfg.EndpointRanks) == 0 { + cfg.EndpointRanks, err = getAllSystemRanks(ctx) + if err != nil { + return nil, err + } + } + + tgtEndpoints := make(tgtEndpointSlice, 0, len(cfg.EndpointRanks)*len(cfg.EndpointTags)) + for _, r := range cfg.EndpointRanks { + for _, t := range cfg.EndpointTags { + tgtEndpoints = append(tgtEndpoints, daos.SelfTestEndpoint{Rank: r, Tag: t}) + } + } + cTgtEndpoints, err := tgtEndpoints.toC() + defer C.free(unsafe.Pointer(cTgtEndpoints)) + + repCount := C.int(int(cfg.Repetitions) * len(tgtEndpoints)) + maxInflight := C.int(cfg.MaxInflightRPCs) + var cOptMasterEndpoints *C.struct_st_endpoint + var numOptMsEndpoints C.uint + var cMasterEndpoints *C.struct_st_master_endpt + var numMsEndpoints C.uint32_t + var cSizeLatencies ***C.struct_st_latency + var bufAlignment = C.int16_t(cfg.BufferAlignment) + + cGroupName := C.CString(cfg.GroupName) + defer C.free(unsafe.Pointer(cGroupName)) + + if len(cfg.MasterEndpoints) > 0 { + numOptMsEndpoints = C.uint(len(cfg.MasterEndpoints)) + ptr, err := C.calloc(C.size_t(numOptMsEndpoints), C.sizeof_struct_st_endpoint) + if err != nil { + return nil, err + } + cOptMasterEndpoints = (*C.struct_st_endpoint)(ptr) + defer C.free(unsafe.Pointer(cOptMasterEndpoints)) + + masterEndpoints := unsafe.Slice(cOptMasterEndpoints, int(numOptMsEndpoints)) + for i, ep := range cfg.MasterEndpoints { + masterEndpoints[i].rank = C.uint(ep.Rank) + masterEndpoints[i].tag = C.uint(ep.Tag) + } + } + + defer func() { + if cMasterEndpoints != nil { + C.free(unsafe.Pointer(cMasterEndpoints)) + } + if cSizeLatencies != nil { + C.free_size_latencies(cSizeLatencies, C.uint32_t(len(testSizes)), numMsEndpoints) + } + self_test_fini(true) + }() + + rc := run_self_test(cSizes, C.int(len(testSizes)), repCount, maxInflight, cGroupName, + cOptMasterEndpoints, numOptMsEndpoints, + cTgtEndpoints, C.uint32_t(len(tgtEndpoints)), + &cMasterEndpoints, &numMsEndpoints, + &cSizeLatencies, bufAlignment) + if err := daos.ErrorFromRC(int(rc)); err != nil { + return nil, errors.Wrap(err, "self_test failed") + } + + if numMsEndpoints == 0 || cMasterEndpoints == nil { + return nil, errors.New("no master endpoints defined") + } + if cSizeLatencies == nil { + return nil, errors.New("no test latencies recorded") + } + + masterEndpoints := unsafe.Slice(cMasterEndpoints, int(numMsEndpoints)) + var results []*daos.SelfTestResult + perSizeList := unsafe.Slice(cSizeLatencies, len(testSizes)) + for i := 0; i < len(testSizes); i++ { + params := testSizes[i] + msSessions := unsafe.Slice(perSizeList[i], int(numMsEndpoints)) + for j := 0; j < int(numMsEndpoints); j++ { + msEp := masterEndpoints[j] + res := &daos.SelfTestResult{ + MasterEndpoint: daos.SelfTestEndpoint{ + Rank: ranklist.Rank(msEp.endpt.ep_rank), + Tag: uint32(msEp.endpt.ep_tag), + }, + TargetEndpoints: tgtEndpoints, + Repetitions: uint(repCount), + SendSize: uint64(params.send_size), + ReplySize: uint64(params.reply_size), + BufferAlignment: int16(bufAlignment), + Duration: time.Duration(msEp.reply.test_duration_ns), + MasterLatency: new(daos.EndpointLatency), + TargetLatencies: make(map[daos.SelfTestEndpoint]*daos.EndpointLatency), + } + repLatencies := unsafe.Slice(msSessions[j], int(repCount)) + + for _, latency := range repLatencies { + if latency.cci_rc < 0 { + res.MasterLatency.AddValue(-1) + res.AddTargetLatency(ranklist.Rank(latency.rank), uint32(latency.tag), -1) + continue + } + res.MasterLatency.AddValue(int64(latency.val)) + res.AddTargetLatency(ranklist.Rank(latency.rank), uint32(latency.tag), int64(latency.val)) + } + + results = append(results, res) + } + } + + return results, nil +} diff --git a/src/control/lib/daos/api/selftest_test.go b/src/control/lib/daos/api/selftest_test.go new file mode 100644 index 00000000000..7413c938f56 --- /dev/null +++ b/src/control/lib/daos/api/selftest_test.go @@ -0,0 +1,206 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package api + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/ranklist" + "github.com/daos-stack/daos/src/control/logging" +) + +func TestAPI_RunSelfTest(t *testing.T) { + genCfg := func(xfrm func(cfg *daos.SelfTestConfig)) *daos.SelfTestConfig { + cfg := &daos.SelfTestConfig{} + cfg.SetDefaults() + if xfrm != nil { + xfrm(cfg) + } + return cfg + } + genEndPoints := func(tags []uint32, ranks ...ranklist.Rank) (eps []daos.SelfTestEndpoint) { + if len(tags) == 0 { + tags = []uint32{0} + } + if len(ranks) == 0 { + ranks = []ranklist.Rank{0} + } + for i := 0; i < len(ranks); i++ { + for j := 0; j < len(tags); j++ { + eps = append(eps, daos.SelfTestEndpoint{Rank: ranks[i], Tag: tags[j]}) + } + } + return + } + genEpLatencies := func(totalReps uint, eps ...daos.SelfTestEndpoint) (lats []run_self_test_EndpointLatency) { + if totalReps == 0 { + totalReps = genCfg(nil).Repetitions + } + if len(eps) == 0 { + eps = genEndPoints(nil, 0, 1, 2) + } + lats = make([]run_self_test_EndpointLatency, totalReps) + latCount := 0 + for i := 0; i < int(totalReps); i++ { + lats[i] = run_self_test_EndpointLatency{ + val: _Ctype_int64_t(i + 1), + rank: _Ctype___uint32_t(eps[i%len(eps)].Rank), + tag: _Ctype___uint32_t(eps[i%len(eps)].Tag), + cci_rc: 0, + } + latCount++ + } + return + } + genExpResults := func(cfg *daos.SelfTestConfig) (results []*daos.SelfTestResult) { + for i := range cfg.SendSizes { + var tgtEndpoints []daos.SelfTestEndpoint + if len(cfg.EndpointRanks) > 0 { + tgtEndpoints = genEndPoints(cfg.EndpointTags, cfg.EndpointRanks...) + } + + masterEndPoints := cfg.MasterEndpoints + if len(masterEndPoints) == 0 { + masterEndPoints = []daos.SelfTestEndpoint{ + { + Rank: cfg.EndpointRanks[len(cfg.EndpointRanks)-1] + 1, + Tag: cfg.EndpointTags[len(cfg.EndpointTags)-1], + }, + } + } + + for _, mep := range masterEndPoints { + tgtEps := genEndPoints(cfg.EndpointTags, cfg.EndpointRanks...) + res := &daos.SelfTestResult{ + MasterEndpoint: mep, + TargetEndpoints: tgtEps, + Repetitions: cfg.Repetitions * uint(len(tgtEps)), + SendSize: cfg.SendSizes[i], + ReplySize: cfg.ReplySizes[i], + BufferAlignment: cfg.BufferAlignment, + MasterLatency: &daos.EndpointLatency{}, + TargetLatencies: make(map[daos.SelfTestEndpoint]*daos.EndpointLatency), + } + for _, ep := range tgtEndpoints { + res.TargetLatencies[ep] = &daos.EndpointLatency{} + } + + results = append(results, res) + } + } + return + } + + for name, tc := range map[string]struct { + cfg *daos.SelfTestConfig + self_test_RC int + testSysInfo *daos.SystemInfo + get_sys_info_RC int + expMsEps []daos.SelfTestEndpoint + expRunCfg *daos.SelfTestConfig + expRunResults []*daos.SelfTestResult + expErr error + }{ + "empty config": { + cfg: &daos.SelfTestConfig{}, + expErr: errors.New("invalid self_test configuration"), + }, + "library alloc fails": { + cfg: genCfg(nil), + self_test_RC: int(daos.NoMemory), + expErr: daos.NoMemory, + }, + "GetSystemInfo fails": { + cfg: genCfg(nil), + get_sys_info_RC: int(daos.AgentCommFailed), + expErr: daos.AgentCommFailed, + }, + "custom config -- 1 rank": { + cfg: genCfg(func(cfg *daos.SelfTestConfig) { + cfg.EndpointRanks = []ranklist.Rank{1} + cfg.EndpointTags = []uint32{1} + cfg.Repetitions = 10 + cfg.SendSizes = []uint64{1024} + cfg.ReplySizes = []uint64{1024} + }), + }, + "custom config -- defined master endpoints": { + cfg: genCfg(func(cfg *daos.SelfTestConfig) { + cfg.EndpointRanks = []ranklist.Rank{0, 1, 2} + cfg.EndpointTags = []uint32{1} + cfg.MasterEndpoints = []daos.SelfTestEndpoint{ + {Rank: 0, Tag: 1}, + {Rank: 1, Tag: 1}, + } + }), + }, + "default config -- all ranks": { + cfg: genCfg(nil), + }, + } { + t.Run(name, func(t *testing.T) { + if tc.testSysInfo == nil { + tc.testSysInfo = defaultSystemInfo + } + daos_mgmt_get_sys_info_SystemInfo = tc.testSysInfo + daos_mgmt_get_sys_info_RC = _Ctype_int(tc.get_sys_info_RC) + defer func() { + daos_mgmt_get_sys_info_SystemInfo = defaultSystemInfo + daos_mgmt_get_sys_info_RC = 0 + }() + + run_self_test_RunConfig = nil + run_self_test_MsEndpoints = nil + run_self_test_EndpointLatencies = nil + run_self_test_RC = _Ctype_int(tc.self_test_RC) + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + var sysRanks []ranklist.Rank + if len(tc.cfg.EndpointRanks) == 0 { + sysRanks = make([]ranklist.Rank, len(tc.testSysInfo.RankURIs)) + for i, rankURI := range tc.testSysInfo.RankURIs { + sysRanks[i] = ranklist.Rank(rankURI.Rank) + } + } else { + sysRanks = tc.cfg.EndpointRanks + } + if tc.expRunCfg == nil { + tc.expRunCfg = tc.cfg.Copy() + tc.expRunCfg.EndpointRanks = sysRanks + tc.expRunCfg.Repetitions = tc.cfg.Repetitions * uint(len(sysRanks)) + } + if tc.expRunResults == nil { + expCfg := tc.cfg.Copy() + expCfg.EndpointRanks = sysRanks + tc.expRunResults = genExpResults(expCfg) + } + tgtEps := genEndPoints(tc.cfg.EndpointTags, sysRanks...) + run_self_test_EndpointLatencies = genEpLatencies(tc.cfg.Repetitions*uint(len(tgtEps)), tgtEps...) + + ctx := test.MustLogContext(t, log) + res, err := RunSelfTest(ctx, tc.cfg) + test.CmpErr(t, tc.expErr, err) + if tc.expErr != nil { + return + } + test.CmpAny(t, "SelfTestConfig", tc.expRunCfg, run_self_test_RunConfig) + cmpOpts := cmp.Options{ + // Don't need to test all of this again. Just verify that + // we get the expected number of latency results here. + cmpopts.IgnoreTypes(daos.EndpointLatency{}), + } + test.CmpAny(t, "SelfTestResults", tc.expRunResults, res, cmpOpts...) + }) + } +} diff --git a/src/control/lib/daos/api/system.go b/src/control/lib/daos/api/system.go index da874f69ddf..73001363a25 100644 --- a/src/control/lib/daos/api/system.go +++ b/src/control/lib/daos/api/system.go @@ -7,6 +7,8 @@ package api import ( + "context" + "sort" "unsafe" "github.com/pkg/errors" @@ -22,13 +24,13 @@ import ( import "C" // GetSystemInfo queries for the connected system information. -func (p *Provider) GetSystemInfo() (*daos.SystemInfo, error) { +func (p *Provider) GetSystemInfo(ctx context.Context) (*daos.SystemInfo, error) { var cSysInfo *C.struct_daos_sys_info - rc := C.daos_mgmt_get_sys_info(nil, &cSysInfo) + rc := daos_mgmt_get_sys_info(nil, &cSysInfo) if err := daos.ErrorFromRC(int(rc)); err != nil { return nil, errors.Wrap(err, "querying DAOS system information") } - defer C.daos_mgmt_put_sys_info(cSysInfo) + defer daos_mgmt_put_sys_info(cSysInfo) sysInfo := &daos.SystemInfo{ Name: C.GoString(&cSysInfo.dsi_system_name[0]), @@ -46,10 +48,16 @@ func (p *Provider) GetSystemInfo() (*daos.SystemInfo, error) { sysInfo.RankURIs = append(sysInfo.RankURIs, rankURI) rankURIs[rankURI.Rank] = rankURI } + sort.Slice(sysInfo.RankURIs, func(i, j int) bool { + return sysInfo.RankURIs[i].Rank < sysInfo.RankURIs[j].Rank + }) for _, cMSRank := range unsafe.Slice(cSysInfo.dsi_ms_ranks, int(cSysInfo.dsi_nr_ms_ranks)) { sysInfo.AccessPointRankURIs = append(sysInfo.AccessPointRankURIs, rankURIs[uint32(cMSRank)]) } + sort.Slice(sysInfo.AccessPointRankURIs, func(i, j int) bool { + return sysInfo.AccessPointRankURIs[i].Rank < sysInfo.AccessPointRankURIs[j].Rank + }) return sysInfo, nil } diff --git a/src/control/lib/daos/libgurt.go b/src/control/lib/daos/libgurt.go new file mode 100644 index 00000000000..3e1485b968f --- /dev/null +++ b/src/control/lib/daos/libgurt.go @@ -0,0 +1,22 @@ +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build !test_stubs +// +build !test_stubs + +package daos + +/* +#cgo LDFLAGS: -lgurt + +#include +*/ +import "C" + +func daos_debug_init(log_file *C.char) C.int { + return C.daos_debug_init(log_file) +} + +func daos_debug_fini() { + C.daos_debug_fini() +} diff --git a/src/control/lib/daos/libgurt_stubs.go b/src/control/lib/daos/libgurt_stubs.go new file mode 100644 index 00000000000..af8b3dc896a --- /dev/null +++ b/src/control/lib/daos/libgurt_stubs.go @@ -0,0 +1,21 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build test_stubs +// +build test_stubs + +package daos + +import "C" + +var ( + daos_debug_init_RC C.int = 0 +) + +func daos_debug_init(log_file *C.char) C.int { + return daos_debug_init_RC +} + +func daos_debug_fini() {} diff --git a/src/control/lib/daos/logging.go b/src/control/lib/daos/logging.go index 9891adba0be..8c9f26a3e3f 100644 --- a/src/control/lib/daos/logging.go +++ b/src/control/lib/daos/logging.go @@ -13,14 +13,9 @@ import ( "github.com/pkg/errors" ) -/* -#cgo LDFLAGS: -lgurt - -#include -*/ -import "C" - const ( + // UnsetLogMask defines an explicitly-unset log mask. + UnsetLogMask = "UNSET" // DefaultDebugMask defines the basic debug mask. DefaultDebugMask = "DEBUG,MEM=ERR,OBJECT=ERR,PLACEMENT=ERR" // DefaultInfoMask defines the basic info mask. @@ -35,13 +30,27 @@ func InitLogging(masks ...string) (func(), error) { if mask == "" { mask = DefaultInfoMask } - os.Setenv("D_LOG_MASK", mask) + if mask != UnsetLogMask { + if err := SetLogMask(mask); err != nil { + return func() {}, errors.Wrap(err, "failed to set DAOS logging mask") + } + } - if rc := C.daos_debug_init(nil); rc != 0 { + if rc := daos_debug_init(nil); rc != 0 { return func() {}, errors.Wrap(Status(rc), "daos_debug_init() failed") } return func() { - C.daos_debug_fini() + daos_debug_fini() }, nil } + +// SetLogMask sets the DAOS logging mask. +func SetLogMask(mask string) error { + return os.Setenv("D_LOG_MASK", mask) +} + +// GetLogMask returns the DAOS logging mask, if set. +func GetLogMask() string { + return os.Getenv("D_LOG_MASK") +} diff --git a/src/control/lib/daos/selftest.go b/src/control/lib/daos/selftest.go new file mode 100644 index 00000000000..703685b5dbe --- /dev/null +++ b/src/control/lib/daos/selftest.go @@ -0,0 +1,354 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package daos + +import ( + "encoding/json" + "fmt" + "math" + "sort" + "time" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/lib/ranklist" +) + +/* +#cgo CFLAGS: -I${SRCDIR}/../../../cart/utils -I${SRCDIR}/../../../utils/self_test + +#include "self_test_lib.h" +*/ +import "C" + +type ( + // EndpointLatency represents the results of running latency tests against + // a single rank:target endpoint. + EndpointLatency struct { + rawValues []uint64 + sorted bool + TotalRPCs uint64 `json:"total_rpcs"` + Min uint64 `json:"min"` + Max uint64 `json:"max"` + Sum uint64 `json:"-"` + SumSquares float64 `json:"-"` + FailCount uint64 `json:"fail_count"` + } + + // SelfTestEndpoint represents a rank:target test endpoint. + SelfTestEndpoint struct { + Rank ranklist.Rank + Tag uint32 + } + + // SelfTestConfig defines the parameters for a set of self_test runs. + SelfTestConfig struct { + GroupName string `json:"group_name"` + MasterEndpoints []SelfTestEndpoint `json:"master_endpoints,omitempty"` + EndpointRanks []ranklist.Rank `json:"endpoint_ranks"` + EndpointTags []uint32 `json:"endpoint_tags"` + Repetitions uint `json:"repetitions"` + SendSizes []uint64 `json:"send_sizes"` + ReplySizes []uint64 `json:"reply_sizes"` + BufferAlignment int16 `json:"buffer_alignment"` + MaxInflightRPCs uint `json:"max_inflight_rpcs"` + } + + // SelfTestResult represents the results of a single self_test run. + SelfTestResult struct { + MasterEndpoint SelfTestEndpoint `json:"-"` + TargetEndpoints []SelfTestEndpoint `json:"-"` + Repetitions uint `json:"repetitions"` + SendSize uint64 `json:"send_size"` + ReplySize uint64 `json:"reply_size"` + BufferAlignment int16 `json:"buffer_alignment"` + Duration time.Duration `json:"duration"` + MasterLatency *EndpointLatency `json:"master_latency"` + TargetLatencies map[SelfTestEndpoint]*EndpointLatency `json:"-"` + } +) + +var defaultLatencyPercentiles []uint64 = []uint64{50, 75, 90, 95, 99} + +const ( + defaultSendSize = 1 << 20 // 1MiB + defaultReplySize = defaultSendSize + defaultRepCount = 10000 + defaultMaxInflight = 16 + defaultBufAlignment = C.CRT_ST_BUF_ALIGN_DEFAULT +) + +// SetDefaults replaces unset parameters with default values. +func (cfg *SelfTestConfig) SetDefaults() error { + if cfg == nil { + return errors.New("nil config") + } + + if cfg.GroupName == "" { + cfg.GroupName = build.DefaultSystemName + } + if len(cfg.EndpointTags) == 0 { + cfg.EndpointTags = []uint32{0} + } + if len(cfg.SendSizes) == 0 { + cfg.SendSizes = []uint64{defaultSendSize} + } + if len(cfg.ReplySizes) == 0 { + cfg.ReplySizes = []uint64{defaultReplySize} + } + if cfg.Repetitions == 0 { + cfg.Repetitions = defaultRepCount + } + if cfg.MaxInflightRPCs == 0 { + cfg.MaxInflightRPCs = defaultMaxInflight + } + if cfg.BufferAlignment == 0 { + cfg.BufferAlignment = defaultBufAlignment + } + + return cfg.Validate() +} + +// Validate checks the configuration for validity. +func (cfg *SelfTestConfig) Validate() error { + if cfg == nil { + return errors.New("nil config") + } + + if cfg.GroupName == "" { + return errors.New("group name is required") + } + if len(cfg.EndpointTags) == 0 { + return errors.New("endpoint tag list is required") + } + if len(cfg.SendSizes) == 0 { + return errors.New("send size list is required") + } + if len(cfg.ReplySizes) == 0 { + return errors.New("reply size list is required") + } + if cfg.Repetitions == 0 { + return errors.New("repetitions is required") + } + if cfg.MaxInflightRPCs == 0 { + return errors.New("max inflight RPCs is required") + } + if cfg.MaxInflightRPCs == 0 { + return errors.New("max inflight RPCs is required") + } + if cfg.BufferAlignment == 0 { + return errors.New("buffer alignment is required") + } + if len(cfg.SendSizes) != len(cfg.ReplySizes) { + return errors.New("send/reply size list mismatch") + } + + return nil +} + +// Copy returns a copy of the configuration. +func (cfg *SelfTestConfig) Copy() *SelfTestConfig { + if cfg == nil { + return nil + } + + cp := &SelfTestConfig{} + *cp = *cfg + copy(cp.MasterEndpoints, cfg.MasterEndpoints) + copy(cp.EndpointRanks, cfg.EndpointRanks) + copy(cp.EndpointTags, cfg.EndpointTags) + copy(cp.SendSizes, cfg.SendSizes) + copy(cp.ReplySizes, cfg.ReplySizes) + + return cp +} + +// Succeeded returns the number of RPCs that succeeded. +func (epl *EndpointLatency) Succeeded() uint64 { + return epl.TotalRPCs - epl.FailCount +} + +// AddValue adds a sampled latency value (or -1 to increment the failure count). +func (epl *EndpointLatency) AddValue(value int64) { + epl.TotalRPCs++ + if value < 0 { + epl.FailCount++ + return + } + + // TODO: Figure out if there's a more clever way to do this... Seems + // like with histograms we need to pre-bucket the values. + epl.rawValues = append(epl.rawValues, uint64(value)) + epl.sorted = false + if epl.TotalRPCs == 1 || value < int64(epl.Min) { + epl.Min = uint64(value) + } + if value > int64(epl.Max) { + epl.Max = uint64(value) + } + + epl.SumSquares += float64(value) * float64(value) + epl.Sum += uint64(value) +} + +func (epl *EndpointLatency) sortValues() { + if epl.sorted { + return + } + + sort.Slice(epl.rawValues, func(a, b int) bool { + return epl.rawValues[a] < epl.rawValues[b] + }) + epl.sorted = true +} + +// Percentiles returns a sorted slice of bucket keys and a map of buckets +// holding percentile values. +func (epl *EndpointLatency) Percentiles(percentiles ...uint64) ([]uint64, map[uint64]*MetricBucket) { + epl.sortValues() + + if len(percentiles) == 0 { + percentiles = defaultLatencyPercentiles + } + sort.Slice(percentiles, func(a, b int) bool { + return percentiles[a] < percentiles[b] + }) + buckets := make(map[uint64]*MetricBucket) + + for _, p := range percentiles { + valIdx := epl.Succeeded() * p / 100 + if uint64(len(epl.rawValues)) <= valIdx { + continue + } + buckets[p] = &MetricBucket{ + Label: fmt.Sprintf("%d", p), + CumulativeCount: valIdx, + UpperBound: float64(epl.rawValues[valIdx]), + } + } + + return percentiles, buckets +} + +// PercentileBuckets returns a sorted slice of buckets holding percentile values. +func (epl *EndpointLatency) PercentileBuckets(percentiles ...uint64) []*MetricBucket { + keys, bucketMap := epl.Percentiles(percentiles...) + buckets := make([]*MetricBucket, 0, len(bucketMap)) + for _, key := range keys { + buckets = append(buckets, bucketMap[key]) + } + + return buckets +} + +// Average returns the average latency value of successful RPCs. +func (epl *EndpointLatency) Average() float64 { + if epl.Succeeded() == 0 { + return 0 + } + return float64(epl.Sum) / float64(epl.Succeeded()) +} + +// StdDev returns the standard deviation of the latency values of successful RPCs. +func (epl *EndpointLatency) StdDev() float64 { + if epl.Succeeded() < 2 { + return 0 + } + avg := epl.Average() + return math.Sqrt((epl.SumSquares - (float64(epl.Succeeded()) * avg * avg)) / float64(epl.Succeeded()-1)) +} + +func roundFloat(val float64, places int) float64 { + return math.Round(val*math.Pow10(places)) / math.Pow10(places) +} + +func (epl *EndpointLatency) MarshalJSON() ([]byte, error) { + type toJSON EndpointLatency + return json.Marshal(struct { + Average float64 `json:"avg"` + StdDev float64 `json:"std_dev"` + Percentiles []*MetricBucket `json:"percentiles"` + *toJSON + }{ + Average: roundFloat(epl.Average(), 4), + StdDev: roundFloat(epl.StdDev(), 4), + Percentiles: epl.PercentileBuckets(), + toJSON: (*toJSON)(epl), + }) +} + +func (ste SelfTestEndpoint) String() string { + return fmt.Sprintf("%d:%d", ste.Rank, ste.Tag) +} + +func (str *SelfTestResult) MarshalJSON() ([]byte, error) { + epLatencies := make(map[string]*EndpointLatency) + for ep, lr := range str.TargetLatencies { + epLatencies[ep.String()] = lr + } + + type toJSON SelfTestResult + return json.Marshal(struct { + MasterEndpoint string `json:"master_endpoint"` + TargetEndpoints []string `json:"target_endpoints"` + EndpointLatencies map[string]*EndpointLatency `json:"target_latencies,omitempty"` + *toJSON + }{ + MasterEndpoint: str.MasterEndpoint.String(), + TargetEndpoints: func() []string { + eps := make([]string, len(str.TargetEndpoints)) + for i, ep := range str.TargetEndpoints { + eps[i] = ep.String() + } + return eps + }(), + EndpointLatencies: epLatencies, + toJSON: (*toJSON)(str), + }) +} + +// AddTargetLatency adds a latency value for a target endpoint. +func (str *SelfTestResult) AddTargetLatency(rank ranklist.Rank, tag uint32, value int64) { + var found bool + for _, ep := range str.TargetEndpoints { + if ep.Rank == rank && ep.Tag == tag { + found = true + break + } + } + if !found { + return + } + + if str.TargetLatencies == nil { + str.TargetLatencies = make(map[SelfTestEndpoint]*EndpointLatency) + } + + ep := SelfTestEndpoint{ + Rank: rank, + Tag: tag, + } + epl, found := str.TargetLatencies[ep] + if !found { + epl = &EndpointLatency{ + rawValues: make([]uint64, 0, str.Repetitions/uint(len(str.TargetEndpoints))), + } + str.TargetLatencies[ep] = epl + } + + epl.AddValue(value) +} + +// TargetRanks returns a slice of target ranks in the same order +// as the configured target endpoints. +func (str *SelfTestResult) TargetRanks() (ranks []ranklist.Rank) { + for _, ep := range str.TargetEndpoints { + ranks = append(ranks, ep.Rank) + } + return +} diff --git a/src/control/lib/daos/selftest_test.go b/src/control/lib/daos/selftest_test.go new file mode 100644 index 00000000000..8029a176349 --- /dev/null +++ b/src/control/lib/daos/selftest_test.go @@ -0,0 +1,295 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package daos_test + +import ( + "encoding/json" + "testing" + "time" + + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/ranklist" +) + +func TestDaos_SelfTestConfig(t *testing.T) { + for name, tc := range map[string]struct { + cfg *daos.SelfTestConfig + expErr error + }{ + "nil config fails validation": { + expErr: errors.New("nil"), + }, + "imbalanced send/reply lists": { + cfg: func() *daos.SelfTestConfig { + cfg := new(daos.SelfTestConfig) + cfg.SendSizes = []uint64{0, 1} + cfg.ReplySizes = []uint64{1} + cfg.SetDefaults() + return cfg + }(), + expErr: errors.New("mismatch"), + }, + "defaults should pass": { + cfg: func() *daos.SelfTestConfig { + cfg := new(daos.SelfTestConfig) + cfg.SetDefaults() + return cfg + }(), + }, + } { + t.Run(name, func(t *testing.T) { + gotErr := tc.cfg.Validate() + test.CmpErr(t, tc.expErr, gotErr) + }) + } +} + +func TestDaos_EndpointLatency(t *testing.T) { + epl := new(daos.EndpointLatency) + for i := int64(1); i <= 100; i++ { + epl.AddValue(i) + } + epl.AddValue(-1) + + test.CmpAny(t, "TotalRPCs", uint64(101), epl.TotalRPCs) + test.CmpAny(t, "Succeeded()", uint64(100), epl.Succeeded()) + test.CmpAny(t, "FailCount", uint64(1), epl.FailCount) + test.CmpAny(t, "Min", uint64(1), epl.Min) + test.CmpAny(t, "Max", uint64(100), epl.Max) + test.CmpAny(t, "Sum", uint64(5050), epl.Sum) + test.CmpAny(t, "SumSquares", float64(338350), epl.SumSquares) + test.CmpAny(t, "Average()", float64(50.5), epl.Average()) + test.CmpAny(t, "StdDev()", float64(29.0115), epl.StdDev(), cmpopts.EquateApprox(0, 0.0001)) + + keys, buckets := epl.Percentiles() + sorted := make([]*daos.MetricBucket, len(keys)) + for i, key := range keys { + sorted[i] = buckets[key] + + switch key { + case 50: + test.CmpAny(t, "50th", float64(51), buckets[key].UpperBound) + case 75: + test.CmpAny(t, "75th", float64(76), buckets[key].UpperBound) + case 90: + test.CmpAny(t, "90th", float64(91), buckets[key].UpperBound) + case 95: + test.CmpAny(t, "95th", float64(96), buckets[key].UpperBound) + case 99: + test.CmpAny(t, "99th", float64(100), buckets[key].UpperBound) + } + } + test.CmpAny(t, "PercentileBuckets()", sorted, epl.PercentileBuckets()) +} + +func TestDaos_SelfTestResult(t *testing.T) { + str := new(daos.SelfTestResult) + + testRank := ranklist.Rank(1) + testTarget := uint32(0) + testEndpoint := daos.SelfTestEndpoint{Rank: testRank, Tag: testTarget} + str.AddTargetLatency(testRank, testTarget, 1) + if _, found := str.TargetLatencies[testEndpoint]; found { + t.Fatal("expected no latency for unknown endpoint") + } + + str.TargetEndpoints = append(str.TargetEndpoints, testEndpoint) + str.AddTargetLatency(testRank, testTarget, 1) + if _, found := str.TargetLatencies[testEndpoint]; !found { + t.Fatal("expected latency for known endpoint") + } + + test.CmpAny(t, "TargetRanks()", []ranklist.Rank{testRank}, str.TargetRanks()) +} + +func TestDaos_SelfTestResult_MarshalJSON(t *testing.T) { + str := &daos.SelfTestResult{ + MasterEndpoint: daos.SelfTestEndpoint{Rank: 3, Tag: 0}, + TargetEndpoints: []daos.SelfTestEndpoint{ + {Rank: 0, Tag: 0}, + {Rank: 1, Tag: 0}, + {Rank: 2, Tag: 0}, + }, + Repetitions: 3000, + SendSize: 1024, + ReplySize: 1024, + BufferAlignment: -1, + Duration: 2 * time.Second, + MasterLatency: &daos.EndpointLatency{}, + } + + for i := int64(1); i <= int64(str.Repetitions); i++ { + str.MasterLatency.AddValue(i) + str.AddTargetLatency(ranklist.Rank(i%3), 0, i) + } + + gotBytes, err := json.MarshalIndent(str, "", " ") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + expStr := `{ + "master_endpoint": "3:0", + "target_endpoints": [ + "0:0", + "1:0", + "2:0" + ], + "target_latencies": { + "0:0": { + "avg": 1501.5, + "std_dev": 866.4583, + "percentiles": [ + { + "label": "50", + "cumulative_count": 500, + "upper_bound": 1503 + }, + { + "label": "75", + "cumulative_count": 750, + "upper_bound": 2253 + }, + { + "label": "90", + "cumulative_count": 900, + "upper_bound": 2703 + }, + { + "label": "95", + "cumulative_count": 950, + "upper_bound": 2853 + }, + { + "label": "99", + "cumulative_count": 990, + "upper_bound": 2973 + } + ], + "total_rpcs": 1000, + "min": 3, + "max": 3000, + "fail_count": 0 + }, + "1:0": { + "avg": 1499.5, + "std_dev": 866.4583, + "percentiles": [ + { + "label": "50", + "cumulative_count": 500, + "upper_bound": 1501 + }, + { + "label": "75", + "cumulative_count": 750, + "upper_bound": 2251 + }, + { + "label": "90", + "cumulative_count": 900, + "upper_bound": 2701 + }, + { + "label": "95", + "cumulative_count": 950, + "upper_bound": 2851 + }, + { + "label": "99", + "cumulative_count": 990, + "upper_bound": 2971 + } + ], + "total_rpcs": 1000, + "min": 1, + "max": 2998, + "fail_count": 0 + }, + "2:0": { + "avg": 1500.5, + "std_dev": 866.4583, + "percentiles": [ + { + "label": "50", + "cumulative_count": 500, + "upper_bound": 1502 + }, + { + "label": "75", + "cumulative_count": 750, + "upper_bound": 2252 + }, + { + "label": "90", + "cumulative_count": 900, + "upper_bound": 2702 + }, + { + "label": "95", + "cumulative_count": 950, + "upper_bound": 2852 + }, + { + "label": "99", + "cumulative_count": 990, + "upper_bound": 2972 + } + ], + "total_rpcs": 1000, + "min": 2, + "max": 2999, + "fail_count": 0 + } + }, + "repetitions": 3000, + "send_size": 1024, + "reply_size": 1024, + "buffer_alignment": -1, + "duration": 2000000000, + "master_latency": { + "avg": 1500.5, + "std_dev": 866.1697, + "percentiles": [ + { + "label": "50", + "cumulative_count": 1500, + "upper_bound": 1501 + }, + { + "label": "75", + "cumulative_count": 2250, + "upper_bound": 2251 + }, + { + "label": "90", + "cumulative_count": 2700, + "upper_bound": 2701 + }, + { + "label": "95", + "cumulative_count": 2850, + "upper_bound": 2851 + }, + { + "label": "99", + "cumulative_count": 2970, + "upper_bound": 2971 + } + ], + "total_rpcs": 3000, + "min": 1, + "max": 3000, + "fail_count": 0 + } +}` + test.CmpAny(t, "JSON output", expStr, string(gotBytes)) +} diff --git a/src/control/lib/daos/status.go b/src/control/lib/daos/status.go index 577c0c01d2b..54099f31a2f 100644 --- a/src/control/lib/daos/status.go +++ b/src/control/lib/daos/status.go @@ -172,5 +172,7 @@ const ( // BadCert indicates that an invalid certificate was detected. BadCert Status = -C.DER_BAD_CERT // RedundancyFactorExceeded indicates that the maximum number of failed components was exceeded. - RedundancyFactorExceeded = -C.DER_RF + RedundancyFactorExceeded Status = -C.DER_RF + // AgentCommFailed indicates that client/agent communication failed. + AgentCommFailed Status = -C.DER_AGENT_COMM ) diff --git a/src/control/lib/daos/telemetry.go b/src/control/lib/daos/telemetry.go new file mode 100644 index 00000000000..e56377ddb1e --- /dev/null +++ b/src/control/lib/daos/telemetry.go @@ -0,0 +1,302 @@ +// +// (C) Copyright 2021-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package daos + +import ( + "encoding/json" + "sort" + "strconv" + "strings" + + "github.com/pkg/errors" +) + +// MetricType defines the different types of metrics. +type MetricType uint32 + +const ( + MetricTypeUnknown MetricType = iota + MetricTypeGeneric + MetricTypeCounter + MetricTypeGauge + MetricTypeSummary + MetricTypeHistogram + + metricTypeUnknownStr = "Unknown" + metricTypeGenericStr = "Generic" + metricTypeCounterStr = "Counter" + metricTypeGaugeStr = "Gauge" + metricTypeSummaryStr = "Summary" + metricTypeHistogramStr = "Histogram" +) + +func (t MetricType) String() string { + switch t { + case MetricTypeGeneric: + return metricTypeGenericStr + case MetricTypeCounter: + return metricTypeCounterStr + case MetricTypeGauge: + return metricTypeGaugeStr + case MetricTypeSummary: + return metricTypeSummaryStr + case MetricTypeHistogram: + return metricTypeHistogramStr + } + + return metricTypeUnknownStr +} + +func metricTypeFromString(typeStr string) MetricType { + // normalize the strings for comparison + switch strings.ToLower(typeStr) { + case strings.ToLower(metricTypeCounterStr): + return MetricTypeCounter + case strings.ToLower(metricTypeGaugeStr): + return MetricTypeGauge + case strings.ToLower(metricTypeSummaryStr): + return MetricTypeSummary + case strings.ToLower(metricTypeHistogramStr): + return MetricTypeHistogram + case strings.ToLower(metricTypeGenericStr): + return MetricTypeGeneric + } + return MetricTypeUnknown +} + +type ( + // Metric is an interface implemented by all metric types. + Metric interface { + IsMetric() + } + + // MetricLabelMap is the set of key-value label pairs. + MetricLabelMap map[string]string + + // SimpleMetric is a specific metric with a value. + SimpleMetric struct { + Labels MetricLabelMap `json:"labels"` + Value float64 `json:"value"` + } + + // QuantileMap is the set of quantile measurements. + QuantileMap map[float64]float64 + + // SummaryMetric represents a group of observations. + SummaryMetric struct { + Labels MetricLabelMap `json:"labels"` + SampleCount uint64 `json:"sample_count"` + SampleSum float64 `json:"sample_sum"` + Quantiles QuantileMap `json:"quantiles"` + } + + // MetricBucket represents a bucket for observations to be sorted into. + MetricBucket struct { + Label string `json:"label"` + CumulativeCount uint64 `json:"cumulative_count"` + UpperBound float64 `json:"upper_bound"` + } + + // HistogramMetric represents a group of observations sorted into + // buckets. + HistogramMetric struct { + Labels MetricLabelMap `json:"labels"` + SampleCount uint64 `json:"sample_count"` + SampleSum float64 `json:"sample_sum"` + Buckets []*MetricBucket `json:"buckets"` + } + + // MetricSet is a group of related metrics. + MetricSet struct { + Name string `json:"name"` + Description string `json:"description"` + Type MetricType `json:"type"` + Metrics []Metric `json:"metrics"` + } +) + +// IsMetric identifies SimpleMetric as a Metric. +func (*SimpleMetric) IsMetric() {} + +// IsMetric identifies SummaryMetric as a Metric. +func (*SummaryMetric) IsMetric() {} + +// UnmarshalJSON unmarshals a SummaryMetric from JSON. +func (m *SummaryMetric) UnmarshalJSON(data []byte) error { + if m == nil { + return errors.New("nil SummaryMetric") + } + + if m.Quantiles == nil { + m.Quantiles = make(QuantileMap) + } + + type Alias SummaryMetric + aux := (*Alias)(m) + if err := json.Unmarshal(data, &aux); err != nil { + return err + } + + return nil +} + +// IsMetric identifies HistogramMetric as a Metric. +func (*HistogramMetric) IsMetric() {} + +// Keys gets the sorted list of label keys. +func (m MetricLabelMap) Keys() []string { + result := make([]string, 0, len(m)) + for label := range m { + result = append(result, label) + } + sort.Strings(result) + return result +} + +// Keys gets the sorted list of quantile keys. +func (m QuantileMap) Keys() []float64 { + result := make([]float64, 0, len(m)) + for q := range m { + result = append(result, q) + } + sort.Float64s(result) + return result +} + +// MarshalJSON marshals the QuantileMap into JSON. +func (m QuantileMap) MarshalJSON() ([]byte, error) { + strMap := make(map[string]string) + + fmtFloat := func(f float64) string { + return strconv.FormatFloat(f, 'g', -1, 64) + } + + for key, val := range m { + strMap[fmtFloat(key)] = fmtFloat(val) + } + + return json.Marshal(&strMap) +} + +// UnmarshalJSON unmarshals the QuantileMap from JSON. +func (m QuantileMap) UnmarshalJSON(data []byte) error { + if m == nil { + return errors.New("QuantileMap is nil") + } + + fromJSON := make(map[string]string) + + if err := json.Unmarshal(data, &fromJSON); err != nil { + return nil + } + + for key, val := range fromJSON { + floatKey, err := strconv.ParseFloat(key, 64) + if err != nil { + return errors.Wrapf(err, "QuantileMap key %q", key) + } + + floatVal, err := strconv.ParseFloat(val, 64) + if err != nil { + return errors.Wrapf(err, "QuantileMap value %q for key %q", val, key) + } + + m[floatKey] = floatVal + } + return nil +} + +// MarshalJSON marshals the MetricSet to JSON. +func (ms *MetricSet) MarshalJSON() ([]byte, error) { + type toJSON MetricSet + return json.Marshal(&struct { + Type string `json:"type"` + *toJSON + }{ + Type: strings.ToLower(ms.Type.String()), + toJSON: (*toJSON)(ms), + }) +} + +// jsonMetric serves as a universal metric representation for unmarshaling from +// JSON. It covers all possible fields of Metric types. +type jsonMetric struct { + Labels MetricLabelMap `json:"labels"` + Value float64 `json:"value"` + SampleCount uint64 `json:"sample_count"` + SampleSum float64 `json:"sample_sum"` + Quantiles QuantileMap `json:"quantiles"` + Buckets []*MetricBucket `json:"buckets"` +} + +// UnmarshalJSON unmarshals a Metric into the jsonMetric type. +func (jm *jsonMetric) UnmarshalJSON(data []byte) error { + if jm == nil { + return errors.New("nil jsonMetric") + } + + if jm.Quantiles == nil { + jm.Quantiles = make(QuantileMap) + } + + type Alias jsonMetric + aux := (*Alias)(jm) + if err := json.Unmarshal(data, &aux); err != nil { + return err + } + + return nil +} + +// UnmarshalJSON unmarshals the MetricSet from JSON. +func (ms *MetricSet) UnmarshalJSON(data []byte) error { + if ms == nil { + return errors.New("nil MetricSet") + } + + type fromJSON MetricSet + from := &struct { + Type string `json:"type"` + Metrics []*jsonMetric `json:"metrics"` + *fromJSON + }{ + fromJSON: (*fromJSON)(ms), + } + if err := json.Unmarshal(data, from); err != nil { + return err + } + + ms.Type = metricTypeFromString(from.Type) + for _, m := range from.Metrics { + switch ms.Type { + case MetricTypeSummary: + ms.Metrics = append(ms.Metrics, &SummaryMetric{ + Labels: m.Labels, + SampleCount: m.SampleCount, + SampleSum: m.SampleSum, + Quantiles: m.Quantiles, + }) + case MetricTypeHistogram: + ms.Metrics = append(ms.Metrics, &HistogramMetric{ + Labels: m.Labels, + SampleCount: m.SampleCount, + SampleSum: m.SampleSum, + Buckets: m.Buckets, + }) + default: + ms.Metrics = append(ms.Metrics, newSimpleMetric(m.Labels, m.Value)) + } + } + return nil +} + +func newSimpleMetric(labels map[string]string, value float64) *SimpleMetric { + return &SimpleMetric{ + Labels: labels, + Value: value, + } +} diff --git a/src/control/lib/daos/telemetry_test.go b/src/control/lib/daos/telemetry_test.go new file mode 100644 index 00000000000..9a1ee77fcf1 --- /dev/null +++ b/src/control/lib/daos/telemetry_test.go @@ -0,0 +1,246 @@ +// +// (C) Copyright 2021-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package daos + +import ( + "encoding/json" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + + "github.com/daos-stack/daos/src/control/common/test" +) + +func TestDaos_Metric_JSON(t *testing.T) { + testLabelMap := map[string]string{ + "label1": "val1", + "label2": "val2", + } + + for name, tc := range map[string]struct { + metric Metric + }{ + "nil": {}, + "simple": { + metric: newSimpleMetric(testLabelMap, 123), + }, + "summary": { + metric: &SummaryMetric{ + Labels: testLabelMap, + SampleSum: 5678.9, + SampleCount: 42, + Quantiles: QuantileMap{ + 0.25: 50, + 0.5: 42, + }, + }, + }, + "histogram": { + metric: &HistogramMetric{ + Labels: testLabelMap, + SampleSum: 9876, + SampleCount: 120, + Buckets: []*MetricBucket{ + { + CumulativeCount: 55, + UpperBound: 500, + }, + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + marshaled, err := json.Marshal(tc.metric) + if err != nil { + t.Fatalf("expected to marshal, got %q", err) + } + + var unmarshaled Metric + switch tc.metric.(type) { + case *SimpleMetric: + unmarshaled = new(SimpleMetric) + case *SummaryMetric: + unmarshaled = new(SummaryMetric) + case *HistogramMetric: + unmarshaled = new(HistogramMetric) + default: + unmarshaled = new(SimpleMetric) + } + + err = json.Unmarshal(marshaled, unmarshaled) + if err != nil { + t.Fatalf("expected to unmarshal, got %q", err) + } + + expResult := tc.metric + if tc.metric == nil { + expResult = &SimpleMetric{} + } + + if diff := cmp.Diff(expResult, unmarshaled); diff != "" { + t.Fatalf("unmarshaled different from original (-want, +got):\n%s\n", diff) + } + }) + } +} + +func TestDaos_metricTypeFromString(t *testing.T) { + for name, tc := range map[string]struct { + input string + expType MetricType + }{ + "empty": { + expType: MetricTypeUnknown, + }, + "counter": { + input: "counter", + expType: MetricTypeCounter, + }, + "gauge": { + input: "gauge", + expType: MetricTypeGauge, + }, + "summary": { + input: "summary", + expType: MetricTypeSummary, + }, + "histogram": { + input: "histogram", + expType: MetricTypeHistogram, + }, + "generic": { + input: "generic", + expType: MetricTypeGeneric, + }, + "invalid": { + input: "some garbage text", + expType: MetricTypeUnknown, + }, + "weird capitalization": { + input: "CoUnTeR", + expType: MetricTypeCounter, + }, + } { + t.Run(name, func(t *testing.T) { + gotType := metricTypeFromString(tc.input) + + test.AssertEqual(t, tc.expType, gotType, "") + }) + } +} + +func TestDaos_MetricSet_JSON(t *testing.T) { + for name, tc := range map[string]struct { + set *MetricSet + }{ + "nil": {}, + "generic type": { + set: &MetricSet{ + Name: "timespan", + Description: "It's been a while", + Type: MetricTypeGeneric, + Metrics: []Metric{ + newSimpleMetric(map[string]string{ + "units": "nanoseconds", + }, float64(time.Second)), + }, + }, + }, + "counter type": { + set: &MetricSet{ + Name: "one_ring", + Description: "Precious...", + Type: MetricTypeCounter, + Metrics: []Metric{ + newSimpleMetric(map[string]string{ + "owner": "frodo", + }, 1), + }, + }, + }, + "gauge type": { + set: &MetricSet{ + Name: "funny_hats", + Description: "Hilarious headgear in inventory", + Type: MetricTypeGauge, + Metrics: []Metric{ + newSimpleMetric(map[string]string{ + "type": "tophat", + }, 1), + newSimpleMetric(map[string]string{ + "type": "cowboy", + }, 6), + newSimpleMetric(map[string]string{ + "type": "jester", + }, 0), + }, + }, + }, + "summary type": { + set: &MetricSet{ + Name: "alpha", + Description: "The first letter! Everybody's favorite!", + Type: MetricTypeSummary, + Metrics: []Metric{ + &SummaryMetric{ + Labels: map[string]string{"beta": "b"}, + SampleCount: 3, + SampleSum: 42, + Quantiles: map[float64]float64{0.5: 2.2}, + }, + }, + }, + }, + "histogram type": { + set: &MetricSet{ + Name: "my_histogram", + Description: "This is a histogram", + Type: MetricTypeHistogram, + Metrics: []Metric{ + &HistogramMetric{ + Labels: map[string]string{"owner": "me"}, + SampleCount: 1024, + SampleSum: 12344, + Buckets: []*MetricBucket{ + { + CumulativeCount: 789, + UpperBound: 500, + }, + { + CumulativeCount: 456, + UpperBound: 1000, + }, + }, + }, + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + marshaled, err := json.Marshal(tc.set) + if err != nil { + t.Fatalf("expected to marshal, got %q", err) + } + + unmarshaled := new(MetricSet) + err = json.Unmarshal(marshaled, unmarshaled) + if err != nil { + t.Fatalf("expected to unmarshal, got %q", err) + } + + expResult := tc.set + if tc.set == nil { + expResult = &MetricSet{} + } + + if diff := cmp.Diff(expResult, unmarshaled); diff != "" { + t.Fatalf("unmarshaled different from original (-want, +got):\n%s\n", diff) + } + }) + } +} diff --git a/src/control/lib/ui/num_flags.go b/src/control/lib/ui/num_flags.go new file mode 100644 index 00000000000..22d124c43c8 --- /dev/null +++ b/src/control/lib/ui/num_flags.go @@ -0,0 +1,75 @@ +// +// (C) Copyright 2022-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package ui + +import ( + "fmt" + "math" + + "github.com/dustin/go-humanize" + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/lib/atm" +) + +// FmtHumanSize formats the supplied size in a human-readable format. +func FmtHumanSize(size float64, suffix string, binary bool) string { + if size == 0 { + return "0 " + suffix + } + val := size + + base := float64(1000) + if binary { + base = 1024 + if suffix != "" { + suffix = "i" + suffix + } + } + + for _, unit := range []string{"", " K", " M", " G", " T", " P", " E", " Z", " Y"} { + if math.Abs(val) < base { + if unit == "" && suffix != "" { + unit = " " + } + return fmt.Sprintf("%.02f%s%s", val, unit, suffix) + } + val /= base + } + + // Fallback to scientific notation for unexpectedly huge numbers. + return fmt.Sprintf("%E %s", size, suffix) +} + +// ByteSizeFlag is a go-flags compatible flag type for converting +// string input into a byte size. +type ByteSizeFlag struct { + set atm.Bool + Bytes uint64 +} + +func (sf ByteSizeFlag) IsSet() bool { + return sf.set.IsTrue() +} + +func (sf ByteSizeFlag) String() string { + return humanize.Bytes(sf.Bytes) +} + +func (sf *ByteSizeFlag) UnmarshalFlag(fv string) (err error) { + if fv == "" { + return errors.New("no size specified") + } + + sf.Bytes, err = humanize.ParseBytes(fv) + if err != nil { + return errors.Errorf("invalid size %q", fv) + } + sf.set.SetTrue() + + return nil +} diff --git a/src/control/lib/ui/num_flags_test.go b/src/control/lib/ui/num_flags_test.go new file mode 100644 index 00000000000..6d46f4c1025 --- /dev/null +++ b/src/control/lib/ui/num_flags_test.go @@ -0,0 +1,139 @@ +// +// (C) Copyright 2021-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package ui_test + +import ( + "testing" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/ui" +) + +func TestUI_FmtHumanSize(t *testing.T) { + for name, tc := range map[string]struct { + input float64 + suffix string + binary bool + expSize string + }{ + "0": { + input: 0, + suffix: "B", + expSize: "0 B", + }, + "-0": { + input: -0, + suffix: "B", + expSize: "0 B", + }, + "-1": { + input: -1, + suffix: "B", + expSize: "-1.00 B", + }, + "1 no suffix": { + input: 1, + expSize: "1.00", + }, + "1 binary no suffix": { + input: 1, + binary: true, + expSize: "1.00", + }, + "1000 no suffix": { + input: 1000, + expSize: "1.00 K", + }, + "1000 binary no suffix": { + input: 1000, + binary: true, + expSize: "1000.00", + }, + "1024 binary no suffix": { + input: 1024, + binary: true, + expSize: "1.00 K", + }, + "4.5PB": { + input: 1 << 52, + suffix: "B", + expSize: "4.50 PB", + }, + "4PiB binary": { + input: 1 << 52, + suffix: "B", + binary: true, + expSize: "4.00 PiB", + }, + "trouble": { + input: 1 << 90, + suffix: "tribbles", + expSize: "1.237940E+27 tribbles", + }, + } { + t.Run(name, func(t *testing.T) { + gotSize := ui.FmtHumanSize(tc.input, tc.suffix, tc.binary) + test.AssertEqual(t, tc.expSize, gotSize, "unexpected size") + }) + } +} + +func TestUI_ByteSizeFlag(t *testing.T) { + for name, tc := range map[string]struct { + input string + expSize uint64 + expStr string + expErr error + }{ + "empty": { + expErr: errors.New("no size specified"), + }, + "invalid size": { + input: "horse", + expErr: errors.New("invalid size"), + }, + "negative size invalid": { + input: "-438 TB", + expErr: errors.New("invalid size"), + }, + "0": { + input: "0", + expSize: 0, + expStr: "0 B", + }, + "weird but valid": { + input: "0 EiB", + expSize: 0, + expStr: "0 B", + }, + "valid MB": { + input: "10MB", + expSize: 10 * 1000 * 1000, + expStr: "10 MB", + }, + "valid raw number": { + input: "1058577", + expSize: 1058577, + expStr: "1.1 MB", + }, + } { + t.Run(name, func(t *testing.T) { + f := ui.ByteSizeFlag{} + gotErr := f.UnmarshalFlag(tc.input) + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + test.AssertFalse(t, f.IsSet(), "shouldn't be set on error") + return + } + test.AssertTrue(t, f.IsSet(), "should be set on success") + test.AssertEqual(t, tc.expSize, f.Bytes, "unexpected size") + test.AssertEqual(t, tc.expStr, f.String(), "unexpected string") + }) + } +} diff --git a/src/control/run_go_tests.sh b/src/control/run_go_tests.sh index ec8204dd308..c976b1595f8 100755 --- a/src/control/run_go_tests.sh +++ b/src/control/run_go_tests.sh @@ -100,15 +100,18 @@ function setup_environment() LD_LIBRARY_PATH=${SL_PREFIX+${SL_PREFIX}/lib} LD_LIBRARY_PATH+="${SL_PREFIX+:${SL_PREFIX}/lib64}" LD_LIBRARY_PATH+="${SL_PREFIX+:${SL_PREFIX}/lib64/daos_srv}" + LD_LIBRARY_PATH+="${SL_MERCURY_PREFIX+:${SL_MERCURY_PREFIX}/lib}" LD_LIBRARY_PATH+="${SL_SPDK_PREFIX+:${SL_SPDK_PREFIX}/lib}" LD_LIBRARY_PATH+="${SL_OFI_PREFIX+:${SL_OFI_PREFIX}/lib}" CGO_LDFLAGS=${SL_PREFIX+-L${SL_PREFIX}/lib} CGO_LDFLAGS+="${SL_PREFIX+ -L${SL_PREFIX}/lib64}" CGO_LDFLAGS+="${SL_PREFIX+ -L${SL_PREFIX}/lib64/daos_srv}" CGO_LDFLAGS+="${SL_BUILD_DIR+ -L${SL_BUILD_DIR}/src/control/lib/spdk}" + CGO_LDFLAGS+="${SL_MERCURY_PREFIX+ -L${SL_MERCURY_PREFIX}/lib}" CGO_LDFLAGS+="${SL_SPDK_PREFIX+ -L${SL_SPDK_PREFIX}/lib}" CGO_LDFLAGS+="${SL_OFI_PREFIX+ -L${SL_OFI_PREFIX}/lib}" CGO_CFLAGS=${SL_PREFIX+-I${SL_PREFIX}/include} + CGO_CFLAGS+="${SL_MERCURY_PREFIX+ -I${SL_MERCURY_PREFIX}/include}" CGO_CFLAGS+="${SL_SPDK_PREFIX+ -I${SL_SPDK_PREFIX}/include}" CGO_CFLAGS+="${SL_OFI_PREFIX+ -I${SL_OFI_PREFIX}/include}" CGO_CFLAGS+="${SL_ARGOBOTS_PREFIX+ -I${SL_ARGOBOTS_PREFIX}/include}" @@ -167,7 +170,7 @@ $output function get_test_runner() { - test_args="-mod vendor -race -cover -v ./... -tags firmware,fault_injection" + test_args="-mod vendor -race -cover -v ./... -tags firmware,fault_injection,test_stubs" test_runner="go test" if which gotestsum >/dev/null; then diff --git a/src/mgmt/cli_mgmt.c b/src/mgmt/cli_mgmt.c index c24f4802171..46a5dc0d0c6 100644 --- a/src/mgmt/cli_mgmt.c +++ b/src/mgmt/cli_mgmt.c @@ -514,6 +514,7 @@ dc_mgmt_put_sys_info(struct daos_sys_info *info) if (info == NULL) return; free_rank_uris(info->dsi_ranks, info->dsi_nr_ranks); + D_FREE(info->dsi_ms_ranks); D_FREE(info); } diff --git a/src/utils/self_test/self_test.c b/src/utils/self_test/self_test.c index 3cdedbee8c5..fb8118fc88e 100644 --- a/src/utils/self_test/self_test.c +++ b/src/utils/self_test/self_test.c @@ -1240,9 +1240,7 @@ int main(int argc, char *argv[]) D_FREE(tgt_endpts); D_FREE(all_params); - if (use_agent) - dc_mgmt_fini(); - + self_test_fini(use_agent); d_log_fini(); return ret; diff --git a/src/utils/self_test/self_test_lib.c b/src/utils/self_test/self_test_lib.c index beb8e54cef2..c8c94feddf5 100644 --- a/src/utils/self_test/self_test_lib.c +++ b/src/utils/self_test/self_test_lib.c @@ -187,6 +187,16 @@ self_test_init(char *dest_name, crt_context_t *crt_ctx, crt_group_t **srv_grp, p return 0; } +void +self_test_fini(bool agent_used) +{ + if (!agent_used) + return; + + dc_mgmt_fini(); + dc_agent_fini(); +} + int st_compare_endpts(const void *a_in, const void *b_in) { diff --git a/src/utils/self_test/self_test_lib.h b/src/utils/self_test/self_test_lib.h index 38dcc57f5c1..52843986716 100644 --- a/src/utils/self_test/self_test_lib.h +++ b/src/utils/self_test/self_test_lib.h @@ -57,5 +57,7 @@ int st_compare_latencies_by_ranks(const void *a_in, const void *b_in); void free_size_latencies(struct st_latency ***latencies, uint32_t num_msg_sizes, uint32_t num_ms_endpts); +void +self_test_fini(bool agent_used); #endif /* __SELF_TEST_LIB_H__ */ \ No newline at end of file