Skip to content

Commit

Permalink
roachtest: add run-operation command to run singular operation
Browse files Browse the repository at this point in the history
This change adds a new roachtest command, run-operation,
that runs a singular operation defined in OperationSpec
(see #119796), in a simplified harness, with existing
clusters only. The existing cluster is not touched (eg.
stopped or wiped), unless the operation explicitly
does an action like that.

An implementation of operation.Operation is also bundled in this
change to make these operations runnable.

Epic: none

Release note: None
  • Loading branch information
itsbilal committed Mar 18, 2024
1 parent 4733c20 commit 01cf12b
Show file tree
Hide file tree
Showing 10 changed files with 363 additions and 6 deletions.
1 change: 1 addition & 0 deletions pkg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -1162,6 +1162,7 @@ GO_TARGETS = [
"//pkg/cmd/roachtest/clusterstats:clusterstats_test",
"//pkg/cmd/roachtest/grafana:grafana",
"//pkg/cmd/roachtest/operation:operation",
"//pkg/cmd/roachtest/operations:operations",
"//pkg/cmd/roachtest/option:option",
"//pkg/cmd/roachtest/option:option_test",
"//pkg/cmd/roachtest/registry:registry",
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ go_library(
"github.go",
"main.go",
"monitor.go",
"operation_impl.go",
"run.go",
"slack.go",
"test_filter.go",
Expand All @@ -24,6 +25,8 @@ go_library(
"//pkg/cmd/bazci/githubpost/issues",
"//pkg/cmd/roachprod/grafana",
"//pkg/cmd/roachtest/cluster",
"//pkg/cmd/roachtest/operation",
"//pkg/cmd/roachtest/operations",
"//pkg/cmd/roachtest/option",
"//pkg/cmd/roachtest/registry",
"//pkg/cmd/roachtest/roachtestflags",
Expand Down
38 changes: 38 additions & 0 deletions pkg/cmd/roachtest/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ import (
"math/rand"
"os"
"os/user"
"regexp"
"sort"
"strings"

"github.com/cockroachdb/cockroach/pkg/build"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/operations"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestflags"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/tests"
Expand Down Expand Up @@ -179,9 +181,28 @@ the cluster nodes on start.
}
roachtestflags.AddRunFlags(benchCmd.Flags())

var runOperationCmd = &cobra.Command{
// Don't display usage when the command fails.
SilenceUsage: true,
Use: "run-operation [clusterName] [regex...]",
Short: "run one operation on an existing cluster",
Long: `Run an automated operation on an existing roachprod cluster.
If multiple operations are matched by the passed-in regex filter, one operation
is chosen at random and run. The provided cluster name must already exist in roachprod;
this command does no setup/teardown of clusters.`,
Args: cobra.ExactArgs(2),
RunE: func(cmd *cobra.Command, args []string) error {
fmt.Printf("\nRunning operation %s.\n\n", args[0])
cmd.SilenceUsage = true
return runOperation(operations.RegisterOperations, args[1], args[0])
},
}
roachtestflags.AddRunOpsFlags(runOperationCmd.Flags())

rootCmd.AddCommand(listCmd)
rootCmd.AddCommand(runCmd)
rootCmd.AddCommand(benchCmd)
rootCmd.AddCommand(runOperationCmd)

var err error
config.OSUser, err = user.Current()
Expand Down Expand Up @@ -263,6 +284,23 @@ func testsToRun(
return selectSpecs(notSkipped, selectProbability, true, print), nil
}

func opsToRun(r testRegistryImpl, filter string) ([]registry.OperationSpec, error) {
regex, err := regexp.Compile(filter)
if err != nil {
return nil, err
}
var filteredOps []registry.OperationSpec
for _, opSpec := range r.AllOperations() {
if regex.MatchString(opSpec.Name) {
filteredOps = append(filteredOps, opSpec)
}
}
if len(filteredOps) == 0 {
return nil, errors.New("no matching operations to run")
}
return filteredOps, nil
}

// selectSpecs returns a random sample of the given test specs.
// If atLeastOnePerPrefix is true, it guarantees that at least one test is
// selected for each prefix (e.g. kv0/, acceptance/).
Expand Down
138 changes: 138 additions & 0 deletions pkg/cmd/roachtest/operation_impl.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package main

import (
"context"
"fmt"
"strings"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/operation"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/errors"
)

var errOperationFatal = errors.New("o.Fatal() was called")

type operationImpl struct {
spec *registry.OperationSpec
cockroach string // path to main cockroach binary on the cluster.

// l is the logger that the operation will use for its output.
l *logger.Logger

mu struct {
syncutil.RWMutex
done bool

// cancel, if set, is called from the o.Fatal() family of functions when the
// op is being marked as failed (i.e. when the failures slice is being
// appended to). This is used to cancel the context passed to o.spec.Run(),
// so async goroutines can be notified.
cancel func()

// failures added via addFailures, in order. An operation may have multiple
// calls to o.Fail()/Error(), with each call adding to this slice once.
failures []error

status string
}
}

func (o *operationImpl) ClusterCockroach() string {
return o.cockroach
}

func (o *operationImpl) Name() string {
return o.spec.Name
}

// L returns the operation's logger.
func (o *operationImpl) L() *logger.Logger {
return o.l
}

// Status sets the main status message for the operation. This is logged to
// o.L() and is the main way to log status of an operation.
func (o *operationImpl) Status(args ...interface{}) {
o.mu.Lock()
defer o.mu.Unlock()

o.mu.status = fmt.Sprint(args...)
if !o.L().Closed() {
o.L().PrintfCtxDepth(context.TODO(), 3, "operation status: %s", o.mu.status)
}
}

// Fatal marks the operation as failed, prints the args to o.L(), and calls the
// cancel method if specified. Also shuts down the process. Can be called
// multiple times.
func (o *operationImpl) Fatal(args ...interface{}) {
o.addFailureAndCancel(1, "", args...)
panic(errOperationFatal)
}

// Fatalf is like Fatal, but takes a format string.
func (o *operationImpl) Fatalf(format string, args ...interface{}) {
o.addFailureAndCancel(1, format, args...)
panic(errOperationFatal)
}

// FailNow implements the Operation interface.
func (o *operationImpl) FailNow() {
o.addFailureAndCancel(1, "FailNow called")
panic(errOperationFatal)
}

// Error implements the Operation interface
func (o *operationImpl) Error(args ...interface{}) {
o.addFailureAndCancel(1, "", args...)
}

// Errorf implements the Operation interface.
func (o *operationImpl) Errorf(format string, args ...interface{}) {
o.addFailureAndCancel(1, format, args...)
}

func (o *operationImpl) addFailureAndCancel(depth int, format string, args ...interface{}) {
o.addFailure(depth+1, format, args...)
o.mu.cancel()
}

// addFailure depth indicates how many stack frames to skip when reporting the
// site of the failure in logs. `0` will report the caller of addFailure, `1` the
// caller of the caller of addFailure, etc.
func (o *operationImpl) addFailure(depth int, format string, args ...interface{}) {
if format == "" {
format = strings.Repeat(" %v", len(args))[1:]
}
reportFailure := errors.NewWithDepthf(depth+1, format, args...)

o.mu.Lock()
defer o.mu.Unlock()
o.mu.failures = append(o.mu.failures, reportFailure)

msg := reportFailure.Error()

failureNum := len(o.mu.failures)
o.L().Printf("operation failure #%d: %s", failureNum, msg)
}

func (o *operationImpl) Failed() bool {
o.mu.RLock()
defer o.mu.RUnlock()

return len(o.mu.failures) > 0
}

var _ operation.Operation = &operationImpl{}
9 changes: 9 additions & 0 deletions pkg/cmd/roachtest/operations/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")

go_library(
name = "operations",
srcs = ["register.go"],
importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/operations",
visibility = ["//visibility:public"],
deps = ["//pkg/cmd/roachtest/registry"],
)
17 changes: 17 additions & 0 deletions pkg/cmd/roachtest/operations/register.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package operations

import "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"

// RegisterOperations registers all operations to the Registry. This powers `roachtest run-operations`.
func RegisterOperations(r registry.Registry) {
}
51 changes: 45 additions & 6 deletions pkg/cmd/roachtest/roachtestflags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,31 @@ var (
Usage: `Absolute path to cockroach binary to use`,
})

CockroachBinaryPath string = "cockroach"
_ = registerRunOpsFlag(&CockroachBinaryPath, FlagInfo{
Name: "cockroach-binary",
Usage: `Relative path to cockroach binary to use, on the cluster specified in --cluster`,
})

CertsDir string
_ = registerRunOpsFlag(&CertsDir, FlagInfo{
Name: "certs-dir",
Usage: `Absolute path to certificates directory, if the cluster specified in --cluster is secure`,
})

VirtualCluster string
_ = registerRunOpsFlag(&VirtualCluster, FlagInfo{
Name: "virtual-cluster",
Usage: `Specifies virtual cluster to connect to, within the specified --cluster.`,
})

WaitBeforeCleanup time.Duration = 5 * time.Minute
_ = registerRunOpsFlag(&WaitBeforeCleanup, FlagInfo{
Name: "wait-before-cleanup",
Usage: `Specifies the amount of time to wait before running any cleanup work defined
by the operation. Note that this time does not count towards the timeout.`,
})

CockroachEAPath string
_ = registerRunFlag(&CockroachEAPath, FlagInfo{
Name: "cockroach-ea",
Expand Down Expand Up @@ -141,11 +166,13 @@ var (

// ArtifactsDir is a path to a local dir where the test logs and artifacts
// collected from cluster will be placed.
ArtifactsDir string = "artifacts"
_ = registerRunFlag(&ArtifactsDir, FlagInfo{
ArtifactsDir string = "artifacts"
ArtifactsFlag = FlagInfo{
Name: "artifacts",
Usage: `Path to artifacts directory`,
})
}
_ = registerRunFlag(&ArtifactsDir, ArtifactsFlag)
_ = registerRunOpsFlag(&ArtifactsDir, ArtifactsFlag)

// LiteralArtifactsDir is a path to the literal on-agent directory where
// artifacts are stored. May be different from `artifacts`. Only used for
Expand Down Expand Up @@ -242,11 +269,12 @@ var (
https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes)`,
})

CPUQuota int = 300
_ = registerRunFlag(&CPUQuota, FlagInfo{
CPUQuota int = 300
cpuQuotaFlagInfo = FlagInfo{
Name: "cpu-quota",
Usage: `The number of cloud CPUs roachtest is allowed to use at any one time.`,
})
}
_ = registerRunFlag(&CPUQuota, cpuQuotaFlagInfo)

HTTPPort int = 0
_ = registerRunFlag(&HTTPPort, FlagInfo{
Expand Down Expand Up @@ -411,6 +439,12 @@ func AddRunFlags(cmdFlags *pflag.FlagSet) {
globalMan.AddFlagsToCommand(runCmdID, cmdFlags)
}

// AddRunOpsFlags adds all flags registered for the run-operations command to
// the given command flag set.
func AddRunOpsFlags(cmdFlags *pflag.FlagSet) {
globalMan.AddFlagsToCommand(runOpsCmdID, cmdFlags)
}

// Changed returns true if a flag associated with a given value was present.
//
// For example: roachtestflags.Changed(&roachtestflags.Cloud) returns true if
Expand All @@ -431,3 +465,8 @@ func registerRunFlag(valPtr interface{}, info FlagInfo) struct{} {
globalMan.RegisterFlag(runCmdID, valPtr, info)
return struct{}{}
}

func registerRunOpsFlag(valPtr interface{}, info FlagInfo) struct{} {
globalMan.RegisterFlag(runOpsCmdID, valPtr, info)
return struct{}{}
}
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/roachtestflags/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ type cmdID int
const (
listCmdID cmdID = iota
runCmdID
runOpsCmdID
numCmdIDs
)

Expand Down
Loading

0 comments on commit 01cf12b

Please sign in to comment.