Skip to content

Commit

Permalink
Merge #123506
Browse files Browse the repository at this point in the history
123506: roachtestutil: dynamically determine block device to stall r=RaduBerinde a=itsbilal

Previously, we hardcoded the block device on which to run the disk-stalled* roachtests and the disk-stall operation. This was a flaky approach as sometimes we'd use a local ssd as a block device which had very different numbers than a Google persistent disk.

This change updates the cgroup disk staller to programmatically determine the major/minor device numbers for the block device to stall (the one mounted at /mnt/data1). It also updates the dmsetup disk staller to dynamically determine the block device name mounted at /mnt/data1.

Fixes #123080, #123054.

Epic: none

Release note: None

Co-authored-by: Bilal Akhtar <bilal@cockroachlabs.com>
  • Loading branch information
craig[bot] and itsbilal committed May 2, 2024
2 parents 6951ba0 + 8e81063 commit 5b1a3c1
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 32 deletions.
1 change: 0 additions & 1 deletion pkg/cmd/roachtest/roachtestutil/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ go_library(
deps = [
"//pkg/cmd/roachtest/cluster",
"//pkg/cmd/roachtest/option",
"//pkg/cmd/roachtest/spec",
"//pkg/cmd/roachtest/test",
"//pkg/kv/kvpb",
"//pkg/roachprod/config",
Expand Down
57 changes: 28 additions & 29 deletions pkg/cmd/roachtest/roachtestutil/disk_stall.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ import (
"fmt"
"math/rand"
"path/filepath"
"strconv"
"strings"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
)

Expand Down Expand Up @@ -92,20 +93,32 @@ func (s *cgroupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOp
}
}

func (s *cgroupDiskStaller) device() (major, minor int) {
func (s *cgroupDiskStaller) device(nodes option.NodeListOption) (major, minor int) {
// TODO(jackson): Programmatically determine the device major,minor numbers.
// eg,:
// deviceName := getDevice(s.t, s.c)
// `cat /proc/partitions` and find `deviceName`
switch s.c.Cloud() {
case spec.GCE:
// ls -l /dev/sdb
// brw-rw---- 1 root disk 8, 16 Mar 27 22:08 /dev/sdb
return 8, 16
default:
s.f.Fatalf("unsupported cloud %q", s.c.Cloud())
res, err := s.c.RunWithDetailsSingleNode(context.TODO(), s.f.L(), option.WithNodes(nodes[:1]), "lsblk | grep /mnt/data1 | awk '{print $2}'")
if err != nil {
s.f.Fatalf("error when determining block device: %s", err)
return 0, 0
}
parts := strings.Split(strings.TrimSpace(res.Stdout), ":")
if len(parts) != 2 {
s.f.Fatalf("unexpected output from lsblk: %s", res.Stdout)
return 0, 0
}
major, err = strconv.Atoi(parts[0])
if err != nil {
s.f.Fatalf("error when determining block device: %s", err)
return 0, 0
}
minor, err = strconv.Atoi(parts[1])
if err != nil {
s.f.Fatalf("error when determining block device: %s", err)
return 0, 0
}
return major, minor
}

type throughput struct {
Expand Down Expand Up @@ -134,7 +147,7 @@ func (rw bandwidthReadWrite) cgroupV2BandwidthProp() string {
func (s *cgroupDiskStaller) setThroughput(
ctx context.Context, nodes option.NodeListOption, rw bandwidthReadWrite, bw throughput,
) error {
maj, min := s.device()
maj, min := s.device(nodes)
cockroachIOController := filepath.Join("/sys/fs/cgroup/system.slice", SystemInterfaceSystemdUnitName()+".service", "io.max")

bytesPerSecondStr := "max"
Expand All @@ -151,25 +164,11 @@ func (s *cgroupDiskStaller) setThroughput(
))
}

func GetDiskDevice(f Fataler, c cluster.Cluster) string {
s := c.Spec()
switch c.Cloud() {
case spec.GCE:
switch s.LocalSSD {
case spec.LocalSSDDisable:
return "/dev/sdb"
case spec.LocalSSDPreferOn, spec.LocalSSDDefault:
// TODO(jackson): These spec values don't guarantee that we are actually
// using local SSDs, just that we might've.
return "/dev/nvme0n1"
default:
f.Fatalf("unsupported LocalSSD enum %v", s.LocalSSD)
return ""
}
case spec.AWS:
return "/dev/nvme1n1"
default:
f.Fatalf("unsupported cloud %q", c.Cloud())
func GetDiskDevice(f Fataler, c cluster.Cluster, nodes option.NodeListOption) string {
res, err := c.RunWithDetailsSingleNode(context.TODO(), f.L(), option.WithNodes(nodes[:1]), "lsblk | grep /mnt/data1 | awk '{print $1}'")
if err != nil {
f.Fatalf("error when determining block device: %s", err)
return ""
}
return "/dev/" + strings.TrimSpace(res.Stdout)
}
6 changes: 4 additions & 2 deletions pkg/cmd/roachtest/tests/disk_stall.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,10 +448,12 @@ type dmsetupDiskStaller struct {

var _ diskStaller = (*dmsetupDiskStaller)(nil)

func (s *dmsetupDiskStaller) device() string { return roachtestutil.GetDiskDevice(s.t, s.c) }
func (s *dmsetupDiskStaller) device(nodes option.NodeListOption) string {
return roachtestutil.GetDiskDevice(s.t, s.c, nodes)
}

func (s *dmsetupDiskStaller) Setup(ctx context.Context) {
dev := s.device()
dev := s.device(s.c.All())
// snapd will run "snapd auto-import /dev/dm-0" via udev triggers when
// /dev/dm-0 is created. This possibly interferes with the dmsetup create
// reload, so uninstall snapd.
Expand Down

0 comments on commit 5b1a3c1

Please sign in to comment.