Skip to content

Commit

Permalink
roachtestutil: dynamically determine block device to stall
Browse files Browse the repository at this point in the history
Previously, we hardcoded the block device on which to run
the disk-stalled* roachtests and the disk-stall operation.
This was a flaky approach as sometimes we'd use a local
ssd as a block device which had very different numbers
than a Google persistent disk.

This change updates the cgroup disk staller to programmatically
determine the major/minor device numbers for the block device
to stall (the one mounted at /mnt/data1). It also updates the
dmsetup disk staller to dynamically determine the block device
name mounted at /mnt/data1.

Fixes cockroachdb#123080, cockroachdb#123054.

Epic: none

Release note: None
  • Loading branch information
itsbilal committed May 7, 2024
1 parent 9e29de7 commit a8eec94
Showing 1 changed file with 30 additions and 34 deletions.
64 changes: 30 additions & 34 deletions pkg/cmd/roachtest/tests/disk_stall.go
Original file line number Diff line number Diff line change
Expand Up @@ -456,10 +456,12 @@ type dmsetupDiskStaller struct {

var _ diskStaller = (*dmsetupDiskStaller)(nil)

func (s *dmsetupDiskStaller) device() string { return getDevice(s.t, s.c) }
func (s *dmsetupDiskStaller) device(nodes option.NodeListOption) string {
return getDevice(s.t, s.c, nodes)
}

func (s *dmsetupDiskStaller) Setup(ctx context.Context) {
dev := s.device()
dev := s.device(s.c.All())
// snapd will run "snapd auto-import /dev/dm-0" via udev triggers when
// /dev/dm-0 is created. This possibly interferes with the dmsetup create
// reload, so uninstall snapd.
Expand Down Expand Up @@ -547,20 +549,28 @@ func (s *cgroupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOp
}
}

func (s *cgroupDiskStaller) device() (major, minor int) {
// TODO(jackson): Programmatically determine the device major,minor numbers.
// eg,:
// deviceName := getDevice(s.t, s.c)
// `cat /proc/partitions` and find `deviceName`
switch s.c.Cloud() {
case spec.GCE:
// ls -l /dev/sdb
// brw-rw---- 1 root disk 8, 16 Mar 27 22:08 /dev/sdb
return 8, 16
default:
s.t.Fatalf("unsupported cloud %q", s.c.Cloud())
func (s *cgroupDiskStaller) device(nodes option.NodeListOption) (major, minor int) {
res, err := s.c.RunWithDetailsSingleNode(context.TODO(), s.t.L(), option.WithNodes(nodes[:1]), "lsblk | grep /mnt/data1 | awk '{print $2}'")
if err != nil {
s.t.Fatalf("error when determining block device: %s", err)
return 0, 0
}
parts := strings.Split(strings.TrimSpace(res.Stdout), ":")
if len(parts) != 2 {
s.t.Fatalf("unexpected output from lsblk: %s", res.Stdout)
return 0, 0
}
major, err = strconv.Atoi(parts[0])
if err != nil {
s.t.Fatalf("error when determining block device: %s", err)
return 0, 0
}
minor, err = strconv.Atoi(parts[1])
if err != nil {
s.t.Fatalf("error when determining block device: %s", err)
return 0, 0
}
return major, minor
}

type throughput struct {
Expand Down Expand Up @@ -589,7 +599,7 @@ func (rw bandwidthReadWrite) cgroupV2BandwidthProp() string {
func (s *cgroupDiskStaller) setThroughput(
ctx context.Context, nodes option.NodeListOption, rw bandwidthReadWrite, bw throughput,
) error {
maj, min := s.device()
maj, min := s.device(nodes)
cockroachIOController := filepath.Join("/sys/fs/cgroup/system.slice", roachtestutil.SystemInterfaceSystemdUnitName()+".service", "io.max")

bytesPerSecondStr := "max"
Expand All @@ -606,25 +616,11 @@ func (s *cgroupDiskStaller) setThroughput(
))
}

func getDevice(t test.Test, c cluster.Cluster) string {
s := c.Spec()
switch c.Cloud() {
case spec.GCE:
switch s.LocalSSD {
case spec.LocalSSDDisable:
return "/dev/sdb"
case spec.LocalSSDPreferOn, spec.LocalSSDDefault:
// TODO(jackson): These spec values don't guarantee that we are actually
// using local SSDs, just that we might've.
return "/dev/nvme0n1"
default:
t.Fatalf("unsupported LocalSSD enum %v", s.LocalSSD)
return ""
}
case spec.AWS:
return "/dev/nvme1n1"
default:
t.Fatalf("unsupported cloud %q", c.Cloud())
func getDevice(t test.Test, c cluster.Cluster, nodes option.NodeListOption) string {
res, err := c.RunWithDetailsSingleNode(context.TODO(), t.L(), option.WithNodes(nodes[:1]), "lsblk | grep /mnt/data1 | awk '{print $1}'")
if err != nil {
t.Fatalf("error when determining block device: %s", err)
return ""
}
return "/dev/" + strings.TrimSpace(res.Stdout)
}

0 comments on commit a8eec94

Please sign in to comment.