Skip to content

Commit

Permalink
Backport of client/fingerprint: correctly fingerprint E/P cores of Ap…
Browse files Browse the repository at this point in the history
…ple Silicon chips into release/1.5.x (#16690)

This pull request was automerged via backport-assistant
  • Loading branch information
hc-github-team-nomad-core authored Mar 28, 2023
1 parent 90ba690 commit d76453a
Show file tree
Hide file tree
Showing 15 changed files with 296 additions and 145 deletions.
3 changes: 3 additions & 0 deletions .changelog/16672.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
fingerprint/cpu: correctly fingerprint P/E cores of Apple Silicon chips
```
2 changes: 1 addition & 1 deletion api/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ require (
github.com/hashicorp/go-rootcerts v1.0.2
github.com/mitchellh/go-testing-interface v1.14.1
github.com/mitchellh/mapstructure v1.5.0
github.com/shoenig/test v0.6.2
github.com/shoenig/test v0.6.3
golang.org/x/exp v0.0.0-20230108222341-4b8118a2686a
)

Expand Down
4 changes: 2 additions & 2 deletions api/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/shoenig/test v0.6.2 h1:tdq+WGnznwE5xcOMXkqqXuudK75RkSGBazBGcP1lX6w=
github.com/shoenig/test v0.6.2/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
github.com/shoenig/test v0.6.3 h1:GVXWJFk9PiOjN0KoJ7VrJGH6uLPnqxR7/fe3HUPfE0c=
github.com/shoenig/test v0.6.3/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
Expand Down
155 changes: 99 additions & 56 deletions client/fingerprint/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (

"github.com/hashicorp/nomad/lib/cpuset"

log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/nomad/structs"
)
Expand All @@ -22,85 +22,128 @@ const (
// CPUFingerprint is used to fingerprint the CPU
type CPUFingerprint struct {
StaticFingerprinter
logger log.Logger
logger hclog.Logger

// accumulates result in these resource structs
resources *structs.Resources
nodeResources *structs.NodeResources
}

// NewCPUFingerprint is used to create a CPU fingerprint
func NewCPUFingerprint(logger log.Logger) Fingerprint {
f := &CPUFingerprint{logger: logger.Named("cpu")}
return f
func NewCPUFingerprint(logger hclog.Logger) Fingerprint {
return &CPUFingerprint{
logger: logger.Named("cpu"),
resources: new(structs.Resources), // COMPAT (to be removed after 0.10)
nodeResources: new(structs.NodeResources),
}
}

func (f *CPUFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error {
cfg := req.Config
setResourcesCPU := func(totalCompute int, totalCores uint16, reservableCores []uint16) {
// COMPAT(0.10): Remove in 0.10
resp.Resources = &structs.Resources{
CPU: totalCompute,
}
func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *FingerprintResponse) error {
f.initialize()

resp.NodeResources = &structs.NodeResources{
Cpu: structs.NodeCpuResources{
CpuShares: int64(totalCompute),
TotalCpuCores: totalCores,
ReservableCpuCores: reservableCores,
},
}
}
f.setModelName(response)

f.setFrequency(response)

f.setCoreCount(response)

f.setReservableCores(request, response)

f.setTotalCompute(request, response)

f.setResponseResources(response)

response.Detected = true

return nil
}

func (f *CPUFingerprint) initialize() {
if err := stats.Init(); err != nil {
f.logger.Warn("failed initializing stats collector", "error", err)
}
}

func (f *CPUFingerprint) setModelName(response *FingerprintResponse) {
if modelName := stats.CPUModelName(); modelName != "" {
resp.AddAttribute("cpu.modelname", modelName)
response.AddAttribute("cpu.modelname", modelName)
f.logger.Debug("detected CPU model", "name", modelName)
}
}

func (*CPUFingerprint) frequency(mhz uint64) string {
return fmt.Sprintf("%.0f", float64(mhz))
}

if mhz := stats.CPUMHzPerCore(); mhz > 0 {
resp.AddAttribute("cpu.frequency", fmt.Sprintf("%.0f", mhz))
f.logger.Debug("detected cpu frequency", "MHz", log.Fmt("%.0f", mhz))
func (f *CPUFingerprint) setFrequency(response *FingerprintResponse) {
power, efficiency := stats.CPUMHzPerCore()
switch {
case efficiency > 0:
response.AddAttribute("cpu.frequency.efficiency", f.frequency(efficiency))
response.AddAttribute("cpu.frequency.power", f.frequency(power))
f.logger.Debug("detected CPU efficiency core speed", "mhz", efficiency)
f.logger.Debug("detected CPU power core speed", "mhz", power)
case power > 0:
response.AddAttribute("cpu.frequency", f.frequency(power))
f.logger.Debug("detected CPU frequency", "mhz", power)
}
}

func (*CPUFingerprint) cores(count int) string {
return strconv.Itoa(count)
}

var numCores int
if numCores = stats.CPUNumCores(); numCores > 0 {
resp.AddAttribute("cpu.numcores", strconv.Itoa(numCores))
f.logger.Debug("detected core count", "cores", numCores)
func (f *CPUFingerprint) setCoreCount(response *FingerprintResponse) {
power, efficiency := stats.CPUNumCores()
switch {
case efficiency > 0:
response.AddAttribute("cpu.numcores.efficiency", f.cores(efficiency))
response.AddAttribute("cpu.numcores.power", f.cores(power))
f.logger.Debug("detected CPU efficiency core count", "cores", efficiency)
f.logger.Debug("detected CPU power core count", "cores", power)
case power > 0:
response.AddAttribute("cpu.numcores", f.cores(power))
f.logger.Debug("detected CPU core count", power)
}
f.nodeResources.Cpu.TotalCpuCores = uint16(power + efficiency)
}

var reservableCores []uint16
if req.Config.ReservableCores != nil {
reservableCores = req.Config.ReservableCores
f.logger.Debug("reservable cores set by config", "cpuset", reservableCores)
func (f *CPUFingerprint) setReservableCores(request *FingerprintRequest, response *FingerprintResponse) {
reservable := request.Config.ReservableCores
if len(reservable) > 0 {
f.logger.Debug("reservable cores set by config", "cpuset", reservable)
} else {
if cores, err := f.deriveReservableCores(req); err != nil {
f.logger.Warn("failed to detect set of reservable cores", "error", err)
} else {
if req.Node.ReservedResources != nil {
reservableCores = cpuset.New(cores...).Difference(cpuset.New(req.Node.ReservedResources.Cpu.ReservedCpuCores...)).ToSlice()
cgroupParent := request.Config.CgroupParent
if reservable = f.deriveReservableCores(cgroupParent); reservable != nil {
if request.Node.ReservedResources != nil {
forNode := request.Node.ReservedResources.Cpu.ReservedCpuCores
reservable = cpuset.New(reservable...).Difference(cpuset.New(forNode...)).ToSlice()
f.logger.Debug("client configuration reserves these cores for node", "cores", forNode)
}
f.logger.Debug("detected reservable cores", "cpuset", reservableCores)
f.logger.Debug("set of reservable cores available for tasks", "cores", reservable)
}
}
resp.AddAttribute("cpu.reservablecores", strconv.Itoa(len(reservableCores)))

tt := int(stats.TotalTicksAvailable())
if cfg.CpuCompute > 0 {
f.logger.Debug("using user specified cpu compute", "cpu_compute", cfg.CpuCompute)
tt = cfg.CpuCompute
}
response.AddAttribute("cpu.reservablecores", strconv.Itoa(len(reservable)))
f.nodeResources.Cpu.ReservableCpuCores = reservable
}

// If we cannot detect the cpu total compute, fallback to a very low default
// value and log a message about configuring cpu_total_compute. This happens
// on Graviton instances where CPU information is unavailable. In that case,
// the env_aws fingerprinter updates the value with correct information.
if tt == 0 {
f.logger.Info("fallback to default cpu total compute, set client config option cpu_total_compute to override")
tt = defaultCPUTicks
func (f *CPUFingerprint) setTotalCompute(request *FingerprintRequest, response *FingerprintResponse) {
var ticks uint64
switch {
case request.Config.CpuCompute > 0:
ticks = uint64(request.Config.CpuCompute)
case stats.TotalTicksAvailable() > 0:
ticks = stats.TotalTicksAvailable()
default:
ticks = defaultCPUTicks
}
response.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", ticks))
f.resources.CPU = int(ticks)
f.nodeResources.Cpu.CpuShares = int64(ticks)
}

resp.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", tt))
setResourcesCPU(tt, uint16(numCores), reservableCores)
resp.Detected = true

return nil
func (f *CPUFingerprint) setResponseResources(response *FingerprintResponse) {
response.Resources = f.resources
response.NodeResources = f.nodeResources
}
45 changes: 45 additions & 0 deletions client/fingerprint/cpu_darwin_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//go:build darwin && arm64 && cgo

package fingerprint

import (
"testing"

"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
)

func TestCPUFingerprint_AppleSilicon(t *testing.T) {
ci.Parallel(t)

f := NewCPUFingerprint(testlog.HCLogger(t))
node := &structs.Node{Attributes: make(map[string]string)}

request := &FingerprintRequest{Config: new(config.Config), Node: node}
var response FingerprintResponse

err := f.Fingerprint(request, &response)
must.NoError(t, err)

must.True(t, response.Detected)

attributes := response.Attributes
must.NotNil(t, attributes)
must.MapContainsKey(t, attributes, "cpu.modelname")
must.MapContainsKey(t, attributes, "cpu.numcores.power")
must.MapContainsKey(t, attributes, "cpu.numcores.efficiency")
must.MapContainsKey(t, attributes, "cpu.frequency.power")
must.MapContainsKey(t, attributes, "cpu.frequency.efficiency")
must.MapContainsKey(t, attributes, "cpu.totalcompute")
must.Positive(t, response.Resources.CPU)
must.Positive(t, response.NodeResources.Cpu.CpuShares)
must.Positive(t, response.NodeResources.Cpu.SharesPerCore())
must.SliceEmpty(t, response.NodeResources.Cpu.ReservableCpuCores)

// not included for mixed core types (that we can detect)
must.MapNotContainsKey(t, attributes, "cpu.numcores")
must.MapNotContainsKey(t, attributes, "cpu.frequency")
}
5 changes: 2 additions & 3 deletions client/fingerprint/cpu_default.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
//go:build !linux
// +build !linux

package fingerprint

func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest) ([]uint16, error) {
return nil, nil
func (_ *CPUFingerprint) deriveReservableCores(string) []uint16 {
return nil
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build !darwin || !arm64 || !cgo

package fingerprint

import (
Expand All @@ -8,54 +10,36 @@ import (
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
)

func TestCPUFingerprint(t *testing.T) {
func TestCPUFingerprint_Classic(t *testing.T) {
ci.Parallel(t)

f := NewCPUFingerprint(testlog.HCLogger(t))
node := &structs.Node{
Attributes: make(map[string]string),
}
node := &structs.Node{Attributes: make(map[string]string)}

request := &FingerprintRequest{Config: &config.Config{}, Node: node}
var response FingerprintResponse
err := f.Fingerprint(request, &response)
if err != nil {
t.Fatalf("err: %v", err)
}

if !response.Detected {
t.Fatalf("expected response to be applicable")
}
err := f.Fingerprint(request, &response)
must.NoError(t, err)

// CPU info
must.True(t, response.Detected)
attributes := response.Attributes
if attributes == nil {
t.Fatalf("expected attributes to be initialized")
}
if attributes["cpu.numcores"] == "" {
t.Fatalf("Missing Num Cores")
}
if attributes["cpu.modelname"] == "" {
t.Fatalf("Missing Model Name")
}

if attributes["cpu.frequency"] == "" {
t.Fatalf("Missing CPU Frequency")
}
if attributes["cpu.totalcompute"] == "" {
t.Fatalf("Missing CPU Total Compute")
}

// COMPAT(0.10): Remove in 0.10
if response.Resources == nil || response.Resources.CPU == 0 {
t.Fatalf("Expected to find CPU Resources")
}

if response.NodeResources == nil || response.NodeResources.Cpu.CpuShares == 0 {
t.Fatalf("Expected to find CPU Resources")
}
must.NotNil(t, attributes)
must.MapContainsKey(t, attributes, "cpu.numcores")
must.MapContainsKey(t, attributes, "cpu.modelname")
must.MapContainsKey(t, attributes, "cpu.frequency")
must.MapContainsKey(t, attributes, "cpu.totalcompute")
must.Positive(t, response.Resources.CPU)
must.Positive(t, response.NodeResources.Cpu.CpuShares)
must.Positive(t, response.NodeResources.Cpu.SharesPerCore())
must.SliceNotEmpty(t, response.NodeResources.Cpu.ReservableCpuCores)

// asymetric core detection currently only works with apple silicon
must.MapNotContainsKey(t, attributes, "cpu.numcores.power")
must.MapNotContainsKey(t, attributes, "cpu.numcores.efficiency")
}

// TestCPUFingerprint_OverrideCompute asserts that setting cpu_total_compute in
Expand Down
9 changes: 7 additions & 2 deletions client/fingerprint/cpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@ import (
"github.com/hashicorp/nomad/client/lib/cgutil"
)

func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest) ([]uint16, error) {
func (f *CPUFingerprint) deriveReservableCores(cgroupParent string) []uint16 {
// The cpuset cgroup manager is initialized (on linux), but not accessible
// from the finger-printer. So we reach in and grab the information manually.
// We may assume the hierarchy is already setup.
return cgutil.GetCPUsFromCgroup(req.Config.CgroupParent)
cpuset, err := cgutil.GetCPUsFromCgroup(cgroupParent)
if err != nil {
f.logger.Warn("failed to detect set of reservable cores", "error", err)
return nil
}
return cpuset
}
4 changes: 2 additions & 2 deletions client/stats/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func (c *CpuStats) Percent(cpuTime float64) float64 {
// TicksConsumed calculates the total ticks consumes by the process across all
// cpu cores
func (c *CpuStats) TicksConsumed(percent float64) float64 {
return (percent / 100) * shelpers.TotalTicksAvailable() / float64(c.totalCpus)
return (percent / 100) * float64(shelpers.TotalTicksAvailable()) / float64(c.totalCpus)
}

func (c *CpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 {
Expand Down Expand Up @@ -83,7 +83,7 @@ func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks flo
Idle: idle,
Total: total,
}
ticksConsumed += (total / 100.0) * (shelpers.TotalTicksAvailable() / float64(len(cpuStats)))
ticksConsumed += (total / 100.0) * (float64(shelpers.TotalTicksAvailable()) / float64(len(cpuStats)))
}

return cs, ticksConsumed, nil
Expand Down
2 changes: 1 addition & 1 deletion drivers/docker/util/stats_posix.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func DockerStatsToTaskResourceUsage(s *docker.Stats) *cstructs.TaskResourceUsage
cs.UserMode = CalculateCPUPercent(
s.CPUStats.CPUUsage.UsageInUsermode, s.PreCPUStats.CPUUsage.UsageInUsermode,
s.CPUStats.CPUUsage.TotalUsage, s.PreCPUStats.CPUUsage.TotalUsage, runtime.NumCPU())
cs.TotalTicks = (cs.Percent / 100) * stats.TotalTicksAvailable() / float64(runtime.NumCPU())
cs.TotalTicks = (cs.Percent / 100) * float64(stats.TotalTicksAvailable()) / float64(runtime.NumCPU())

return &cstructs.TaskResourceUsage{
ResourceUsage: &cstructs.ResourceUsage{
Expand Down
Loading

0 comments on commit d76453a

Please sign in to comment.