Skip to content

Commit

Permalink
client/fingerprint: correctly fingerprint E/P cores of Apple Silicon …
Browse files Browse the repository at this point in the history
…chips

This PR adds detection of asymetric core types (Power & Efficiency) (P/E)
when running on M1/M2 Apple Silicon CPUs. This functionality is provided
by shoenig/go-m1cpu which makes use of the Apple IOKit framework to read
undocumented registers containing CPU performance data. Currently working
on getting that functionality merged upstream into gopsutil, but gopsutil
would still not support detecting P vs E cores like this PR does.

Also refactors the CPUFingerprinter code to handle the mixed core
types, now setting power vs efficiency cpu attributes.

For now the scheduler is still unaware of mixed core types - on Apple
platforms tasks cannot reserve cores anyway so it doesn't matter, but
at least now the total CPU shares available will be correct.

Future work should include adding support for detecting P/E cores on
the latest and upcoming Intel chips, where computation of total cpu shares
is currently incorrect. For that, we should also include updating the
scheduler to be core-type aware, so that tasks of resources.cores on Linux
platforms can be assigned the correct number of CPU shares for the core
type(s) they have been assigned.

node attributes before

cpu.arch                  = arm64
cpu.modelname             = Apple M2 Pro
cpu.numcores              = 12
cpu.reservablecores       = 0
cpu.totalcompute          = 1000

node attributes after

cpu.arch                  = arm64
cpu.frequency.efficiency  = 2424
cpu.frequency.power       = 3504
cpu.modelname             = Apple M2 Pro
cpu.numcores.efficiency   = 4
cpu.numcores.power        = 8
cpu.reservablecores       = 0
cpu.totalcompute          = 37728
  • Loading branch information
shoenig committed Mar 27, 2023
1 parent e4963b9 commit bac2ed4
Show file tree
Hide file tree
Showing 13 changed files with 293 additions and 145 deletions.
2 changes: 1 addition & 1 deletion api/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ require (
github.com/hashicorp/go-rootcerts v1.0.2
github.com/mitchellh/go-testing-interface v1.14.1
github.com/mitchellh/mapstructure v1.5.0
github.com/shoenig/test v0.6.2
github.com/shoenig/test v0.6.3
golang.org/x/exp v0.0.0-20230108222341-4b8118a2686a
)

Expand Down
4 changes: 2 additions & 2 deletions api/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/shoenig/test v0.6.2 h1:tdq+WGnznwE5xcOMXkqqXuudK75RkSGBazBGcP1lX6w=
github.com/shoenig/test v0.6.2/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
github.com/shoenig/test v0.6.3 h1:GVXWJFk9PiOjN0KoJ7VrJGH6uLPnqxR7/fe3HUPfE0c=
github.com/shoenig/test v0.6.3/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
Expand Down
158 changes: 100 additions & 58 deletions client/fingerprint/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (

"github.com/hashicorp/nomad/lib/cpuset"

log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/nomad/structs"
)
Expand All @@ -22,85 +22,127 @@ const (
// CPUFingerprint is used to fingerprint the CPU
type CPUFingerprint struct {
StaticFingerprinter
logger log.Logger
logger hclog.Logger

// accumulates result in these resource structs
resources *structs.Resources
nodeResources *structs.NodeResources
}

// NewCPUFingerprint is used to create a CPU fingerprint
func NewCPUFingerprint(logger log.Logger) Fingerprint {
f := &CPUFingerprint{logger: logger.Named("cpu")}
return f
func NewCPUFingerprint(logger hclog.Logger) Fingerprint {
return &CPUFingerprint{
logger: logger.Named("cpu"),
resources: new(structs.Resources),
nodeResources: new(structs.NodeResources),
}
}

func (f *CPUFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error {
cfg := req.Config
setResourcesCPU := func(totalCompute int, totalCores uint16, reservableCores []uint16) {
// COMPAT(0.10): Remove in 0.10
resp.Resources = &structs.Resources{
CPU: totalCompute,
}
func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *FingerprintResponse) error {
f.initialize()

resp.NodeResources = &structs.NodeResources{
Cpu: structs.NodeCpuResources{
CpuShares: int64(totalCompute),
TotalCpuCores: totalCores,
ReservableCpuCores: reservableCores,
},
}
}
f.setModelName(response)

f.setFrequency(response)

f.setCoreCount(response)

f.setReservableCores(request, response)

f.setTotalCompute(request, response)

f.setResponseResources(response)

response.Detected = true

return nil
}

func (f *CPUFingerprint) initialize() {
if err := stats.Init(); err != nil {
f.logger.Warn("failed initializing stats collector", "error", err)
}
}

func (f *CPUFingerprint) setModelName(response *FingerprintResponse) {
if modelName := stats.CPUModelName(); modelName != "" {
resp.AddAttribute("cpu.modelname", modelName)
response.AddAttribute("cpu.modelname", modelName)
f.logger.Debug("detected CPU model", "name", modelName)
}
}

if mhz := stats.CPUMHzPerCore(); mhz > 0 {
resp.AddAttribute("cpu.frequency", fmt.Sprintf("%.0f", mhz))
f.logger.Debug("detected cpu frequency", "MHz", log.Fmt("%.0f", mhz))
}
func (*CPUFingerprint) frequency(mhz uint64) string {
return fmt.Sprintf("%.0f", float64(mhz))
}

var numCores int
if numCores = stats.CPUNumCores(); numCores > 0 {
resp.AddAttribute("cpu.numcores", strconv.Itoa(numCores))
f.logger.Debug("detected core count", "cores", numCores)
func (f *CPUFingerprint) setFrequency(response *FingerprintResponse) {
power, efficiency := stats.CPUMHzPerCore()
switch {
case efficiency > 0:
response.AddAttribute("cpu.frequency.efficiency", f.frequency(efficiency))
response.AddAttribute("cpu.frequency.power", f.frequency(power))
f.logger.Debug("detected CPU efficiency core speed", "mhz", efficiency)
f.logger.Debug("detected CPU power core speed", "mhz", power)
case power > 0:
response.AddAttribute("cpu.frequency", f.frequency(power))
f.logger.Debug("detected CPU frequency", "mhz", power)
}
}

var reservableCores []uint16
if req.Config.ReservableCores != nil {
reservableCores = req.Config.ReservableCores
f.logger.Debug("reservable cores set by config", "cpuset", reservableCores)
} else {
if cores, err := f.deriveReservableCores(req); err != nil {
f.logger.Warn("failed to detect set of reservable cores", "error", err)
} else {
if req.Node.ReservedResources != nil {
reservableCores = cpuset.New(cores...).Difference(cpuset.New(req.Node.ReservedResources.Cpu.ReservedCpuCores...)).ToSlice()
}
f.logger.Debug("detected reservable cores", "cpuset", reservableCores)
}
func (*CPUFingerprint) cores(count int) string {
return strconv.Itoa(count)
}

func (f *CPUFingerprint) setCoreCount(response *FingerprintResponse) {
power, efficiency := stats.CPUNumCores()
switch {
case efficiency > 0:
response.AddAttribute("cpu.numcores.efficiency", f.cores(efficiency))
response.AddAttribute("cpu.numcores.power", f.cores(power))
f.logger.Debug("detected CPU efficiency core count", "cores", efficiency)
f.logger.Debug("detected CPU power core count", "cores", power)
case power > 0:
response.AddAttribute("cpu.numcores", f.cores(power))
f.logger.Debug("detected CPU core count", power)
}
resp.AddAttribute("cpu.reservablecores", strconv.Itoa(len(reservableCores)))
f.nodeResources.Cpu.TotalCpuCores = uint16(power + efficiency)
}

tt := int(stats.TotalTicksAvailable())
if cfg.CpuCompute > 0 {
f.logger.Debug("using user specified cpu compute", "cpu_compute", cfg.CpuCompute)
tt = cfg.CpuCompute
func (f *CPUFingerprint) setReservableCores(request *FingerprintRequest, response *FingerprintResponse) {
reservable := request.Config.ReservableCores
if len(reservable) > 0 {
f.logger.Debug("reservable cores set by config", "cpuset", reservable)
}

// If we cannot detect the cpu total compute, fallback to a very low default
// value and log a message about configuring cpu_total_compute. This happens
// on Graviton instances where CPU information is unavailable. In that case,
// the env_aws fingerprinter updates the value with correct information.
if tt == 0 {
f.logger.Info("fallback to default cpu total compute, set client config option cpu_total_compute to override")
tt = defaultCPUTicks
cgroupParent := request.Config.CgroupParent
if reservable = f.deriveReservableCores(cgroupParent); reservable != nil {
if request.Node.ReservedResources != nil {
forNode := request.Node.ReservedResources.Cpu.ReservedCpuCores
reservable = cpuset.New(reservable...).Difference(cpuset.New(forNode...)).ToSlice()
f.logger.Debug("client configuration reserves these cores for node", "cores", forNode)
}
f.logger.Debug("set of reservable cores available for tasks", "cores", reservable)
}
response.AddAttribute("cpu.reservablecores", strconv.Itoa(len(reservable)))
f.nodeResources.Cpu.ReservableCpuCores = reservable
}

resp.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", tt))
setResourcesCPU(tt, uint16(numCores), reservableCores)
resp.Detected = true
func (f *CPUFingerprint) setTotalCompute(request *FingerprintRequest, response *FingerprintResponse) {
var ticks uint64
switch {
case request.Config.CpuCompute > 0:
ticks = uint64(request.Config.CpuCompute)
case stats.TotalTicksAvailable() > 0:
ticks = stats.TotalTicksAvailable()
default:
ticks = defaultCPUTicks
}
response.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", ticks))
f.resources.CPU = int(ticks)
f.nodeResources.Cpu.CpuShares = int64(ticks)
}

return nil
func (f *CPUFingerprint) setResponseResources(response *FingerprintResponse) {
response.Resources = f.resources
response.NodeResources = f.nodeResources
}
45 changes: 45 additions & 0 deletions client/fingerprint/cpu_darwin_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//go:build darwin && arm64 && cgo

package fingerprint

import (
"testing"

"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
)

func TestCPUFingerprint_AppleSilicon(t *testing.T) {
ci.Parallel(t)

f := NewCPUFingerprint(testlog.HCLogger(t))
node := &structs.Node{Attributes: make(map[string]string)}

request := &FingerprintRequest{Config: new(config.Config), Node: node}
var response FingerprintResponse

err := f.Fingerprint(request, &response)
must.NoError(t, err)

must.True(t, response.Detected)

attributes := response.Attributes
must.NotNil(t, attributes)
must.MapContainsKey(t, attributes, "cpu.modelname")
must.MapContainsKey(t, attributes, "cpu.numcores.power")
must.MapContainsKey(t, attributes, "cpu.numcores.efficiency")
must.MapContainsKey(t, attributes, "cpu.frequency.power")
must.MapContainsKey(t, attributes, "cpu.frequency.efficiency")
must.MapContainsKey(t, attributes, "cpu.totalcompute")
must.Positive(t, response.Resources.CPU)
must.Positive(t, response.NodeResources.Cpu.CpuShares)
must.Positive(t, response.NodeResources.Cpu.SharesPerCore())
must.SliceEmpty(t, response.NodeResources.Cpu.ReservableCpuCores)

// not included for mixed core types (that we can detect)
must.MapNotContainsKey(t, attributes, "cpu.numcores")
must.MapNotContainsKey(t, attributes, "cpu.frequency")
}
4 changes: 2 additions & 2 deletions client/fingerprint/cpu_default.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@

package fingerprint

func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest) ([]uint16, error) {
return nil, nil
func (f *CPUFingerprint) deriveReservableCores(string) []uint16 {
return nil
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build !darwin || !arm64 || !cgo

package fingerprint

import (
Expand All @@ -10,52 +12,33 @@ import (
"github.com/hashicorp/nomad/nomad/structs"
)

func TestCPUFingerprint(t *testing.T) {
func TestCPUFingerprint_Classic(t *testing.T) {
ci.Parallel(t)

f := NewCPUFingerprint(testlog.HCLogger(t))
node := &structs.Node{
Attributes: make(map[string]string),
}
node := &structs.Node{Attributes: make(map[string]string)}

request := &FingerprintRequest{Config: &config.Config{}, Node: node}
var response FingerprintResponse
err := f.Fingerprint(request, &response)
if err != nil {
t.Fatalf("err: %v", err)
}

if !response.Detected {
t.Fatalf("expected response to be applicable")
}
err := f.Fingerprint(request, &response)
must.NoError(t, err)

// CPU info
must.True(t, response.Detected)
attributes := response.Attributes
if attributes == nil {
t.Fatalf("expected attributes to be initialized")
}
if attributes["cpu.numcores"] == "" {
t.Fatalf("Missing Num Cores")
}
if attributes["cpu.modelname"] == "" {
t.Fatalf("Missing Model Name")
}

if attributes["cpu.frequency"] == "" {
t.Fatalf("Missing CPU Frequency")
}
if attributes["cpu.totalcompute"] == "" {
t.Fatalf("Missing CPU Total Compute")
}

// COMPAT(0.10): Remove in 0.10
if response.Resources == nil || response.Resources.CPU == 0 {
t.Fatalf("Expected to find CPU Resources")
}

if response.NodeResources == nil || response.NodeResources.Cpu.CpuShares == 0 {
t.Fatalf("Expected to find CPU Resources")
}
must.NotNil(t, attributes)
must.MapContainsKey(t, "cpu.numcores")
must.MapContainsKey(t, "cpu.modelname")
must.MapContainsKey(t, "cpu.frequency")
must.MapContainsKey(t, "cpu.totalcompute")
must.Positive(t, response.Resources.CPU)
must.Positive(t, response.NodeResources.Cpu.CpuShares)
must.Positive(t, response.NodeResources.Cpu.SharesPerCore())
must.SliceNotEmpty(t, response.NodeResources.Cpu.ReservableCpuCores)

// asymetric core detection currently only works with apple silicon
must.MapNotContainsKey(t, "cpu.numcores.power")
must.MapNotContainsKey(t, "cpu.numcores.efficiency")
}

// TestCPUFingerprint_OverrideCompute asserts that setting cpu_total_compute in
Expand Down
9 changes: 7 additions & 2 deletions client/fingerprint/cpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@ import (
"github.com/hashicorp/nomad/client/lib/cgutil"
)

func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest) ([]uint16, error) {
func (f *CPUFingerprint) deriveReservableCores(cgroupParent string) []uint16 {
// The cpuset cgroup manager is initialized (on linux), but not accessible
// from the finger-printer. So we reach in and grab the information manually.
// We may assume the hierarchy is already setup.
return cgutil.GetCPUsFromCgroup(req.Config.CgroupParent)
cpuset, err := cgutil.GetCPUsFromCgroup(cgroupParent)
if err != nil {
f.logger.Warn("failed to detect set of reservable cores", "error", err)
return nil
}
return cpuset
}
4 changes: 2 additions & 2 deletions client/stats/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func (c *CpuStats) Percent(cpuTime float64) float64 {
// TicksConsumed calculates the total ticks consumes by the process across all
// cpu cores
func (c *CpuStats) TicksConsumed(percent float64) float64 {
return (percent / 100) * shelpers.TotalTicksAvailable() / float64(c.totalCpus)
return (percent / 100) * float64(shelpers.TotalTicksAvailable()) / float64(c.totalCpus)
}

func (c *CpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 {
Expand Down Expand Up @@ -83,7 +83,7 @@ func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks flo
Idle: idle,
Total: total,
}
ticksConsumed += (total / 100.0) * (shelpers.TotalTicksAvailable() / float64(len(cpuStats)))
ticksConsumed += (total / 100.0) * (float64(shelpers.TotalTicksAvailable()) / float64(len(cpuStats)))
}

return cs, ticksConsumed, nil
Expand Down
2 changes: 1 addition & 1 deletion drivers/docker/util/stats_posix.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func DockerStatsToTaskResourceUsage(s *docker.Stats) *cstructs.TaskResourceUsage
cs.UserMode = CalculateCPUPercent(
s.CPUStats.CPUUsage.UsageInUsermode, s.PreCPUStats.CPUUsage.UsageInUsermode,
s.CPUStats.CPUUsage.TotalUsage, s.PreCPUStats.CPUUsage.TotalUsage, runtime.NumCPU())
cs.TotalTicks = (cs.Percent / 100) * stats.TotalTicksAvailable() / float64(runtime.NumCPU())
cs.TotalTicks = (cs.Percent / 100) * float64(stats.TotalTicksAvailable()) / float64(runtime.NumCPU())

return &cstructs.TaskResourceUsage{
ResourceUsage: &cstructs.ResourceUsage{
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ require (
github.com/ryanuber/go-glob v1.0.0
github.com/shirou/gopsutil/v3 v3.23.1
github.com/shoenig/go-landlock v0.1.5
github.com/shoenig/test v0.6.2
github.com/shoenig/go-m1cpu v0.1.3
github.com/shoenig/test v0.6.3
github.com/skratchdot/open-golang v0.0.0-20160302144031-75fb7ed4208c
github.com/stretchr/testify v1.8.1
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
Expand Down
Loading

0 comments on commit bac2ed4

Please sign in to comment.