Skip to content

Commit

Permalink
DAOS-16352 control: Handle cases with static ifaces (#14953)
Browse files Browse the repository at this point in the history
Fabric interfaces defined statically in the daos_agent config
file are fundamentally different from those detected via hardware
scanning. They don't include information derived from the hardware
such as their true device class or fabric provider(s).

This patch adds some rigor to what is ignored regarding these
manually-defined interfaces.

- Ignore provider for statically-defined fabric interfaces, as we
  do not bother detecting it. They are assumed to be compatible with
  whatever provider the agent is using.
- Silence confusing "no interfaces requested" error from
  WaitFabricReady by not calling it if there are no interfaces to
  check.
- Remove some defunct logic related to detecting the provider in
  the multi-provider case. The agent may only use a single provider.

Signed-off-by: Kris Jacque <kris.jacque@intel.com>
  • Loading branch information
kjacque authored Aug 21, 2024
1 parent c2ce882 commit 6107475
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 36 deletions.
4 changes: 2 additions & 2 deletions src/control/cmd/daos_agent/fabric.go
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ func (n *NUMAFabric) Find(name string) ([]*FabricInterface, error) {
}

// FindDevice looks up a fabric device with a given name, domain, and provider.
// NB: The domain and provider are optional. All other parameters are required. If there is more
// NB: The name is required. All other parameters are optional. If there is more
// than one match, all of them are returned.
func (n *NUMAFabric) FindDevice(params *FabricIfaceParams) ([]*FabricInterface, error) {
if params == nil {
Expand Down Expand Up @@ -406,7 +406,7 @@ func filterDomain(domain string, fiList []*FabricInterface) []*FabricInterface {
func filterProvider(provider string, fiList []*FabricInterface) []*FabricInterface {
result := make([]*FabricInterface, 0, len(fiList))
for _, fi := range fiList {
if fi.HasProvider(provider) {
if fi.HasProvider(provider) || fi.NetDevClass == FabricDevClassManual {
result = append(result, fi)
}
}
Expand Down
30 changes: 30 additions & 0 deletions src/control/cmd/daos_agent/fabric_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1100,6 +1100,36 @@ func TestAgent_NUMAFabric_FindDevice(t *testing.T) {
},
},
},
"success with manual interfaces": {
nf: &NUMAFabric{
numaMap: map[int][]*FabricInterface{
0: {
{
Name: "t1",
Domain: "t1",
NetDevClass: FabricDevClassManual,
},
{
Name: "t2",
Domain: "t2",
NetDevClass: FabricDevClassManual,
},
},
},
},
params: &FabricIfaceParams{
Interface: "t2",
Domain: "t2",
Provider: "p2",
},
expResult: []*FabricInterface{
{
Name: "t2",
Domain: "t2",
NetDevClass: FabricDevClassManual,
},
},
},
"success with no domain": {
nf: &NUMAFabric{
numaMap: map[int][]*FabricInterface{
Expand Down
5 changes: 5 additions & 0 deletions src/control/cmd/daos_agent/infocache.go
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,11 @@ func (c *InfoCache) waitFabricReady(ctx context.Context, netDevClass hardware.Ne
}
}

if len(needIfaces) == 0 {
c.log.Debugf("no interfaces with device class %s to wait for", netDevClass)
return nil
}

return hardware.WaitFabricReady(ctx, c.log, hardware.WaitFabricReadyParams{
StateProvider: c.devStateGetter,
FabricIfaces: needIfaces,
Expand Down
1 change: 1 addition & 0 deletions src/control/cmd/daos_agent/infocache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1581,6 +1581,7 @@ func TestAgent_InfoCache_waitFabricReady(t *testing.T) {
netDevClass: hardware.Infiniband,
expChecked: []string{"t0", "t1"},
},
"nothing to wait for": {},
} {
t.Run(name, func(t *testing.T) {
log, buf := logging.NewTestLogger(t.Name())
Expand Down
42 changes: 15 additions & 27 deletions src/control/cmd/daos_agent/mgmt_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
package main

import (
"fmt"
"net"
"strings"

Expand Down Expand Up @@ -240,42 +241,28 @@ func (mod *mgmtModule) getAttachInfoResp(ctx context.Context, sys string) (*mgmt
}

func (mod *mgmtModule) selectAttachInfo(ctx context.Context, srvResp *mgmtpb.GetAttachInfoResp, iface, domain string) (*mgmtpb.GetAttachInfoResp, error) {
reqProviders := mod.getIfaceProviders(ctx, iface, domain)

resp := srvResp
if mod.providerIdx > 0 {
mod.log.Debugf("using secondary provider idx %d", mod.providerIdx)

var err error
// Secondary provider indices begin at 1
resp, err := mod.selectSecondaryAttachInfo(srvResp, mod.providerIdx)
resp, err = mod.selectSecondaryAttachInfo(srvResp, mod.providerIdx)
if err != nil {
return nil, err
}

if len(reqProviders) != 0 && !reqProviders.Has(resp.ClientNetHint.Provider) {
mod.log.Errorf("requested fabric interface %q (domain: %q) does not report support for configured provider %q (idx %d)",
iface, domain, resp.ClientNetHint.Provider, mod.providerIdx)
}

return resp, nil
}

if len(reqProviders) == 0 || reqProviders.Has(srvResp.ClientNetHint.Provider) {
return srvResp, nil
}

mod.log.Debugf("primary provider is not supported by requested interface %q domain %q (supports: %s)", iface, domain, strings.Join(reqProviders.ToSlice(), ", "))

// We can try to be smart about choosing a provider if the client requested a specific interface
for _, hint := range srvResp.SecondaryClientNetHints {
if reqProviders.Has(hint.Provider) {
mod.log.Tracef("found secondary provider supported by requested interface: %q (idx %d)", hint.Provider, hint.ProviderIdx)
return mod.selectSecondaryAttachInfo(srvResp, uint(hint.ProviderIdx))
}
reqProviders := mod.getIfaceProviders(ctx, iface, domain, hardware.NetDevClass(resp.ClientNetHint.NetDevClass))
if len(reqProviders) == 0 || reqProviders.Has(resp.ClientNetHint.Provider) {
return resp, nil
}

mod.log.Errorf("no supported provider for requested interface %q domain %q, using primary by default", iface, domain)
return srvResp, nil
return nil, fmt.Errorf("provider %s is not supported by requested interface %q domain %q (supports: %s)",
resp.ClientNetHint.Provider, iface, domain, strings.Join(reqProviders.ToSlice(), ", "))
}

func (mod *mgmtModule) getIfaceProviders(ctx context.Context, iface, domain string) common.StringSet {
func (mod *mgmtModule) getIfaceProviders(ctx context.Context, iface, domain string, ndc hardware.NetDevClass) common.StringSet {
providers := common.NewStringSet()
if iface == "" {
return providers
Expand All @@ -288,9 +275,10 @@ func (mod *mgmtModule) getIfaceProviders(ctx context.Context, iface, domain stri
if fis, err := mod.getFabricInterface(ctx, &FabricIfaceParams{
Interface: iface,
Domain: domain,
DevClass: ndc,
}); err != nil {
mod.log.Errorf("requested fabric interface %q (domain %q) may not function as desired: %s", iface, domain, err)
} else {
} else if fis.NetDevClass != FabricDevClassManual {
providers.Add(fis.Providers()...)
}

Expand Down Expand Up @@ -362,7 +350,7 @@ func (mod *mgmtModule) populateNUMAFabricMap(ctx context.Context, resp *mgmtpb.G
if exists {
pbFIs.Ifaces = make([]*mgmtpb.FabricInterface, 0, len(fis))
for _, fi := range fis {
if fi.HasProvider(resp.ClientNetHint.Provider) {
if fi.HasProvider(resp.ClientNetHint.Provider) || fi.NetDevClass == FabricDevClassManual {
pbFIs.Ifaces = append(pbFIs.Ifaces, &mgmtpb.FabricInterface{
NumaNode: uint32(numaNode),
Interface: fi.Name,
Expand Down
55 changes: 48 additions & 7 deletions src/control/cmd/daos_agent/mgmt_rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ func TestAgent_mgmtModule_getAttachInfo(t *testing.T) {
mockFabricScan fabricScanFn
mockGetNetIfaces func() ([]net.Interface, error)
numaGetter *mockNUMAProvider
fabricCfg []*NUMAFabricConfig
reqBytes []byte
expResp *mgmtpb.GetAttachInfoResp
expErr error
Expand Down Expand Up @@ -266,6 +267,34 @@ func TestAgent_mgmtModule_getAttachInfo(t *testing.T) {
},
}),
},
"req interface with cfg ifaces": {
reqBytes: reqBytes(&mgmtpb.GetAttachInfoReq{
Sys: testSys,
Interface: "test0",
}),
fabricCfg: []*NUMAFabricConfig{
{
NUMANode: 0,
Interfaces: []*FabricInterfaceConfig{
{
Interface: "test0",
Domain: "test0",
},
},
},
},
expResp: respWith(testResp, "test0", "test0", []*mgmtpb.FabricInterfaces{
{
Ifaces: []*mgmtpb.FabricInterface{
{
Interface: "test0",
Domain: "test0",
Provider: "ofi+tcp", // automatically set to the same as server requested
},
},
},
}),
},
"incompatible error": {
reqBytes: reqBytes(&mgmtpb.GetAttachInfoReq{}),
mockGetAttachInfo: func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) {
Expand Down Expand Up @@ -320,14 +349,26 @@ func TestAgent_mgmtModule_getAttachInfo(t *testing.T) {
}
}

ic := newTestInfoCache(t, log, testInfoCacheParams{
mockGetAttachInfo: tc.mockGetAttachInfo,
mockScanFabric: tc.mockFabricScan,
mockNetIfaces: tc.mockGetNetIfaces,
mockNetDevClassGetter: &hardware.MockNetDevClassProvider{
GetNetDevClassReturn: []hardware.MockGetNetDevClassResult{
{
NDC: hardware.Ether,
},
},
},
})
if tc.fabricCfg != nil {
nf := NUMAFabricFromConfig(log, tc.fabricCfg)
ic.EnableStaticFabricCache(test.Context(t), nf)
}
mod := &mgmtModule{
log: log,
sys: testSys,
cache: newTestInfoCache(t, log, testInfoCacheParams{
mockGetAttachInfo: tc.mockGetAttachInfo,
mockScanFabric: tc.mockFabricScan,
mockNetIfaces: tc.mockGetNetIfaces,
}),
log: log,
sys: testSys,
cache: ic,
numaGetter: tc.numaGetter,
}

Expand Down

0 comments on commit 6107475

Please sign in to comment.