From 8f0e605213284ab6450948b2d40bd4d8c79e0b41 Mon Sep 17 00:00:00 2001 From: qinguangrui <283713406@qq.com> Date: Tue, 22 Oct 2024 17:19:17 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=A4=9Agpu=E5=A4=9A?= =?UTF-8?q?=E8=8A=82=E7=82=B9=E6=97=B6=E8=A7=A3=E9=99=A4=E8=8A=82=E7=82=B9?= =?UTF-8?q?=E5=90=8D=E5=BC=82=E5=B8=B8=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- services/config/config.go | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/services/config/config.go b/services/config/config.go index ccc0820..cef7134 100644 --- a/services/config/config.go +++ b/services/config/config.go @@ -671,7 +671,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva res := strings.Contains(nodeArray[0], "[") if res { - getNodeNameCmd := fmt.Sprintf("echo %s | awk -F'[' '{print $1,$2}' | awk -F'-' '{print $1}'", nodeArray[0]) + getNodeNameCmd := fmt.Sprintf("echo %s | awk -F'[' '{print $1,$2}'", nodeArray[0]) nodeNameOutput, err := utils.RunCommand(getNodeNameCmd) if err != nil || utils.CheckSlurmStatus(nodeNameOutput) { errInfo := &errdetails.ErrorInfo{ @@ -682,7 +682,29 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } - nodeName := strings.Join(strings.Split(nodeNameOutput, " "), "") + nodeNameTmp := strings.Split(nodeNameOutput, " ") + if len(nodeNameTmp) != 2 { + errInfo := &errdetails.ErrorInfo{ + Reason: "INVALID_NODE_NAME", + } + st := status.New(codes.Internal, "node name is invalid") + st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) + return nil, st.Err() + } + nodeNamePrefix := nodeNameTmp[0] + nodeNameSuffixCmd := fmt.Sprintf("echo %s | awk -F'-' '{print $1}'", nodeNameTmp[1]) + nodeNameSuffix, err := utils.RunCommand(nodeNameSuffixCmd) + if err != nil || utils.CheckSlurmStatus(nodeNameOutput) { + errInfo := &errdetails.ErrorInfo{ + Reason: "COMMAND_EXEC_FAILED", + } + st := status.New(codes.Internal, "Exec command failed or slurmctld down.") + st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) + return nil, st.Err() + } + nodeName := nodeNamePrefix + nodeNameSuffix gpusCmd := fmt.Sprintf("scontrol show node=%s| grep ' Gres=' | awk -F':' '{print $NF}'", nodeName) gpusOutput, err := utils.RunCommand(gpusCmd) if err != nil || utils.CheckSlurmStatus(gpusOutput) { @@ -970,7 +992,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo noAvailableNodes int ) getPartitionStatusCmd := fmt.Sprintf("sinfo -p %s --noheader", v) - fullCmd := getPartitionStatusCmd + " --format='%P %c %C %G %a %D %F'| tr '\n' ','" + fullCmd := getPartitionStatusCmd + " --format='%P %c %C %G %a %D %F'" result, err := utils.RunCommand(fullCmd) // 状态 if err != nil || utils.CheckSlurmStatus(result) { errInfo := &errdetails.ErrorInfo{ @@ -982,7 +1004,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo return nil, st.Err() } - partitionElements := strings.Split(result, ",") + partitionElements := strings.Split(result, "\n") for _, partitionElement := range partitionElements { // 移除可能存在的前导空格 partitionElement = strings.TrimSpace(partitionElement) @@ -990,8 +1012,16 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo continue } resultList := strings.Split(partitionElement, " ") + if len(resultList) != 7 { + caller.Logger.Infof("Invalid partitionElement: %s", partitionElement) + continue + } state = resultList[4] nodeInfo := strings.Split(resultList[6], "/") + if len(nodeInfo) != 4 { + caller.Logger.Infof("Invalid nodeInfo: %s", resultList[6]) + continue + } runningNodesTmp, _ := strconv.Atoi(nodeInfo[0]) runningNodes = runningNodes + runningNodesTmp idleNodesTmp, _ := strconv.Atoi(nodeInfo[1])