Skip to content

Commit

Permalink
Merge pull request #23 from PKUHPC/fix_mult
Browse files Browse the repository at this point in the history
修复多gpu多节点时解除节点名异常的问题
  • Loading branch information
283713406 authored Oct 23, 2024
2 parents bb7b0ea + 8f0e605 commit 954f678
Showing 1 changed file with 34 additions and 4 deletions.
38 changes: 34 additions & 4 deletions services/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva

res := strings.Contains(nodeArray[0], "[")
if res {
getNodeNameCmd := fmt.Sprintf("echo %s | awk -F'[' '{print $1,$2}' | awk -F'-' '{print $1}'", nodeArray[0])
getNodeNameCmd := fmt.Sprintf("echo %s | awk -F'[' '{print $1,$2}'", nodeArray[0])
nodeNameOutput, err := utils.RunCommand(getNodeNameCmd)
if err != nil || utils.CheckSlurmStatus(nodeNameOutput) {
errInfo := &errdetails.ErrorInfo{
Expand All @@ -689,7 +689,29 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva
caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err())
return nil, st.Err()
}
nodeName := strings.Join(strings.Split(nodeNameOutput, " "), "")
nodeNameTmp := strings.Split(nodeNameOutput, " ")
if len(nodeNameTmp) != 2 {
errInfo := &errdetails.ErrorInfo{
Reason: "INVALID_NODE_NAME",
}
st := status.New(codes.Internal, "node name is invalid")
st, _ = st.WithDetails(errInfo)
caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err())
return nil, st.Err()
}
nodeNamePrefix := nodeNameTmp[0]
nodeNameSuffixCmd := fmt.Sprintf("echo %s | awk -F'-' '{print $1}'", nodeNameTmp[1])
nodeNameSuffix, err := utils.RunCommand(nodeNameSuffixCmd)
if err != nil || utils.CheckSlurmStatus(nodeNameOutput) {
errInfo := &errdetails.ErrorInfo{
Reason: "COMMAND_EXEC_FAILED",
}
st := status.New(codes.Internal, "Exec command failed or slurmctld down.")
st, _ = st.WithDetails(errInfo)
caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err())
return nil, st.Err()
}
nodeName := nodeNamePrefix + nodeNameSuffix
gpusCmd := fmt.Sprintf("scontrol show node=%s| grep ' Gres=' | awk -F':' '{print $NF}'", nodeName)
gpusOutput, err := utils.RunCommand(gpusCmd)
if err != nil || utils.CheckSlurmStatus(gpusOutput) {
Expand Down Expand Up @@ -977,7 +999,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo
noAvailableNodes int
)
getPartitionStatusCmd := fmt.Sprintf("sinfo -p %s --noheader", v)
fullCmd := getPartitionStatusCmd + " --format='%P %c %C %G %a %D %F'| tr '\n' ','"
fullCmd := getPartitionStatusCmd + " --format='%P %c %C %G %a %D %F'"
result, err := utils.RunCommand(fullCmd) // 状态
if err != nil || utils.CheckSlurmStatus(result) {
errInfo := &errdetails.ErrorInfo{
Expand All @@ -989,16 +1011,24 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo
return nil, st.Err()
}

partitionElements := strings.Split(result, ",")
partitionElements := strings.Split(result, "\n")
for _, partitionElement := range partitionElements {
// 移除可能存在的前导空格
partitionElement = strings.TrimSpace(partitionElement)
if partitionElement == "" {
continue
}
resultList := strings.Split(partitionElement, " ")
if len(resultList) != 7 {
caller.Logger.Infof("Invalid partitionElement: %s", partitionElement)
continue
}
state = resultList[4]
nodeInfo := strings.Split(resultList[6], "/")
if len(nodeInfo) != 4 {
caller.Logger.Infof("Invalid nodeInfo: %s", resultList[6])
continue
}
runningNodesTmp, _ := strconv.Atoi(nodeInfo[0])
runningNodes = runningNodes + runningNodesTmp
idleNodesTmp, _ := strconv.Atoi(nodeInfo[1])
Expand Down

0 comments on commit 954f678

Please sign in to comment.