Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
[Rest Server] Update hived cell number calculation (#4757)
Browse files Browse the repository at this point in the history
Update hived cell number calculation and refine error messages.
  • Loading branch information
abuccts authored Aug 6, 2020
1 parent 30aa629 commit 41afdc7
Showing 1 changed file with 26 additions and 11 deletions.
37 changes: 26 additions & 11 deletions src/rest-server/src/middlewares/v2/hived.js
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,6 @@ const hivedValidate = async (protocolObj, username) => {
let requestCellNumber = 0;
const {cellQuota, cellUnits} = await getCellStatus(virtualCluster);
for (let taskRole of Object.keys(protocolObj.taskRoles)) {
const {gpu = 0, cpu, memoryMB} = protocolObj.taskRoles[taskRole].resourcePerInstance;
const cellNumber = gpu === 0 ? cpu : gpu;
requestCellNumber += protocolObj.taskRoles[taskRole].instances * cellNumber;

const resourcePerCell = {};
for (const t of ['gpu', 'cpu', 'memory']) {
resourcePerCell[t] = Math.min(
Expand All @@ -190,7 +186,7 @@ const hivedValidate = async (protocolObj, username) => {
priority: convertPriority(hivedConfig ? hivedConfig.jobPriorityClass : undefined),
gpuType: null,
pinnedCellId: null,
gpuNumber: cellNumber,
gpuNumber: 0,
affinityGroup: null,
};
if (hivedConfig && hivedConfig.taskRoles && taskRole in hivedConfig.taskRoles) {
Expand All @@ -216,25 +212,44 @@ const hivedValidate = async (protocolObj, username) => {
};
}

if (gpu > resourcePerCell.gpu * cellNumber ||
cpu > resourcePerCell.cpu * cellNumber ||
memoryMB > resourcePerCell.memory * cellNumber) {
const {gpu = 0, cpu, memoryMB} = protocolObj.taskRoles[taskRole].resourcePerInstance;
let requestedResource = '';
let emptyResource = '';
if (resourcePerCell.gpu === 0 && gpu > 0) {
requestedResource = resourcePerCell.gpu;
emptyResource = 'GPU';
} else if (resourcePerCell.cpu === 0 && cpu > 0) {
requestedResource = resourcePerCell.cpu;
emptyResource = 'CPU';
} else if (resourcePerCell.memory === 0 && memoryMB > 0) {
requestedResource = resourcePerCell.memory;
emptyResource = 'memory';
}
if (emptyResource !== '') {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Taskrole ${taskRole} requests ${gpu} GPU, ${cpu} CPU, ${memoryMB}MB memory; ` +
`sku allows ${resourcePerCell.gpu} GPU, ${resourcePerCell.cpu} CPU, ${resourcePerCell.memory}MB memory per cell.`
`Taskrole ${taskRole} requests ${requestedResource} ${emptyResource}, but SKU does not ` +
`configure ${emptyResource}. Please contact admin if the taskrole needs ${emptyResource} resources.`
);
}

const cellNumber = Math.max(
Math.ceil(gpu / resourcePerCell.gpu),
Math.ceil(cpu / resourcePerCell.cpu),
Math.ceil(memoryMB / resourcePerCell.memory),
);
podSpec.gpuNumber = cellNumber;
requestCellNumber += protocolObj.taskRoles[taskRole].instances * cellNumber;

protocolObj.taskRoles[taskRole].hivedPodSpec = podSpec;
}

if (requestCellNumber > cellQuota && gangAllocation && !opportunistic) {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Exceed ${cellQuota} GPU quota in ${virtualCluster} VC.`
`Job requests ${requestCellNumber} SKUs, exceeds maximum ${cellQuota} SKUs in VC ${virtualCluster}.`
);
}

Expand Down

0 comments on commit 41afdc7

Please sign in to comment.