Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Support distributed job for frameworkcontroller #612

Merged
Merged
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
d77a99c
fix remote bug
Dec 25, 2018
695d866
Merge pull request #106 from Microsoft/master
SparkSnail Dec 25, 2018
b7e9799
Merge pull request #107 from Microsoft/master
SparkSnail Dec 27, 2018
7cb03f9
add document
Dec 27, 2018
44d1565
add document
Dec 27, 2018
7ab7386
update
Dec 27, 2018
d9e1ea8
update
Dec 27, 2018
2c225a8
update
Dec 27, 2018
be23f55
update
Dec 29, 2018
6f760ab
Merge pull request #108 from Microsoft/master
SparkSnail Jan 2, 2019
9161209
fix remote issue
Jan 3, 2019
e661c55
fix forEach
Jan 3, 2019
4e5d836
Merge pull request #109 from Microsoft/master
SparkSnail Jan 3, 2019
f80e737
fix conflict
Jan 4, 2019
aefc219
Merge branch 'Microsoft-master'
Jan 4, 2019
4fec2cc
update doc according to comments
Jan 7, 2019
dc45661
Merge pull request #111 from Microsoft/master
SparkSnail Jan 7, 2019
11fec6f
update
Jan 7, 2019
a03a191
update
Jan 7, 2019
7c7832c
update
Jan 7, 2019
2c862dc
Merge pull request #112 from Microsoft/master
SparkSnail Jan 8, 2019
85c015d
remove 'any more'
Jan 8, 2019
85cb472
Merge branch 'master' of https://github.com/SparkSnail/nni
Jan 8, 2019
3784355
Merge pull request #113 from Microsoft/master
SparkSnail Jan 9, 2019
296c928
debug
Jan 11, 2019
d91c980
Merge pull request #114 from Microsoft/master
SparkSnail Jan 14, 2019
0b06014
Merge branch 'master' of https://github.com/SparkSnail/nni into dev-f…
Jan 14, 2019
d36e4c1
first version
Jan 15, 2019
a4d3873
update
Jan 15, 2019
63230bb
update
Jan 15, 2019
05b7710
update
Jan 15, 2019
036c259
fix bug
Jan 16, 2019
61cc84f
update code according to comments
Jan 16, 2019
fa4ee49
remove unused import
Jan 16, 2019
dc6c319
remove blank lines
Jan 16, 2019
2329bfa
fix code
Jan 16, 2019
d9a71db
fix code format
Jan 16, 2019
77d0859
update code according to comments
Jan 16, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,13 @@ import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInf
*/
@component.Singleton
class FrameworkControllerTrainingService extends KubernetesTrainingService implements KubernetesTrainingService {
private frameworkcontrollerTrialConfig?: FrameworkControllerTrialConfig;
private frameworkcontrollerJobInfoCollector: FrameworkControllerJobInfoCollector;
private fcTrialConfig?: FrameworkControllerTrialConfig; // frameworkcontroller trial configuration
private fcJobInfoCollector: FrameworkControllerJobInfoCollector; // frameworkcontroller job info collector
private fcContainerPortMap = new Map<string, number>(); // store frameworkcontroller container port

constructor() {
super();
this.frameworkcontrollerJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap);
this.fcJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap);
this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1;
}
Expand All @@ -67,7 +68,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
while (!this.stopping) {
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await delay(3000);
await this.frameworkcontrollerJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
await this.fcJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
}
}

Expand All @@ -90,7 +91,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
const frameworkcontrollerJobName = `nniexp${this.experimentId}trial${trialJobId}`.toLowerCase();

//Generate the port used for taskRole
this.generateContainerPort();
await this.prepareRunScript(trialLocalTempFolder, curTrialSequenceId, trialJobId, trialWorkingFolder, form);

//upload code files
Expand Down Expand Up @@ -157,22 +159,38 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return Promise.resolve(trialJobOutputUrl);
}

/**
* generate trial's command for frameworkcontroller
* expose port and execute injector.sh before executing user's command
* @param command
*/
private generateCommandScript(command: string): string {
let portScript = '';
if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}
for(let taskRole of this.fcTrialConfig.taskRoles) {
portScript += `${taskRole.name}_port=${this.fcContainerPortMap.get(taskRole.name)} `;
}
return `${portScript} . /mnt/frameworkbarrier/injector.sh && ${command}`;
}

private async prepareRunScript(trialLocalTempFolder: string, curTrialSequenceId: number, trialJobId: string, trialWorkingFolder: string, form: JobApplicationForm): Promise<void> {
if(!this.frameworkcontrollerTrialConfig) {
if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}

await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.frameworkcontrollerTrialConfig.codeDir} ${trialLocalTempFolder}`);
await cpp.exec(`cp -r ${this.fcTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);

for(let taskRole of this.frameworkcontrollerTrialConfig.taskRoles) {
for(let taskRole of this.fcTrialConfig.taskRoles) {
const runScriptContent: string = this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder,
taskRole.command, curTrialSequenceId.toString(), taskRole.name, taskRole.gpuNum);
this.generateCommandScript(taskRole.command), curTrialSequenceId.toString(), taskRole.name, taskRole.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, { encoding: 'utf8' });
}

Expand All @@ -186,12 +204,12 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple

private async prepareFrameworkControllerConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName: string): Promise<any> {

if(!this.frameworkcontrollerTrialConfig) {
if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}

const podResources : any = [];
for(let taskRole of this.frameworkcontrollerTrialConfig.taskRoles) {
for(let taskRole of this.fcTrialConfig.taskRoles) {
let resource: any = {};
resource.requests = this.generatePodResource(taskRole.memoryMB, taskRole.cpuNum, taskRole.gpuNum);
resource.limits = Object.assign({}, resource.requests);
Expand Down Expand Up @@ -234,14 +252,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
case TrialConfigMetadataKey.TRIAL_CONFIG:
let frameworkcontrollerTrialJsonObjsect = JSON.parse(value);

this.frameworkcontrollerTrialConfig = new FrameworkControllerTrialConfig(
this.fcTrialConfig = new FrameworkControllerTrialConfig(
frameworkcontrollerTrialJsonObjsect.codeDir,
frameworkcontrollerTrialJsonObjsect.taskRoles
);

// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.frameworkcontrollerTrialConfig.codeDir);
await validateCodeDir(this.fcTrialConfig.codeDir);
} catch(error) {
this.log.error(error);
return Promise.reject(new Error(error));
Expand All @@ -253,6 +271,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple

return Promise.resolve();
}

private generateContainerPort() {
if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}

let port = 4000; //The default port used in container
for(let index in this.fcTrialConfig.taskRoles) {
this.fcContainerPortMap.set(this.fcTrialConfig.taskRoles[index].name, port);
port += 1;
}
}

/**
* Generate frameworkcontroller resource config file
Expand All @@ -266,24 +296,29 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('frameworkcontroller Cluster config is not initialized');
}

if(!this.frameworkcontrollerTrialConfig) {
if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}

let taskRoles = [];
for(let index in this.frameworkcontrollerTrialConfig.taskRoles) {
for(let index in this.fcTrialConfig.taskRoles) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same concern as above for "for...in..."

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use 'for ... in ... ' in this place is to get the index of the array, and the index will be used in other purpose.

let containerPort = this.fcContainerPortMap.get(this.fcTrialConfig.taskRoles[index].name);
if(!containerPort) {
throw new Error('Container port is not initialized');
}
let taskRole = this.generateTaskRoleConfig(
trialWorkingFolder,
this.frameworkcontrollerTrialConfig.taskRoles[index].image,
`run_${this.frameworkcontrollerTrialConfig.taskRoles[index].name}.sh`,
podResources[index]
this.fcTrialConfig.taskRoles[index].image,
`run_${this.fcTrialConfig.taskRoles[index].name}.sh`,
podResources[index],
containerPort
);
taskRoles.push({
name: this.frameworkcontrollerTrialConfig.taskRoles[index].name,
taskNumber: this.frameworkcontrollerTrialConfig.taskRoles[index].taskNum,
name: this.fcTrialConfig.taskRoles[index].name,
taskNumber: this.fcTrialConfig.taskRoles[index].taskNum,
frameworkAttemptCompletionPolicy: {
minFailedTaskCount: this.frameworkcontrollerTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minFailedTaskCount,
minSucceededTaskCount: this.frameworkcontrollerTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minSucceededTaskCount
minFailedTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minFailedTaskCount,
minSucceededTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minSucceededTaskCount
},
task: taskRole
});
Expand All @@ -308,12 +343,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
};
}

private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, podResources: any): any {


private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, podResources: any, containerPort: number): any {
if(!this.kubernetesClusterConfig) {
throw new Error('frameworkcontroller Cluster config is not initialized');
}

if(!this.frameworkcontrollerTrialConfig) {
if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}

Expand All @@ -327,6 +364,9 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
shareName: `${this.azureStorageShare}`,
readonly: false
}
}, {
name: 'frameworkbarrier-volume',
emptyDir: {}
}])
}else {
let frameworkcontrollerClusterConfigNFS: KubernetesClusterConfigNFS = <KubernetesClusterConfigNFS> this.kubernetesClusterConfig;
Expand All @@ -337,26 +377,45 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
server: `${frameworkcontrollerClusterConfigNFS.nfs.server}`,
path: `${frameworkcontrollerClusterConfigNFS.nfs.path}`
}
}, {
name: 'frameworkbarrier-volume',
emptyDir: {}
}])
}

let taskRole = {
pod: {
spec: {
containers: [
{
name: 'framework',
image: replicaImage,
args: ["sh", `${path.join(trialWorkingFolder, runScriptFile)}`],
command: ["sh", `${path.join(trialWorkingFolder, runScriptFile)}`],
volumeMounts: [
{
name: 'nni-vol',
mountPath: this.CONTAINER_MOUNT_PATH
},{
name: 'frameworkbarrier-volume',
mountPath: '/mnt/frameworkbarrier'
}],
resources: podResources
resources: podResources,
ports: [{
containerPort: containerPort
}]
}],
initContainers: [
{
name: 'frameworkbarrier',
image: 'frameworkcontroller/frameworkbarrier',
volumeMounts: [
{
name: 'frameworkbarrier-volume',
mountPath: '/mnt/frameworkbarrier'
}]
}],
restartPolicy: 'OnFailure',
volumes: volumeSpecMap.get('nniVolumes')
volumes: volumeSpecMap.get('nniVolumes'),
hostNetwork: false
}
}
}
Expand Down