From 6e50d2905a129f3867c1e360fd0d827a9ba56f56 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 19 Aug 2019 21:25:28 +0800 Subject: [PATCH 1/9] init --- .../kubernetes/azureStorageClientUtils.ts | 69 +++++--- .../frameworkcontrollerTrainingService.ts | 30 +++- .../kubeflow/kubeflowTrainingService.ts | 159 ++++++++++-------- 3 files changed, 147 insertions(+), 111 deletions(-) diff --git a/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts b/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts index f687136fbe..25b121b4ea 100644 --- a/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts +++ b/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts @@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility { * @param fileServerClient * @param azureShare */ - export async function createShare(fileServerClient: any, azureShare: any): Promise { - const deferred: Deferred = new Deferred(); + export async function createShare(fileServerClient: any, azureShare: any): Promise { + const deferred: Deferred = new Deferred(); fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => { if (error) { getLogger() .error(`Create share failed:, ${error}`); - deferred.reject(error); + deferred.resolve(false); } else { - deferred.resolve(); + deferred.resolve(true); } }); @@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility { * @param azureFoler * @param azureShare */ - export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise { - const deferred: Deferred = new Deferred(); + export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise { + const deferred: Deferred = new Deferred(); fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => { if (error) { getLogger() .error(`Create directory failed:, ${error}`); - deferred.reject(error); + deferred.resolve(false); } else { - deferred.resolve(); + deferred.resolve(true); } }); - return deferred.promise; } @@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility { * @param azureDirectory */ export async function createDirectoryRecursive(fileServerClient: azureStorage.FileService, azureDirectory: string, - azureShare: any): Promise { - const deferred: Deferred = new Deferred(); + azureShare: any): Promise { + const deferred: Deferred = new Deferred(); const directories: string[] = azureDirectory.split('/'); let rootDirectory: string = ''; for (const directory of directories) { rootDirectory += directory; - await createDirectory(fileServerClient, rootDirectory, azureShare); + let result:boolean = await createDirectory(fileServerClient, rootDirectory, azureShare); + if (!result) { + deferred.resolve(false); + return deferred.promise; + } rootDirectory += '/'; } - deferred.resolve(); + deferred.resolve(true); return deferred.promise; } @@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility { * @param localFilePath */ async function uploadFileToAzure(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any, - localFilePath: string): Promise { - const deferred: Deferred = new Deferred(); + localFilePath: string): Promise { + const deferred: Deferred = new Deferred(); await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath, (error: any, result: any, response: any) => { if (error) { getLogger() .error(`Upload file failed:, ${error}`); - deferred.reject(error); + deferred.resolve(false); } else { - deferred.resolve(); + deferred.resolve(true); } }); @@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility { * @param localFilePath */ async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any, - localFilePath: string): Promise { - const deferred: Deferred = new Deferred(); + localFilePath: string): Promise { + const deferred: Deferred = new Deferred(); // tslint:disable-next-line:non-literal-fs-path await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath), (error: any, result: any, response: any) => { if (error) { getLogger() .error(`Download file failed:, ${error}`); - deferred.reject(error); + deferred.resolve(false); } else { - deferred.resolve(); + deferred.resolve(true); } }); @@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility { */ // tslint:disable:non-literal-fs-path export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any, - localDirectory: string): Promise { - const deferred: Deferred = new Deferred(); + localDirectory: string): Promise { + const deferred: Deferred = new Deferred(); const fileNameArray: string[] = fs.readdirSync(localDirectory); - await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare); + let result: boolean = await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare); + if (!result) { + deferred.resolve(false); + return deferred.promise; + } for (const fileName of fileNameArray) { const fullFilePath: string = path.join(localDirectory, fileName); try { + let resultUploadFile: boolean = true; + let resultUploadDir: boolean = true; if (fs.lstatSync(fullFilePath) .isFile()) { - await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath); + resultUploadFile = await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath); } else { // If filePath is a directory, recuisively copy it to azure - await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath); + resultUploadDir = await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath); + } + if (!(resultUploadFile && resultUploadDir)) { + deferred.resolve(false); + return deferred.promise; } } catch (error) { - deferred.reject(error); + deferred.resolve(false); return deferred.promise; } } // All files/directories are copied successfully, resolve - deferred.resolve(); + deferred.resolve(true); return deferred.promise; } diff --git a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts index d38c656aaa..d65e34a008 100644 --- a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts @@ -211,16 +211,28 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple if (this.azureStorageClient === undefined) { throw new Error('azureStorageClient is not initialized'); } + let retryCount: number = 1; try { - //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage - await AzureStorageClientUtility.uploadDirectory( - this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`); - //upload code files to azure storage - await AzureStorageClientUtility.uploadDirectory( - this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${this.fcTrialConfig.codeDir}`); - - trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/` + - `${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; + while (retryCount >= 0) { + //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage + let resultUploadDir1: boolean = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, + `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, + `${trialLocalTempFolder}`); + //upload code files to azure storage + let resultUploadDir2: boolean = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, + `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, + `${this.fcTrialConfig.codeDir}`); + if (resultUploadDir1 && resultUploadDir2) { + trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` + + `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; + break; + } else { + //wait for 5 seconds to re-upload files + await delay(5000); + this.log.info('Re-upload files to azure-storage'); + retryCount -= 1; + } + } } catch (error) { this.log.error(error); diff --git a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts index b65a548fb6..1ba9675748 100644 --- a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts @@ -101,22 +101,36 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber //prepare the runscript await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form); //upload files to sotrage - const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); - const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( - trialJobId, - 'WAITING', - Date.now(), - trialWorkingFolder, - form, - kubeflowJobName, - curTrialSequenceId, - trialJobOutputUrl - ); - - // Generate kubeflow job resource config object - const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName); - // Create kubeflow job based on generated kubeflow job resource config - await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig); + const trialJobOutputUrl: string | undefined = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); + let trialJobDetail: KubernetesTrialJobDetail; + if (trialJobOutputUrl) { + trialJobDetail = new KubernetesTrialJobDetail( + trialJobId, + 'WAITING', + Date.now(), + trialWorkingFolder, + form, + kubeflowJobName, + curTrialSequenceId, + trialJobOutputUrl + ); + // Generate kubeflow job resource config object + const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName); + + // Create kubeflow job based on generated kubeflow job resource config + await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig); + } else { + trialJobDetail = new KubernetesTrialJobDetail( + trialJobId, + 'FAILED', + Date.now(), + trialWorkingFolder, + form, + kubeflowJobName, + curTrialSequenceId, + 'submit failed' + ); + } // Set trial job detail until create Kubeflow job successfully this.trialJobsMap.set(trialJobId, trialJobDetail); @@ -196,7 +210,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber * @param trialLocalTempFolder * return: trialJobOutputUrl */ - private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise { + private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise { if (this.kubeflowClusterConfig === undefined) { throw new Error('Kubeflow Cluster config is not initialized'); } @@ -205,7 +219,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber throw new Error('Kubeflow Trial config is not initialized'); } - let trialJobOutputUrl: string = ''; + let trialJobOutputUrl: string | undefined = undefined; assert(this.kubeflowClusterConfig.storage === undefined || this.kubeflowClusterConfig.storage === 'azureStorage' @@ -215,18 +229,28 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber if (this.azureStorageClient === undefined) { throw new Error('azureStorageClient is not initialized'); } + let retryCount: number = 1; try { - //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage - await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${trialLocalTempFolder}`); - //upload code files to azure storage - await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${this.kubeflowTrialConfig.codeDir}`); - - trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` + - `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; + while (retryCount >= 0) { + //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage + let resultUploadDir1: boolean = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, + `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, + `${trialLocalTempFolder}`); + //upload code files to azure storage + let resultUploadDir2: boolean = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, + `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, + `${this.kubeflowTrialConfig.codeDir}`); + if (resultUploadDir1 && resultUploadDir2) { + trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` + + `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; + break; + } else { + //wait for 5 seconds to re-upload files + await delay(5000); + this.log.info('Re-upload files to azure-storage'); + retryCount -= 1; + } + } } catch (error) { this.log.error(error); @@ -347,7 +371,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber } // Generate kubeflow job resource config object - const kubeflowJobConfig: any = await this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, + const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, nonWorkerResources); return Promise.resolve(kubeflowJobConfig); @@ -361,8 +385,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber * @param workerPodResources worker pod template * @param nonWorkerPodResources non-worker pod template, like ps or master */ - private async generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any, - nonWorkerPodResources?: any) : Promise { + private generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any, + nonWorkerPodResources?: any) : any { if (this.kubeflowClusterConfig === undefined) { throw new Error('Kubeflow Cluster config is not initialized'); } @@ -377,32 +401,29 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber const replicaSpecsObj: any = {}; const replicaSpecsObjMap: Map = new Map(); + if (this.kubeflowTrialConfig.operatorType === 'tf-operator') { const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = this.kubeflowTrialConfig; - let privateRegistrySecretName = await this.createRegistrySecret(tensorflowTrialConfig.worker.privateRegistryAuthPath); replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas, - tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName); + tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources); if (tensorflowTrialConfig.ps !== undefined) { - let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(tensorflowTrialConfig.ps.privateRegistryAuthPath); replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas, - tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources, privateRegistrySecretName); + tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources); } replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj}); } else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') { const pytorchTrialConfig: KubeflowTrialConfigPytorch = this.kubeflowTrialConfig; if (pytorchTrialConfig.worker !== undefined) { - let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.worker.privateRegistryAuthPath); replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas, - pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName); + pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources); } - let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.master.privateRegistryAuthPath); replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas, - pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources, privateRegistrySecretName); + pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj}); } - return Promise.resolve({ + return { apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`, kind: this.kubernetesCRDClient.jobKind, metadata: { @@ -415,7 +436,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber } }, spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind) - }); + }; } /** @@ -427,7 +448,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber * @param podResources pod resource config section */ private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string, - podResources: any, privateRegistrySecretName: string | undefined): any { + podResources: any): any { if (this.kubeflowClusterConfig === undefined) { throw new Error('Kubeflow Cluster config is not initialized'); } @@ -439,7 +460,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber if (this.kubernetesCRDClient === undefined) { throw new Error('Kubeflow operator client is not initialized'); } - // The config spec for volume field + const volumeSpecMap: Map = new Map(); if (this.kubeflowClusterConfig.storageType === 'azureStorage') { volumeSpecMap.set('nniVolumes', [ @@ -462,34 +483,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber } }]); } - // The config spec for container field - const containersSpecMap: Map = new Map(); - containersSpecMap.set('containers', [ - { - // Kubeflow tensorflow operator requires that containers' name must be tensorflow - // TODO: change the name based on operator's type - name: this.kubernetesCRDClient.containerName, - image: replicaImage, - args: ['sh', `${path.join(trialWorkingFolder, runScriptFile)}`], - volumeMounts: [ - { - name: 'nni-vol', - mountPath: this.CONTAINER_MOUNT_PATH - }], - resources: podResources - } - ]); - let spec: any = { - containers: containersSpecMap.get('containers'), - restartPolicy: 'ExitCode', - volumes: volumeSpecMap.get('nniVolumes') - } - if (privateRegistrySecretName) { - spec.imagePullSecrets = [ - { - name: privateRegistrySecretName - }] - } + return { replicas: replicaNumber, template: { @@ -497,9 +491,26 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber // tslint:disable-next-line:no-null-keyword creationTimestamp: null }, - spec: spec + spec: { + containers: [ + { + // Kubeflow tensorflow operator requires that containers' name must be tensorflow + // TODO: change the name based on operator's type + name: this.kubernetesCRDClient.containerName, + image: replicaImage, + args: ['sh', `${path.join(trialWorkingFolder, runScriptFile)}`], + volumeMounts: [ + { + name: 'nni-vol', + mountPath: this.CONTAINER_MOUNT_PATH + }], + resources: podResources + }], + restartPolicy: 'ExitCode', + volumes: volumeSpecMap.get('nniVolumes') + } } - } + }; } } // tslint:enable: no-unsafe-any no-any From fc6ab54433e7823f546010b19bf0e823840020a6 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 19 Aug 2019 21:43:01 +0800 Subject: [PATCH 2/9] fix error --- .../kubeflow/kubeflowTrainingService.ts | 129 +++++++++--------- 1 file changed, 64 insertions(+), 65 deletions(-) diff --git a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts index 1ba9675748..ae19ff0af7 100644 --- a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts @@ -101,36 +101,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber //prepare the runscript await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form); //upload files to sotrage - const trialJobOutputUrl: string | undefined = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); - let trialJobDetail: KubernetesTrialJobDetail; - if (trialJobOutputUrl) { - trialJobDetail = new KubernetesTrialJobDetail( - trialJobId, - 'WAITING', - Date.now(), - trialWorkingFolder, - form, - kubeflowJobName, - curTrialSequenceId, - trialJobOutputUrl - ); - // Generate kubeflow job resource config object - const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName); - - // Create kubeflow job based on generated kubeflow job resource config - await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig); - } else { - trialJobDetail = new KubernetesTrialJobDetail( - trialJobId, - 'FAILED', - Date.now(), - trialWorkingFolder, - form, - kubeflowJobName, - curTrialSequenceId, - 'submit failed' - ); - } + const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); + const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( + trialJobId, + 'WAITING', + Date.now(), + trialWorkingFolder, + form, + kubeflowJobName, + curTrialSequenceId, + trialJobOutputUrl + ); + + // Generate kubeflow job resource config object + const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName); + // Create kubeflow job based on generated kubeflow job resource config + await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig); // Set trial job detail until create Kubeflow job successfully this.trialJobsMap.set(trialJobId, trialJobDetail); @@ -210,7 +196,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber * @param trialLocalTempFolder * return: trialJobOutputUrl */ - private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise { + private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise { if (this.kubeflowClusterConfig === undefined) { throw new Error('Kubeflow Cluster config is not initialized'); } @@ -219,7 +205,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber throw new Error('Kubeflow Trial config is not initialized'); } - let trialJobOutputUrl: string | undefined = undefined; + let trialJobOutputUrl: string = ''; assert(this.kubeflowClusterConfig.storage === undefined || this.kubeflowClusterConfig.storage === 'azureStorage' @@ -250,7 +236,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber this.log.info('Re-upload files to azure-storage'); retryCount -= 1; } - } + } } catch (error) { this.log.error(error); @@ -371,7 +357,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber } // Generate kubeflow job resource config object - const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, + const kubeflowJobConfig: any = await this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, nonWorkerResources); return Promise.resolve(kubeflowJobConfig); @@ -385,8 +371,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber * @param workerPodResources worker pod template * @param nonWorkerPodResources non-worker pod template, like ps or master */ - private generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any, - nonWorkerPodResources?: any) : any { + private async generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any, + nonWorkerPodResources?: any) : Promise { if (this.kubeflowClusterConfig === undefined) { throw new Error('Kubeflow Cluster config is not initialized'); } @@ -401,29 +387,32 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber const replicaSpecsObj: any = {}; const replicaSpecsObjMap: Map = new Map(); - if (this.kubeflowTrialConfig.operatorType === 'tf-operator') { const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = this.kubeflowTrialConfig; + let privateRegistrySecretName = await this.createRegistrySecret(tensorflowTrialConfig.worker.privateRegistryAuthPath); replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas, - tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources); + tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName); if (tensorflowTrialConfig.ps !== undefined) { + let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(tensorflowTrialConfig.ps.privateRegistryAuthPath); replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas, - tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources); + tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources, privateRegistrySecretName); } replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj}); } else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') { const pytorchTrialConfig: KubeflowTrialConfigPytorch = this.kubeflowTrialConfig; if (pytorchTrialConfig.worker !== undefined) { + let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.worker.privateRegistryAuthPath); replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas, - pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources); + pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName); } + let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.master.privateRegistryAuthPath); replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas, - pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources); + pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources, privateRegistrySecretName); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj}); } - return { + return Promise.resolve({ apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`, kind: this.kubernetesCRDClient.jobKind, metadata: { @@ -436,7 +425,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber } }, spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind) - }; + }); } /** @@ -448,7 +437,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber * @param podResources pod resource config section */ private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string, - podResources: any): any { + podResources: any, privateRegistrySecretName: string | undefined): any { if (this.kubeflowClusterConfig === undefined) { throw new Error('Kubeflow Cluster config is not initialized'); } @@ -460,7 +449,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber if (this.kubernetesCRDClient === undefined) { throw new Error('Kubeflow operator client is not initialized'); } - + // The config spec for volume field const volumeSpecMap: Map = new Map(); if (this.kubeflowClusterConfig.storageType === 'azureStorage') { volumeSpecMap.set('nniVolumes', [ @@ -483,7 +472,34 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber } }]); } - + // The config spec for container field + const containersSpecMap: Map = new Map(); + containersSpecMap.set('containers', [ + { + // Kubeflow tensorflow operator requires that containers' name must be tensorflow + // TODO: change the name based on operator's type + name: this.kubernetesCRDClient.containerName, + image: replicaImage, + args: ['sh', `${path.join(trialWorkingFolder, runScriptFile)}`], + volumeMounts: [ + { + name: 'nni-vol', + mountPath: this.CONTAINER_MOUNT_PATH + }], + resources: podResources + } + ]); + let spec: any = { + containers: containersSpecMap.get('containers'), + restartPolicy: 'ExitCode', + volumes: volumeSpecMap.get('nniVolumes') + } + if (privateRegistrySecretName) { + spec.imagePullSecrets = [ + { + name: privateRegistrySecretName + }] + } return { replicas: replicaNumber, template: { @@ -491,26 +507,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber // tslint:disable-next-line:no-null-keyword creationTimestamp: null }, - spec: { - containers: [ - { - // Kubeflow tensorflow operator requires that containers' name must be tensorflow - // TODO: change the name based on operator's type - name: this.kubernetesCRDClient.containerName, - image: replicaImage, - args: ['sh', `${path.join(trialWorkingFolder, runScriptFile)}`], - volumeMounts: [ - { - name: 'nni-vol', - mountPath: this.CONTAINER_MOUNT_PATH - }], - resources: podResources - }], - restartPolicy: 'ExitCode', - volumes: volumeSpecMap.get('nniVolumes') - } + spec: spec } - }; + } } } // tslint:enable: no-unsafe-any no-any From 5bacd96e17f6cfd454a0ee90969800a992f37129 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 22 Aug 2019 22:22:56 +0800 Subject: [PATCH 3/9] fix comments --- .../kubernetes/azureStorageClientUtils.ts | 4 +- .../frameworkcontrollerTrainingService.ts | 40 ++++-------------- .../kubeflow/kubeflowTrainingService.ts | 36 ++++------------ .../kubernetes/kubernetesTrainingService.ts | 42 ++++++++++++++++++- 4 files changed, 57 insertions(+), 65 deletions(-) diff --git a/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts b/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts index 25b121b4ea..00eeb3e557 100644 --- a/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts +++ b/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts @@ -62,7 +62,7 @@ export namespace AzureStorageClientUtility { if (error) { getLogger() .error(`Create directory failed:, ${error}`); - deferred.resolve(false); + deferred.resolve(false); } else { deferred.resolve(true); } @@ -110,7 +110,7 @@ export namespace AzureStorageClientUtility { if (error) { getLogger() .error(`Upload file failed:, ${error}`); - deferred.resolve(false); + deferred.resolve(false); } else { deferred.resolve(true); } diff --git a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts index d65e34a008..9bafc43c07 100644 --- a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts @@ -25,7 +25,7 @@ import * as path from 'path'; import * as component from '../../../common/component'; import { getExperimentId } from '../../../common/experimentStartupInfo'; import { - JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail + JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; @@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple //upload code files const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); - + let initStatus: TrialJobStatus = 'WAITING'; + if (!trialJobOutputUrl) { + initStatus = 'FAILED'; + } const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( trialJobId, - 'WAITING', + initStatus, Date.now(), trialWorkingFolder, form, @@ -208,36 +211,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple let trialJobOutputUrl: string = ''; if (this.fcClusterConfig.storageType === 'azureStorage') { - if (this.azureStorageClient === undefined) { - throw new Error('azureStorageClient is not initialized'); - } - let retryCount: number = 1; - try { - while (retryCount >= 0) { - //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage - let resultUploadDir1: boolean = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${trialLocalTempFolder}`); - //upload code files to azure storage - let resultUploadDir2: boolean = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${this.fcTrialConfig.codeDir}`); - if (resultUploadDir1 && resultUploadDir2) { - trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` + - `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; - break; - } else { - //wait for 5 seconds to re-upload files - await delay(5000); - this.log.info('Re-upload files to azure-storage'); - retryCount -= 1; - } - } - } catch (error) { - this.log.error(error); - - return Promise.reject(error); - } + trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.fcTrialConfig.codeDir); } else if (this.fcClusterConfig.storageType === 'nfs') { const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = this.fcClusterConfig; diff --git a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts index ae19ff0af7..aa60c7b9e2 100644 --- a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts @@ -27,7 +27,7 @@ import * as component from '../../../common/component'; import { getExperimentId } from '../../../common/experimentStartupInfo'; import { - JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail + JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; @@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form); //upload files to sotrage const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); + let initStatus: TrialJobStatus = 'WAITING'; + if (!trialJobOutputUrl) { + initStatus = 'FAILED'; + } const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( trialJobId, - 'WAITING', + initStatus, Date.now(), trialWorkingFolder, form, @@ -215,33 +219,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber if (this.azureStorageClient === undefined) { throw new Error('azureStorageClient is not initialized'); } - let retryCount: number = 1; - try { - while (retryCount >= 0) { - //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage - let resultUploadDir1: boolean = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${trialLocalTempFolder}`); - //upload code files to azure storage - let resultUploadDir2: boolean = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${this.kubeflowTrialConfig.codeDir}`); - if (resultUploadDir1 && resultUploadDir2) { - trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` + - `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; - break; - } else { - //wait for 5 seconds to re-upload files - await delay(5000); - this.log.info('Re-upload files to azure-storage'); - retryCount -= 1; - } - } - } catch (error) { - this.log.error(error); - - return Promise.reject(error); - } + trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.kubeflowTrialConfig.codeDir); } else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) { const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = this.kubeflowClusterConfig; // Creat work dir for current trial in NFS directory diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 0327cd553f..65a5b757ca 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -31,7 +31,7 @@ import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; -import { getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; +import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; import { AzureStorageClientUtility } from './azureStorageClientUtils'; import { GeneralK8sClient, KubernetesCRDClient } from './kubernetesApiClient'; import { KubernetesClusterConfig } from './kubernetesConfig'; @@ -357,6 +357,46 @@ abstract class KubernetesTrainingService { ); return registrySecretName; } + + protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: String, codeDir: String): Promise { + if (this.azureStorageClient === undefined) { + throw new Error('azureStorageClient is not initialized'); + } + let trialJobOutputUrl: string = ''; + let retryCount: number = 1; + let resultUploadNNIScript: boolean = false; + let resultUploadCodeFile: boolean = false; + try { + do { + //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage + if(!resultUploadNNIScript) { + resultUploadNNIScript = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, + `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, + `${trialLocalTempFolder}`); + } + //upload code files to azure storage + if(!resultUploadCodeFile) { + resultUploadCodeFile = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, + `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, + `${codeDir}`); + } + if (resultUploadNNIScript && resultUploadCodeFile) { + trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` + + `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; + break; + } else { + //wait for 5 seconds to re-upload files + await delay(5000); + this.log.info('Upload failed, Retry: upload files to azure-storage'); + } + } while (retryCount-- >= 0) + } catch (error) { + this.log.error(error); + + return Promise.reject(error); + } + return Promise.resolve(trialJobOutputUrl); + } } export { KubernetesTrainingService }; From 7ae02193790ea7864df9ae7c59083d1af1c15f89 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 22 Aug 2019 23:04:12 +0800 Subject: [PATCH 4/9] add config file for expriments --- src/nni_manager/config/config.yaml | 1 + .../training_service/kubernetes/kubernetesTrainingService.ts | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 src/nni_manager/config/config.yaml diff --git a/src/nni_manager/config/config.yaml b/src/nni_manager/config/config.yaml new file mode 100644 index 0000000000..c356cf3338 --- /dev/null +++ b/src/nni_manager/config/config.yaml @@ -0,0 +1 @@ +azureStorageUploadRetryCount: 1 #If upload files to azure storage failed, NNI will retry the process of upload, this field will specify the number of attempts to re-upload files. diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 65a5b757ca..3e7123a308 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -38,6 +38,7 @@ import { KubernetesClusterConfig } from './kubernetesConfig'; import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData'; import { KubernetesJobRestServer } from './kubernetesJobRestServer'; +var yaml = require('js-yaml'); var fs = require('fs'); /** @@ -364,6 +365,10 @@ abstract class KubernetesTrainingService { } let trialJobOutputUrl: string = ''; let retryCount: number = 1; + let config = yaml.safeLoad(fs.readFileSync('./config/config.yaml', 'utf8')); + if(config && config.azureStorageUploadRetryCount instanceof Number) { + retryCount = config.azureStorageUploadRetryCount; + } let resultUploadNNIScript: boolean = false; let resultUploadCodeFile: boolean = false; try { From 17be3ba226de36e8d14f98d78b3886672315968b Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 22 Aug 2019 23:05:53 +0800 Subject: [PATCH 5/9] fix english words --- src/nni_manager/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/config/config.yaml b/src/nni_manager/config/config.yaml index c356cf3338..715b4ee1ed 100644 --- a/src/nni_manager/config/config.yaml +++ b/src/nni_manager/config/config.yaml @@ -1 +1 @@ -azureStorageUploadRetryCount: 1 #If upload files to azure storage failed, NNI will retry the process of upload, this field will specify the number of attempts to re-upload files. +azureStorageUploadRetryCount: 1 #If upload files to azure storage failed, NNI will retry the process of uploading, this field will specify the number of attempts to re-upload files. From 36b3cc3a0f1281efa3f70670954d5b1e5837c23b Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 23 Aug 2019 10:23:16 +0800 Subject: [PATCH 6/9] fix exception --- .../training_service/kubernetes/kubernetesTrainingService.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 3e7123a308..3f0265508d 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -397,8 +397,8 @@ abstract class KubernetesTrainingService { } while (retryCount-- >= 0) } catch (error) { this.log.error(error); - - return Promise.reject(error); + //return a empty url when got error + return Promise.resolve(""); } return Promise.resolve(trialJobOutputUrl); } From 26d2b9e31dde2d019efe7001719613f13f5234c2 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 27 Aug 2019 14:03:41 +0800 Subject: [PATCH 7/9] add log for fail retry --- .../training_service/kubernetes/kubernetesTrainingService.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 3f0265508d..9697175e8b 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -400,6 +400,9 @@ abstract class KubernetesTrainingService { //return a empty url when got error return Promise.resolve(""); } + if(!trialJobOutputUrl) { + this.log.info(`Retry-count is used up, upload files to azureStorage for trial ${trialJobId} failed!`); + } return Promise.resolve(trialJobOutputUrl); } From cfeac611fdafbf26d1747a7e5155dcf77f5c6bca Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 29 Aug 2019 10:56:01 +0800 Subject: [PATCH 8/9] fix comments --- docs/en_US/Tutorial/ExperimentConfig.md | 4 + src/nni_manager/config/config.yaml | 1 - .../rest_server/restValidationSchemas.ts | 370 +++++++++--------- .../frameworkcontrollerTrainingService.ts | 5 +- .../kubeflow/kubeflowTrainingService.ts | 3 +- .../kubernetes/kubernetesConfig.ts | 8 +- .../kubernetes/kubernetesTrainingService.ts | 7 +- tools/nni_cmd/config_schema.py | 6 +- 8 files changed, 209 insertions(+), 195 deletions(-) delete mode 100644 src/nni_manager/config/config.yaml diff --git a/docs/en_US/Tutorial/ExperimentConfig.md b/docs/en_US/Tutorial/ExperimentConfig.md index 97effd2b83..0f8970bd3d 100644 --- a/docs/en_US/Tutorial/ExperimentConfig.md +++ b/docs/en_US/Tutorial/ExperimentConfig.md @@ -519,6 +519,10 @@ machineList: __azureShare__ is the share of the azure file storage. + * __uploadRetryCount__ + + If upload files to azure storage failed, NNI will retry the process of uploading, this field will specify the number of attempts to re-upload files. + * __paiConfig__ * __userName__ diff --git a/src/nni_manager/config/config.yaml b/src/nni_manager/config/config.yaml deleted file mode 100644 index 715b4ee1ed..0000000000 --- a/src/nni_manager/config/config.yaml +++ /dev/null @@ -1 +0,0 @@ -azureStorageUploadRetryCount: 1 #If upload files to azure storage failed, NNI will retry the process of uploading, this field will specify the number of attempts to re-upload files. diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 19f88f11af..59795243f7 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -22,188 +22,190 @@ const joi = require('joi'); export namespace ValidationSchemas { - export const SETCLUSTERMETADATA = { - body: { - machine_list: joi.array().items(joi.object({ - username: joi.string().required(), - ip: joi.string().ip().required(), - port: joi.number().min(1).max(65535).required(), - passwd: joi.string(), - sshKeyPath: joi.string(), - passphrase: joi.string(), - gpuIndices: joi.string(), - maxTrialNumPerGpu: joi.number(), - useActiveGpu: joi.boolean() - })), - local_config: joi.object({ - gpuIndices: joi.string(), - maxTrialNumPerGpu: joi.number(), - useActiveGpu: joi.boolean() - }), - trial_config: joi.object({ - image: joi.string().min(1), - codeDir: joi.string().min(1).required(), - dataDir: joi.string(), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - gpuNum: joi.number().min(0), - command: joi.string().min(1), - virtualCluster: joi.string(), - shmMB: joi.number(), - authFile: joi.string(), - nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), - worker: joi.object({ - replicas: joi.number().min(1).required(), - image: joi.string().min(1), - privateRegistryAuthPath: joi.string().min(1), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - gpuNum: joi.number().min(0).required(), - command: joi.string().min(1).required() - }), - ps: joi.object({ - replicas: joi.number().min(1).required(), - image: joi.string().min(1), - privateRegistryAuthPath: joi.string().min(1), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - gpuNum: joi.number().min(0).required(), - command: joi.string().min(1).required() - }), - master: joi.object({ - replicas: joi.number().min(1).required(), - image: joi.string().min(1), - privateRegistryAuthPath: joi.string().min(1), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - gpuNum: joi.number().min(0).required(), - command: joi.string().min(1).required() - }), - taskRoles: joi.array({ - name: joi.string().min(1), - taskNum: joi.number().min(1).required(), - image: joi.string().min(1), - privateRegistryAuthPath: joi.string().min(1), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - shmMB: joi.number(), - gpuNum: joi.number().min(0).required(), - command: joi.string().min(1).required(), - frameworkAttemptCompletionPolicy: joi.object({ - minFailedTaskCount: joi.number(), - minSucceededTaskCount: joi.number() - }) - }) - }), - pai_config: joi.object({ - userName: joi.string().min(1).required(), - passWord: joi.string().min(1).required(), - host: joi.string().min(1).required() - }), - kubeflow_config: joi.object({ - operator: joi.string().min(1).required(), - storage: joi.string().min(1), - apiVersion: joi.string().min(1), - nfs: joi.object({ - server: joi.string().min(1).required(), - path: joi.string().min(1).required() - }), - keyVault: joi.object({ - vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/), - name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/) - }), - azureStorage: joi.object({ - accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), - azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) - }) - }), - frameworkcontroller_config: joi.object({ - storage: joi.string().min(1), - serviceAccountName: joi.string().min(1), - nfs: joi.object({ - server: joi.string().min(1).required(), - path: joi.string().min(1).required() - }), - keyVault: joi.object({ - vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/), - name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/) - }), - azureStorage: joi.object({ - accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), - azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) - }) - }), - nni_manager_ip: joi.object({ - nniManagerIp: joi.string().min(1) - }) - } - }; - export const STARTEXPERIMENT = { - body: { - experimentName: joi.string().required(), - description: joi.string(), - authorName: joi.string(), - maxTrialNum: joi.number().min(0).required(), - trialConcurrency: joi.number().min(0).required(), - trainingServicePlatform: joi.string(), - searchSpace: joi.string().required(), - maxExecDuration: joi.number().min(0).required(), - multiPhase: joi.boolean(), - multiThread: joi.boolean(), - versionCheck: joi.boolean(), - logCollection: joi.string(), - advisor: joi.object({ - builtinAdvisorName: joi.string().valid('Hyperband', 'BOHB'), - codeDir: joi.string(), - classFileName: joi.string(), - className: joi.string(), - classArgs: joi.any(), - gpuNum: joi.number().min(0), - checkpointDir: joi.string().allow('') - }), - tuner: joi.object({ - builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution', 'SMAC', 'BatchTuner', 'GridSearch', 'NetworkMorphism', 'MetisTuner', 'GPTuner'), - codeDir: joi.string(), - classFileName: joi.string(), - className: joi.string(), - classArgs: joi.any(), - gpuNum: joi.number().min(0), - checkpointDir: joi.string().allow(''), - includeIntermediateResults: joi.boolean() - }), - assessor: joi.object({ - builtinAssessorName: joi.string().valid('Medianstop', 'Curvefitting'), - codeDir: joi.string(), - classFileName: joi.string(), - className: joi.string(), - classArgs: joi.any(), - gpuNum: joi.number().min(0), - checkpointDir: joi.string().allow('') - }), - clusterMetaData: joi.array().items(joi.object({ - key: joi.string(), - value: joi.any() - })) - } - }; - export const UPDATEEXPERIMENT = { - query: { - update_type: joi.string().required().valid('TRIAL_CONCURRENCY', 'MAX_EXEC_DURATION', 'SEARCH_SPACE', 'MAX_TRIAL_NUM') - }, - body: { - id: joi.string().required(), - revision: joi.number().min(0).required(), - params: joi.object(STARTEXPERIMENT.body), - execDuration: joi.number().required(), - startTime: joi.number(), - endTime: joi.number(), - logDir: joi.string(), - maxSequenceId: joi.number() - } - }; + export const SETCLUSTERMETADATA = { + body: { + machine_list: joi.array().items(joi.object({ + username: joi.string().required(), + ip: joi.string().ip().required(), + port: joi.number().min(1).max(65535).required(), + passwd: joi.string(), + sshKeyPath: joi.string(), + passphrase: joi.string(), + gpuIndices: joi.string(), + maxTrialNumPerGpu: joi.number(), + useActiveGpu: joi.boolean() + })), + local_config: joi.object({ + gpuIndices: joi.string(), + maxTrialNumPerGpu: joi.number(), + useActiveGpu: joi.boolean() + }), + trial_config: joi.object({ + image: joi.string().min(1), + codeDir: joi.string().min(1).required(), + dataDir: joi.string(), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0), + command: joi.string().min(1), + virtualCluster: joi.string(), + shmMB: joi.number(), + authFile: joi.string(), + nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), + worker: joi.object({ + replicas: joi.number().min(1).required(), + image: joi.string().min(1), + privateRegistryAuthPath: joi.string().min(1), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required() + }), + ps: joi.object({ + replicas: joi.number().min(1).required(), + image: joi.string().min(1), + privateRegistryAuthPath: joi.string().min(1), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required() + }), + master: joi.object({ + replicas: joi.number().min(1).required(), + image: joi.string().min(1), + privateRegistryAuthPath: joi.string().min(1), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required() + }), + taskRoles: joi.array({ + name: joi.string().min(1), + taskNum: joi.number().min(1).required(), + image: joi.string().min(1), + privateRegistryAuthPath: joi.string().min(1), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + shmMB: joi.number(), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required(), + frameworkAttemptCompletionPolicy: joi.object({ + minFailedTaskCount: joi.number(), + minSucceededTaskCount: joi.number() + }) + }) + }), + pai_config: joi.object({ + userName: joi.string().min(1).required(), + passWord: joi.string().min(1).required(), + host: joi.string().min(1).required() + }), + kubeflow_config: joi.object({ + operator: joi.string().min(1).required(), + storage: joi.string().min(1), + apiVersion: joi.string().min(1), + nfs: joi.object({ + server: joi.string().min(1).required(), + path: joi.string().min(1).required() + }), + keyVault: joi.object({ + vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/), + name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/) + }), + azureStorage: joi.object({ + accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), + azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) + }), + uploadRetryCount: joi.number().min(1) + }), + frameworkcontroller_config: joi.object({ + storage: joi.string().min(1), + serviceAccountName: joi.string().min(1), + nfs: joi.object({ + server: joi.string().min(1).required(), + path: joi.string().min(1).required() + }), + keyVault: joi.object({ + vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/), + name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/) + }), + azureStorage: joi.object({ + accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), + azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) + }), + uploadRetryCount: joi.number().min(1) + }), + nni_manager_ip: joi.object({ + nniManagerIp: joi.string().min(1) + }) + } + }; + export const STARTEXPERIMENT = { + body: { + experimentName: joi.string().required(), + description: joi.string(), + authorName: joi.string(), + maxTrialNum: joi.number().min(0).required(), + trialConcurrency: joi.number().min(0).required(), + trainingServicePlatform: joi.string(), + searchSpace: joi.string().required(), + maxExecDuration: joi.number().min(0).required(), + multiPhase: joi.boolean(), + multiThread: joi.boolean(), + versionCheck: joi.boolean(), + logCollection: joi.string(), + advisor: joi.object({ + builtinAdvisorName: joi.string().valid('Hyperband', 'BOHB'), + codeDir: joi.string(), + classFileName: joi.string(), + className: joi.string(), + classArgs: joi.any(), + gpuNum: joi.number().min(0), + checkpointDir: joi.string().allow('') + }), + tuner: joi.object({ + builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution', 'SMAC', 'BatchTuner', 'GridSearch', 'NetworkMorphism', 'MetisTuner', 'GPTuner'), + codeDir: joi.string(), + classFileName: joi.string(), + className: joi.string(), + classArgs: joi.any(), + gpuNum: joi.number().min(0), + checkpointDir: joi.string().allow(''), + includeIntermediateResults: joi.boolean() + }), + assessor: joi.object({ + builtinAssessorName: joi.string().valid('Medianstop', 'Curvefitting'), + codeDir: joi.string(), + classFileName: joi.string(), + className: joi.string(), + classArgs: joi.any(), + gpuNum: joi.number().min(0), + checkpointDir: joi.string().allow('') + }), + clusterMetaData: joi.array().items(joi.object({ + key: joi.string(), + value: joi.any() + })) + } + }; + export const UPDATEEXPERIMENT = { + query: { + update_type: joi.string().required().valid('TRIAL_CONCURRENCY', 'MAX_EXEC_DURATION', 'SEARCH_SPACE', 'MAX_TRIAL_NUM') + }, + body: { + id: joi.string().required(), + revision: joi.number().min(0).required(), + params: joi.object(STARTEXPERIMENT.body), + execDuration: joi.number().required(), + startTime: joi.number(), + endTime: joi.number(), + logDir: joi.string(), + maxSequenceId: joi.number() + } + }; } diff --git a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts index 9bafc43c07..51d56d5b7c 100644 --- a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts @@ -211,7 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple let trialJobOutputUrl: string = ''; if (this.fcClusterConfig.storageType === 'azureStorage') { - trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.fcTrialConfig.codeDir); + const azureFrameworkControllerClusterConfig: FrameworkControllerClusterConfigAzure = + this.fcClusterConfig; + trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.fcTrialConfig.codeDir, + azureFrameworkControllerClusterConfig.uploadRetryCount); } else if (this.fcClusterConfig.storageType === 'nfs') { const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = this.fcClusterConfig; diff --git a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts index aa60c7b9e2..e70246176a 100644 --- a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts @@ -219,7 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber if (this.azureStorageClient === undefined) { throw new Error('azureStorageClient is not initialized'); } - trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.kubeflowTrialConfig.codeDir); + const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = this.kubeflowClusterConfig; + trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.kubeflowTrialConfig.codeDir, azureKubeflowClusterConfig.uploadRetryCount); } else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) { const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = this.kubeflowClusterConfig; // Creat work dir for current trial in NFS directory diff --git a/src/nni_manager/training_service/kubernetes/kubernetesConfig.ts b/src/nni_manager/training_service/kubernetes/kubernetesConfig.ts index 334eb122ed..c0d6a4c6dd 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesConfig.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesConfig.ts @@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { public readonly keyVault: KeyVaultConfig; public readonly azureStorage: AzureStorage; + public readonly uploadRetryCount: number | undefined; constructor( apiVersion: string, keyVault: KeyVaultConfig, azureStorage: AzureStorage, - storage?: KubernetesStorageKind + storage?: KubernetesStorageKind, + uploadRetryCount?: number ) { super(apiVersion, storage); this.keyVault = keyVault; this.azureStorage = azureStorage; + this.uploadRetryCount = uploadRetryCount; } public get storageType(): KubernetesStorageKind { @@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { kubernetesClusterConfigObjectAzure.apiVersion, kubernetesClusterConfigObjectAzure.keyVault, kubernetesClusterConfigObjectAzure.azureStorage, - kubernetesClusterConfigObjectAzure.storage + kubernetesClusterConfigObjectAzure.storage, + kubernetesClusterConfigObjectAzure.uploadRetryCount ); } } diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 9697175e8b..6a1df6e0f2 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -359,15 +359,14 @@ abstract class KubernetesTrainingService { return registrySecretName; } - protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: String, codeDir: String): Promise { + protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: String, codeDir: String, uploadRetryCount: number | undefined): Promise { if (this.azureStorageClient === undefined) { throw new Error('azureStorageClient is not initialized'); } let trialJobOutputUrl: string = ''; let retryCount: number = 1; - let config = yaml.safeLoad(fs.readFileSync('./config/config.yaml', 'utf8')); - if(config && config.azureStorageUploadRetryCount instanceof Number) { - retryCount = config.azureStorageUploadRetryCount; + if(uploadRetryCount) { + retryCount = uploadRetryCount; } let resultUploadNNIScript: boolean = false; let resultUploadCodeFile: boolean = false; diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index f09786664b..ff38d3c267 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -310,7 +310,8 @@ def setPathCheck(key): error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'), 'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\ error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)') - } + }, + Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999) }) } @@ -356,7 +357,8 @@ def setPathCheck(key): error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'), 'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\ error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)') - } + }, + Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999) }) } From 9c153c48421714d93e27fa838293fc6c5e5a075e Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 29 Aug 2019 10:59:31 +0800 Subject: [PATCH 9/9] fix error lines --- .../rest_server/restValidationSchemas.ts | 372 +++++++++--------- 1 file changed, 186 insertions(+), 186 deletions(-) diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 59795243f7..1937d82744 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -22,190 +22,190 @@ const joi = require('joi'); export namespace ValidationSchemas { - export const SETCLUSTERMETADATA = { - body: { - machine_list: joi.array().items(joi.object({ - username: joi.string().required(), - ip: joi.string().ip().required(), - port: joi.number().min(1).max(65535).required(), - passwd: joi.string(), - sshKeyPath: joi.string(), - passphrase: joi.string(), - gpuIndices: joi.string(), - maxTrialNumPerGpu: joi.number(), - useActiveGpu: joi.boolean() - })), - local_config: joi.object({ - gpuIndices: joi.string(), - maxTrialNumPerGpu: joi.number(), - useActiveGpu: joi.boolean() - }), - trial_config: joi.object({ - image: joi.string().min(1), - codeDir: joi.string().min(1).required(), - dataDir: joi.string(), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - gpuNum: joi.number().min(0), - command: joi.string().min(1), - virtualCluster: joi.string(), - shmMB: joi.number(), - authFile: joi.string(), - nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), - worker: joi.object({ - replicas: joi.number().min(1).required(), - image: joi.string().min(1), - privateRegistryAuthPath: joi.string().min(1), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - gpuNum: joi.number().min(0).required(), - command: joi.string().min(1).required() - }), - ps: joi.object({ - replicas: joi.number().min(1).required(), - image: joi.string().min(1), - privateRegistryAuthPath: joi.string().min(1), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - gpuNum: joi.number().min(0).required(), - command: joi.string().min(1).required() - }), - master: joi.object({ - replicas: joi.number().min(1).required(), - image: joi.string().min(1), - privateRegistryAuthPath: joi.string().min(1), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - gpuNum: joi.number().min(0).required(), - command: joi.string().min(1).required() - }), - taskRoles: joi.array({ - name: joi.string().min(1), - taskNum: joi.number().min(1).required(), - image: joi.string().min(1), - privateRegistryAuthPath: joi.string().min(1), - outputDir: joi.string(), - cpuNum: joi.number().min(1), - memoryMB: joi.number().min(100), - shmMB: joi.number(), - gpuNum: joi.number().min(0).required(), - command: joi.string().min(1).required(), - frameworkAttemptCompletionPolicy: joi.object({ - minFailedTaskCount: joi.number(), - minSucceededTaskCount: joi.number() - }) - }) - }), - pai_config: joi.object({ - userName: joi.string().min(1).required(), - passWord: joi.string().min(1).required(), - host: joi.string().min(1).required() - }), - kubeflow_config: joi.object({ - operator: joi.string().min(1).required(), - storage: joi.string().min(1), - apiVersion: joi.string().min(1), - nfs: joi.object({ - server: joi.string().min(1).required(), - path: joi.string().min(1).required() - }), - keyVault: joi.object({ - vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/), - name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/) - }), - azureStorage: joi.object({ - accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), - azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) - }), - uploadRetryCount: joi.number().min(1) - }), - frameworkcontroller_config: joi.object({ - storage: joi.string().min(1), - serviceAccountName: joi.string().min(1), - nfs: joi.object({ - server: joi.string().min(1).required(), - path: joi.string().min(1).required() - }), - keyVault: joi.object({ - vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/), - name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/) - }), - azureStorage: joi.object({ - accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), - azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) - }), - uploadRetryCount: joi.number().min(1) - }), - nni_manager_ip: joi.object({ - nniManagerIp: joi.string().min(1) - }) - } - }; - export const STARTEXPERIMENT = { - body: { - experimentName: joi.string().required(), - description: joi.string(), - authorName: joi.string(), - maxTrialNum: joi.number().min(0).required(), - trialConcurrency: joi.number().min(0).required(), - trainingServicePlatform: joi.string(), - searchSpace: joi.string().required(), - maxExecDuration: joi.number().min(0).required(), - multiPhase: joi.boolean(), - multiThread: joi.boolean(), - versionCheck: joi.boolean(), - logCollection: joi.string(), - advisor: joi.object({ - builtinAdvisorName: joi.string().valid('Hyperband', 'BOHB'), - codeDir: joi.string(), - classFileName: joi.string(), - className: joi.string(), - classArgs: joi.any(), - gpuNum: joi.number().min(0), - checkpointDir: joi.string().allow('') - }), - tuner: joi.object({ - builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution', 'SMAC', 'BatchTuner', 'GridSearch', 'NetworkMorphism', 'MetisTuner', 'GPTuner'), - codeDir: joi.string(), - classFileName: joi.string(), - className: joi.string(), - classArgs: joi.any(), - gpuNum: joi.number().min(0), - checkpointDir: joi.string().allow(''), - includeIntermediateResults: joi.boolean() - }), - assessor: joi.object({ - builtinAssessorName: joi.string().valid('Medianstop', 'Curvefitting'), - codeDir: joi.string(), - classFileName: joi.string(), - className: joi.string(), - classArgs: joi.any(), - gpuNum: joi.number().min(0), - checkpointDir: joi.string().allow('') - }), - clusterMetaData: joi.array().items(joi.object({ - key: joi.string(), - value: joi.any() - })) - } - }; - export const UPDATEEXPERIMENT = { - query: { - update_type: joi.string().required().valid('TRIAL_CONCURRENCY', 'MAX_EXEC_DURATION', 'SEARCH_SPACE', 'MAX_TRIAL_NUM') - }, - body: { - id: joi.string().required(), - revision: joi.number().min(0).required(), - params: joi.object(STARTEXPERIMENT.body), - execDuration: joi.number().required(), - startTime: joi.number(), - endTime: joi.number(), - logDir: joi.string(), - maxSequenceId: joi.number() - } - }; + export const SETCLUSTERMETADATA = { + body: { + machine_list: joi.array().items(joi.object({ + username: joi.string().required(), + ip: joi.string().ip().required(), + port: joi.number().min(1).max(65535).required(), + passwd: joi.string(), + sshKeyPath: joi.string(), + passphrase: joi.string(), + gpuIndices: joi.string(), + maxTrialNumPerGpu: joi.number(), + useActiveGpu: joi.boolean() + })), + local_config: joi.object({ + gpuIndices: joi.string(), + maxTrialNumPerGpu: joi.number(), + useActiveGpu: joi.boolean() + }), + trial_config: joi.object({ + image: joi.string().min(1), + codeDir: joi.string().min(1).required(), + dataDir: joi.string(), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0), + command: joi.string().min(1), + virtualCluster: joi.string(), + shmMB: joi.number(), + authFile: joi.string(), + nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), + worker: joi.object({ + replicas: joi.number().min(1).required(), + image: joi.string().min(1), + privateRegistryAuthPath: joi.string().min(1), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required() + }), + ps: joi.object({ + replicas: joi.number().min(1).required(), + image: joi.string().min(1), + privateRegistryAuthPath: joi.string().min(1), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required() + }), + master: joi.object({ + replicas: joi.number().min(1).required(), + image: joi.string().min(1), + privateRegistryAuthPath: joi.string().min(1), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required() + }), + taskRoles: joi.array({ + name: joi.string().min(1), + taskNum: joi.number().min(1).required(), + image: joi.string().min(1), + privateRegistryAuthPath: joi.string().min(1), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + shmMB: joi.number(), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required(), + frameworkAttemptCompletionPolicy: joi.object({ + minFailedTaskCount: joi.number(), + minSucceededTaskCount: joi.number() + }) + }) + }), + pai_config: joi.object({ + userName: joi.string().min(1).required(), + passWord: joi.string().min(1).required(), + host: joi.string().min(1).required() + }), + kubeflow_config: joi.object({ + operator: joi.string().min(1).required(), + storage: joi.string().min(1), + apiVersion: joi.string().min(1), + nfs: joi.object({ + server: joi.string().min(1).required(), + path: joi.string().min(1).required() + }), + keyVault: joi.object({ + vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/), + name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/) + }), + azureStorage: joi.object({ + accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), + azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) + }), + uploadRetryCount: joi.number().min(1) + }), + frameworkcontroller_config: joi.object({ + storage: joi.string().min(1), + serviceAccountName: joi.string().min(1), + nfs: joi.object({ + server: joi.string().min(1).required(), + path: joi.string().min(1).required() + }), + keyVault: joi.object({ + vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/), + name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/) + }), + azureStorage: joi.object({ + accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), + azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) + }), + uploadRetryCount: joi.number().min(1) + }), + nni_manager_ip: joi.object({ + nniManagerIp: joi.string().min(1) + }) + } + }; + export const STARTEXPERIMENT = { + body: { + experimentName: joi.string().required(), + description: joi.string(), + authorName: joi.string(), + maxTrialNum: joi.number().min(0).required(), + trialConcurrency: joi.number().min(0).required(), + trainingServicePlatform: joi.string(), + searchSpace: joi.string().required(), + maxExecDuration: joi.number().min(0).required(), + multiPhase: joi.boolean(), + multiThread: joi.boolean(), + versionCheck: joi.boolean(), + logCollection: joi.string(), + advisor: joi.object({ + builtinAdvisorName: joi.string().valid('Hyperband', 'BOHB'), + codeDir: joi.string(), + classFileName: joi.string(), + className: joi.string(), + classArgs: joi.any(), + gpuNum: joi.number().min(0), + checkpointDir: joi.string().allow('') + }), + tuner: joi.object({ + builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution', 'SMAC', 'BatchTuner', 'GridSearch', 'NetworkMorphism', 'MetisTuner', 'GPTuner'), + codeDir: joi.string(), + classFileName: joi.string(), + className: joi.string(), + classArgs: joi.any(), + gpuNum: joi.number().min(0), + checkpointDir: joi.string().allow(''), + includeIntermediateResults: joi.boolean() + }), + assessor: joi.object({ + builtinAssessorName: joi.string().valid('Medianstop', 'Curvefitting'), + codeDir: joi.string(), + classFileName: joi.string(), + className: joi.string(), + classArgs: joi.any(), + gpuNum: joi.number().min(0), + checkpointDir: joi.string().allow('') + }), + clusterMetaData: joi.array().items(joi.object({ + key: joi.string(), + value: joi.any() + })) + } + }; + export const UPDATEEXPERIMENT = { + query: { + update_type: joi.string().required().valid('TRIAL_CONCURRENCY', 'MAX_EXEC_DURATION', 'SEARCH_SPACE', 'MAX_TRIAL_NUM') + }, + body: { + id: joi.string().required(), + revision: joi.number().min(0).required(), + params: joi.object(STARTEXPERIMENT.body), + execDuration: joi.number().required(), + startTime: joi.number(), + endTime: joi.number(), + logDir: joi.string(), + maxSequenceId: joi.number() + } + }; }