diff --git a/docs/en_US/Tutorial/ExperimentConfig.md b/docs/en_US/Tutorial/ExperimentConfig.md index 97effd2b83..0f8970bd3d 100644 --- a/docs/en_US/Tutorial/ExperimentConfig.md +++ b/docs/en_US/Tutorial/ExperimentConfig.md @@ -519,6 +519,10 @@ machineList: __azureShare__ is the share of the azure file storage. + * __uploadRetryCount__ + + If upload files to azure storage failed, NNI will retry the process of uploading, this field will specify the number of attempts to re-upload files. + * __paiConfig__ * __userName__ diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 19f88f11af..1937d82744 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -120,7 +120,8 @@ export namespace ValidationSchemas { azureStorage: joi.object({ accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) - }) + }), + uploadRetryCount: joi.number().min(1) }), frameworkcontroller_config: joi.object({ storage: joi.string().min(1), @@ -136,7 +137,8 @@ export namespace ValidationSchemas { azureStorage: joi.object({ accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) - }) + }), + uploadRetryCount: joi.number().min(1) }), nni_manager_ip: joi.object({ nniManagerIp: joi.string().min(1) diff --git a/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts b/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts index f687136fbe..00eeb3e557 100644 --- a/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts +++ b/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts @@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility { * @param fileServerClient * @param azureShare */ - export async function createShare(fileServerClient: any, azureShare: any): Promise { - const deferred: Deferred = new Deferred(); + export async function createShare(fileServerClient: any, azureShare: any): Promise { + const deferred: Deferred = new Deferred(); fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => { if (error) { getLogger() .error(`Create share failed:, ${error}`); - deferred.reject(error); + deferred.resolve(false); } else { - deferred.resolve(); + deferred.resolve(true); } }); @@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility { * @param azureFoler * @param azureShare */ - export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise { - const deferred: Deferred = new Deferred(); + export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise { + const deferred: Deferred = new Deferred(); fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => { if (error) { getLogger() .error(`Create directory failed:, ${error}`); - deferred.reject(error); + deferred.resolve(false); } else { - deferred.resolve(); + deferred.resolve(true); } }); - return deferred.promise; } @@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility { * @param azureDirectory */ export async function createDirectoryRecursive(fileServerClient: azureStorage.FileService, azureDirectory: string, - azureShare: any): Promise { - const deferred: Deferred = new Deferred(); + azureShare: any): Promise { + const deferred: Deferred = new Deferred(); const directories: string[] = azureDirectory.split('/'); let rootDirectory: string = ''; for (const directory of directories) { rootDirectory += directory; - await createDirectory(fileServerClient, rootDirectory, azureShare); + let result:boolean = await createDirectory(fileServerClient, rootDirectory, azureShare); + if (!result) { + deferred.resolve(false); + return deferred.promise; + } rootDirectory += '/'; } - deferred.resolve(); + deferred.resolve(true); return deferred.promise; } @@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility { * @param localFilePath */ async function uploadFileToAzure(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any, - localFilePath: string): Promise { - const deferred: Deferred = new Deferred(); + localFilePath: string): Promise { + const deferred: Deferred = new Deferred(); await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath, (error: any, result: any, response: any) => { if (error) { getLogger() .error(`Upload file failed:, ${error}`); - deferred.reject(error); + deferred.resolve(false); } else { - deferred.resolve(); + deferred.resolve(true); } }); @@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility { * @param localFilePath */ async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any, - localFilePath: string): Promise { - const deferred: Deferred = new Deferred(); + localFilePath: string): Promise { + const deferred: Deferred = new Deferred(); // tslint:disable-next-line:non-literal-fs-path await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath), (error: any, result: any, response: any) => { if (error) { getLogger() .error(`Download file failed:, ${error}`); - deferred.reject(error); + deferred.resolve(false); } else { - deferred.resolve(); + deferred.resolve(true); } }); @@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility { */ // tslint:disable:non-literal-fs-path export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any, - localDirectory: string): Promise { - const deferred: Deferred = new Deferred(); + localDirectory: string): Promise { + const deferred: Deferred = new Deferred(); const fileNameArray: string[] = fs.readdirSync(localDirectory); - await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare); + let result: boolean = await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare); + if (!result) { + deferred.resolve(false); + return deferred.promise; + } for (const fileName of fileNameArray) { const fullFilePath: string = path.join(localDirectory, fileName); try { + let resultUploadFile: boolean = true; + let resultUploadDir: boolean = true; if (fs.lstatSync(fullFilePath) .isFile()) { - await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath); + resultUploadFile = await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath); } else { // If filePath is a directory, recuisively copy it to azure - await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath); + resultUploadDir = await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath); + } + if (!(resultUploadFile && resultUploadDir)) { + deferred.resolve(false); + return deferred.promise; } } catch (error) { - deferred.reject(error); + deferred.resolve(false); return deferred.promise; } } // All files/directories are copied successfully, resolve - deferred.resolve(); + deferred.resolve(true); return deferred.promise; } diff --git a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts index d38c656aaa..51d56d5b7c 100644 --- a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts @@ -25,7 +25,7 @@ import * as path from 'path'; import * as component from '../../../common/component'; import { getExperimentId } from '../../../common/experimentStartupInfo'; import { - JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail + JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; @@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple //upload code files const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); - + let initStatus: TrialJobStatus = 'WAITING'; + if (!trialJobOutputUrl) { + initStatus = 'FAILED'; + } const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( trialJobId, - 'WAITING', + initStatus, Date.now(), trialWorkingFolder, form, @@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple let trialJobOutputUrl: string = ''; if (this.fcClusterConfig.storageType === 'azureStorage') { - if (this.azureStorageClient === undefined) { - throw new Error('azureStorageClient is not initialized'); - } - try { - //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage - await AzureStorageClientUtility.uploadDirectory( - this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`); - //upload code files to azure storage - await AzureStorageClientUtility.uploadDirectory( - this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${this.fcTrialConfig.codeDir}`); - - trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/` + - `${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; - } catch (error) { - this.log.error(error); - - return Promise.reject(error); - } + const azureFrameworkControllerClusterConfig: FrameworkControllerClusterConfigAzure = + this.fcClusterConfig; + trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.fcTrialConfig.codeDir, + azureFrameworkControllerClusterConfig.uploadRetryCount); } else if (this.fcClusterConfig.storageType === 'nfs') { const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = this.fcClusterConfig; diff --git a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts index b65a548fb6..e70246176a 100644 --- a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts @@ -27,7 +27,7 @@ import * as component from '../../../common/component'; import { getExperimentId } from '../../../common/experimentStartupInfo'; import { - JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail + JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; @@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form); //upload files to sotrage const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); + let initStatus: TrialJobStatus = 'WAITING'; + if (!trialJobOutputUrl) { + initStatus = 'FAILED'; + } const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( trialJobId, - 'WAITING', + initStatus, Date.now(), trialWorkingFolder, form, @@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber if (this.azureStorageClient === undefined) { throw new Error('azureStorageClient is not initialized'); } - try { - //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage - await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${trialLocalTempFolder}`); - //upload code files to azure storage - await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${this.kubeflowTrialConfig.codeDir}`); - - trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` + - `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; - } catch (error) { - this.log.error(error); - - return Promise.reject(error); - } + const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = this.kubeflowClusterConfig; + trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.kubeflowTrialConfig.codeDir, azureKubeflowClusterConfig.uploadRetryCount); } else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) { const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = this.kubeflowClusterConfig; // Creat work dir for current trial in NFS directory diff --git a/src/nni_manager/training_service/kubernetes/kubernetesConfig.ts b/src/nni_manager/training_service/kubernetes/kubernetesConfig.ts index 334eb122ed..c0d6a4c6dd 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesConfig.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesConfig.ts @@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { public readonly keyVault: KeyVaultConfig; public readonly azureStorage: AzureStorage; + public readonly uploadRetryCount: number | undefined; constructor( apiVersion: string, keyVault: KeyVaultConfig, azureStorage: AzureStorage, - storage?: KubernetesStorageKind + storage?: KubernetesStorageKind, + uploadRetryCount?: number ) { super(apiVersion, storage); this.keyVault = keyVault; this.azureStorage = azureStorage; + this.uploadRetryCount = uploadRetryCount; } public get storageType(): KubernetesStorageKind { @@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { kubernetesClusterConfigObjectAzure.apiVersion, kubernetesClusterConfigObjectAzure.keyVault, kubernetesClusterConfigObjectAzure.azureStorage, - kubernetesClusterConfigObjectAzure.storage + kubernetesClusterConfigObjectAzure.storage, + kubernetesClusterConfigObjectAzure.uploadRetryCount ); } } diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 0327cd553f..6a1df6e0f2 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; -import { getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; +import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; import { AzureStorageClientUtility } from './azureStorageClientUtils'; import { GeneralK8sClient, KubernetesCRDClient } from './kubernetesApiClient'; import { KubernetesClusterConfig } from './kubernetesConfig'; import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData'; import { KubernetesJobRestServer } from './kubernetesJobRestServer'; +var yaml = require('js-yaml'); var fs = require('fs'); /** @@ -357,6 +358,52 @@ abstract class KubernetesTrainingService { ); return registrySecretName; } + + protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: String, codeDir: String, uploadRetryCount: number | undefined): Promise { + if (this.azureStorageClient === undefined) { + throw new Error('azureStorageClient is not initialized'); + } + let trialJobOutputUrl: string = ''; + let retryCount: number = 1; + if(uploadRetryCount) { + retryCount = uploadRetryCount; + } + let resultUploadNNIScript: boolean = false; + let resultUploadCodeFile: boolean = false; + try { + do { + //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage + if(!resultUploadNNIScript) { + resultUploadNNIScript = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, + `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, + `${trialLocalTempFolder}`); + } + //upload code files to azure storage + if(!resultUploadCodeFile) { + resultUploadCodeFile = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, + `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, + `${codeDir}`); + } + if (resultUploadNNIScript && resultUploadCodeFile) { + trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` + + `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; + break; + } else { + //wait for 5 seconds to re-upload files + await delay(5000); + this.log.info('Upload failed, Retry: upload files to azure-storage'); + } + } while (retryCount-- >= 0) + } catch (error) { + this.log.error(error); + //return a empty url when got error + return Promise.resolve(""); + } + if(!trialJobOutputUrl) { + this.log.info(`Retry-count is used up, upload files to azureStorage for trial ${trialJobId} failed!`); + } + return Promise.resolve(trialJobOutputUrl); + } } export { KubernetesTrainingService }; diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index f09786664b..ff38d3c267 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -310,7 +310,8 @@ def setPathCheck(key): error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'), 'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\ error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)') - } + }, + Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999) }) } @@ -356,7 +357,8 @@ def setPathCheck(key): error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'), 'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\ error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)') - } + }, + Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999) }) }