From 664a149d866183ea20e1b964e5e709369ec96a11 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Tue, 30 Jul 2019 22:32:16 +0800 Subject: [PATCH] Remove outputDir and dataDir in config file (#1361) --- docs/en_US/TrainingService/PaiMode.md | 13 +-- docs/zh_CN/PaiMode.md | 6 -- examples/trials/auto-gbdt/config_pai.yml | 4 - .../trials/cifar10_pytorch/config_pai.yml | 4 - examples/trials/ga_squad/config_pai.yml | 4 - examples/trials/mnist-advisor/config_pai.yml | 4 - .../trials/mnist-annotation/config_pai.yml | 4 - .../mnist-batch-tune-keras/config_pai.yml | 4 - examples/trials/mnist-keras/config_pai.yml | 4 - examples/trials/mnist/config_pai.yml | 4 - .../FashionMNIST/config_pai.yml | 4 - .../network_morphism/cifar10/config_pai.yml | 4 - .../sklearn/classification/config_pai.yml | 4 - .../trials/sklearn/regression/config_pai.yml | 4 - .../training_service/pai/paiConfig.ts | 14 +-- .../training_service/pai/paiData.ts | 3 - .../pai/paiTrainingService.ts | 92 +++++-------------- tools/nni_cmd/launcher_utils.py | 8 ++ 18 files changed, 35 insertions(+), 149 deletions(-) diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index ddb5a2697c..ef9fc093cb 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -32,8 +32,6 @@ trial: cpuNum: 1 memoryMB: 8196 image: msranni/nni:latest - dataDir: hdfs://10.1.1.1:9000/nni - outputDir: hdfs://10.1.1.1:9000/nni # Configuration to access OpenPAI Cluster paiConfig: userName: your_pai_nni_user @@ -51,10 +49,6 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod * image * Required key. In pai mode, your trial program will be scheduled by OpenPAI to run in [Docker container](https://www.docker.com/). This key is used to specify the Docker image used to create the container in which your trial will run. * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it. -* dataDir - * Optional key. It specifies the HDFS data direcotry for trial to download data. The format should be something like hdfs://{your HDFS host}:9000/{your data directory} -* outputDir - * Optional key. It specifies the HDFS output directory for trial. Once the trial is completed (either succeed or fail), trial's stdout, stderr will be copied to this directory by NNI sdk automatically. The format should be something like hdfs://{your HDFS host}:9000/{your output directory} * virtualCluster * Optional key. Set the virtualCluster of OpenPAI. If omitted, the job will run on default virtual cluster. * shmMB @@ -80,9 +74,10 @@ And you will be redirected to HDFS web portal to browse the output files of that You can see there're three fils in output folder: stderr, stdout, and trial.log -If you also want to save trial's other output into HDFS, like model files, you can use environment variable `NNI_OUTPUT_DIR` in your trial code to save your own output files, and NNI SDK will copy all the files in `NNI_OUTPUT_DIR` from trial's container to HDFS. +## data management +If your training data is not too large, it could be put into codeDir, and nni will upload the data to hdfs, or you could build your own docker image with the data. If you have large dataset, it's not appropriate to put the data in codeDir, and you could follow the [guidance](https://github.com/microsoft/pai/blob/master/docs/user/storage.md) to mount the data folder in container. -Any problems when using NNI in pai mode, please create issues on [NNI github repo](https://github.com/Microsoft/nni). +If you also want to save trial's other output into HDFS, like model files, you can use environment variable `NNI_OUTPUT_DIR` in your trial code to save your own output files, and NNI SDK will copy all the files in `NNI_OUTPUT_DIR` from trial's container to HDFS, the target path is `hdfs://host:port/{username}/nni/{experiments}/{experimentId}/trials/{trialId}/nnioutput` ## version check NNI support version check feature in since version 0.6. It is a policy to insure the version of NNIManager is consistent with trialKeeper, and avoid errors caused by version incompatibility. @@ -92,4 +87,4 @@ Check policy: 3. Note that the version check feature only check first two digits of version.For example, NNIManager v0.6.1 could use trialKeeper v0.6 or trialKeeper v0.6.2, but could not use trialKeeper v0.5.1 or trialKeeper v0.7. If you could not run your experiment and want to know if it is caused by version check, you could check your webUI, and there will be an error message about version check. -![](../../img/version_check.png) \ No newline at end of file +![](../../img/version_check.png) diff --git a/docs/zh_CN/PaiMode.md b/docs/zh_CN/PaiMode.md index f2700bc615..ac375069d1 100644 --- a/docs/zh_CN/PaiMode.md +++ b/docs/zh_CN/PaiMode.md @@ -34,8 +34,6 @@ trial: cpuNum: 1 memoryMB: 8196 image: msranni/nni:latest - dataDir: hdfs://10.1.1.1:9000/nni - outputDir: hdfs://10.1.1.1:9000/nni # 配置访问的 OpenPAI 集群 paiConfig: userName: your_pai_nni_user @@ -54,10 +52,6 @@ paiConfig: * image * 必填。 在 pai 模式中,Trial 程序由 OpenPAI 在 [Docker 容器](https://www.docker.com/)中安排运行。 此键用来指定 Trial 程序的容器使用的 Docker 映像。 * [Docker Hub](https://hub.docker.com/) 上有预制的 NNI Docker 映像 [nnimsra/nni](https://hub.docker.com/r/msranni/nni/)。 它包含了用来启动 NNI Experiment 所依赖的所有 Python 包,Node 模块和 JavaScript。 生成此 Docker 映像的文件在[这里](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile)。 可以直接使用此映像,或参考它来生成自己的映像。 -* dataDir - * 可选。 指定了 Trial 用于下载数据的 HDFS 数据目录。 格式应为 hdfs://{your HDFS host}:9000/{数据目录} -* outputDir - * 可选。 指定了 Trial 的 HDFS 输出目录。 Trial 在完成(成功或失败)后,Trial 的 stdout, stderr 会被 NNI 自动复制到此目录中。 格式应为 hdfs://{your HDFS host}:9000/{输出目录} * virtualCluster * 可选。 设置 OpenPAI 的 virtualCluster,即虚拟集群。 如果未设置此参数,将使用默认的虚拟集群。 * shmMB diff --git a/examples/trials/auto-gbdt/config_pai.yml b/examples/trials/auto-gbdt/config_pai.yml index 25eafe1b11..7393a080a2 100644 --- a/examples/trials/auto-gbdt/config_pai.yml +++ b/examples/trials/auto-gbdt/config_pai.yml @@ -23,10 +23,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/cifar10_pytorch/config_pai.yml b/examples/trials/cifar10_pytorch/config_pai.yml index 3494967aa3..87d82ff097 100644 --- a/examples/trials/cifar10_pytorch/config_pai.yml +++ b/examples/trials/cifar10_pytorch/config_pai.yml @@ -23,10 +23,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/ga_squad/config_pai.yml b/examples/trials/ga_squad/config_pai.yml index 73bf0154b5..a2cfb8f381 100644 --- a/examples/trials/ga_squad/config_pai.yml +++ b/examples/trials/ga_squad/config_pai.yml @@ -23,10 +23,6 @@ trial: memoryMB: 32869 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/mnist-advisor/config_pai.yml b/examples/trials/mnist-advisor/config_pai.yml index 56d5aa6c60..b26b758f79 100644 --- a/examples/trials/mnist-advisor/config_pai.yml +++ b/examples/trials/mnist-advisor/config_pai.yml @@ -27,10 +27,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/mnist-annotation/config_pai.yml b/examples/trials/mnist-annotation/config_pai.yml index 89b84d8732..f8a825defd 100644 --- a/examples/trials/mnist-annotation/config_pai.yml +++ b/examples/trials/mnist-annotation/config_pai.yml @@ -22,10 +22,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/mnist-batch-tune-keras/config_pai.yml b/examples/trials/mnist-batch-tune-keras/config_pai.yml index 40d97cd364..69c6dd5f61 100644 --- a/examples/trials/mnist-batch-tune-keras/config_pai.yml +++ b/examples/trials/mnist-batch-tune-keras/config_pai.yml @@ -20,10 +20,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/mnist-keras/config_pai.yml b/examples/trials/mnist-keras/config_pai.yml index 775d6f0363..aa08d0ee1c 100644 --- a/examples/trials/mnist-keras/config_pai.yml +++ b/examples/trials/mnist-keras/config_pai.yml @@ -23,10 +23,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/mnist/config_pai.yml b/examples/trials/mnist/config_pai.yml index 0b1aca733f..c0bb710294 100644 --- a/examples/trials/mnist/config_pai.yml +++ b/examples/trials/mnist/config_pai.yml @@ -23,10 +23,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/network_morphism/FashionMNIST/config_pai.yml b/examples/trials/network_morphism/FashionMNIST/config_pai.yml index f0c8612c3b..3562d8dc82 100644 --- a/examples/trials/network_morphism/FashionMNIST/config_pai.yml +++ b/examples/trials/network_morphism/FashionMNIST/config_pai.yml @@ -30,10 +30,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/network_morphism/cifar10/config_pai.yml b/examples/trials/network_morphism/cifar10/config_pai.yml index d32cea15a5..e14caab934 100644 --- a/examples/trials/network_morphism/cifar10/config_pai.yml +++ b/examples/trials/network_morphism/cifar10/config_pai.yml @@ -30,10 +30,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/sklearn/classification/config_pai.yml b/examples/trials/sklearn/classification/config_pai.yml index 3672af3f61..d3ffdc8d74 100644 --- a/examples/trials/sklearn/classification/config_pai.yml +++ b/examples/trials/sklearn/classification/config_pai.yml @@ -23,10 +23,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/examples/trials/sklearn/regression/config_pai.yml b/examples/trials/sklearn/regression/config_pai.yml index ba70af2fe2..b6d84f2f6d 100644 --- a/examples/trials/sklearn/regression/config_pai.yml +++ b/examples/trials/sklearn/regression/config_pai.yml @@ -23,10 +23,6 @@ trial: memoryMB: 8196 #The docker image to run nni job on pai image: msranni/nni:latest - #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' - dataDir: hdfs://10.10.10.10:9000/username/nni - #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' - outputDir: hdfs://10.10.10.10:9000/username/nni paiConfig: #The username to login pai userName: username diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts index 74f74fa7a4..c1bb7fb664 100644 --- a/src/nni_manager/training_service/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -69,10 +69,6 @@ export class PAIJobConfig { public readonly jobName: string; // URL pointing to the Docker image for all tasks in the job public readonly image: string; - // Data directory existing on HDFS - public readonly dataDir: string; - // Output directory on HDFS - public readonly outputDir: string; // Code directory on HDFS public readonly codeDir: string; @@ -90,12 +86,10 @@ export class PAIJobConfig { * @param outputDir Output directory on HDFS * @param taskRoles List of taskRole, one task role at least */ - constructor(jobName: string, image : string, dataDir : string, outputDir : string, codeDir : string, + constructor(jobName: string, image : string, codeDir : string, taskRoles : PAITaskRole[], virtualCluster: string) { this.jobName = jobName; this.image = image; - this.dataDir = dataDir; - this.outputDir = outputDir; this.codeDir = codeDir; this.taskRoles = taskRoles; this.virtualCluster = virtualCluster; @@ -130,8 +124,6 @@ export class NNIPAITrialConfig extends TrialConfig { public readonly cpuNum: number; public readonly memoryMB: number; public readonly image: string; - public readonly dataDir: string; - public outputDir: string; //The virtual cluster job runs on. If omitted, the job will run on default virtual cluster public virtualCluster?: string; @@ -139,13 +131,11 @@ export class NNIPAITrialConfig extends TrialConfig { public shmMB?: number; constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, - image: string, dataDir: string, outputDir: string, virtualCluster?: string, shmMB?: number) { + image: string, virtualCluster?: string, shmMB?: number) { super(command, codeDir, gpuNum); this.cpuNum = cpuNum; this.memoryMB = memoryMB; this.image = image; - this.dataDir = dataDir; - this.outputDir = outputDir; this.virtualCluster = virtualCluster; this.shmMB = shmMB; } diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 8820f55cbd..8ac4b77ed1 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -70,9 +70,6 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = --pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \ --nni_manager_version '{13}' --log_collection '{14}'`; -export const PAI_OUTPUT_DIR_FORMAT: string = -`hdfs://{0}:9000/`; - // tslint:disable:no-http-string export const PAI_LOG_PATH_FORMAT: string = `http://{0}/webhdfs/explorer.html#{1}`; diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index ad11cbfc98..ce5ca61905 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -43,7 +43,7 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { execMkdir, validateCodeDir } from '../common/util'; import { HDFSClientUtility } from './hdfsClientUtility'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; -import { PAI_LOG_PATH_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiData'; +import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiData'; import { PAIJobInfoCollector } from './paiJobInfoCollector'; import { PAIJobRestServer, ParameterFileMeta } from './paiJobRestServer'; @@ -70,9 +70,6 @@ class PAITrainingService implements TrainingService { private readonly paiTokenUpdateInterval: number; private readonly experimentId! : string; private readonly paiJobCollector : PAIJobInfoCollector; - private readonly hdfsDirPattern: string; - private hdfsBaseDir: string | undefined; - private hdfsOutputHost: string | undefined; private nextTrialSequenceId: number; private paiRestServerPort?: number; private nniManagerIpConfig?: NNIManagerIpConfig; @@ -80,6 +77,8 @@ class PAITrainingService implements TrainingService { private versionCheck: boolean = true; private logCollection: string; private isMultiPhase: boolean = false; + private hdfsCodeDir?: string; + private hdfsOutputDir?: string; constructor() { this.log = getLogger(); @@ -90,7 +89,6 @@ class PAITrainingService implements TrainingService { this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); this.experimentId = getExperimentId(); this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); - this.hdfsDirPattern = 'hdfs://(?([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?/.*)?'; this.nextTrialSequenceId = -1; this.paiTokenUpdateInterval = 7200000; //2hours this.logCollection = 'none'; @@ -144,10 +142,10 @@ class PAITrainingService implements TrainingService { } public async submitTrialJob(form: JobApplicationForm): Promise { - const deferred : Deferred = new Deferred(); - if (this.hdfsBaseDir === undefined) { - throw new Error('hdfsBaseDir is not initialized'); + if (this.paiClusterConfig === undefined) { + throw new Error(`paiClusterConfig not initialized!`); } + const deferred : Deferred = new Deferred(); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); @@ -156,12 +154,14 @@ class PAITrainingService implements TrainingService { //TODO: use HDFS working folder instead const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; + this.hdfsCodeDir = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); + this.hdfsOutputDir = unixPathJoin(this.hdfsCodeDir, 'nnioutput'); - const hdfsOutputDir : string = path.join(this.hdfsBaseDir, this.experimentId, trialJobId); const hdfsLogPath : string = String.Format( PAI_LOG_PATH_FORMAT, - this.hdfsOutputHost, - hdfsOutputDir); + this.paiClusterConfig.host, + this.hdfsOutputDir + ); const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, @@ -278,14 +278,6 @@ class PAITrainingService implements TrainingService { break; } this.paiTrialConfig = JSON.parse(value); - //paiTrialConfig.outputDir could be null if it is not set in nnictl - if (this.paiTrialConfig.outputDir === undefined || this.paiTrialConfig.outputDir === null) { - this.paiTrialConfig.outputDir = String.Format( - PAI_OUTPUT_DIR_FORMAT, - this.paiClusterConfig.host - ) - .replace(/\r\n|\n|\r/gm, ''); - } // Validate to make sure codeDir doesn't have too many files try { @@ -295,43 +287,7 @@ class PAITrainingService implements TrainingService { deferred.reject(new Error(error)); break; } - - const hdfsDirContent: any = this.paiTrialConfig.outputDir.match(this.hdfsDirPattern); - - if (hdfsDirContent === null) { - throw new Error('Trial outputDir format Error'); - } - const groups: any = hdfsDirContent.groups; - if (groups === undefined) { - throw new Error('Trial outputDir format Error'); - } - this.hdfsOutputHost = groups.host; - //TODO: choose to use /${username} as baseDir - this.hdfsBaseDir = groups.baseDir; - if (this.hdfsBaseDir === undefined) { - this.hdfsBaseDir = '/'; - } - - let dataOutputHdfsClient: any; - if (this.paiClusterConfig.host === this.hdfsOutputHost && this.hdfsClient) { - dataOutputHdfsClient = this.hdfsClient; - } else { - dataOutputHdfsClient = WebHDFS.createClient({ - user: this.paiClusterConfig.userName, - port: 50070, - host: this.hdfsOutputHost - }); - } - - try { - const exist : boolean = await HDFSClientUtility.pathExists('/', dataOutputHdfsClient); - if (!exist) { - deferred.reject(new Error(`Please check hdfsOutputDir host!`)); - } - } catch (error) { - deferred.reject(new Error(`HDFS encounters problem, error is ${error}. Please check hdfsOutputDir host!`)); - } - + // Copy experiment files from local folder to HDFS this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs( this.paiTrialConfig.codeDir, @@ -409,12 +365,12 @@ class PAITrainingService implements TrainingService { throw new Error('PAI token is not initialized'); } - if (this.hdfsBaseDir === undefined) { - throw new Error('hdfsBaseDir is not initialized'); + if (this.hdfsCodeDir === undefined) { + throw new Error('hdfsCodeDir is not initialized'); } - if (this.hdfsOutputHost === undefined) { - throw new Error('hdfsOutputHost is not initialized'); + if (this.hdfsOutputDir === undefined) { + throw new Error('hdfsOutputDir is not initialized'); } if (this.paiRestServerPort === undefined) { @@ -428,8 +384,6 @@ class PAITrainingService implements TrainingService { } // Step 1. Prepare PAI job configuration - const hdfsOutputDir : string = unixPathJoin(this.hdfsBaseDir, this.experimentId, trialJobId); - const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); //create tmp trial working folder locally. @@ -463,8 +417,8 @@ class PAITrainingService implements TrainingService { this.paiTrialConfig.command, nniManagerIp, this.paiRestServerPort, - hdfsOutputDir, - this.hdfsOutputHost, + this.hdfsOutputDir, + this.paiClusterConfig.host, this.paiClusterConfig.userName, HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName), version, @@ -497,12 +451,8 @@ class PAITrainingService implements TrainingService { trialJobDetail.paiJobName, // Docker image this.paiTrialConfig.image, - // dataDir - this.paiTrialConfig.dataDir, - // outputDir - this.paiTrialConfig.outputDir, // codeDir - `$PAI_DEFAULT_FS_URI${hdfsCodeDir}`, + `$PAI_DEFAULT_FS_URI${this.hdfsCodeDir}`, // PAI Task roles paiTaskRoles, // Add Virutal Cluster @@ -511,9 +461,9 @@ class PAITrainingService implements TrainingService { // Step 2. Upload code files in codeDir onto HDFS try { - await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, hdfsCodeDir, this.hdfsClient); + await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, this.hdfsCodeDir, this.hdfsClient); } catch (error) { - this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`); + this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${this.hdfsCodeDir} failed, error is ${error}`); trialJobDetail.status = 'FAILED'; deferred.resolve(true); diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index 6828db89a4..af1d8a5a8c 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -253,6 +253,14 @@ def validate_pai_trial_conifg(experiment_config): experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']: print_error('shmMB should be no more than memoryMB!') exit(1) + #backward compatibility + warning_information = '{0} is not supported in NNI anymore, please remove the field in config file!\ + please refer https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/PaiMode.md#run-an-experiment\ + for the practices of how to get data and output model in trial code' + if experiment_config.get('trial').get('dataDir'): + print_warning(warning_information.format('dataDir')) + if experiment_config.get('trial').get('outputDir'): + print_warning(warning_information.format('outputDir')) def validate_all_content(experiment_config, config_path): '''Validate whether experiment_config is valid'''