From d94b165c7e6a047120655c9dcb920857bf31d02d Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Mon, 8 Oct 2018 19:07:41 +0800 Subject: [PATCH 1/7] Change hard-coded root directory to $PWD in PAI container --- .../training_service/pai/hdfsClientUtility.ts | 9 ++++++--- .../training_service/pai/paiTrainingService.ts | 16 +++++++++++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/nni_manager/training_service/pai/hdfsClientUtility.ts b/src/nni_manager/training_service/pai/hdfsClientUtility.ts index 69fc383e6d..21271d2786 100644 --- a/src/nni_manager/training_service/pai/hdfsClientUtility.ts +++ b/src/nni_manager/training_service/pai/hdfsClientUtility.ts @@ -133,10 +133,13 @@ export namespace HDFSClientUtility { deferred.resolve(exist); }); - // Set timeout and reject the promise once reach timeout (5 seconds) - setTimeout(() => deferred.reject(`Check HDFS path ${hdfsPath} exists timeout`), 5000); + let timeoutId : NodeJS.Timer + const delayTimeout : Promise = new Promise((resolve : Function, reject : Function) : void => { + // Set timeout and reject the promise once reach timeout (5 seconds) + setTimeout(() => deferred.reject(`Check HDFS path ${hdfsPath} exists timeout`), 5000); + }); - return deferred.promise; + return Promise.race([deferred.promise, delayTimeout]).finally(() => clearTimeout(timeoutId)); } /** diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index f7f8b3c4e7..794e957413 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -186,8 +186,8 @@ class PAITrainingService implements TrainingService { const nniPaiTrialCommand : string = String.Format( PAI_TRIAL_COMMAND_FORMAT, // PAI will copy job's codeDir into /root directory - `/root/${trialJobId}`, - `/root/${trialJobId}/nnioutput`, + `$PWD/${trialJobId}`, + `$PWD/${trialJobId}/nnioutput`, trialJobId, this.experimentId, this.paiTrialConfig.command, @@ -343,7 +343,17 @@ class PAITrainingService implements TrainingService { deferred.resolve(); } }); - break; + + let timeoutId: NodeJS.Timer; + const timeoutDelay: Promise = new Promise((resolve: Function, reject: Function): void => { + // Set timeout and reject the promise once reach timeout (5 seconds) + timeoutId = setTimeout( + () => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')), + 5000); + }); + + return Promise.race([timeoutDelay, deferred.promise]).finally(() => clearTimeout(timeoutId)); + case TrialConfigMetadataKey.TRIAL_CONFIG: if (!this.paiClusterConfig){ this.log.error('pai cluster config is not initialized'); From 108e3f5e64fd945be0c473882d216dc68674b564 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Wed, 10 Oct 2018 18:11:15 +0800 Subject: [PATCH 2/7] Print exception message for trial keeper rest utils --- tools/trial_tool/rest_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/trial_tool/rest_utils.py b/tools/trial_tool/rest_utils.py index f506653c4e..d6abf0905e 100644 --- a/tools/trial_tool/rest_utils.py +++ b/tools/trial_tool/rest_utils.py @@ -27,7 +27,8 @@ def rest_get(url, timeout): try: response = requests.get(url, timeout=timeout) return response - except Exception: + except Exception as e: + print('Get exception {0} when sending http get to url {1}'.format(str(e), url)) return None def rest_post(url, data, timeout): @@ -36,7 +37,8 @@ def rest_post(url, data, timeout): response = requests.post(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\ data=data, timeout=timeout) return response - except Exception: + except Exception as e: + print('Get exception {0} when sending http post to url {1}'.format(str(e), url)) return None def rest_put(url, data, timeout): @@ -45,7 +47,8 @@ def rest_put(url, data, timeout): response = requests.put(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\ data=data, timeout=timeout) return response - except Exception: + except Exception as e: + print('Get exception {0} when sending http put to url {1}'.format(str(e), url)) return None def rest_delete(url, timeout): @@ -53,5 +56,6 @@ def rest_delete(url, timeout): try: response = requests.delete(url, timeout=timeout) return response - except Exception: + except Exception as e: + print('Get exception {0} when sending http delete to url {1}'.format(str(e), url)) return None From 838aaec89de8fee046abe6a9222ff33ffa34ace9 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 11 Oct 2018 15:25:55 +0800 Subject: [PATCH 3/7] fix parameter file name issue for multi-phase training --- examples/assessors/README.md | 6 ++--- src/nni_manager/common/utils.ts | 23 ++++++++++++++++--- .../local/localTrainingService.ts | 4 ++-- .../pai/paiTrainingService.ts | 8 ++++--- .../remoteMachineTrainingService.ts | 6 ++--- src/sdk/pynni/nni/platform/local.py | 2 +- 6 files changed, 34 insertions(+), 15 deletions(-) diff --git a/examples/assessors/README.md b/examples/assessors/README.md index 843806b3c7..1a56b58f5f 100644 --- a/examples/assessors/README.md +++ b/examples/assessors/README.md @@ -2,10 +2,10 @@ *Assessor receive intermediate result from Trial and decide whether the Trial should be killed. Once the Trial experiment meets the early stop conditions, the assessor will kill the Trial.* -So, if user want to implement a customized Assessor, she/he only need to: +So, if want to implement a customized Assessor, you only need to: -**1) Inherit a tuner of a base Tuner class** +**1) Inherit an assessor of a base Assessor class** ```python from nni.assessor import Assessor @@ -31,7 +31,7 @@ class CustomizedAssessor(Assessor): # you code implement here. ... ``` -**3) Write a script to run Tuner** +**3) Write a script to run Assessor** ```python import argparse diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index 20609598a0..7fba1820dd 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -31,7 +31,7 @@ import * as util from 'util'; import { Database, DataStore } from './datastore'; import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from './experimentStartupInfo'; import { Manager } from './manager'; -import { TrainingService } from './trainingService'; +import { HyperParameters, TrainingService } from './trainingService'; function getExperimentRootDir(): string { return path.join(os.homedir(), 'nni', 'experiments', getExperimentId()); @@ -194,6 +194,23 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, multiPhase: boolean return command; } +/** + * Generate parameter file name based on HyperParameters object + * @param hyperParameters HyperParameters instance + */ +function generateParamFileName(hyperParameters : HyperParameters): string { + assert(hyperParameters !== undefined); + assert(hyperParameters.index >= 0); + + let paramFileName : string; + if(hyperParameters.index == 0) { + paramFileName = 'parameter.cfg'; + } else { + paramFileName = `parameter_${hyperParameters.index}.cfg` + } + return paramFileName; +} + /** * Initialize a pseudo experiment environment for unit test. * Must be paired with `cleanupUnitTest()`. @@ -242,5 +259,5 @@ function getIPV4Address(): string { return ipv4Address; } -export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, getIPV4Address, - mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect }; +export { generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir, + getDefaultDatabaseDir, getIPV4Address, mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect }; diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index b66dd8d68c..709171a114 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -33,7 +33,7 @@ import { HostJobApplicationForm, JobApplicationForm, HyperParameters, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; -import { delay, getExperimentRootDir, uniqueString } from '../../common/utils'; +import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../common/utils'; import { file } from 'tmp'; const tkill = require('tree-kill'); @@ -412,7 +412,7 @@ class LocalTrainingService implements TrainingService { } private async writeParameterFile(directory: string, hyperParameters: HyperParameters): Promise { - const filepath: string = path.join(directory, `parameter_${hyperParameters.index}.cfg`); + const filepath: string = path.join(directory, generateParamFileName(hyperParameters)); await fs.promises.writeFile(filepath, hyperParameters.value, { encoding: 'utf8' }); } } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 794e957413..d1df2da34e 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -20,6 +20,7 @@ 'use strict' +import * as assert from 'assert'; import * as component from '../../common/component'; import * as cpp from 'child-process-promise'; import * as fs from 'fs'; @@ -37,7 +38,7 @@ import { JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; -import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; +import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; import { PAIJobRestServer } from './paiJobRestServer' import { PAITrialJobDetail, PAI_INSTALL_NNI_SHELL_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData'; import { PAIJobInfoCollector } from './paiJobInfoCollector'; @@ -156,11 +157,12 @@ class PAITrainingService implements TrainingService { const runScriptContent : string = PAI_INSTALL_NNI_SHELL_FORMAT; // Write NNI installation file to local tmp files await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' }); - + // Write file content ( parameter.cfg ) to local tmp folders const trialForm : TrialJobApplicationForm = (form) if(trialForm) { - await fs.promises.writeFile(path.join(trialLocalTempFolder, 'parameter.cfg'), trialForm.hyperParameters, { encoding: 'utf8' }); + await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), + trialForm.hyperParameters.value, { encoding: 'utf8' }); } // Step 1. Prepare PAI job configuration diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index a4be7a1b0d..b57ec194ed 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -36,7 +36,7 @@ import { ObservableTimer } from '../../common/observableTimer'; import { HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; -import { delay, getExperimentRootDir, uniqueString } from '../../common/utils'; +import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../common/utils'; import { GPUSummary } from '../common/gpuData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; @@ -458,7 +458,7 @@ class RemoteMachineTrainingService implements TrainingService { //create tmp trial working folder locally. await cpp.exec(`mkdir -p ${trialLocalTempFolder}`); - // Write file content ( run.sh and parameter_0.cfg ) to local tmp files + // Write file content ( run.sh and parameter.cfg ) to local tmp files await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run.sh'), runScriptContent, { encoding: 'utf8' }); // Copy local tmp files to remote machine @@ -586,7 +586,7 @@ class RemoteMachineTrainingService implements TrainingService { const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); - const fileName: string = `parameter_${hyperParameters.index}.cfg`; + const fileName: string = generateParamFileName(hyperParameters); const localFilepath: string = path.join(trialLocalTempFolder, fileName); await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' }); diff --git a/src/sdk/pynni/nni/platform/local.py b/src/sdk/pynni/nni/platform/local.py index 1c3b196bf4..08fb01f473 100644 --- a/src/sdk/pynni/nni/platform/local.py +++ b/src/sdk/pynni/nni/platform/local.py @@ -49,7 +49,7 @@ def request_next_parameter(): def get_parameters(): global _param_index - params_filepath = os.path.join(_sysdir, 'parameter_{}.cfg'.format(_param_index)) + params_filepath = os.path.join(_sysdir, ('parameter_{}.cfg'.format(_param_index), 'parameter.cfg')[_param_index == 0]) if not os.path.isfile(params_filepath): request_next_parameter() while not os.path.isfile(params_filepath): From a142c18fc4198469e5191975ad1e508819edd1c1 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Fri, 12 Oct 2018 11:57:08 +0800 Subject: [PATCH 4/7] Updated based on comments --- examples/assessors/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/assessors/README.md b/examples/assessors/README.md index 1a56b58f5f..8e6376dce2 100644 --- a/examples/assessors/README.md +++ b/examples/assessors/README.md @@ -2,7 +2,7 @@ *Assessor receive intermediate result from Trial and decide whether the Trial should be killed. Once the Trial experiment meets the early stop conditions, the assessor will kill the Trial.* -So, if want to implement a customized Assessor, you only need to: +So, if users want to implement a customized Assessor, they only need to: **1) Inherit an assessor of a base Assessor class** From 31805330ccd4e0644e94df42ad6aada87406265b Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Fri, 12 Oct 2018 15:53:34 +0800 Subject: [PATCH 5/7] Fix bug, make get_parameters() idompotent --- src/nni_manager/core/nnimanager.ts | 4 ++++ .../training_service/common/trialConfigMetadataKey.ts | 1 + .../training_service/local/localTrainingService.ts | 2 +- .../remote_machine/remoteMachineData.ts | 2 +- .../remote_machine/remoteMachineTrainingService.ts | 10 ++++++++-- src/sdk/pynni/nni/platform/local.py | 10 +++++++++- 6 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 09b1f2310c..badc9e0955 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -116,6 +116,10 @@ class NNIManager implements Manager { await this.storeExperimentProfile(); this.log.debug('Setup tuner...'); + if(expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) { + this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString()); + } + const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase); console.log(`dispatcher command: ${dispatcherCommand}`); this.setupTuner( diff --git a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts index 12df449ee1..334a9604d6 100644 --- a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts +++ b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts @@ -26,6 +26,7 @@ export enum TrialConfigMetadataKey { MACHINE_LIST = 'machine_list', TRIAL_CONFIG = 'trial_config', EXPERIMENT_ID = 'experimentId', + MULTI_PHASE = 'multiPhase', RANDOM_SCHEDULER = 'random_scheduler', PAI_CLUSTER_CONFIG = 'pai_config' } diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 709171a114..25fae256e7 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -229,7 +229,7 @@ class LocalTrainingService implements TrainingService { * Is multiphase job supported in current training service */ public get isMultiPhaseJobSupported(): boolean { - return false; + return true; } public async cancelTrialJob(trialJobId: string): Promise { diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts index 0cd3a028dc..7237c6367b 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts @@ -19,7 +19,6 @@ 'use strict'; -import { Client } from 'ssh2'; import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { GPUSummary } from '../common/gpuData'; @@ -109,6 +108,7 @@ export enum ScheduleResultType { export const REMOTEMACHINE_RUN_SHELL_FORMAT: string = `#!/bin/bash export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} +export MULTI_PHASE={7} cd $NNI_SYS_DIR echo $$ >{2} eval {3}{4} 2>{5} diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index b57ec194ed..24d145f15c 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -64,6 +64,7 @@ class RemoteMachineTrainingService implements TrainingService { private stopping: boolean = false; private metricsEmitter: EventEmitter; private log: Logger; + private isMultiPhase: boolean = false; constructor(@component.Inject timer: ObservableTimer) { this.metricsEmitter = new EventEmitter(); @@ -222,7 +223,7 @@ class RemoteMachineTrainingService implements TrainingService { * Is multiphase job supported in current training service */ public get isMultiPhaseJobSupported(): boolean { - return false; + return true; } /** @@ -291,6 +292,9 @@ class RemoteMachineTrainingService implements TrainingService { } this.trialConfig = remoteMachineTrailConfig; break; + case TrialConfigMetadataKey.MULTI_PHASE: + this.isMultiPhase = (value === 'true' || value === 'True'); + break; default: //Reject for unknown keys throw new Error(`Uknown key: ${key}`); @@ -453,7 +457,9 @@ class RemoteMachineTrainingService implements TrainingService { `CUDA_VISIBLE_DEVICES=${cuda_visible_device} ` : `CUDA_VISIBLE_DEVICES=" " `, this.trialConfig.command, path.join(trialWorkingFolder, 'stderr'), - path.join(trialWorkingFolder, '.nni', 'code')); + path.join(trialWorkingFolder, '.nni', 'code'), + /** Mark if the trial is multi-phase job */ + this.isMultiPhase); //create tmp trial working folder locally. await cpp.exec(`mkdir -p ${trialLocalTempFolder}`); diff --git a/src/sdk/pynni/nni/platform/local.py b/src/sdk/pynni/nni/platform/local.py index 08fb01f473..c1a2ce8960 100644 --- a/src/sdk/pynni/nni/platform/local.py +++ b/src/sdk/pynni/nni/platform/local.py @@ -36,6 +36,8 @@ _log_file_path = os.path.join(_outputdir, 'trial.log') init_logger(_log_file_path) +_multiphase = os.environ.get('MULTI_PHASE') + _param_index = 0 def request_next_parameter(): @@ -49,7 +51,13 @@ def request_next_parameter(): def get_parameters(): global _param_index - params_filepath = os.path.join(_sysdir, ('parameter_{}.cfg'.format(_param_index), 'parameter.cfg')[_param_index == 0]) + params_file_name = '' + if _multiphase and (_multiphase == 'true' or _multiphase == 'True'): + params_file_name = ('parameter_{}.cfg'.format(_param_index), 'parameter.cfg')[_param_index == 0] + else: + params_file_name = 'parameter.cfg' + + params_filepath = os.path.join(_sysdir, params_file_name) if not os.path.isfile(params_filepath): request_next_parameter() while not os.path.isfile(params_filepath): From b8dbd08c50ffba5902a288b3963975ae9e60a26c Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Tue, 16 Oct 2018 11:38:55 +0800 Subject: [PATCH 6/7] Add idompotent support for get_parameters() in LocalTrainingService --- src/nni_manager/core/nnimanager.ts | 6 ++++++ .../training_service/local/localTrainingService.ts | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index badc9e0955..6b95de4a45 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -116,6 +116,7 @@ class NNIManager implements Manager { await this.storeExperimentProfile(); this.log.debug('Setup tuner...'); + // Set up multiphase config if(expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) { this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString()); } @@ -144,6 +145,11 @@ class NNIManager implements Manager { this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId); const expParams: ExperimentParams = this.experimentProfile.params; + // Set up multiphase config + if(expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) { + this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString()); + } + const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase); console.log(`dispatcher command: ${dispatcherCommand}`); this.setupTuner( diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 25fae256e7..0c90598dd3 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -97,6 +97,7 @@ class LocalTrainingService implements TrainingService { private rootDir!: string; protected log: Logger; protected localTrailConfig?: TrialConfig; + private isMultiPhase: boolean = false; constructor() { this.eventEmitter = new EventEmitter(); @@ -262,6 +263,9 @@ class LocalTrainingService implements TrainingService { throw new Error('trial config parsed failed'); } break; + case TrialConfigMetadataKey.MULTI_PHASE: + this.isMultiPhase = (value === 'true' || value === 'True'); + break; default: } } @@ -296,7 +300,8 @@ class LocalTrainingService implements TrainingService { { key: 'NNI_PLATFORM', value: 'local' }, { key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory }, { key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id }, - { key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory } + { key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory }, + { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() } ]; } From 9613c84cab007859b76e286ba81f85eb8e4555bc Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Tue, 16 Oct 2018 11:52:23 +0800 Subject: [PATCH 7/7] Add ip address cached to resolve network issue --- src/nni_manager/common/utils.ts | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index 7fba1820dd..36f0bf8c62 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -245,18 +245,27 @@ function cleanupUnitTest(): void { Container.restore(ExperimentStartupInfo); } +let cachedipv4Address : string = ''; /** * Get IPv4 address of current machine */ function getIPV4Address(): string { - let ipv4Address : string = ''; + if (cachedipv4Address && cachedipv4Address.length > 0) { + return cachedipv4Address; + } - for(const item of os.networkInterfaces().eth0) { - if(item.family === 'IPv4') { - ipv4Address = item.address; + if(os.networkInterfaces().eth0) { + for(const item of os.networkInterfaces().eth0) { + if(item.family === 'IPv4') { + cachedipv4Address = item.address; + return cachedipv4Address; + } } + } else { + throw Error('getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.'); } - return ipv4Address; + + throw Error('getIPV4Address() failed because no valid IPv4 address found.') } export { generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir,