diff --git a/docs/HowToChooseTuner.md b/docs/HowToChooseTuner.md index c23edf8b71..e681e2b423 100644 --- a/docs/HowToChooseTuner.md +++ b/docs/HowToChooseTuner.md @@ -91,6 +91,10 @@ _Usage_: Note that SMAC on nni only supports a subset of the types in [search space spec](./SearchSpaceSpec.md), including `choice`, `randint`, `uniform`, `loguniform`, `quniform(q=1)`. +_Installation_: +* Install swig first. (`sudo apt-get install swig` for Ubuntu users) +* Run `nnictl package install --name=SMAC` + _Suggested scenario_: Similar to TPE, SMAC is also a black-box tuner which can be tried in various scenarios, and is suggested when computation resource is limited. It is optimized for discrete hyperparameters, thus, suggested when most of your hyperparameters are discrete. _Usage_: diff --git a/docs/img/nni_arch_overview.png b/docs/img/nni_arch_overview.png index 6403154ab9..621f16a0dd 100644 Binary files a/docs/img/nni_arch_overview.png and b/docs/img/nni_arch_overview.png differ diff --git a/examples/trials/auto-gbdt/config.yml b/examples/trials/auto-gbdt/config.yml index 29fda16ec8..512127f83f 100644 --- a/examples/trials/auto-gbdt/config.yml +++ b/examples/trials/auto-gbdt/config.yml @@ -18,4 +18,4 @@ tuner: trial: command: python3 main.py codeDir: . - gpuNum: 0 + gpuNum: 0 \ No newline at end of file diff --git a/examples/trials/mnist-annotation/mnist.py b/examples/trials/mnist-annotation/mnist.py index d3c0e36c2a..0c0b213cb6 100644 --- a/examples/trials/mnist-annotation/mnist.py +++ b/examples/trials/mnist-annotation/mnist.py @@ -32,7 +32,7 @@ def __init__(self, """@nni.variable(nni.choice(124, 512, 1024), name=self.hidden_size)""" self.hidden_size = hidden_size self.pool_size = pool_size - """@nni.variable(nni.uniform(0.0001, 0.1), name=self.learning_rate)""" + """@nni.variable(nni.loguniform(0.0001, 0.1), name=self.learning_rate)""" self.learning_rate = learning_rate self.x_dim = x_dim self.y_dim = y_dim diff --git a/src/nni_manager/common/manager.ts b/src/nni_manager/common/manager.ts index 674b177731..6c44f36c5f 100644 --- a/src/nni_manager/common/manager.ts +++ b/src/nni_manager/common/manager.ts @@ -85,7 +85,7 @@ interface TrialJobStatistics { } interface NNIManagerStatus { - status: 'INITIALIZED' | 'EXPERIMENT_RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL'; + status: 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL'; errors: string[]; } diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index ef6906483b..47affdca15 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -138,7 +138,7 @@ class NNIManager implements Manager { checkpointDir); this.experimentProfile.startTime = Date.now(); - this.status.status = 'EXPERIMENT_RUNNING'; + this.status.status = 'RUNNING'; await this.storeExperimentProfile(); this.run().catch((err: Error) => { this.criticalError(err); @@ -185,7 +185,7 @@ class NNIManager implements Manager { this.experimentProfile.endTime) { delete this.experimentProfile.endTime; } - this.status.status = 'EXPERIMENT_RUNNING'; + this.status.status = 'RUNNING'; // TO DO: update database record for resume event this.run().catch((err: Error) => { @@ -350,7 +350,7 @@ class NNIManager implements Manager { let count: number = 1; while (this.status.status !== 'STOPPING' && this.status.status !== 'STOPPED') { await delay(1000 * 1); // 1 seconds - if (this.status.status === 'EXPERIMENT_RUNNING') { + if (this.status.status === 'RUNNING') { this.experimentProfile.execDuration += 1; if (count % 10 === 0) { await this.storeExperimentProfile(); @@ -460,15 +460,15 @@ class NNIManager implements Manager { } // check maxtrialnum and maxduration here - // NO_MORE_TRIAL is more like a subset of EXPERIMENT_RUNNING, because during EXPERIMENT_RUNNING tuner + // NO_MORE_TRIAL is more like a subset of RUNNING, because during RUNNING tuner // might tell nnimanager that this is no more trials. In NO_MORE_TRIAL state, the experiment is viewed - // as still running. DONE could be transfered from EXPERIMENT_RUNNING or NO_MORE_TRIAL. - assert(this.status.status === 'EXPERIMENT_RUNNING' || + // as still running. DONE could be transfered from RUNNING or NO_MORE_TRIAL. + assert(this.status.status === 'RUNNING' || this.status.status === 'DONE' || this.status.status === 'NO_MORE_TRIAL'); if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration || this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) { - if (this.status.status === 'EXPERIMENT_RUNNING' || + if (this.status.status === 'RUNNING' || this.status.status === 'NO_MORE_TRIAL') { this.experimentProfile.endTime = Date.now(); await this.storeExperimentProfile(); @@ -480,7 +480,7 @@ class NNIManager implements Manager { await this.storeExperimentProfile(); } if (this.status.status !== 'NO_MORE_TRIAL') { - this.status.status = 'EXPERIMENT_RUNNING'; + this.status.status = 'RUNNING'; } for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) { if (this.waitingTrials.length === 0 || @@ -602,7 +602,7 @@ class NNIManager implements Manager { case NEW_TRIAL_JOB: if (this.status.status === 'NO_MORE_TRIAL') { this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set'); - this.status.status = 'EXPERIMENT_RUNNING'; + this.status.status = 'RUNNING'; } this.waitingTrials.push(content); break; diff --git a/src/nni_manager/rest_server/nniRestServer.ts b/src/nni_manager/rest_server/nniRestServer.ts index e5f0f8de0b..84381746fe 100644 --- a/src/nni_manager/rest_server/nniRestServer.ts +++ b/src/nni_manager/rest_server/nniRestServer.ts @@ -19,11 +19,12 @@ 'use strict'; -import * as express from 'express'; import * as bodyParser from 'body-parser'; +import * as express from 'express'; import * as path from 'path'; import * as component from '../common/component'; import { RestServer } from '../common/restServer' +import { getLogDir } from '../common/utils'; import { createRestHandler } from './restHandler'; /** @@ -35,6 +36,7 @@ import { createRestHandler } from './restHandler'; @component.Singleton export class NNIRestServer extends RestServer { private readonly API_ROOT_URL: string = '/api/v1/nni'; + private readonly LOGS_ROOT_URL: string = '/logs'; /** * constructor to provide NNIRestServer's own rest property, e.g. port @@ -50,6 +52,7 @@ export class NNIRestServer extends RestServer { this.app.use(express.static('static')); this.app.use(bodyParser.json()); this.app.use(this.API_ROOT_URL, createRestHandler(this)); + this.app.use(this.LOGS_ROOT_URL, express.static(getLogDir())); this.app.get('*', (req: express.Request, res: express.Response) => { res.sendFile(path.resolve('static/index.html')); }); diff --git a/src/nni_manager/rest_server/test/mockedNNIManager.ts b/src/nni_manager/rest_server/test/mockedNNIManager.ts index cdd0d64fa3..a9266dad75 100644 --- a/src/nni_manager/rest_server/test/mockedNNIManager.ts +++ b/src/nni_manager/rest_server/test/mockedNNIManager.ts @@ -39,7 +39,7 @@ export const testManagerProvider: Provider = { export class MockedNNIManager extends Manager { public getStatus(): NNIManagerStatus { return { - status: 'EXPERIMENT_RUNNING', + status: 'RUNNING', errors: [] } } diff --git a/src/nni_manager/training_service/pai/hdfsClientUtility.ts b/src/nni_manager/training_service/pai/hdfsClientUtility.ts index e650639624..489fb830f7 100644 --- a/src/nni_manager/training_service/pai/hdfsClientUtility.ts +++ b/src/nni_manager/training_service/pai/hdfsClientUtility.ts @@ -20,12 +20,38 @@ import * as path from 'path'; import * as fs from 'fs'; import { Deferred } from 'ts-deferred'; +import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger } from '../../common/log'; /** * HDFS client utility, including copy file/directory */ export namespace HDFSClientUtility { + /** + * Get NNI experiment root directory + * @param hdfsUserName HDFS user name + */ + function hdfsExpRootDir(hdfsUserName: string): string { + return path.join('/', hdfsUserName, 'nni', 'experiments', getExperimentId()); + } + + /** + * Get NNI experiment code directory + * @param hdfsUserName HDFS user name + */ + export function getHdfsExpCodeDir(hdfsUserName: string): string { + return path.join(hdfsExpRootDir(hdfsUserName), 'codeDir'); + } + + /** + * Get NNI trial working directory + * @param hdfsUserName HDFS user name + * @param trialId NNI trial ID + */ + export function getHdfsTrialWorkDir(hdfsUserName: string, trialId: string): string { + return path.join(hdfsExpRootDir(hdfsUserName), 'trials', trialId); + } + /** * Copy a local file to hdfs directory * diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index bb22f56900..036c206c68 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -63,7 +63,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = `export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} && cd $NNI_SYS_DIR && sh install_nni.sh && python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}' ---pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10}`; +--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}'`; export const PAI_OUTPUT_DIR_FORMAT: string = `hdfs://{0}:9000/`; diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 444f5a40ca..706f5eebff 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -30,7 +30,7 @@ import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { Deferred } from 'ts-deferred'; import { EventEmitter } from 'events'; import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo'; -import { HDFSClientUtility } from './hdfsClientUtility' +import { HDFSClientUtility } from './hdfsClientUtility'; import { MethodNotImplementedError } from '../../common/errors'; import { getLogger, Logger } from '../../common/log'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; @@ -38,7 +38,7 @@ import { JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, NNIManagerIpConfig } from '../../common/trainingService'; -import { countFilesRecursively, delay, generateParamFileName, +import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; import { PAIJobRestServer } from './paiJobRestServer' import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData'; @@ -74,6 +74,7 @@ class PAITrainingService implements TrainingService { private nextTrialSequenceId: number; private paiRestServerPort?: number; private nniManagerIpConfig?: NNIManagerIpConfig; + private copyExpCodeDirPromise?: Promise; constructor() { this.log = getLogger(); @@ -145,11 +146,11 @@ class PAITrainingService implements TrainingService { throw new Error('PAI token is not initialized'); } - if(!this.hdfsBaseDir){ + if(!this.hdfsBaseDir) { throw new Error('hdfsBaseDir is not initialized'); } - if(!this.hdfsOutputHost){ + if(!this.hdfsOutputHost) { throw new Error('hdfsOutputHost is not initialized'); } @@ -160,6 +161,11 @@ class PAITrainingService implements TrainingService { this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); + // Make sure experiment code files is copied from local to HDFS + if(this.copyExpCodeDirPromise) { + await this.copyExpCodeDirPromise; + } + const trialJobId: string = uniqueString(5); const trialSequenceId: number = this.generateSequenceId(); //TODO: use HDFS working folder instead @@ -167,8 +173,7 @@ class PAITrainingService implements TrainingService { const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); //create tmp trial working folder locally. - await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); - await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`); + await cpp.exec(`mkdir -p ${trialLocalTempFolder}`); const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; // Write NNI installation file to local tmp files @@ -182,8 +187,8 @@ class PAITrainingService implements TrainingService { } // Step 1. Prepare PAI job configuration - const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; - const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId); + const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; + const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); const hdfsOutputDir : string = path.join(this.hdfsBaseDir, this.experimentId, trialJobId); const hdfsLogPath : string = String.Format( @@ -215,7 +220,8 @@ class PAITrainingService implements TrainingService { this.paiRestServerPort, hdfsOutputDir, this.hdfsOutputHost, - this.paiClusterConfig.userName + this.paiClusterConfig.userName, + HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName) ).replace(/\r\n|\n|\r/gm, ''); console.log(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); @@ -390,6 +396,7 @@ class PAITrainingService implements TrainingService { } this.hdfsOutputHost = groups['host']; + //TODO: choose to use /${username} as baseDir this.hdfsBaseDir = groups['baseDir']; if(this.hdfsBaseDir === undefined) { this.hdfsBaseDir = "/"; @@ -414,6 +421,11 @@ class PAITrainingService implements TrainingService { } catch(error) { deferred.reject(new Error(`HDFS encounters problem, error is ${error}. Please check hdfsOutputDir host!`)); } + + // Copy experiment files from local folder to HDFS + this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs(this.paiTrialConfig.codeDir, + HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName), + this.hdfsClient); deferred.resolve(); break; diff --git a/src/sdk/pynni/nni/hyperband_advisor/README.md b/src/sdk/pynni/nni/hyperband_advisor/README.md index 2c5363134e..3cbc2922e3 100644 --- a/src/sdk/pynni/nni/hyperband_advisor/README.md +++ b/src/sdk/pynni/nni/hyperband_advisor/README.md @@ -33,6 +33,7 @@ If you use Hyperband, among the hyperparameters (i.e., key-value pairs) received `eta` means `n/eta` configurations from `n` configurations will survive and rerun using more STEPS. Here is a concrete example of `R=81` and `eta=3`: + | | s=4 | s=3 | s=2 | s=1 | s=0 | |------|-----|-----|-----|-----|-----| |i | n r | n r | n r | n r | n r | diff --git a/tools/nni_trial_tool/hdfsClientUtility.py b/tools/nni_trial_tool/hdfsClientUtility.py index 0b6daeb2c4..93396770e5 100644 --- a/tools/nni_trial_tool/hdfsClientUtility.py +++ b/tools/nni_trial_tool/hdfsClientUtility.py @@ -19,10 +19,55 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import os +import posixpath from pyhdfs import HdfsClient +from .log_utils import LogType, nni_log + +def copyHdfsDirectoryToLocal(hdfsDirectory, localDirectory, hdfsClient): + '''Copy directory from HDFS to local''' + if not os.path.exists(localDirectory): + os.makedirs(localDirectory) + try: + listing = hdfsClient.list_status(hdfsDirectory) + except Exception as exception: + nni_log(LogType.Error, 'List hdfs directory {0} error: {1}'.format(hdfsDirectory, str(exception))) + raise exception + + for f in listing: + if f.type == 'DIRECTORY': + subHdfsDirectory = posixpath.join(hdfsDirectory, f.pathSuffix) + subLocalDirectory = os.path.join(localDirectory, f.pathSuffix) + copyHdfsDirectoryToLocal(subHdfsDirectory, subLocalDirectory, hdfsClient) + elif f.type == 'FILE': + hdfsFilePath = posixpath.join(hdfsDirectory, f.pathSuffix) + localFilePath = os.path.join(localDirectory, f.pathSuffix) + copyHdfsFileToLocal(hdfsFilePath, localFilePath, hdfsClient) + else: + raise AssertionError('unexpected type {}'.format(f.type)) + +def copyHdfsFileToLocal(hdfsFilePath, localFilePath, hdfsClient, override=True): + '''Copy file from HDFS to local''' + if not hdfsClient.exists(hdfsFilePath): + raise Exception('HDFS file {} does not exist!'.format(hdfsFilePath)) + try: + file_status = hdfsClient.get_file_status(hdfsFilePath) + if file_status.type != 'FILE': + raise Exception('HDFS file path {} is not a file'.format(hdfsFilePath)) + except Exception as exception: + nni_log(LogType.Error, 'Get hdfs file {0} status error: {1}'.format(hdfsFilePath, str(exception))) + raise exception + + if os.path.exists(localFilePath) and override: + os.remove(localFilePath) + try: + hdfsClient.copy_to_local(hdfsFilePath, localFilePath) + except Exception as exception: + nni_log(LogType.Error, 'Copy hdfs file {0} to {1} error: {2}'.format(hdfsFilePath, localFilePath, str(exception))) + raise exception + nni_log(LogType.Info, 'Successfully copied hdfs file {0} to {1}, {2} bytes'.format(hdfsFilePath, localFilePath, file_status.length)) def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): - '''Copy directory from local to hdfs''' + '''Copy directory from local to HDFS''' if not os.path.exists(localDirectory): raise Exception('Local Directory does not exist!') hdfsClient.mkdirs(hdfsDirectory) @@ -34,19 +79,19 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): try: result = result and copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient) except Exception as exception: - print(exception) + nni_log(LogType.Error, 'Copy local directory {0} to hdfs directory {1} error: {2}'.format(file_path, hdfs_directory, str(exception))) result = False else: hdfs_file_path = os.path.join(hdfsDirectory, file) try: result = result and copyFileToHdfs(file_path, hdfs_file_path, hdfsClient) except Exception as exception: - print(exception) + nni_log(LogType.Error, 'Copy local file {0} to hdfs {1} error: {2}'.format(file_path, hdfs_file_path, str(exception))) result = False return result def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): - '''Copy a local file to hdfs directory''' + '''Copy a local file to HDFS directory''' if not os.path.exists(localFilePath): raise Exception('Local file Path does not exist!') if os.path.isdir(localFilePath): @@ -60,5 +105,5 @@ def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): hdfsClient.copy_from_local(localFilePath, hdfsFilePath) return True except Exception as exception: - print(exception) + nni_log(LogType.Error, 'Copy local file {0} to hdfs file {1} error: {2}'.format(localFilePath, hdfsFilePath, str(exception))) return False \ No newline at end of file diff --git a/tools/nni_trial_tool/trial_keeper.py b/tools/nni_trial_tool/trial_keeper.py index ad6535b951..dc449fa303 100644 --- a/tools/nni_trial_tool/trial_keeper.py +++ b/tools/nni_trial_tool/trial_keeper.py @@ -28,7 +28,7 @@ from pyhdfs import HdfsClient from .constants import HOME_DIR, LOG_DIR, NNI_PLATFORM, STDOUT_FULL_PATH, STDERR_FULL_PATH -from .hdfsClientUtility import copyDirectoryToHdfs +from .hdfsClientUtility import copyDirectoryToHdfs, copyHdfsDirectoryToLocal from .log_utils import LogType, nni_log from .metrics_reader import read_experiment_metrics @@ -42,6 +42,15 @@ def main_loop(args): stdout_file = open(STDOUT_FULL_PATH, 'a+') stderr_file = open(STDERR_FULL_PATH, 'a+') + + try: + hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) + except Exception as e: + nni_log(LogType.Error, 'Create HDFS client error: ' + str(e)) + raise e + + copyHdfsDirectoryToLocal(args.nni_hdfs_exp_dir, os.getcwd(), hdfs_client) + # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior process = Popen(args.trial_command, shell = True, stdout = stdout_file, stderr = stderr_file) nni_log(LogType.Info, 'Trial keeper spawns a subprocess (pid {0}) to run command: {1}'.format(process.pid, shlex.split(args.trial_command))) @@ -57,7 +66,6 @@ def main_loop(args): # Copy local directory to hdfs for OpenPAI nni_local_output_dir = os.environ['NNI_OUTPUT_DIR'] try: - hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) if copyDirectoryToHdfs(nni_local_output_dir, args.pai_hdfs_output_dir, hdfs_client): nni_log(LogType.Info, 'copy directory from {0} to {1} success!'.format(nni_local_output_dir, args.pai_hdfs_output_dir)) else: @@ -85,6 +93,7 @@ def trial_keeper_help_info(*args): PARSER.add_argument('--pai_hdfs_output_dir', type=str, help='the output dir of hdfs') PARSER.add_argument('--pai_hdfs_host', type=str, help='the host of hdfs') PARSER.add_argument('--pai_user_name', type=str, help='the username of hdfs') + PARSER.add_argument('--nni_hdfs_exp_dir', type=str, help='nni experiment directory in hdfs') args, unknown = PARSER.parse_known_args() if args.trial_command is None: exit(1)