diff --git a/deployment/Dockerfile b/deployment/Dockerfile new file mode 100644 index 0000000000..9527b87225 --- /dev/null +++ b/deployment/Dockerfile @@ -0,0 +1,13 @@ +FROM nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 + +LABEL maintainer='Microsoft NNI Team' + +# +#Tensorflow 1.10.0 +# +RUN pip3 --no-cache-dir install tensorflow-gpu==1.10.0 + +# +#Keras 2.1.6 +# +RUN pip3 --no-cache-dir install Keras==2.1.6 \ No newline at end of file diff --git a/deployment/Dockerfile.build.base b/deployment/Dockerfile.build.base new file mode 100644 index 0000000000..8fd7bf69aa --- /dev/null +++ b/deployment/Dockerfile.build.base @@ -0,0 +1,48 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + +LABEL maintainer='Microsoft NNI Team' + +RUN apt-get update && apt-get install -y --no-install-recommends \ + sudo apt-utils git curl vim unzip openssh-client wget \ + build-essential cmake \ + libopenblas-dev + +# +# Python 3.5 +# +RUN apt-get install -y --no-install-recommends python3.5 python3.5-dev python3-pip python3-tk && \ + pip3 install --no-cache-dir --upgrade pip setuptools && \ + echo "alias python='python3'" >> /root/.bash_aliases && \ + echo "alias pip='pip3'" >> /root/.bash_aliases + +# numpy 1.14.3 scipy 1.1.0 +RUN pip3 --no-cache-dir install \ + numpy==1.14.3 scipy==1.1.0 + +# +#Install node 10.9.0, yarn 1.9.4, NNI v0.1 +# +RUN git clone -b v0.1 https://github.com/Microsoft/nni.git +RUN cd nni && sh install.sh +RUN echo 'PATH=~/.local/node/bin:~/.local/yarn/bin:~/.local/bin:$PATH' >> ~/.bashrc +RUN cd .. && rm -rf nni diff --git a/deployment/README.md b/deployment/README.md new file mode 100644 index 0000000000..b19ff06260 --- /dev/null +++ b/deployment/README.md @@ -0,0 +1,20 @@ +Dockerfile +=== +## 1.Description +This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly. +## 2.Including Libraries + +``` +Ubuntu 16.04 LTS +CUDA 9.0, CuDNN 7.0 +numpy 1.14.3,scipy 1.1.0 +TensorFlow 1.5.0 +Keras 2.1.6 +NNI v0.1 +``` + +## 3 How to run + + docker build -f Dockerfile.build.base -t nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 . + docker build -t nni/nni . + nvidia-docker run -it nni/nni \ No newline at end of file diff --git a/setup.py b/setup.py index d6703cfb3e..eeee54d075 100644 --- a/setup.py +++ b/setup.py @@ -80,10 +80,17 @@ def run(self): 'psutil', 'pyyaml', 'requests', - 'scipy' + 'scipy', + 'schema' + ], + dependency_links = [ + 'git+https://github.com/hyperopt/hyperopt.git' ], cmdclass={ 'install': CustomInstallCommand + }, + entry_points={ + 'console_scripts': ['nnictl = nnicmd.nnictl:parse_args'] } ) diff --git a/src/nni_manager/common/datastore.ts b/src/nni_manager/common/datastore.ts index 99a7f3df8a..b86b0a95fe 100644 --- a/src/nni_manager/common/datastore.ts +++ b/src/nni_manager/common/datastore.ts @@ -26,14 +26,14 @@ type TrialJobEvent = TrialJobStatus | 'USER_TO_CANCEL' | 'ADD_CUSTOMIZED'; type MetricType = 'PERIODICAL' | 'FINAL' | 'CUSTOM'; interface ExperimentProfileRecord { - readonly timestamp: Date; + readonly timestamp: number; readonly experimentId: number; readonly revision: number; readonly data: ExperimentProfile; } interface TrialJobEventRecord { - readonly timestamp: Date; + readonly timestamp: number; readonly trialJobId: string; readonly event: TrialJobEvent; readonly data?: string; @@ -49,7 +49,7 @@ interface MetricData { } interface MetricDataRecord { - readonly timestamp: Date; + readonly timestamp: number; readonly trialJobId: string; readonly parameterId: string; readonly type: MetricType; @@ -60,8 +60,8 @@ interface MetricDataRecord { interface TrialJobInfo { id: string; status: TrialJobStatus; - startTime?: Date; - endTime?: Date; + startTime?: number; + endTime?: number; hyperParameters?: string; logPath?: string; finalMetricData?: string; @@ -96,4 +96,4 @@ abstract class Database { export { DataStore, Database, TrialJobEvent, MetricType, MetricData, TrialJobInfo, ExperimentProfileRecord, TrialJobEventRecord, MetricDataRecord -} +}; diff --git a/src/nni_manager/common/manager.ts b/src/nni_manager/common/manager.ts index 989e55b81c..10fb9a4227 100644 --- a/src/nni_manager/common/manager.ts +++ b/src/nni_manager/common/manager.ts @@ -59,8 +59,8 @@ interface ExperimentProfile { params: ExperimentParams; id: string; execDuration: number; - startTime?: Date; - endTime?: Date; + startTime?: number; + endTime?: number; revision: number; } @@ -69,6 +69,11 @@ interface TrialJobStatistics { trialJobNumber: number; } +interface NNIManagerStatus { + status: 'INITIALIZED' | 'EXPERIMENT_RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED'; + errors: string[]; +} + abstract class Manager { public abstract startExperiment(experimentParams: ExperimentParams): Promise; public abstract resumeExperiment(): Promise; @@ -86,6 +91,7 @@ abstract class Manager { public abstract getMetricData(trialJobId: string, metricType: MetricType): Promise; public abstract getTrialJobStatistics(): Promise; + public abstract getStatus(): NNIManagerStatus; } -export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType }; +export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus }; diff --git a/src/nni_manager/rest_server/server.ts b/src/nni_manager/common/restServer.ts similarity index 78% rename from src/nni_manager/rest_server/server.ts rename to src/nni_manager/common/restServer.ts index d6ab92935e..7929e4344a 100644 --- a/src/nni_manager/rest_server/server.ts +++ b/src/nni_manager/common/restServer.ts @@ -19,28 +19,27 @@ 'use strict'; -import * as bodyParser from 'body-parser'; import * as express from 'express'; import * as http from 'http'; import { Deferred } from 'ts-deferred'; +import { getLogger, Logger } from './log'; -import * as component from '../common/component'; -import { getLogger, Logger } from '../common/log'; -import { Manager } from '../common/manager'; -import { createRestHandler } from './restHandler'; - -@component.Singleton -export class RestServer { - public static readonly DEFAULT_PORT: number = 51188; - private readonly API_ROOT_URL: string = '/api/v1/nni'; - private hostName: string = '0.0.0.0'; - private port: number = RestServer.DEFAULT_PORT; +/** + * Abstraction class to create a RestServer + * The module who wants to use a RestServer could extends this abstract class + * And implement its own registerRestHandler() function to register routers + */ +export abstract class RestServer { private startTask!: Deferred; private stopTask!: Deferred; - private app: express.Application = express(); private server!: http.Server; - private log: Logger = getLogger(); + /** The fields can be inherited by subclass */ + protected hostName: string = '0.0.0.0'; + protected port?: number; + protected app: express.Application = express(); + protected log: Logger = getLogger(); + get endPoint(): string { // tslint:disable-next-line:no-http-string return `http://${this.hostName}:${this.port}`; @@ -61,7 +60,7 @@ export class RestServer { this.port = port; } - this.server = this.app.listen(this.port, this.hostName).on('listening', () => { + this.server = this.app.listen(this.port as number, this.hostName).on('listening', () => { this.startTask.resolve(); }).on('error', (e: Error) => { this.startTask.reject(e); @@ -100,8 +99,8 @@ export class RestServer { return this.stopTask.promise; } - private registerRestHandler(): void { - this.app.use(bodyParser.json()); - this.app.use(this.API_ROOT_URL, createRestHandler(this)); - } + /** + * Register REST handler, which is left for subclass to implement + */ + protected abstract registerRestHandler(): void; } diff --git a/src/nni_manager/common/trainingService.ts b/src/nni_manager/common/trainingService.ts index dce65f05e0..0b8708394c 100644 --- a/src/nni_manager/common/trainingService.ts +++ b/src/nni_manager/common/trainingService.ts @@ -58,9 +58,9 @@ interface HostJobApplicationForm extends JobApplicationForm { interface TrialJobDetail { readonly id: string; readonly status: TrialJobStatus; - readonly submitTime: Date; - readonly startTime?: Date; - readonly endTime?: Date; + readonly submitTime: number; + readonly startTime?: number; + readonly endTime?: number; readonly tags?: string[]; readonly url?: string; readonly workingDirectory: string; diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index 98beed927a..ba0650ef28 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -19,6 +19,7 @@ 'use strict'; +import * as assert from 'assert'; import { randomBytes } from 'crypto'; import * as fs from 'fs'; import * as os from 'os'; @@ -32,7 +33,7 @@ import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from import { Manager } from './manager'; import { TrainingService } from './trainingService'; -function getExperimentRootDir(): string{ +function getExperimentRootDir(): string { return path.join(os.homedir(), 'nni', 'experiments', getExperimentId()); } @@ -115,6 +116,12 @@ function uniqueString(len: number): string { return String.fromCharCode(...codes); } +function randomSelect(a: T[]): T { + assert(a !== undefined); + + // tslint:disable-next-line:insecure-random + return a[Math.floor(Math.random() * a.length)]; +} function parseArg(names: string[]): string { if (process.argv.length >= 4) { for (let i: number = 2; i < process.argv.length - 1; i++) { @@ -223,4 +230,4 @@ function cleanupUnitTest(): void { } export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, mkDirP, delay, prepareUnitTest, - parseArg, cleanupUnitTest, uniqueString }; + parseArg, cleanupUnitTest, uniqueString, randomSelect }; diff --git a/src/nni_manager/core/nniDataStore.ts b/src/nni_manager/core/nniDataStore.ts index da8b4ca8ae..47c2f01dc3 100644 --- a/src/nni_manager/core/nniDataStore.ts +++ b/src/nni_manager/core/nniDataStore.ts @@ -126,7 +126,7 @@ class NNIDataStore implements DataStore { type: metrics.type, sequence: metrics.sequence, data: metrics.value, - timestamp: new Date() + timestamp: Date.now() })); } diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 79923b1a0a..48d9fa3c83 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -29,7 +29,7 @@ import { getExperimentId } from '../common/experimentStartupInfo'; import { getLogger, Logger } from '../common/log'; import { ExperimentParams, ExperimentProfile, Manager, - ProfileUpdateType, TrialJobStatistics + NNIManagerStatus, ProfileUpdateType, TrialJobStatistics } from '../common/manager'; import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus @@ -56,41 +56,26 @@ class NNIManager implements Manager { private dataStore: DataStore; private experimentProfile: ExperimentProfile; private dispatcherPid: number; + private status: NNIManagerStatus; constructor() { this.currSubmittedTrialNum = 0; this.trialConcurrencyReduction = 0; this.customizedTrials = []; - const experimentId: string = getExperimentId(); this.trainingService = component.get(TrainingService); assert(this.trainingService); this.dispatcherPid = 0; this.log = getLogger(); this.dataStore = component.get(DataStore); - this.experimentProfile = { - id: experimentId, - revision: 0, - execDuration: 0, - params: { - authorName: '', - experimentName: '', - trialConcurrency: 0, - maxExecDuration: 0, // unit: second - maxTrialNum: 0, // maxTrialNum includes all the submitted trial jobs - searchSpace: '', - tuner: { - className: '', - classArgs: {}, - checkpointDir: '' - } - } + this.experimentProfile = this.createEmptyExperimentProfile(); + this.status = { + status: 'INITIALIZED', + errors: [] }; } public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise { - // TO DO: remove this line, and let rest server do data type validation - experimentProfile.startTime = new Date(experimentProfile.startTime); switch (updateType) { case 'TRIAL_CONCURRENCY': this.updateTrialConcurrency(experimentProfile.params.trialConcurrency); @@ -140,10 +125,11 @@ class NNIManager implements Manager { 'start', expParams.tuner.checkpointDir); - this.experimentProfile.startTime = new Date(); + this.experimentProfile.startTime = Date.now(); + this.status.status = 'EXPERIMENT_RUNNING'; await this.storeExperimentProfile(); - this.run().catch(err => { - this.log.error(err.stack); + this.run().catch((err: Error) => { + this.criticalError(err); }); return this.experimentProfile.id; } @@ -172,6 +158,8 @@ class NNIManager implements Manager { .filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING') .map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.id))); + this.status.status = 'EXPERIMENT_RUNNING'; + // TO DO: update database record for resume event this.run().catch(console.error); } @@ -206,6 +194,7 @@ class NNIManager implements Manager { } public stopExperiment(): Promise { + this.status.status = 'STOPPING'; if (this.trialJobsMaintainer !== undefined) { this.trialJobsMaintainer.setStopLoop(); @@ -227,6 +216,10 @@ class NNIManager implements Manager { return deferred.promise; } + public getStatus(): NNIManagerStatus { + return this.status; + } + public async listTrialJobs(status?: TrialJobStatus): Promise { return this.dataStore.listTrialJobs(status); } @@ -329,16 +322,17 @@ class NNIManager implements Manager { } } await this.trainingService.cleanUp(); - this.experimentProfile.endTime = new Date(); + this.experimentProfile.endTime = Date.now(); await this.storeExperimentProfile(); + this.status.status = 'STOPPED'; } private async periodicallyUpdateExecDuration(): Promise { - const startTime: Date = new Date(); + const startTime: number = Date.now(); const execDuration: number = this.experimentProfile.execDuration; for (; ;) { await delay(1000 * 60 * 10); // 10 minutes - this.experimentProfile.execDuration = execDuration + (Date.now() - startTime.getTime()) / 1000; + this.experimentProfile.execDuration = execDuration + (Date.now() - startTime) / 1000; await this.storeExperimentProfile(); } } @@ -349,115 +343,178 @@ class NNIManager implements Manager { return this.dataStore.storeExperimentProfile(this.experimentProfile); } - // tslint:disable-next-line:max-func-body-length - private runInternal(): Promise { - // TO DO: cannot run this method more than once in one NNIManager instance - if (this.dispatcher === undefined) { - throw new Error('Error: tuner has not been setup'); - } - this.trainingService.addTrialJobMetricListener(async (metric: TrialJobMetric) => { - await this.dataStore.storeMetricData(metric.id, metric.data); - if (this.dispatcher === undefined) { - throw new Error('Error: tuner has not been setup'); - } - this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data); - }); - + private async run(): Promise { this.trialJobsMaintainer = new TrialJobs( this.trainingService, this.experimentProfile.execDuration, this.experimentProfile.params.maxExecDuration); + + assert(this.dispatcher !== undefined && this.trialJobsMaintainer !== undefined); + + this.addEventListeners(); + + this.sendInitTunerCommands(); + + await Promise.all([ + this.periodicallyUpdateExecDuration(), + this.trainingService.run(), + this.trialJobsMaintainer.run()]); + } + + private addEventListeners(): void { + // TO DO: cannot run this method more than once in one NNIManager instance + if (this.dispatcher === undefined || this.trialJobsMaintainer === undefined) { + throw new Error('Error: tuner or job maintainer have not been setup'); + } + this.trainingService.addTrialJobMetricListener((metric: TrialJobMetric) => { + this.onTrialJobMetrics(metric).catch((err: Error) => { + this.criticalError(err); + }); + }); + this.trialJobsMaintainer.on(async (event: TrialJobMaintainerEvent, trialJobDetail: TrialJobDetail) => { - if (trialJobDetail !== undefined) { - this.log.debug(`Job event: ${event}, id: ${trialJobDetail.id}`); - } else { - this.log.debug(`Job event: ${event}`); - } - if (this.dispatcher === undefined) { - throw new Error('Error: tuner has not been setup'); - } - switch (event) { - case 'SUCCEEDED': - case 'FAILED': - case 'USER_CANCELED': - case 'SYS_CANCELED': - if (this.trialConcurrencyReduction > 0) { - this.trialConcurrencyReduction--; - } else { - if (this.currSubmittedTrialNum < this.experimentProfile.params.maxTrialNum) { - if (this.customizedTrials.length > 0) { - const hyperParams: string | undefined = this.customizedTrials.shift(); - this.dispatcher.sendCommand(ADD_CUSTOMIZED_TRIAL_JOB, hyperParams); - } else { - this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, '1'); - } - } - } - this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({trial_job_id: trialJobDetail.id, event: event})); - await this.dataStore.storeTrialJobEvent(event, trialJobDetail.id, undefined, trialJobDetail.url); - break; - case 'RUNNING': - await this.dataStore.storeTrialJobEvent(event, trialJobDetail.id, undefined, trialJobDetail.url); - break; - case 'EXPERIMENT_DONE': - this.log.info('Experiment done, cleaning up...'); - await this.experimentDoneCleanUp(); - this.log.info('Experiment done.'); - break; - default: - throw new Error('Error: unrecognized event from trialJobsMaintainer'); - } + this.onTrialJobEvent(event, trialJobDetail).catch((err: Error) => { + this.criticalError(err); + }); }); + this.dispatcher.onCommand((commandType: string, content: string) => { + this.onTunerCommand(commandType, content).catch((err: Error) => { + this.criticalError(err); + }); + }); + } + + private sendInitTunerCommands(): void { + if (this.dispatcher === undefined) { + throw new Error('Error: tuner has not been setup'); + } // TO DO: we should send INITIALIZE command to tuner if user's tuner needs to run init method in tuner this.log.debug(`Send tuner command: update search space: ${this.experimentProfile.params.searchSpace}`); this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, this.experimentProfile.params.searchSpace); if (this.trialConcurrencyReduction !== 0) { - return Promise.reject(new Error('Error: cannot modify trialConcurrency before startExperiment')); + throw new Error('Error: cannot modify trialConcurrency before startExperiment'); } - this.log.debug(`Send tuner command: ${this.experimentProfile.params.trialConcurrency}`) + this.log.debug(`Send tuner command: ${this.experimentProfile.params.trialConcurrency}`); this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(this.experimentProfile.params.trialConcurrency)); - this.dispatcher.onCommand(async (commandType: string, content: string) => { - this.log.info(`Command from tuner: ${commandType}, ${content}`); - if (this.trialJobsMaintainer === undefined) { - throw new Error('Error: trialJobsMaintainer not initialized'); - } - switch (commandType) { - case NEW_TRIAL_JOB: + } + + private async onTrialJobMetrics(metric: TrialJobMetric): Promise { + await this.dataStore.storeMetricData(metric.id, metric.data); + if (this.dispatcher === undefined) { + throw new Error('Error: tuner has not been setup'); + } + this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data); + } + + private async onTrialJobEvent(event: TrialJobMaintainerEvent, trialJobDetail: TrialJobDetail): Promise { + if (trialJobDetail !== undefined) { + this.log.debug(`Job event: ${event}, id: ${trialJobDetail.id}`); + } else { + this.log.debug(`Job event: ${event}`); + } + if (this.dispatcher === undefined) { + throw new Error('Error: tuner has not been setup'); + } + switch (event) { + case 'SUCCEEDED': + case 'FAILED': + case 'USER_CANCELED': + case 'SYS_CANCELED': + if (this.trialConcurrencyReduction > 0) { + this.trialConcurrencyReduction--; + } else { if (this.currSubmittedTrialNum < this.experimentProfile.params.maxTrialNum) { - this.currSubmittedTrialNum++; - const trialJobAppForm: TrialJobApplicationForm = { - jobType: 'TRIAL', - hyperParameters: content - }; - const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm); - this.trialJobsMaintainer.setTrialJob(trialJobDetail.id, Object.assign({}, trialJobDetail)); - assert(trialJobDetail.status === 'WAITING'); - await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, content, trialJobDetail.url); - if (this.currSubmittedTrialNum === this.experimentProfile.params.maxTrialNum) { - this.trialJobsMaintainer.setNoMoreTrials(); + if (this.customizedTrials.length > 0) { + const hyperParams: string | undefined = this.customizedTrials.shift(); + this.dispatcher.sendCommand(ADD_CUSTOMIZED_TRIAL_JOB, hyperParams); + } else { + this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, '1'); } } - break; - case NO_MORE_TRIAL_JOBS: - this.trialJobsMaintainer.setNoMoreTrials(); - break; - case KILL_TRIAL_JOB: - await this.trainingService.cancelTrialJob(JSON.parse(content)); - break; - default: - throw new Error(`Error: unsupported command type: [${commandType}]`); - } - }); + } + this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({trial_job_id: trialJobDetail.id, event: event})); + await this.dataStore.storeTrialJobEvent(event, trialJobDetail.id, undefined, trialJobDetail.url); + break; + case 'RUNNING': + await this.dataStore.storeTrialJobEvent(event, trialJobDetail.id, undefined, trialJobDetail.url); + break; + case 'EXPERIMENT_DONE': + this.log.info('Experiment done, cleaning up...'); + await this.experimentDoneCleanUp(); + this.log.info('Experiment done.'); + break; + default: + throw new Error('Error: unrecognized event from trialJobsMaintainer'); + } + } - return this.trialJobsMaintainer.run(); + private async onTunerCommand(commandType: string, content: string): Promise { + this.log.info(`Command from tuner: ${commandType}, ${content}`); + if (this.trialJobsMaintainer === undefined) { + throw new Error('Error: trialJobsMaintainer not initialized'); + } + switch (commandType) { + case NEW_TRIAL_JOB: + if (this.currSubmittedTrialNum < this.experimentProfile.params.maxTrialNum) { + this.currSubmittedTrialNum++; + const trialJobAppForm: TrialJobApplicationForm = { + jobType: 'TRIAL', + hyperParameters: content + }; + const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm); + this.trialJobsMaintainer.setTrialJob(trialJobDetail.id, Object.assign({}, trialJobDetail)); + // TO DO: to uncomment + assert(trialJobDetail.status === 'WAITING'); + await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, content, trialJobDetail.url); + if (this.currSubmittedTrialNum === this.experimentProfile.params.maxTrialNum) { + this.trialJobsMaintainer.setNoMoreTrials(); + } + } + break; + case NO_MORE_TRIAL_JOBS: + this.trialJobsMaintainer.setNoMoreTrials(); + break; + case KILL_TRIAL_JOB: + await this.trainingService.cancelTrialJob(JSON.parse(content)); + break; + default: + throw new Error('Error: unsupported command type from tuner'); + } } - private async run(): Promise { - await Promise.all([ - this.periodicallyUpdateExecDuration(), - this.trainingService.run(), - this.runInternal()]); + private criticalError(err: Error): void { + this.logError(err); + console.error(err); + } + + private logError(err: Error): void { + if (err.stack !== undefined) { + this.log.error(err.stack); + } + this.status.errors.push(err.message); + this.status.status = 'ERROR'; + } + + private createEmptyExperimentProfile(): ExperimentProfile { + return { + id: getExperimentId(), + revision: 0, + execDuration: 0, + params: { + authorName: '', + experimentName: '', + trialConcurrency: 0, + maxExecDuration: 0, // unit: second + maxTrialNum: 0, // maxTrialNum includes all the submitted trial jobs + searchSpace: '', + tuner: { + className: '', + classArgs: {}, + checkpointDir: '' + } + } + }; } } diff --git a/src/nni_manager/core/sqlDatabase.ts b/src/nni_manager/core/sqlDatabase.ts index aeaf475536..4f4755d331 100644 --- a/src/nni_manager/core/sqlDatabase.ts +++ b/src/nni_manager/core/sqlDatabase.ts @@ -60,15 +60,15 @@ function loadExperimentProfile(row: any): ExperimentProfile { params: JSON.parse(row.params), id: row.id, execDuration: row.execDuration, - startTime: row.startTime === null ? undefined : new Date(row.startTime), - endTime: row.endTime === null ? undefined : new Date(row.endTime), + startTime: row.startTime === null ? undefined : row.startTime, + endTime: row.endTime === null ? undefined : row.endTime, revision: row.revision }; } function loadTrialJobEvent(row: any): TrialJobEventRecord { return { - timestamp: new Date(row.timestamp), + timestamp: row.timestamp, trialJobId: row.trialJobId, event: row.event, data: row.data === null ? undefined : row.data, @@ -78,7 +78,7 @@ function loadTrialJobEvent(row: any): TrialJobEventRecord { function loadMetricData(row: any): MetricDataRecord { return { - timestamp: new Date(row.timestamp), + timestamp: row.timestamp, trialJobId: row.trialJobId, parameterId: row.parameterId, type: row.type, @@ -132,8 +132,8 @@ class SqlDB implements Database { JSON.stringify(exp.params), exp.id, exp.execDuration, - exp.startTime === undefined ? null : exp.startTime.getTime(), - exp.endTime === undefined ? null : exp.endTime.getTime(), + exp.startTime === undefined ? null : exp.startTime, + exp.endTime === undefined ? null : exp.endTime, exp.revision ]; diff --git a/src/nni_manager/core/test/dataStore.test.ts b/src/nni_manager/core/test/dataStore.test.ts index 268221bc73..603daa1ac4 100644 --- a/src/nni_manager/core/test/dataStore.test.ts +++ b/src/nni_manager/core/test/dataStore.test.ts @@ -76,8 +76,8 @@ describe('Unit test for dataStore', () => { }, id: 'exp123', execDuration: 0, - startTime: new Date(), - endTime: new Date(), + startTime: Date.now(), + endTime: Date.now(), revision: 0 } const id: string = profile.id; @@ -128,14 +128,14 @@ describe('Unit test for dataStore', () => { parameter_id: 'abc', type: 'PERIODICAL', value: 'acc: 0.88', - timestamp: new Date() + timestamp: Date.now() }, { trial_job_id: '111', parameter_id: 'abc', type: 'FINAL', value: 'acc: 0.88', - timestamp: new Date() + timestamp: Date.now() } ]; diff --git a/src/nni_manager/core/test/mockedDatastore.ts b/src/nni_manager/core/test/mockedDatastore.ts index cb74857e75..efe2c24b11 100644 --- a/src/nni_manager/core/test/mockedDatastore.ts +++ b/src/nni_manager/core/test/mockedDatastore.ts @@ -118,7 +118,7 @@ class MockedDataStore implements DataStore { async storeTrialJobEvent(event: TrialJobEvent, trialJobId: string, data?: string | undefined): Promise { const dataRecord: TrialJobEventRecord = { event: event, - timestamp: new Date(), + timestamp: Date.now(), trialJobId: trialJobId, data: data } @@ -175,7 +175,7 @@ class MockedDataStore implements DataStore { parameterId: metrics.parameter_id, type: metrics.type, data: metrics.value, - timestamp: new Date() + timestamp: Date.now() }); } @@ -234,13 +234,13 @@ class MockedDataStore implements DataStore { } switch (record.event) { case 'RUNNING': - jobInfo.startTime = new Date(); + jobInfo.startTime = Date.now(); break; case 'SUCCEEDED': case 'FAILED': case 'USER_CANCELED': case 'SYS_CANCELED': - jobInfo.endTime = new Date(); + jobInfo.endTime = Date.now(); } jobInfo.status = this.getJobStatusByLatestEvent(record.event); map.set(record.trialJobId, jobInfo); diff --git a/src/nni_manager/core/test/mockedTrainingService.ts b/src/nni_manager/core/test/mockedTrainingService.ts index 4263065235..352022a278 100644 --- a/src/nni_manager/core/test/mockedTrainingService.ts +++ b/src/nni_manager/core/test/mockedTrainingService.ts @@ -34,9 +34,9 @@ class MockedTrainingService extends TrainingService { public jobDetail1: TrialJobDetail = { id: '1234', status: 'SUCCEEDED', - submitTime: new Date(), - startTime: new Date(), - endTime: new Date(), + submitTime: Date.now(), + startTime: Date.now(), + endTime: Date.now(), tags: ['test'], url: 'http://test', workingDirectory: '/tmp/mocked', @@ -47,9 +47,9 @@ class MockedTrainingService extends TrainingService { public jobDetail2: TrialJobDetail = { id: '3456', status: 'SUCCEEDED', - submitTime: new Date(), - startTime: new Date(), - endTime: new Date(), + submitTime: Date.now(), + startTime: Date.now(), + endTime: Date.now(), tags: ['test'], url: 'http://test', workingDirectory: '/tmp/mocked', diff --git a/src/nni_manager/core/test/sqlDatabase.test.ts b/src/nni_manager/core/test/sqlDatabase.test.ts index b925ee7396..7a67e98e96 100644 --- a/src/nni_manager/core/test/sqlDatabase.test.ts +++ b/src/nni_manager/core/test/sqlDatabase.test.ts @@ -62,33 +62,33 @@ const expParams2: ExperimentParams = { }; const profiles: ExperimentProfile[] = [ - { params: expParams1, id: '#1', execDuration: 0, startTime: new Date(), endTime: undefined, revision: 1 }, - { params: expParams1, id: '#1', execDuration: 0, startTime: new Date(), endTime: new Date(), revision: 2 }, - { params: expParams2, id: '#2', execDuration: 0, startTime: new Date(), endTime: new Date(), revision: 2 }, - { params: expParams2, id: '#2', execDuration: 0, startTime: new Date(), endTime: new Date(), revision: 3 } + { params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: undefined, revision: 1 }, + { params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2 }, + { params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2 }, + { params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 3 } ]; const events: TrialJobEventRecord[] = [ - { timestamp: new Date(), event: 'WAITING', trialJobId: 'A', data: 'hello' }, // 0 - { timestamp: new Date(), event: 'UNKNOWN', trialJobId: 'B', data: 'world' }, // 1 - { timestamp: new Date(), event: 'RUNNING', trialJobId: 'B', data: undefined }, // 2 - { timestamp: new Date(), event: 'RUNNING', trialJobId: 'A', data: '123' }, // 3 - { timestamp: new Date(), event: 'FAILED', trialJobId: 'A', data: undefined } // 4 + { timestamp: Date.now(), event: 'WAITING', trialJobId: 'A', data: 'hello' }, // 0 + { timestamp: Date.now(), event: 'UNKNOWN', trialJobId: 'B', data: 'world' }, // 1 + { timestamp: Date.now(), event: 'RUNNING', trialJobId: 'B', data: undefined }, // 2 + { timestamp: Date.now(), event: 'RUNNING', trialJobId: 'A', data: '123' }, // 3 + { timestamp: Date.now(), event: 'FAILED', trialJobId: 'A', data: undefined } // 4 ]; const metrics: MetricDataRecord[] = [ - { timestamp: new Date(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 0, data: 1.1 }, // 0 - { timestamp: new Date(), trialJobId: 'B', parameterId: '2', type: 'PERIODICAL', sequence: 0, data: 2.1 }, // 1 - { timestamp: new Date(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 1, data: 1.2 }, // 2 - { timestamp: new Date(), trialJobId: 'A', parameterId: '1', type: 'FINAL', sequence: 0, data: 1.3 }, // 3 - { timestamp: new Date(), trialJobId: 'C', parameterId: '2', type: 'PERIODICAL', sequence: 1, data: 2.1 }, // 4 - { timestamp: new Date(), trialJobId: 'C', parameterId: '2', type: 'FINAL', sequence: 0, data: 2.2 } // 5 + { timestamp: Date.now(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 0, data: 1.1 }, // 0 + { timestamp: Date.now(), trialJobId: 'B', parameterId: '2', type: 'PERIODICAL', sequence: 0, data: 2.1 }, // 1 + { timestamp: Date.now(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 1, data: 1.2 }, // 2 + { timestamp: Date.now(), trialJobId: 'A', parameterId: '1', type: 'FINAL', sequence: 0, data: 1.3 }, // 3 + { timestamp: Date.now(), trialJobId: 'C', parameterId: '2', type: 'PERIODICAL', sequence: 1, data: 2.1 }, // 4 + { timestamp: Date.now(), trialJobId: 'C', parameterId: '2', type: 'FINAL', sequence: 0, data: 2.2 } // 5 ]; // tslint:disable-next-line:no-any function assertRecordEqual(record: any, value: any): void { - assert.ok(record.timestamp > new Date(2018, 6, 1)); - assert.ok(record.timestamp < new Date()); + assert.ok(record.timestamp > new Date(2018, 6, 1).getTime()); + assert.ok(record.timestamp < Date.now()); for (const key in value) { // tslint:disable-line:no-for-in if (key !== 'timestamp') { diff --git a/src/nni_manager/core/trialJobs.ts b/src/nni_manager/core/trialJobs.ts index b71914f31b..0d36855563 100644 --- a/src/nni_manager/core/trialJobs.ts +++ b/src/nni_manager/core/trialJobs.ts @@ -26,6 +26,9 @@ import { delay } from '../common/utils'; type TrialJobMaintainerEvent = TrialJobStatus | 'EXPERIMENT_DONE'; +/** + * TrialJobs + */ class TrialJobs { private eventEmitter: EventEmitter; private trialJobs: Map; @@ -93,9 +96,9 @@ class TrialJobs { // Do nothing break; case 'RUNNING': - const oldTrialJobDetail = this.trialJobs.get(trialJobId); + const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId); assert(oldTrialJobDetail); - if (oldTrialJobDetail && oldTrialJobDetail.status === "WAITING") { + if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status === "WAITING") { this.trialJobs.set(trialJobId, trialJobDetail); this.eventEmitter.emit('all', trialJobDetail.status, trialJobDetail); } @@ -112,8 +115,8 @@ class TrialJobs { } public async run(): Promise { - const startTime: Date = new Date(); - while ((Date.now() - startTime.getTime()) / 1000 + this.pastExecDuration < this.maxExecDuration) { + const startTime: number = Date.now(); + while ((Date.now() - startTime) / 1000 + this.pastExecDuration < this.maxExecDuration) { if (this.stopLoop || (this.noMoreTrials && this.trialJobs.size === 0)) { break; diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index bddac983ff..6d9c9fa64b 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -31,7 +31,7 @@ import { parseArg, uniqueString, mkDirP, getLogDir } from './common/utils'; import { NNIDataStore } from './core/nniDataStore'; import { NNIManager } from './core/nnimanager'; import { SqlDB } from './core/sqlDatabase'; -import { RestServer } from './rest_server/server'; +import { NNIRestServer } from './rest_server/nniRestServer'; import { LocalTrainingServiceForGPU } from './training_service/local/localTrainingServiceForGPU'; import { RemoteMachineTrainingService @@ -64,7 +64,7 @@ function usage(): void { console.info('usage: node main.js --port --mode --start_mode --experiment_id '); } -let port: number = RestServer.DEFAULT_PORT; +let port: number = NNIRestServer.DEFAULT_PORT; const strPort: string = parseArg(['--port', '-p']); if (strPort && strPort.length > 0) { port = parseInt(strPort, 10); @@ -94,7 +94,7 @@ mkDirP(getLogDir()).then(async () => { const log: Logger = getLogger(); try { await initContainer(mode); - const restServer: RestServer = component.get(RestServer); + const restServer: NNIRestServer = component.get(NNIRestServer); await restServer.start(port); log.info(`Rest server listening on: ${restServer.endPoint}`); } catch (err) { diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index bb2ec2339c..46522044fd 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -13,6 +13,7 @@ "chai-as-promised": "^7.1.1", "child-process-promise": "^2.2.1", "express": "^4.16.3", + "express-joi-validator": "^2.0.0", "node-nvidia-smi": "^1.0.0", "rx": "^4.1.0", "sqlite3": "^4.0.2", diff --git a/src/nni_manager/rest_server/nniRestServer.ts b/src/nni_manager/rest_server/nniRestServer.ts new file mode 100644 index 0000000000..315affbd97 --- /dev/null +++ b/src/nni_manager/rest_server/nniRestServer.ts @@ -0,0 +1,55 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as bodyParser from 'body-parser'; +import * as component from '../common/component'; +import { RestServer } from '../common/restServer' +import { createRestHandler } from './restHandler'; + +/** + * NNI Main rest server, provides rest API to support + * # nnictl CLI tool + * # NNI Web UI + * + */ +@component.Singleton +export class NNIRestServer extends RestServer{ + /** NNI main rest service default port */ + public static readonly DEFAULT_PORT: number = 51188; + + private readonly API_ROOT_URL: string = '/api/v1/nni'; + + /** + * constructor to provide NNIRestServer's own rest property, e.g. port + */ + constructor() { + super(); + this.port = NNIRestServer.DEFAULT_PORT; + } + + /** + * NNIRestServer's own router registration + */ + protected registerRestHandler(): void { + this.app.use(bodyParser.json()); + this.app.use(this.API_ROOT_URL, createRestHandler(this)); + } +} diff --git a/src/nni_manager/rest_server/restHandler.ts b/src/nni_manager/rest_server/restHandler.ts index 8ddc7af918..e94c256ca1 100644 --- a/src/nni_manager/rest_server/restHandler.ts +++ b/src/nni_manager/rest_server/restHandler.ts @@ -28,16 +28,19 @@ import { NNIError, NNIErrorNames } from '../common/errors'; import { isNewExperiment } from '../common/experimentStartupInfo'; import { getLogger, Logger } from '../common/log'; import { ExperimentProfile, Manager, TrialJobStatistics} from '../common/manager'; -import { RestServer } from './server'; +import { ValidationSchemas } from './restValidationSchemas'; +import { NNIRestServer } from './nniRestServer'; import { TensorBoard } from './tensorboard'; +const expressJoi = require('express-joi-validator'); + class NNIRestHandler { - private restServer: RestServer; + private restServer: NNIRestServer; private nniManager: Manager; private tb: TensorBoard; private log: Logger; - constructor(rs: RestServer) { + constructor(rs: NNIRestServer) { this.nniManager = component.get(Manager); this.restServer = rs; this.tb = new TensorBoard(); @@ -75,6 +78,15 @@ class NNIRestHandler { this.startTensorBoard(router); this.stopTensorBoard(router); + // Express-joi-validator configuration + router.use((err: any, req: Request, res: Response, next: any) => { + if (err.isBoom) { + this.log.error(err.output.payload); + + return res.status(err.output.statusCode).json(err.output.payload); + } + }); + return router; } @@ -96,7 +108,7 @@ class NNIRestHandler { router.get('/check-status', (req: Request, res: Response) => { const ds: DataStore = component.get(DataStore); ds.init().then(() => { - res.send(); + res.send(this.nniManager.getStatus()); }).catch(async (err: Error) => { this.handle_error(err, res); this.log.error(err.message); @@ -117,7 +129,7 @@ class NNIRestHandler { } private updateExperimentProfile(router: Router): void { - router.put('/experiment', (req: Request, res: Response) => { + router.put('/experiment', expressJoi(ValidationSchemas.UPDATEEXPERIMENT), (req: Request, res: Response) => { this.nniManager.updateExperimentProfile(req.body, req.query.update_type).then(() => { res.send(); }).catch((err: Error) => { @@ -127,7 +139,7 @@ class NNIRestHandler { } private startExperiment(router: Router): void { - router.post('/experiment', (req: Request, res: Response) => { + router.post('/experiment', expressJoi(ValidationSchemas.STARTEXPERIMENT), (req: Request, res: Response) => { if (isNewExperiment()) { this.nniManager.startExperiment(req.body).then((eid: string) => { res.send({ @@ -171,7 +183,9 @@ class NNIRestHandler { } private setClusterMetaData(router: Router): void { - router.put('/experiment/cluster-metadata', async (req: Request, res: Response) => { + router.put( + '/experiment/cluster-metadata', expressJoi(ValidationSchemas.SETCLUSTERMETADATA), + async (req: Request, res: Response) => { // tslint:disable-next-line:no-any const metadata: any = req.body; const keys: string[] = Object.keys(metadata); @@ -241,7 +255,7 @@ class NNIRestHandler { } private startTensorBoard(router: Router): void { - router.post('/tensorboard', async (req: Request, res: Response) => { + router.post('/tensorboard', expressJoi(ValidationSchemas.STARTTENSORBOARD), async (req: Request, res: Response) => { const jobIds: string[] = req.query.job_ids.split(','); const tensorboardCmd: string | undefined = req.query.tensorboard_cmd; this.tb.startTensorBoard(jobIds, tensorboardCmd).then((endPoint: string) => { @@ -253,7 +267,7 @@ class NNIRestHandler { } private stopTensorBoard(router: Router): void { - router.delete('/tensorboard', async (req: Request, res: Response) => { + router.delete('/tensorboard', expressJoi(ValidationSchemas.STOPTENSORBOARD), async (req: Request, res: Response) => { const endPoint: string = req.query.endpoint; this.tb.stopTensorBoard(endPoint).then(() => { res.send(); @@ -285,7 +299,7 @@ class NNIRestHandler { } } -export function createRestHandler(rs: RestServer): Router { +export function createRestHandler(rs: NNIRestServer): Router { const handler: NNIRestHandler = new NNIRestHandler(rs); return handler.createRestHandler(); diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts new file mode 100644 index 0000000000..218a8c22c4 --- /dev/null +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -0,0 +1,97 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +const joi = require('joi'); + +export namespace ValidationSchemas { + export const SETCLUSTERMETADATA = { + body: { + machine_list: joi.array().items(joi.object({ + username: joi.string().required(), + ip: joi.string().ip().required(), + port: joi.number().min(1).max(65535).required(), + passwd: joi.string().required(), + sshKeyPath: joi.string(), + passphrase: joi.string() + })), + trial_config: joi.object({ + gpuNum: joi.number().min(0).required(), + codeDir: joi.string().min(1).required(), + command: joi.string().min(1).required() + }) + } + }; + export const STARTEXPERIMENT = { + body: { + experimentName: joi.string().required(), + authorName: joi.string(), + maxTrialNum: joi.number().min(0).required(), + trialConcurrency: joi.number().min(0).required(), + searchSpace: joi.string().required(), + maxExecDuration: joi.number().min(0).required(), + tuner: joi.object({ + builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution'), + codeDir: joi.string(), + classFileName: joi.string(), + className: joi.string(), + classArgs: joi.any(), + gpuNum: joi.number().min(0), + checkpointDir: joi.string() + }).required(), + assessor: joi.object({ + builtinAssessorName: joi.string().valid('Medianstop'), + codeDir: joi.string(), + classFileName: joi.string(), + className: joi.string(), + classArgs: joi.any(), + gpuNum: joi.number().min(0), + checkpointDir: joi.string() + }), + clusterMetaData: joi.array().items(joi.object({ + key: joi.string(), + value: joi.any() + })) + } + }; + export const UPDATEEXPERIMENT = { + query: { + update_type: joi.string().required().valid('TRIAL_CONCURRENCY', 'MAX_EXEC_DURATION', 'SEARCH_SPACE') + }, + body: { + id: joi.string().required(), + revision: joi.number().min(0).required(), + params: joi.object(STARTEXPERIMENT.body).required(), + execDuration: joi.number().required(), + startTime: joi.number(), + endTime: joi.number() + } + }; + export const STARTTENSORBOARD = { + query: { + job_ids: joi.string().min(5).max(5).required() + } + }; + export const STOPTENSORBOARD = { + query: { + endpoint: joi.string().uri().required() + } + }; +} diff --git a/src/nni_manager/rest_server/test/mockedNNIManager.ts b/src/nni_manager/rest_server/test/mockedNNIManager.ts index f642223854..5acdd072ce 100644 --- a/src/nni_manager/rest_server/test/mockedNNIManager.ts +++ b/src/nni_manager/rest_server/test/mockedNNIManager.ts @@ -26,7 +26,7 @@ import { MetricDataRecord, MetricType, TrialJobInfo } from '../../common/datasto import { MethodNotImplementedError } from '../../common/errors'; import { ExperimentParams, ExperimentProfile, Manager, ProfileUpdateType, - TrialJobStatistics + TrialJobStatistics, NNIManagerStatus } from '../../common/manager'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus @@ -37,6 +37,12 @@ export const testManagerProvider: Provider = { }; export class MockedNNIManager extends Manager { + public getStatus(): NNIManagerStatus { + return { + status: 'EXPERIMENT_RUNNING', + errors: [] + } + } public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise { return Promise.resolve(); } @@ -65,9 +71,9 @@ export class MockedNNIManager extends Manager { const jobDetail: TrialJobDetail = { id: '1234', status: 'RUNNING', - submitTime: new Date(), - startTime: new Date(), - endTime: new Date(), + submitTime: Date.now(), + startTime: Date.now(), + endTime: Date.now(), tags: ['test'], // tslint:disable-next-line:no-http-string url: 'http://test', @@ -108,8 +114,8 @@ export class MockedNNIManager extends Manager { const jobInfo: TrialJobInfo = { id: '1234', status: 'SUCCEEDED', - startTime: new Date(), - endTime: new Date() + startTime: Date.now(), + endTime: Date.now() }; deferred.resolve(jobInfo); @@ -137,8 +143,8 @@ export class MockedNNIManager extends Manager { }, id: '2345', execDuration: 0, - startTime: new Date(), - endTime: new Date(), + startTime: Date.now(), + endTime: Date.now(), revision: 0 }; @@ -148,15 +154,15 @@ export class MockedNNIManager extends Manager { const job1: TrialJobInfo = { id: '1234', status: 'SUCCEEDED', - startTime: new Date(), - endTime: new Date(), + startTime: Date.now(), + endTime: Date.now(), finalMetricData: 'lr: 0.01, val accuracy: 0.89, batch size: 256' }; const job2: TrialJobInfo = { id: '3456', status: 'FAILED', - startTime: new Date(), - endTime: new Date(), + startTime: Date.now(), + endTime: Date.now(), finalMetricData: '' }; diff --git a/src/nni_manager/rest_server/test/restserver.test.ts b/src/nni_manager/rest_server/test/restserver.test.ts index 07fb1fa7bc..a6fba116d9 100644 --- a/src/nni_manager/rest_server/test/restserver.test.ts +++ b/src/nni_manager/rest_server/test/restserver.test.ts @@ -32,7 +32,7 @@ import { TrainingService } from '../../common/trainingService'; import { cleanupUnitTest, prepareUnitTest } from '../../common/utils'; import { MockedDataStore } from '../../core/test/mockedDatastore'; import { MockedTrainingService } from '../../core/test/mockedTrainingService'; -import { RestServer } from '../server'; +import { NNIRestServer } from '../nniRestServer'; import { testManagerProvider } from './mockedNNIManager'; describe('Unit test for rest server', () => { @@ -44,7 +44,7 @@ describe('Unit test for rest server', () => { Container.bind(Manager).provider(testManagerProvider); Container.bind(DataStore).to(MockedDataStore); Container.bind(TrainingService).to(MockedTrainingService); - const restServer: RestServer = component.get(RestServer); + const restServer: NNIRestServer = component.get(NNIRestServer); restServer.start().then(() => { ROOT_URL = `${restServer.endPoint}/api/v1/nni`; done(); @@ -54,7 +54,7 @@ describe('Unit test for rest server', () => { }); after(() => { - component.get(RestServer).stop(); + component.get(NNIRestServer).stop(); cleanupUnitTest(); }); diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index b07b82b733..50eb9b3846 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -65,16 +65,16 @@ function decodeCommand(data: Buffer): [boolean, string, string, Buffer] { class LocalTrialJobDetail implements TrialJobDetail { public id: string; public status: TrialJobStatus; - public submitTime: Date; - public startTime?: Date; - public endTime?: Date; + public submitTime: number; + public startTime?: number; + public endTime?: number; public tags?: string[]; public url?: string; public workingDirectory: string; public form: JobApplicationForm; public pid?: number; - constructor(id: string, status: TrialJobStatus, submitTime: Date, workingDirectory: string, form: JobApplicationForm) { + constructor(id: string, status: TrialJobStatus, submitTime: number, workingDirectory: string, form: JobApplicationForm) { this.id = id; this.status = status; this.submitTime = submitTime; @@ -152,7 +152,7 @@ class LocalTrainingService implements TrainingService { } if (!alive) { - trialJob.endTime = new Date(); + trialJob.endTime = Date.now(); this.setTrialJobStatus(trialJob, 'FAILED'); try { const state: string = await fs.promises.readFile(path.join(trialJob.workingDirectory, '.nni', 'state'), 'utf8'); @@ -162,7 +162,7 @@ class LocalTrainingService implements TrainingService { if (parseInt(code, 10) === 0) { this.setTrialJobStatus(trialJob, 'SUCCEEDED'); } - trialJob.endTime = new Date(parseInt(timestamp, 10)); + trialJob.endTime = parseInt(timestamp, 10); } } catch (error) { //ignore @@ -191,7 +191,7 @@ class LocalTrainingService implements TrainingService { const trialJobDetail: LocalTrialJobDetail = new LocalTrialJobDetail( trialJobId, 'WAITING', - new Date(), + Date.now(), path.join(this.rootDir, 'trials', trialJobId), form); this.jobQueue.push(trialJobId); @@ -339,7 +339,7 @@ class LocalTrainingService implements TrainingService { const process: cp.ChildProcess = cp.exec(`bash ${path.join(trialJobDetail.workingDirectory, 'run.sh')}`); this.setTrialJobStatus(trialJobDetail, 'RUNNING'); - trialJobDetail.startTime = new Date(); + trialJobDetail.startTime = Date.now(); trialJobDetail.pid = process.pid; this.setExtraProperties(trialJobDetail, resource); @@ -372,7 +372,7 @@ class LocalTrainingService implements TrainingService { const jobDetail: LocalTrialJobDetail = { id: jobId, status: 'RUNNING', - submitTime: new Date(), + submitTime: Date.now(), workingDirectory: workDir, form: form, pid: process.pid diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts new file mode 100644 index 0000000000..6375eee1c5 --- /dev/null +++ b/src/nni_manager/training_service/pai/paiJobRestServer.ts @@ -0,0 +1,20 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + \ No newline at end of file diff --git a/src/nni_manager/training_service/remote_machine/gpuScheduler.ts b/src/nni_manager/training_service/remote_machine/gpuScheduler.ts index b9505ad419..4d0588b45b 100644 --- a/src/nni_manager/training_service/remote_machine/gpuScheduler.ts +++ b/src/nni_manager/training_service/remote_machine/gpuScheduler.ts @@ -19,11 +19,12 @@ 'use strict'; +import * as assert from 'assert'; import { Client } from 'ssh2'; -import { Deferred } from 'ts-deferred'; import { getLogger, Logger } from '../../common/log'; +import { randomSelect } from '../../common/utils'; import { GPUInfo } from '../common/gpuData'; -import { RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineScheduleInfo, ScheduleResultType } from './remoteMachineData'; +import { RemoteMachineMeta, RemoteMachineScheduleResult, ScheduleResultType } from './remoteMachineData'; /** * A simple GPU scheduler implementation @@ -45,82 +46,64 @@ export class GPUScheduler { * Schedule a machine according to the constraints (requiredGPUNum) * @param requiredGPUNum required GPU number */ - public scheduleMachine(requiredGPUNum : Number | undefined, trialJobId : string) : RemoteMachineScheduleResult { - const deferred: Deferred = new Deferred(); - let scheduleResult : RemoteMachineScheduleResult = { - resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU, - scheduleInfo : undefined - }; - - // Step 0: Check if required GPU number not exceeds the total GPU number in all machines - const eligibleRM : RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys()).filter((rmMeta : RemoteMachineMeta) => - rmMeta.gpuSummary === undefined || requiredGPUNum === undefined || rmMeta.gpuSummary.gpuCount >= requiredGPUNum ); - if(eligibleRM.length == 0) { + public scheduleMachine(requiredGPUNum: number, trialJobId : string) : RemoteMachineScheduleResult { + assert(requiredGPUNum >= 0); + const allRMs: RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys()); + assert(allRMs.length > 0); + + // Step 1: Check if required GPU number not exceeds the total GPU number in all machines + const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta : RemoteMachineMeta) => + rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || rmMeta.gpuSummary.gpuCount >= requiredGPUNum); + if (eligibleRM.length === 0) { // If the required gpu number exceeds the upper limit of all machine's GPU number // Return REQUIRE_EXCEED_TOTAL directly return ({ - resultType : ScheduleResultType.REQUIRE_EXCEED_TOTAL, - scheduleInfo : undefined + resultType: ScheduleResultType.REQUIRE_EXCEED_TOTAL, + scheduleInfo: undefined }); } - // Step 1: Generate GPU resource map for remote machines - const totalResourceMap : Map = this.gpuResourceDetection(requiredGPUNum); - - // Step 2: Find machine whose GPU can be allocated based on user GPU requirement, and allocate GPU - for (const rmMeta of Array.from(totalResourceMap.keys())) { - const gpuInfos : GPUInfo[] | undefined = totalResourceMap.get(rmMeta); - if(gpuInfos !== undefined && (requiredGPUNum === undefined || gpuInfos.length >= requiredGPUNum)) { - const allocatedGPUIndex : number[] = Array(); - - // Allocate - gpuInfos.forEach((gpuInfo : GPUInfo) => { - rmMeta.gpuReservation.set(gpuInfo.index, trialJobId); - allocatedGPUIndex.push(gpuInfo.index); - }); - - // Construct scheduling return object - const sshClient : Client | undefined = this.machineSSHClientMap.get(rmMeta); - if(sshClient !== undefined) { - this.log.info(`Found available machine, trialJobId is ${trialJobId}, ip is ${rmMeta.ip}, gpu allocated is ${allocatedGPUIndex.toString()}`); - // We found the first available machine whose GPU resource can match user requirement - return { - resultType : ScheduleResultType.SUCCEED, - scheduleInfo : { - rmMeta : rmMeta, - client : sshClient, - cuda_visible_device : allocatedGPUIndex.join(',') - } - }; - } - } - } - - // Step 3: If not found machine whose GPU is availabe, then find the first machine whose GPU summary is unknown - for (const rmMeta of Array.from(this.machineSSHClientMap.keys())) { - const client : Client | undefined = this.machineSSHClientMap.get(rmMeta); - if(rmMeta.gpuSummary == undefined && client !== undefined) { - // We found the firstmachine whose GPU summary is unknown - return { - resultType : ScheduleResultType.SUCCEED, - scheduleInfo :{ - rmMeta : rmMeta, - client : client, - //Since gpu information is unknown, make all GPU resources visible to the job - cuda_visible_device : '' - } - }; + // Step 2: Allocate Host/GPU for specified trial job + // Currenty the requireGPUNum parameter for all trial jobs are identical. + if (requiredGPUNum > 0) { + // Trial job requires GPU + const result: RemoteMachineScheduleResult | undefined = this.scheduleGPUHost(requiredGPUNum, trialJobId); + if (result !== undefined) { + return result; } - }; - - this.log.warning(`Scheduler: trialJob id ${trialJobId}, no machine can be scheduled, resolve as TMP_NO_AVAILABLE_GPU `); - // Otherwise, no machine can be scheduled, resolve as TMP_NO_AVAILABLE_GPU + } else { + // Trail job does not need GPU + const allocatedRm: RemoteMachineMeta = this.selectMachine(allRMs); + + return this.allocateHost(requiredGPUNum, allocatedRm, [], trialJobId); + } + this.log.warning(`Scheduler: trialJob id ${trialJobId}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `); + return { resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU, scheduleInfo : undefined }; } + private scheduleGPUHost(requiredGPUNum: number, trialJobId: string): RemoteMachineScheduleResult | undefined { + const totalResourceMap: Map = this.gpuResourceDetection(); + const qualifiedRMs: RemoteMachineMeta[] = []; + totalResourceMap.forEach((gpuInfos: GPUInfo[], rmMeta: RemoteMachineMeta) => { + if (gpuInfos !== undefined && gpuInfos.length >= requiredGPUNum) { + qualifiedRMs.push(rmMeta); + } + }); + if (qualifiedRMs.length > 0) { + const allocatedRm: RemoteMachineMeta = this.selectMachine(qualifiedRMs); + const gpuInfos: GPUInfo[] | undefined = totalResourceMap.get(allocatedRm); + if (gpuInfos !== undefined) { // should always true + return this.allocateHost(requiredGPUNum, allocatedRm, gpuInfos, trialJobId); + } else { + assert(false, 'gpuInfos is undefined'); + } + } + } + /** * Detect available GPU resource for a remote machine * @param rmMeta Remote machine metadata @@ -128,33 +111,56 @@ export class GPUScheduler { * @param availableGPUMap available GPU resource filled by this detection * @returns Available GPU number on this remote machine */ - private gpuResourceDetection(requiredGPUNum : Number | undefined) : Map { + private gpuResourceDetection() : Map { const totalResourceMap : Map = new Map(); this.machineSSHClientMap.forEach((client: Client, rmMeta: RemoteMachineMeta) => { // Assgin totoal GPU count as init available GPU number - if(rmMeta.gpuSummary !== undefined) { - const availableGPUs : GPUInfo[] = Array(); - if(rmMeta.gpuReservation === undefined) { + if (rmMeta.gpuSummary !== undefined) { + const availableGPUs: GPUInfo[] = []; + if (rmMeta.gpuReservation === undefined) { rmMeta.gpuReservation = new Map(); } - const gpuReservation = rmMeta.gpuReservation; rmMeta.gpuSummary.gpuInfos.forEach((gpuInfo: GPUInfo) => { - //this.log.info(`GPU index:${gpuInfo.index}, activeProcessNum is ${gpuInfo.activeProcessNum}, GPU reservation is ${JSON.stringify([...gpuReservation])}`); - // if the GPU has active process, OR be reserved by a job, + // if the GPU has active process, OR be reserved by a job, // We should NOT allocate this GPU - if (gpuInfo.activeProcessNum === 0 - && !gpuReservation.has(gpuInfo.index) - && requiredGPUNum !== undefined - && availableGPUs.length < requiredGPUNum) { + if (gpuInfo.activeProcessNum === 0 && !rmMeta.gpuReservation.has(gpuInfo.index)) { availableGPUs.push(gpuInfo); } }); - totalResourceMap.set(rmMeta, availableGPUs); } }); return totalResourceMap; } + + private selectMachine(rmMetas: RemoteMachineMeta[]): RemoteMachineMeta { + assert(rmMetas !== undefined && rmMetas.length > 0); + + return randomSelect(rmMetas); + } + + private selectGPUsForTrial(gpuInfos: GPUInfo[], requiredGPUNum: number): GPUInfo[] { + // Sequentially allocate GPUs + return gpuInfos.slice(0, requiredGPUNum); + } + + private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta, + gpuInfos: GPUInfo[], trialJobId: string): RemoteMachineScheduleResult { + assert(gpuInfos.length >= requiredGPUNum); + const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum); + + allocatedGPUs.forEach((gpuInfo: GPUInfo) => { + rmMeta.gpuReservation.set(gpuInfo.index, trialJobId); + }); + + return { + resultType: ScheduleResultType.SUCCEED, + scheduleInfo: { + rmMeta: rmMeta, + cuda_visible_device: allocatedGPUs.map((gpuInfo: GPUInfo) => { return gpuInfo.index; }).join(',') + } + }; + } } diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts index 0be6922a62..1e52458790 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts @@ -87,16 +87,16 @@ export class JobMetrics { export class RemoteMachineTrialJobDetail implements TrialJobDetail { public id: string; public status: TrialJobStatus; - public submitTime: Date; - public startTime?: Date; - public endTime?: Date; + public submitTime: number; + public startTime?: number; + public endTime?: number; public tags?: string[]; public url?: string; public workingDirectory: string; public form: JobApplicationForm; public rmMeta?: RemoteMachineMeta; - constructor(id: string, status: TrialJobStatus, submitTime: Date, workingDirectory: string, form: JobApplicationForm) { + constructor(id: string, status: TrialJobStatus, submitTime: number, workingDirectory: string, form: JobApplicationForm) { this.id = id; this.status = status; this.submitTime = submitTime; @@ -106,9 +106,9 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail { } } -export type RemoteMachineScheduleResult = { scheduleInfo : RemoteMachineScheduleInfo | undefined, resultType : ScheduleResultType}; +export type RemoteMachineScheduleResult = { scheduleInfo : RemoteMachineScheduleInfo | undefined; resultType : ScheduleResultType}; -export type RemoteMachineScheduleInfo = { client: Client; rmMeta : RemoteMachineMeta; cuda_visible_device : string}; +export type RemoteMachineScheduleInfo = { rmMeta : RemoteMachineMeta; cuda_visible_device : string}; export enum ScheduleResultType { /* Schedule succeeded*/ diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 99b3af48ba..772b93ff5d 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -19,6 +19,7 @@ 'use strict'; +import * as assert from 'assert'; import * as cpp from 'child-process-promise'; import { EventEmitter } from 'events'; import * as fs from 'fs'; @@ -54,8 +55,6 @@ import { SSHClientUtility } from './sshClientUtility'; class RemoteMachineTrainingService implements TrainingService { private machineSSHClientMap: Map; private trialJobsMap: Map; - private experimentId: string | undefined; - // Experiment root directory private expRootDir: string; private remoteExpRootDir: string; private trialConfig: TrialConfig | undefined; @@ -72,9 +71,8 @@ class RemoteMachineTrainingService implements TrainingService { this.machineSSHClientMap = new Map(); this.gpuScheduler = new GPUScheduler(this.machineSSHClientMap); this.jobQueue = []; - this.experimentId = getExperimentId(); this.expRootDir = getExperimentRootDir(); - this.remoteExpRootDir = this.getRemoteModeExperimentRootDir(); + this.remoteExpRootDir = this.getRemoteExperimentRootDir(); this.timer = timer; this.log = getLogger(); } @@ -183,7 +181,7 @@ class RemoteMachineTrainingService implements TrainingService { const trialJobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail( trialJobId, 'WAITING', - new Date(), + Date.now(), trialWorkingFolder, form); this.jobQueue.push(trialJobId); @@ -326,6 +324,7 @@ class RemoteMachineTrainingService implements TrainingService { } this.machineSSHClientMap.set(rmMeta, conn); conn.on('ready', async () => { + this.machineSSHClientMap.set(rmMeta, conn); await this.initRemoteMachineOnConnected(rmMeta, conn); if (++connectedRMNum === rmMetaList.length) { deferred.resolve(); @@ -392,7 +391,7 @@ class RemoteMachineTrainingService implements TrainingService { trialJobDetail.status = 'RUNNING'; trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialWorkingFolder}`; - trialJobDetail.startTime = new Date(); + trialJobDetail.startTime = Date.now(); trialJobDetail.rmMeta = rmScheduleInfo.rmMeta; deferred.resolve(true); @@ -412,7 +411,13 @@ class RemoteMachineTrainingService implements TrainingService { throw new Error('trial config is not initialized'); } const cuda_visible_device: string = rmScheduleInfo.cuda_visible_device; - const sshClient: Client = rmScheduleInfo.client; + const sshClient: Client | undefined = this.machineSSHClientMap.get(rmScheduleInfo.rmMeta); + if (sshClient === undefined) { + assert(false, 'sshClient is undefined.'); + + // for lint + return; + } const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); await SSHClientUtility.remoteExeCommand(`mkdir -p ${trialWorkingFolder}`, sshClient); @@ -472,9 +477,9 @@ class RemoteMachineTrainingService implements TrainingService { path.join(localDir, 'run.sh'), path.join(remoteDir, 'run.sh'), sshClient); SSHClientUtility.remoteExeCommand(`bash ${path.join(remoteDir, 'run.sh')}`, sshClient); - const jobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(jobId, 'RUNNING', new Date(), remoteDir, form); + const jobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(jobId, 'RUNNING', Date.now(), remoteDir, form); jobDetail.rmMeta = rmMeta; - jobDetail.startTime = new Date(); + jobDetail.startTime = Date.now(); this.trialJobsMap.set(jobId, jobDetail); this.log.debug(`runHostJob: return: ${JSON.stringify(jobDetail)} `); @@ -510,7 +515,7 @@ class RemoteMachineTrainingService implements TrainingService { } else { trialJob.status = 'FAILED'; } - trialJob.endTime = new Date(parseInt(timestamp, 10)); + trialJob.endTime = parseInt(timestamp, 10); } this.log.info(`trailJob status update: ${trialJob.id}, ${trialJob.status}`); } @@ -536,7 +541,7 @@ class RemoteMachineTrainingService implements TrainingService { return path.join(this.remoteExpRootDir, 'hostjobs', jobId); } - private getRemoteModeExperimentRootDir(): string{ + private getRemoteExperimentRootDir(): string{ return path.join(os.tmpdir(), 'nni', 'experiments', getExperimentId()); } diff --git a/src/nni_manager/tslint.json b/src/nni_manager/tslint.json index 1d1f0ebc8c..bc64bec175 100644 --- a/src/nni_manager/tslint.json +++ b/src/nni_manager/tslint.json @@ -11,5 +11,12 @@ "no-console": [true, "log"], "no-multiline-string": false }, - "rulesDirectory": [] + "rulesDirectory": [], + "linterOptions": { + "exclude": [ + "training_service/test/*", + "rest_server/test/*", + "core/test/*" + ] + } } \ No newline at end of file diff --git a/src/nni_manager/yarn.lock b/src/nni_manager/yarn.lock index 2788546e7f..4f20c6e832 100644 --- a/src/nni_manager/yarn.lock +++ b/src/nni_manager/yarn.lock @@ -343,6 +343,24 @@ body-parser@1.18.2: raw-body "2.3.2" type-is "~1.6.15" +boom@2.6.x: + version "2.6.1" + resolved "https://registry.yarnpkg.com/boom/-/boom-2.6.1.tgz#4dc8ef9b6dfad9c43bbbfbe71fa4c21419f22753" + dependencies: + hoek "2.x.x" + +boxen@1.3.0: + version "1.3.0" + resolved "https://registry.yarnpkg.com/boxen/-/boxen-1.3.0.tgz#55c6c39a8ba58d9c61ad22cd877532deb665a20b" + dependencies: + ansi-align "^2.0.0" + camelcase "^4.0.0" + chalk "^2.0.1" + cli-boxes "^1.0.0" + string-width "^2.0.0" + term-size "^1.2.0" + widest-line "^2.0.0" + brace-expansion@^1.1.7: version "1.1.11" resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd" @@ -582,6 +600,38 @@ etag@~1.8.1: version "1.8.1" resolved "https://registry.yarnpkg.com/etag/-/etag-1.8.1.tgz#41ae2eeb65efa62268aebfea83ac7d79299b0887" +execa@^0.7.0: + version "0.7.0" + resolved "https://registry.yarnpkg.com/execa/-/execa-0.7.0.tgz#944becd34cc41ee32a63a9faf27ad5a65fc59777" + dependencies: + cross-spawn "^5.0.1" + get-stream "^3.0.0" + is-stream "^1.1.0" + npm-run-path "^2.0.0" + p-finally "^1.0.0" + signal-exit "^3.0.0" + strip-eof "^1.0.0" + +execa@^0.8.0: + version "0.8.0" + resolved "https://registry.yarnpkg.com/execa/-/execa-0.8.0.tgz#d8d76bbc1b55217ed190fd6dd49d3c774ecfc8da" + dependencies: + cross-spawn "^5.0.1" + get-stream "^3.0.0" + is-stream "^1.1.0" + npm-run-path "^2.0.0" + p-finally "^1.0.0" + signal-exit "^3.0.0" + strip-eof "^1.0.0" + +express-joi-validator@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/express-joi-validator/-/express-joi-validator-2.0.0.tgz#24e26e6a8327f69985ed72588f00e295dc3e3234" + dependencies: + boom "2.6.x" + extend "2.0.x" + joi "6.x.x" + express@^4.16.3: version "4.16.3" resolved "https://registry.yarnpkg.com/express/-/express-4.16.3.tgz#6af8a502350db3246ecc4becf6b5a34d22f7ed53" @@ -617,6 +667,10 @@ express@^4.16.3: utils-merge "1.0.1" vary "~1.1.2" +extend@2.0.x: + version "2.0.2" + resolved "https://registry.yarnpkg.com/extend/-/extend-2.0.2.tgz#1b74985400171b85554894459c978de6ef453ab7" + extend@~3.0.1: version "3.0.2" resolved "https://registry.yarnpkg.com/extend/-/extend-3.0.2.tgz#f8b1136b4071fbd8eb140aff858b1019ec2915fa" @@ -763,6 +817,10 @@ he@1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/he/-/he-1.1.1.tgz#93410fd21b009735151f8868c2f271f3427e23fd" +hoek@2.x.x: + version "2.16.3" + resolved "https://registry.yarnpkg.com/hoek/-/hoek-2.16.3.tgz#20bb7403d3cea398e91dc4710a8ff1b8274a25ed" + http-errors@1.6.2: version "1.6.2" resolved "https://registry.yarnpkg.com/http-errors/-/http-errors-1.6.2.tgz#0a002cc85707192a7e7946ceedc11155f60ec736" @@ -852,6 +910,10 @@ isarray@~1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11" +isemail@1.x.x: + version "1.2.0" + resolved "https://registry.yarnpkg.com/isemail/-/isemail-1.2.0.tgz#be03df8cc3e29de4d2c5df6501263f1fa4595e9a" + isexe@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10" @@ -860,6 +922,15 @@ isstream@~0.1.2: version "0.1.2" resolved "https://registry.yarnpkg.com/isstream/-/isstream-0.1.2.tgz#47e63f7af55afa6f92e1500e690eb8b8529c099a" +joi@6.x.x: + version "6.10.1" + resolved "https://registry.yarnpkg.com/joi/-/joi-6.10.1.tgz#4d50c318079122000fe5f16af1ff8e1917b77e06" + dependencies: + hoek "2.x.x" + isemail "1.x.x" + moment "2.x.x" + topo "1.x.x" + js-tokens@^3.0.2: version "3.0.2" resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-3.0.2.tgz#9866df395102130e38f7f996bceb65443209c25b" @@ -982,6 +1053,10 @@ mocha@^5.2.0: mkdirp "0.5.1" supports-color "5.4.0" +moment@2.x.x: + version "2.22.2" + resolved "https://registry.yarnpkg.com/moment/-/moment-2.22.2.tgz#3c257f9839fc0e93ff53149632239eb90783ff66" + ms@2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/ms/-/ms-2.0.0.tgz#5608aeadfc00be6c2901df5f9861788de0d597c8" @@ -1459,12 +1534,24 @@ tmp@^0.0.33: dependencies: os-tmpdir "~1.0.2" +topo@1.x.x: + version "1.1.0" + resolved "https://registry.yarnpkg.com/topo/-/topo-1.1.0.tgz#e9d751615d1bb87dc865db182fa1ca0a5ef536d5" + dependencies: + hoek "2.x.x" + tough-cookie@~2.3.3: version "2.3.4" resolved "https://registry.yarnpkg.com/tough-cookie/-/tough-cookie-2.3.4.tgz#ec60cee38ac675063ffc97a5c18970578ee83655" dependencies: punycode "^1.4.1" +toxic@^1.0.0: + version "1.0.1" + resolved "https://registry.yarnpkg.com/toxic/-/toxic-1.0.1.tgz#8c2e2528da591100adc3883f2c0e56acfb1c7288" + dependencies: + lodash "^4.17.10" + tree-kill@^1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/tree-kill/-/tree-kill-1.2.0.tgz#5846786237b4239014f05db156b643212d4c6f36" diff --git a/src/webui/src/components/Accuracy.tsx b/src/webui/src/components/Accuracy.tsx index e6e3eb0cf1..412c05bf6b 100644 --- a/src/webui/src/components/Accuracy.tsx +++ b/src/webui/src/components/Accuracy.tsx @@ -58,8 +58,6 @@ class Accuracy extends React.Component<{}, ChartState> { yAxis: { name: 'Accuracy', type: 'value', - min: 0, - max: 1, data: yAxis }, series: [{ @@ -85,7 +83,7 @@ class Accuracy extends React.Component<{}, ChartState> { accArr.push(parseFloat(accData[item].finalMetricData.data)); } }); - accY.push({yAxis: accArr}); + accY.push({ yAxis: accArr }); let optionObj = this.getOption(accY[0]); this.setState({ option: optionObj }, () => { if (accArr.length === 0) { diff --git a/src/webui/src/components/Logdetail.tsx b/src/webui/src/components/Logdetail.tsx deleted file mode 100644 index 6f1c829b76..0000000000 --- a/src/webui/src/components/Logdetail.tsx +++ /dev/null @@ -1,146 +0,0 @@ -import * as React from 'react'; -import axios from 'axios'; -import { MANAGER_IP } from '../const'; -import { - message, - Tabs, - Button -} from 'antd'; -const TabPane = Tabs.TabPane; -import '../style/logdetail.css'; - -interface LogState { - trialId: string; - slotLog: string; - processLog: string; -} - -class Logdetail extends React.Component<{}, LogState> { - - public _isMounted = false; - - constructor(props: {}) { - - super(props); - this.state = { - trialId: '', - slotLog: '', - processLog: '' - }; - } - - getJobLog = () => { - - Object.keys(this.props).map(item => { - if (item === 'location') { - if (this._isMounted) { - this.setState({ trialId: this.props[item].state }, () => { - - const { trialId } = this.state; - let id = trialId; - - axios(`${MANAGER_IP}/jobLog`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json;charset=utf-8' - }, - data: { - id - } - }) - .then(res => { - if (res.status === 200 && this._isMounted) { - this.setState({ - slotLog: res.data.trial_slot_log, - processLog: res.data.trial_process_log - }); - } - }); - }); - } - } - }); - } - - getPaiDetail = (id: string) => { - - axios(`${MANAGER_IP}/jobPaiPage`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json;charset=utf-8' - }, - data: { - id - } - }) - .then(res => { - if (res.status === 200) { - message.success('Successful send'); - setTimeout(this.openPage(res.data.url), 100); - } - }); - } - - openPage = (pailog: string) => { - window.open(pailog); - } - - paiLog = () => { - - axios(`${MANAGER_IP}/paiPage`, { - method: 'POST' - }) - .then(res => { - if (res.status === 200) { - setTimeout(this.openPage(res.data.url), 200); - } - }); - } - - componentDidMount() { - - this._isMounted = true; - this.getJobLog(); - } - - componentWillUnmount() { - - this._isMounted = false; - } - - render() { - const { trialId, slotLog, processLog } = this.state; - return ( -
-
- - -
{slotLog}
-
- -
{processLog}
-
-
-
-
- - -
-
- ); - } -} - -export default Logdetail; \ No newline at end of file diff --git a/src/webui/src/components/Para.tsx b/src/webui/src/components/Para.tsx index 4456276131..32d099ad8e 100644 --- a/src/webui/src/components/Para.tsx +++ b/src/webui/src/components/Para.tsx @@ -35,6 +35,11 @@ interface ParaObj { parallelAxis: Array; } +interface VisualMapValue { + maxAccuracy: number; + minAccuracy: number; +} + interface ParaState { option: object; paraBack: ParaObj; @@ -42,6 +47,7 @@ interface ParaState { swapAxisArr: Array; percent: number; paraNodata: string; + visualValue: VisualMapValue; } message.config({ @@ -69,9 +75,13 @@ class Para extends React.Component<{}, ParaState> { swapAxisArr: [], percent: 0, paraNodata: '', + visualValue: { + minAccuracy: 0, + maxAccuracy: 1 + } }; } - + hyperParaPic = () => { axios .all([ @@ -110,7 +120,11 @@ class Para extends React.Component<{}, ParaState> { const dimName = Object.keys(speDimName[0]); if (this._isMounted) { this.setState(() => ({ - dimName: dimName + dimName: dimName, + visualValue: { + minAccuracy: accPara.length !== 0 ? Math.min(...accPara) : 0, + maxAccuracy: accPara.length !== 0 ? Math.max(...accPara) : 1 + } })); } // search space range and specific value [only number] @@ -159,6 +173,11 @@ class Para extends React.Component<{}, ParaState> { Object.keys(paraYdata).map(item => { paraYdata[item].push(accPara[item]); }); + // according acc to sort ydata + if (paraYdata.length !== 0) { + const len = paraYdata[0].length - 1; + paraYdata.sort((a, b) => b[len] - a[len]); + } this.setState(() => ({ paraBack: { parallelAxis: parallelAxis, @@ -205,6 +224,7 @@ class Para extends React.Component<{}, ParaState> { // deal with response data into pic data getOption = (dataObj: ParaObj) => { + const { visualValue } = this.state; let parallelAxis = dataObj.parallelAxis; let paralleData = dataObj.data; let optionown = { @@ -223,8 +243,6 @@ class Para extends React.Component<{}, ParaState> { borderColor: '#ddd' } }, - feature: { - }, z: 202 }, parallel: { @@ -236,8 +254,8 @@ class Para extends React.Component<{}, ParaState> { }, visualMap: { type: 'continuous', - min: 0, - max: 1, + min: visualValue.minAccuracy, + max: visualValue.maxAccuracy, // gradient color color: ['#fb7c7c', 'yellow', 'lightblue'] }, @@ -363,7 +381,7 @@ class Para extends React.Component<{}, ParaState> {
Hyper Parameter
- {/* top */} + top