Skip to content

Commit

Permalink
Refactoring nnimanager log (microsoft#652)
Browse files Browse the repository at this point in the history
* Pull code (#22)

* Support distributed job for frameworkcontroller (microsoft#612)

support distributed job for frameworkcontroller

* Multiphase doc (microsoft#519)

* multiPhase doc

* updates

* updates

* Add time parser for 'nnictl update duration' (microsoft#632)

Current nnictl update duration only support seconds unit, add a parser for this command to support {s, m, h, d}

* fix experiment state bug (microsoft#629)

* update top README.md (microsoft#622)

* Update README.md

* update (microsoft#634)

* Integration tests refactoring (microsoft#625)

* Integration test refactoring (#21) (microsoft#616)

* Integration test refactoring (#21)

* Refactoring integration tests

* test metrics

* update azure pipeline

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* update trigger

* Integration test refactoring (microsoft#618)

* updates

* updates

* update pipeline (microsoft#619)

* update pipeline

* updates

* updates

* updates

* updates

* updates

* test pipeline (microsoft#623)

* test pipeline

* updates

* updates

* updates

* Update integration test (microsoft#624)

* Update integration test

* updates

* updates

* updates

* updates

* updates

* updates

* Revert "Pull code (#22)"

This reverts commit 62fc165.

* Update nnimanager logs

* updates

* Update README.md

* Revert "Update README.md"

This reverts commit bc67061.

* updates

* updates
  • Loading branch information
chicm-ms authored Jan 25, 2019
1 parent d9c83c0 commit 6d59198
Show file tree
Hide file tree
Showing 14 changed files with 83 additions and 248 deletions.
4 changes: 2 additions & 2 deletions src/nni_manager/common/log.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class BufferSerialEmitter {
@component.Singleton
class Logger {
private DEFAULT_LOGFILE: string = path.join(getLogDir(), 'nnimanager.log');
private level: number = DEBUG;
private level: number = INFO;
private bufferSerialEmitter: BufferSerialEmitter;
private writable: Writable;

Expand Down Expand Up @@ -153,4 +153,4 @@ function getLogger(fileName?: string): Logger {
return component.get(Logger);
}

export { Logger, getLogger };
export { Logger, getLogger, logLevelNameMap };
5 changes: 3 additions & 2 deletions src/nni_manager/common/manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore';
import { TrialJobStatus } from './trainingService';

type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM';
type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';

interface ExperimentParams {
authorName: string;
Expand Down Expand Up @@ -85,7 +86,7 @@ interface TrialJobStatistics {
}

interface NNIManagerStatus {
status: 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';
status: ExperimentStatus;
errors: string[];
}

Expand All @@ -109,4 +110,4 @@ abstract class Manager {
public abstract getStatus(): NNIManagerStatus;
}

export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus };
export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus, ExperimentStatus };
49 changes: 33 additions & 16 deletions src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import { NNIError } from '../common/errors';
import { getExperimentId, setInitTrialSequenceId } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log';
import {
ExperimentParams, ExperimentProfile, Manager,
ExperimentParams, ExperimentProfile, Manager, ExperimentStatus,
NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
} from '../common/manager';
import {
Expand Down Expand Up @@ -112,12 +112,13 @@ class NNIManager implements Manager {
}

public async cancelTrialJobByUser(trialJobId: string): Promise<void> {
this.log.info(`User cancelTrialJob: ${trialJobId}`);
await this.trainingService.cancelTrialJob(trialJobId);
await this.dataStore.storeTrialJobEvent('USER_TO_CANCEL', trialJobId, '');
}

public async startExperiment(expParams: ExperimentParams): Promise<string> {
this.log.debug(`Starting experiment: ${this.experimentProfile.id}`);
this.log.info(`Starting experiment: ${this.experimentProfile.id}`);
this.experimentProfile.params = expParams;
await this.storeExperimentProfile();
this.log.debug('Setup tuner...');
Expand All @@ -138,7 +139,7 @@ class NNIManager implements Manager {
checkpointDir);

this.experimentProfile.startTime = Date.now();
this.status.status = 'RUNNING';
this.setStatus('RUNNING');
await this.storeExperimentProfile();
this.run().catch((err: Error) => {
this.criticalError(err);
Expand All @@ -148,6 +149,7 @@ class NNIManager implements Manager {
}

public async resumeExperiment(): Promise<void> {
this.log.info(`Resuming experiment: ${this.experimentProfile.id}`);
//Fetch back the experiment profile
const experimentId: string = getExperimentId();
this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
Expand Down Expand Up @@ -185,7 +187,7 @@ class NNIManager implements Manager {
this.experimentProfile.endTime) {
delete this.experimentProfile.endTime;
}
this.status.status = 'RUNNING';
this.setStatus('RUNNING');

// TO DO: update database record for resume event
this.run().catch((err: Error) => {
Expand All @@ -200,6 +202,7 @@ class NNIManager implements Manager {
}

public async setClusterMetadata(key: string, value: string): Promise<void> {
this.log.info(`NNIManager setClusterMetadata, key: ${key}, value: ${value}`);
let timeoutId: NodeJS.Timer;
// TO DO: move timeout value to constants file
const delay1: Promise<{}> = new Promise((resolve: Function, reject: Function): void => {
Expand All @@ -223,10 +226,10 @@ class NNIManager implements Manager {
}

public async stopExperiment(): Promise<void> {
this.status.status = 'STOPPING';
this.log.info('Experiment done, cleaning up...');
this.setStatus('STOPPING');
this.log.info('Stopping experiment, cleaning up ...');
await this.experimentDoneCleanUp();
this.log.info('Experiment done.');
this.log.info('Experiment stopped.');
}

public async getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> {
Expand Down Expand Up @@ -334,6 +337,7 @@ class NNIManager implements Manager {
if (trialJob.status === 'RUNNING' ||
trialJob.status === 'WAITING') {
try {
this.log.info(`cancelTrialJob: ${trialJob.id}`);
await this.trainingService.cancelTrialJob(trialJob.id);
} catch (error) {
// pid does not exist, do nothing here
Expand All @@ -343,7 +347,7 @@ class NNIManager implements Manager {
await this.trainingService.cleanUp();
this.experimentProfile.endTime = Date.now();
await this.storeExperimentProfile();
this.status.status = 'STOPPED';
this.setStatus('STOPPED');
}

private async periodicallyUpdateExecDuration(): Promise<void> {
Expand All @@ -365,8 +369,8 @@ class NNIManager implements Manager {
throw new Error('Error: tuner has not been setup');
}
while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
await delay(1000 * 5);
this.dispatcher.sendCommand(PING);
await delay(1000 * 5);
}
}

Expand All @@ -379,6 +383,7 @@ class NNIManager implements Manager {
const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId);
const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status !== trialJobDetail.status) {
this.log.info(`Trial job ${trialJobDetail.id} status changed from ${oldTrialJobDetail.status} to ${trialJobDetail.status}`);
this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail));
await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail);
}
Expand Down Expand Up @@ -478,12 +483,12 @@ class NNIManager implements Manager {
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
if (this.status.status !== 'DONE') {
this.status.status = 'NO_MORE_TRIAL';
this.setStatus('NO_MORE_TRIAL');
waitSubmittedToFinish = this.currSubmittedTrialNum;

assert(allFinishedTrialJobNum <= waitSubmittedToFinish);
if (allFinishedTrialJobNum >= waitSubmittedToFinish) {
this.status.status = 'DONE';
this.setStatus('DONE');
this.experimentProfile.endTime = Date.now();
await this.storeExperimentProfile();
// write this log for travis CI
Expand All @@ -496,7 +501,7 @@ class NNIManager implements Manager {
await this.storeExperimentProfile();
}
if (this.status.status !== 'TUNER_NO_MORE_TRIAL') {
this.status.status = 'RUNNING';
this.setStatus('RUNNING');
}
for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
if (this.waitingTrials.length === 0 ||
Expand All @@ -515,6 +520,7 @@ class NNIManager implements Manager {
index: 0
}
};
this.log.info(`submitTrialJob: form: ${JSON.stringify(trialJobAppForm)}`);
const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
await this.storeMaxSequenceId(trialJobDetail.sequenceId);
this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail));
Expand Down Expand Up @@ -558,6 +564,7 @@ class NNIManager implements Manager {
}

private addEventListeners(): void {
this.log.info('Add event listeners');
// TO DO: cannot run this method more than once in one NNIManager instance
if (this.dispatcher === undefined) {
throw new Error('Error: tuner or job maintainer have not been setup');
Expand Down Expand Up @@ -585,6 +592,7 @@ class NNIManager implements Manager {
}

private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
this.log.debug(`NNIManager received trial job metrics: ${metric}`);
await this.dataStore.storeMetricData(metric.id, metric.data);
if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup');
Expand Down Expand Up @@ -612,7 +620,7 @@ class NNIManager implements Manager {
}

private async onTunerCommand(commandType: string, content: string): Promise<void> {
this.log.info(`Command from tuner: ${commandType}, ${content}`);
this.log.info(`NNIManaer received command from dispatcher: ${commandType}, ${content}`);
switch (commandType) {
case INITIALIZED:
// Tuner is intialized, search space is set, request tuner to generate hyper parameters
Expand All @@ -621,7 +629,7 @@ class NNIManager implements Manager {
case NEW_TRIAL_JOB:
if (this.status.status === 'TUNER_NO_MORE_TRIAL') {
this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set');
this.status.status = 'RUNNING';
this.setStatus('RUNNING');
}
this.waitingTrials.push(content);
break;
Expand All @@ -637,14 +645,16 @@ class NNIManager implements Manager {
index: tunerCommand.parameter_index
}
};
this.log.info(`updateTrialJob: job id: ${tunerCommand.trial_job_id}, form: ${JSON.stringify(trialJobForm)}`);
await this.trainingService.updateTrialJob(tunerCommand.trial_job_id, trialJobForm);
await this.dataStore.storeTrialJobEvent(
'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined);
break;
case NO_MORE_TRIAL_JOBS:
this.status.status = 'TUNER_NO_MORE_TRIAL';
this.setStatus('TUNER_NO_MORE_TRIAL');
break;
case KILL_TRIAL_JOB:
this.log.info(`cancelTrialJob: ${JSON.parse(content)}`);
await this.trainingService.cancelTrialJob(JSON.parse(content), true);
break;
default:
Expand All @@ -662,7 +672,14 @@ class NNIManager implements Manager {
this.log.error(err.stack);
}
this.status.errors.push(err.message);
this.status.status = 'ERROR';
this.setStatus('ERROR');
}

private setStatus(status: ExperimentStatus): void {
if (status !== this.status.status) {
this.log.info(`Change NNIManager status from: ${this.status.status} to: ${status}`);
this.status.status = status;
}
}

private createEmptyExperimentProfile(): ExperimentProfile {
Expand Down
12 changes: 10 additions & 2 deletions src/nni_manager/core/sqlDatabase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import {
TrialJobEvent,
TrialJobEventRecord
} from '../common/datastore';
import { getLogger, Logger } from '../common/log';
import { ExperimentProfile } from '../common/manager';
import { TrialJobDetail } from '../common/trainingService';

Expand Down Expand Up @@ -95,13 +96,15 @@ function loadMetricData(row: any): MetricDataRecord {

class SqlDB implements Database {
private db!: sqlite3.Database;
private log: Logger = getLogger();
private initTask!: Deferred<void>;

public init(createNew: boolean, dbDir: string): Promise<void> {
if (this.initTask !== undefined) {
return this.initTask.promise;
}
this.initTask = new Deferred<void>();
this.log.debug(`Database directory: ${dbDir}`);
assert(fs.existsSync(dbDir));

// tslint:disable-next-line:no-bitwise
Expand Down Expand Up @@ -144,7 +147,7 @@ class SqlDB implements Database {
exp.maxSequenceId,
exp.revision
];

this.log.trace(`storeExperimentProfile: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<void> = new Deferred<void>();
this.db.run(sql, args, (err: Error | null) => { this.resolve(deferred, err); });

Expand All @@ -161,7 +164,7 @@ class SqlDB implements Database {
sql = 'select * from ExperimentProfile where id=? and revision=?';
args = [experimentId, revision];
}

this.log.trace(`queryExperimentProfile: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<ExperimentProfile[]> = new Deferred<ExperimentProfile[]>();
this.db.all(sql, args, (err: Error | null, rows: any[]) => {
this.resolve(deferred, err, rows, loadExperimentProfile);
Expand All @@ -183,6 +186,7 @@ class SqlDB implements Database {
const sequenceId: number | undefined = jobDetail === undefined ? undefined : jobDetail.sequenceId;
const args: any[] = [timestamp, trialJobId, event, hyperParameter, logPath, sequenceId];

this.log.trace(`storeTrialJobEvent: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<void> = new Deferred<void>();
this.db.run(sql, args, (err: Error | null) => { this.resolve(deferred, err); });

Expand All @@ -205,6 +209,7 @@ class SqlDB implements Database {
args = [trialJobId, event];
}

this.log.trace(`queryTrialJobEvent: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<TrialJobEventRecord[]> = new Deferred<TrialJobEventRecord[]>();
this.db.all(sql, args, (err: Error | null, rows: any[]) => {
this.resolve(deferred, err, rows, loadTrialJobEvent);
Expand All @@ -218,6 +223,7 @@ class SqlDB implements Database {
const json: MetricDataRecord = JSON.parse(data);
const args: any[] = [Date.now(), json.trialJobId, json.parameterId, json.type, json.sequence, JSON.stringify(json.data)];

this.log.trace(`storeMetricData: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<void> = new Deferred<void>();
this.db.run(sql, args, (err: Error | null) => { this.resolve(deferred, err); });

Expand All @@ -240,6 +246,7 @@ class SqlDB implements Database {
args = [trialJobId, metricType];
}

this.log.trace(`queryMetricData: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<MetricDataRecord[]> = new Deferred<MetricDataRecord[]>();
this.db.all(sql, args, (err: Error | null, rows: any[]) => {
this.resolve(deferred, err, rows, loadMetricData);
Expand Down Expand Up @@ -268,6 +275,7 @@ class SqlDB implements Database {
for (const row of (<any[]>rows)) {
data.push(rowLoader(row));
}
this.log.trace(`sql query result: ${JSON.stringify(data)}`);
(<Deferred<T[]>>deferred).resolve(data);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import * as component from './common/component';
import * as fs from 'fs';
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger } from './common/log';
import { getLogger, Logger, logLevelNameMap } from './common/log';
import { Manager } from './common/manager';
import { TrainingService } from './common/trainingService';
import { parseArg, uniqueString, mkDirP, getLogDir } from './common/utils';
Expand Down Expand Up @@ -111,7 +111,7 @@ if (logDir.length > 0) {
}

const logLevel: string = parseArg(['--log_level', '-ll']);
if (logLevel.length > 0 && !['debug', 'info', 'error', 'warning', 'critical'].includes(logLevel)) {
if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {
console.log(`FATAL: invalid log_level: ${logLevel}`);
}

Expand Down
Loading

0 comments on commit 6d59198

Please sign in to comment.