Skip to content

Commit

Permalink
Configurable nniManager log path and log level (microsoft#644)
Browse files Browse the repository at this point in the history
* Pull code (#22)

* Support distributed job for frameworkcontroller (microsoft#612)

support distributed job for frameworkcontroller

* Multiphase doc (microsoft#519)

* multiPhase doc

* updates

* updates

* Add time parser for 'nnictl update duration' (microsoft#632)

Current nnictl update duration only support seconds unit, add a parser for this command to support {s, m, h, d}

* fix experiment state bug (microsoft#629)

* update top README.md (microsoft#622)

* Update README.md

* update (microsoft#634)

* Integration tests refactoring (microsoft#625)

* Integration test refactoring (#21) (microsoft#616)

* Integration test refactoring (#21)

* Refactoring integration tests

* test metrics

* update azure pipeline

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* update trigger

* Integration test refactoring (microsoft#618)

* updates

* updates

* update pipeline (microsoft#619)

* update pipeline

* updates

* updates

* updates

* updates

* updates

* test pipeline (microsoft#623)

* test pipeline

* updates

* updates

* updates

* Update integration test (microsoft#624)

* Update integration test

* updates

* updates

* updates

* updates

* updates

* updates

* Revert "Pull code (#22)"

This reverts commit 62fc165.

* Configurable nniManager log path

* Configure log level

* add --debug command line for nnictl

* updates
  • Loading branch information
chicm-ms authored Jan 24, 2019
1 parent 0f9fbf8 commit d9c83c0
Show file tree
Hide file tree
Showing 12 changed files with 109 additions and 21 deletions.
14 changes: 12 additions & 2 deletions docs/ExperimentConfig.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,18 @@ machineList:
__nniManagerIp__ set the IP address of the machine on which nni manager process runs. This field is optional, and if it's not set, eth0 device IP will be used instead.

Note: run ifconfig on NNI manager's machine to check if eth0 device exists. If not, we recommend to set nnimanagerIp explicitly.



* __logDir__
* Description

__logDir__ configures the directory to store logs and data of the experiment. The default value is `<user home directory>/nni/experiment`

* __logLevel__
* Description

__logLevel__ sets log level for the experiment, available log levels are: `trace, debug, info, warning, error, fatal`. The default value is `info`.


* __tuner__
* Description

Expand Down
2 changes: 2 additions & 0 deletions docs/NNICTLDOC.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ nnictl --version
| ------ | ------ | ------ |------ |
| --config, -c| True| |yaml configure file of the experiment|
| --port, -p | False| |the port of restful server|
| --debug, -d | False| |Set log level to debug|

* __nnictl resume__

Expand All @@ -62,6 +63,7 @@ nnictl --version
| ------ | ------ | ------ |------ |
| id| False| |The id of the experiment you want to resume|
| --port, -p| False| |Rest port of the experiment you want to resume|
| --debug, -d | False| |Set log level to debug|

* __nnictl stop__
* Description
Expand Down
40 changes: 36 additions & 4 deletions src/nni_manager/common/experimentStartupInfo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
'use strict';

import * as assert from 'assert';
import * as os from 'os';
import * as path from 'path';
import * as component from '../common/component';

@component.Singleton
Expand All @@ -29,15 +31,27 @@ class ExperimentStartupInfo {
private basePort: number = -1;
private initialized: boolean = false;
private initTrialSequenceID: number = 0;
private logDir: string = '';
private logLevel: string = '';

public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void {
public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void {
assert(!this.initialized);
assert(experimentId.trim().length > 0);

this.newExperiment = newExperiment;
this.experimentId = experimentId;
this.basePort = basePort;
this.initialized = true;

if (logDir !== undefined && logDir.length > 0) {
this.logDir = path.join(logDir, getExperimentId());
} else {
this.logDir = path.join(os.homedir(), 'nni', 'experiments', getExperimentId());
}

if (logLevel !== undefined && logLevel.length > 1) {
this.logLevel = logLevel;
}
}

public getExperimentId(): string {
Expand All @@ -58,6 +72,18 @@ class ExperimentStartupInfo {
return this.newExperiment;
}

public getLogDir(): string {
assert(this.initialized);

return this.logDir;
}

public getLogLevel(): string {
assert(this.initialized);

return this.logLevel;
}

public setInitTrialSequenceId(initSequenceId: number): void {
assert(this.initialized);
this.initTrialSequenceID = initSequenceId;
Expand Down Expand Up @@ -90,9 +116,15 @@ function getInitTrialSequenceId(): number {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId();
}

function setExperimentStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setStartupInfo(newExperiment, experimentId, basePort);
function getExperimentStartupInfo(): ExperimentStartupInfo {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo);
}

function setExperimentStartupInfo(
newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo)
.setStartupInfo(newExperiment, experimentId, basePort, logDir, logLevel);
}

export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment,
export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo,
setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId };
24 changes: 21 additions & 3 deletions src/nni_manager/common/log.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,18 @@ import { Writable } from 'stream';
import { WritableStreamBuffer } from 'stream-buffers';
import { format } from 'util';
import * as component from '../common/component';
import { getExperimentStartupInfo } from './experimentStartupInfo';
import { getLogDir } from './utils';

const CRITICAL: number = 1;
const FATAL: number = 1;
const ERROR: number = 2;
const WARNING: number = 3;
const INFO: number = 4;
const DEBUG: number = 5;
const TRACE: number = 6;

const logLevelNameMap: Map<string, number> = new Map([['fatal', FATAL],
['error', ERROR], ['warning', WARNING], ['info', INFO], ['debug', DEBUG], ['trace', TRACE]]);

class BufferSerialEmitter {
private buffer: Buffer;
Expand Down Expand Up @@ -83,12 +88,25 @@ class Logger {
autoClose: true
});
this.bufferSerialEmitter = new BufferSerialEmitter(this.writable);

const logLevelName: string = getExperimentStartupInfo()
.getLogLevel();
const logLevel: number | undefined = logLevelNameMap.get(logLevelName);
if (logLevel !== undefined) {
this.level = logLevel;
}
}

public close() {
this.writable.destroy();
}

public trace(...param: any[]): void {
if (this.level >= TRACE) {
this.log('TRACE', param);
}
}

public debug(...param: any[]): void {
if (this.level >= DEBUG) {
this.log('DEBUG', param);
Expand All @@ -113,8 +131,8 @@ class Logger {
}
}

public critical(...param: any[]): void {
this.log('CRITICAL', param);
public fatal(...param: any[]): void {
this.log('FATAL', param);
}

private log(level: string, param: any[]): void {
Expand Down
5 changes: 3 additions & 2 deletions src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,14 @@ import { Container } from 'typescript-ioc';
import * as util from 'util';

import { Database, DataStore } from './datastore';
import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from './experimentStartupInfo';
import { ExperimentStartupInfo, getExperimentId, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo';
import { Manager } from './manager';
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
import { getLogger } from './log';

function getExperimentRootDir(): string {
return path.join(os.homedir(), 'nni', 'experiments', getExperimentId());
return getExperimentStartupInfo()
.getLogDir();
}

function getLogDir(): string{
Expand Down
4 changes: 2 additions & 2 deletions src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import {
import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService';
import { delay, getCheckpointDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils';
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils';
import {
ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
Expand Down Expand Up @@ -670,7 +670,7 @@ class NNIManager implements Manager {
id: getExperimentId(),
revision: 0,
execDuration: 0,
logDir: getLogDir(),
logDir: getExperimentRootDir(),
maxSequenceId: 0,
params: {
authorName: '',
Expand Down
19 changes: 16 additions & 3 deletions src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import { Container, Scope } from 'typescript-ioc';

import * as component from './common/component';
import * as fs from 'fs';
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger } from './common/log';
Expand All @@ -40,10 +41,10 @@ import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';

function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) {
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number, logDirectory: string, experimentLogLevel: string) {
const createNew: boolean = (startExpMode === 'new');
const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
setExperimentStartupInfo(createNew, expId, basePort);
setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel);
}

async function initContainer(platformMode: string): Promise<void> {
Expand Down Expand Up @@ -102,7 +103,19 @@ if (startMode === 'resume' && experimentId.trim().length < 1) {
process.exit(1);
}

initStartupInfo(startMode, experimentId, port);
const logDir: string = parseArg(['--log_dir', '-ld']);
if (logDir.length > 0) {
if (!fs.existsSync(logDir)) {
console.log(`FATAL: log_dir ${logDir} does not exist`);
}
}

const logLevel: string = parseArg(['--log_level', '-ll']);
if (logLevel.length > 0 && !['debug', 'info', 'error', 'warning', 'critical'].includes(logLevel)) {
console.log(`FATAL: invalid log_level: ${logLevel}`);
}

initStartupInfo(startMode, experimentId, port, logDir, logLevel);

mkDirP(getLogDir()).then(async () => {
const log: Logger = getLogger();
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/rest_server/restHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class NNIRestHandler {

// If it's a fatal error, exit process
if(isFatal) {
this.log.critical(err);
this.log.fatal(err);
process.exit(1);
}

Expand Down
2 changes: 2 additions & 0 deletions tools/nni_cmd/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
Optional('multiPhase'): bool,
Optional('multiThread'): bool,
Optional('nniManagerIp'): str,
Optional('logDir'): os.path.isdir,
Optional('logLevel'): Or('trace', 'debug', 'info', 'warning', 'error', 'fatal'),
'useAnnotation': bool,
Optional('advisor'): Or({
'builtinAdvisorName': Or('Hyperband'),
Expand Down
13 changes: 10 additions & 3 deletions tools/nni_cmd/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _generate_installation_path(sitepackages_path):
print_error('Fail to find nni under python library')
exit(1)

def start_rest_server(port, platform, mode, config_file_name, experiment_id=None):
def start_rest_server(port, platform, mode, config_file_name, experiment_id=None, log_dir=None, log_level=None):
'''Run nni manager process'''
nni_config = Config(config_file_name)
if detect_port(port):
Expand All @@ -118,6 +118,10 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
entry_file = os.path.join(entry_dir, 'main.js')

cmds = ['node', entry_file, '--port', str(port), '--mode', platform, '--start_mode', mode]
if log_dir is not None:
cmds += ['--log_dir', log_dir]
if log_level is not None:
cmds += ['--log_level', log_level]
if mode == 'resume':
cmds += ['--experiment_id', experiment_id]
stdout_full_path, stderr_full_path = get_log_path(config_file_name)
Expand Down Expand Up @@ -317,9 +321,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
except ModuleNotFoundError as e:
print_error('The tuner %s should be installed through nnictl'%(tuner_name))
exit(1)

log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None
log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None
if log_level not in ['trace', 'debug'] and args.debug:
log_level = 'debug'
# start rest server
rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id)
rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level)
nni_config.set_config('restServerPid', rest_process.pid)
# Deal with annotation
if experiment_config.get('useAnnotation'):
Expand Down
2 changes: 2 additions & 0 deletions tools/nni_cmd/nnictl.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,14 @@ def parse_args():
parser_start = subparsers.add_parser('create', help='create a new experiment')
parser_start.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file')
parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_start.add_argument('--debug', '-d', action='store_true', help=' set log level to debug')
parser_start.set_defaults(func=create_experiment)

# parse resume command
parser_resume = subparsers.add_parser('resume', help='resume a new experiment')
parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume')
parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_resume.add_argument('--debug', '-d', action='store_true', help=' set log level to debug')
parser_resume.set_defaults(func=resume_experiment)

# parse update command
Expand Down
3 changes: 2 additions & 1 deletion tools/nni_trial_tool/log_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@

@unique
class LogType(Enum):
Trace = 'TRACE'
Debug = 'DEBUG'
Info = 'INFO'
Warning = 'WARNING'
Error = 'ERROR'
Critical = 'CRITICAL'
Fatal = 'FATAL'

@unique
class StdOutputType(Enum):
Expand Down

0 comments on commit d9c83c0

Please sign in to comment.