Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Designated gpu devices for NNI trial jobs (#991) #162

Merged
merged 1 commit into from
Apr 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/en_US/ExperimentConfig.md
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,13 @@ machineList:

__image__ set the image to be used in __worker__.

* __localConfig__

__localConfig__ is applicable only if __trainingServicePlatform__ is set to ```local```, otherwise there should not be __localConfig__ section in configuration file.
* __gpuIndices__

__gpuIndices__ is used to specify designated GPU devices for NNI, if it is set, only the specified GPU devices are used for NNI trial jobs. Single or multiple GPU indices can be specified, multiple GPU indices are seperated by comma(,), such as ```1``` or ```0,1,3```.

* __machineList__

__machineList__ should be set if __trainingServicePlatform__ is set to remote, or it should be empty.
Expand Down Expand Up @@ -422,6 +429,10 @@ machineList:

__passphrase__ is used to protect ssh key, which could be empty if users don't have passphrase.

* __gpuIndices__

__gpuIndices__ is used to specify designated GPU devices for NNI on this remote machine, if it is set, only the specified GPU devices are used for NNI trial jobs. Single or multiple GPU indices can be specified, multiple GPU indices are seperated by comma(,), such as ```1``` or ```0,1,3```.

* __kubeflowConfig__:

* __operator__
Expand Down
70 changes: 45 additions & 25 deletions src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,57 +21,75 @@

import { Container, Scope } from 'typescript-ioc';

import * as component from './common/component';
import * as fs from 'fs';
import * as component from './common/component';
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger, logLevelNameMap } from './common/log';
import { Manager } from './common/manager';
import { TrainingService } from './common/trainingService';
import { parseArg, uniqueString, mkDirP, getLogDir } from './common/utils';
import { getLogDir, mkDirP, parseArg, uniqueString } from './common/utils';
import { NNIDataStore } from './core/nniDataStore';
import { NNIManager } from './core/nnimanager';
import { SqlDB } from './core/sqlDatabase';
import { NNIRestServer } from './rest_server/nniRestServer';
import { LocalTrainingServiceForGPU } from './training_service/local/localTrainingServiceForGPU';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { LocalTrainingService } from './training_service/local/localTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService';
import {
RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';

function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number, logDirectory: string, experimentLogLevel: string) {
function initStartupInfo(
startExpMode: string, resumeExperimentId: string, basePort: number,
logDirectory: string, experimentLogLevel: string): void {
const createNew: boolean = (startExpMode === 'new');
const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel);
}

async function initContainer(platformMode: string): Promise<void> {
if (platformMode === 'local') {
Container.bind(TrainingService).to(LocalTrainingServiceForGPU).scope(Scope.Singleton);
Container.bind(TrainingService)
.to(LocalTrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'remote') {
Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton);
Container.bind(TrainingService)
.to(RemoteMachineTrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'pai') {
Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton);
Container.bind(TrainingService)
.to(PAITrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'kubeflow') {
Container.bind(TrainingService).to(KubeflowTrainingService).scope(Scope.Singleton);
Container.bind(TrainingService)
.to(KubeflowTrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'frameworkcontroller') {
Container.bind(TrainingService).to(FrameworkControllerTrainingService).scope(Scope.Singleton);
}
else {
Container.bind(TrainingService)
.to(FrameworkControllerTrainingService)
.scope(Scope.Singleton);
} else {
throw new Error(`Error: unsupported mode: ${mode}`);
}
Container.bind(Manager).to(NNIManager).scope(Scope.Singleton);
Container.bind(Database).to(SqlDB).scope(Scope.Singleton);
Container.bind(DataStore).to(NNIDataStore).scope(Scope.Singleton);
Container.bind(Manager)
.to(NNIManager)
.scope(Scope.Singleton);
Container.bind(Database)
.to(SqlDB)
.scope(Scope.Singleton);
Container.bind(DataStore)
.to(NNIDataStore)
.scope(Scope.Singleton);
const ds: DataStore = component.get(DataStore);

await ds.init();
}

function usage(): void {
console.info('usage: node main.js --port <port> --mode <local/remote/pai/kubeflow/frameworkcontroller> --start_mode <new/resume> --experiment_id <id>');
console.info('usage: node main.js --port <port> --mode \
<local/remote/pai/kubeflow/frameworkcontroller> --start_mode <new/resume> --experiment_id <id>');
}

const strPort: string = parseArg(['--port', '-p']);
Expand Down Expand Up @@ -117,7 +135,8 @@ if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {

initStartupInfo(startMode, experimentId, port, logDir, logLevel);

mkDirP(getLogDir()).then(async () => {
mkDirP(getLogDir())
.then(async () => {
const log: Logger = getLogger();
try {
await initContainer(mode);
Expand All @@ -128,25 +147,26 @@ mkDirP(getLogDir()).then(async () => {
log.error(`${err.stack}`);
throw err;
}
}).catch((err: Error) => {
})
.catch((err: Error) => {
console.error(`Failed to create log dir: ${err.stack}`);
});

process.on('SIGTERM', async () => {
const log: Logger = getLogger();
let hasError: boolean = false;
try{
try {
const nniManager: Manager = component.get(Manager);
await nniManager.stopExperiment();
const ds: DataStore = component.get(DataStore);
await ds.close();
const restServer: NNIRestServer = component.get(NNIRestServer);
await restServer.stop();
}catch(err){
} catch (err) {
hasError = true;
log.error(`${err.stack}`);
}finally{
} finally {
await log.close();
process.exit(hasError?1:0);
process.exit(hasError ? 1 : 0);
}
})
});
6 changes: 5 additions & 1 deletion src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,12 @@ export namespace ValidationSchemas {
port: joi.number().min(1).max(65535).required(),
passwd: joi.string(),
sshKeyPath: joi.string(),
passphrase: joi.string()
passphrase: joi.string(),
gpuIndices: joi.string()
})),
local_config: joi.object({
gpuIndices: joi.string()
}),
trial_config: joi.object({
image: joi.string().min(1),
codeDir: joi.string().min(1).required(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
*/
export enum TrialConfigMetadataKey {
MACHINE_LIST = 'machine_list',
LOCAL_CONFIG = 'local_config',
TRIAL_CONFIG = 'trial_config',
EXPERIMENT_ID = 'experimentId',
MULTI_PHASE = 'multiPhase',
Expand Down
71 changes: 40 additions & 31 deletions src/nni_manager/training_service/local/gpuScheduler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,18 @@

'use strict';

import { delay } from '../../common/utils';
import { GPUInfo, GPUSummary } from '../common/gpuData';
import { getLogger, Logger } from '../../common/log';
import * as cp from 'child_process';
import * as cpp from 'child-process-promise';
import * as path from 'path';
import * as os from 'os';
import * as cp from 'child_process';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import { String } from 'typescript-string-operations';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData'
import { getLogger, Logger } from '../../common/log';
import { delay } from '../../common/utils';
import { GPU_INFO_COLLECTOR_FORMAT, GPUInfo, GPUSummary } from '../common/gpuData';

/**
* GPUScheduler
* GPUScheduler for local training service
*/
class GPUScheduler {

Expand All @@ -58,45 +57,55 @@ class GPUScheduler {
}
}

/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
//generate gpu_metrics_collector.sh
let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
this.gpuMetricCollectorScriptFolder,
path.join(this.gpuMetricCollectorScriptFolder, 'pid'),
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
}

public getAvailableGPUIndices(): number[] {
if (this.gpuSummary !== undefined) {
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0).map((info: GPUInfo) => info.index);
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0)
.map((info: GPUInfo) => info.index);
}

return [];
}

public async stop() {
public getSystemGpuCount(): number {
if (this.gpuSummary !== undefined) {
return this.gpuSummary.gpuCount;
}

return 0;
}

public async stop(): Promise<void> {
this.stopping = true;
try {
const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8');
await cpp.exec(`pkill -P ${pid}`);
await cpp.exec(`rm -rf ${this.gpuMetricCollectorScriptFolder}`);
} catch (error){
} catch (error) {
this.log.error(`GPU scheduler error: ${error}`);
}
}

private async updateGPUSummary() {
const cmdresult = await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`);
if(cmdresult && cmdresult.stdout) {
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
//generate gpu_metrics_collector.sh
const gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
this.gpuMetricCollectorScriptFolder,
path.join(this.gpuMetricCollectorScriptFolder, 'pid')
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
}

private async updateGPUSummary(): Promise<void> {
const cmdresult: cpp.childProcessPromise.Result =
await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`);
if (cmdresult && cmdresult.stdout) {
this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
} else {
this.log.error('Could not get gpu metrics information!');
Expand Down
Loading