Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Merge pull request #80 from Microsoft/master
Browse files Browse the repository at this point in the history
merge master
  • Loading branch information
SparkSnail authored Nov 29, 2018
2 parents a0371e9 + cf3d434 commit e1ad56b
Show file tree
Hide file tree
Showing 25 changed files with 214 additions and 82 deletions.
4 changes: 2 additions & 2 deletions examples/trials/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ if __name__ == '__main__':

**2)Get configure from Tuner**

User import ```nni``` and use ```nni.get_next_parameter()``` to recive configure. Please noted **10**, **24** and **25** line in the following code.
User import ```nni``` and use ```nni.get_next_parameter()``` to receive configure. Please noted **10**, **24** and **25** line in the following code.


```python
Expand Down Expand Up @@ -165,7 +165,7 @@ def train(args, params):
...
```

Here is the complete exampe:
Here is the complete example:


```python
Expand Down
2 changes: 1 addition & 1 deletion examples/trials/auto-gbdt/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
'''
This project is for automaticlly tuning parameters for GBDT.
This project is for automatically tuning parameters for GBDT.
'''
import logging

Expand Down
4 changes: 2 additions & 2 deletions examples/trials/ga_squad/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,8 @@ Here is an example of the model configuration, which is passed from the tuner to

Every model configuration will has a "layers" section, which is a JSON list of layer definitions. The definition of each layer is also a JSON object, where:

* `type` is the type of the layer. 0, 1, 2, 3, 4 corresponde to attention, self-attention, RNN, input and output layer respectively.
* `size` is the length of the output. "x", "y" corresponde to document length / question length, respectively.
* `type` is the type of the layer. 0, 1, 2, 3, 4 correspond to attention, self-attention, RNN, input and output layer respectively.
* `size` is the length of the output. "x", "y" correspond to document length / question length, respectively.
* `input_size` is the number of inputs the layer has.
* `input` is the indices of layers taken as input of this layer.
* `output` is the indices of layers use this layer's output as their input.
Expand Down
4 changes: 2 additions & 2 deletions examples/trials/ga_squad/trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

def get_config():
'''
Get config from arument parser.
Get config from argument parser.
'''
parser = argparse.ArgumentParser(
description='This program is using genetic algorithm to search architecture for SQuAD.')
Expand Down Expand Up @@ -86,7 +86,7 @@ def get_id(word_dict, word):

def load_embedding(path):
'''
return embedding for a specif file by given file path.
return embedding for a specific file by given file path.
'''
EMBEDDING_DIM = 300
embedding_dict = {}
Expand Down
6 changes: 3 additions & 3 deletions examples/trials/mnist-annotation/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

class MnistNetwork(object):
'''
MnistNetwork is for initlizing and building basic network for mnist.
MnistNetwork is for initializing and building basic network for mnist.
'''
def __init__(self,
channel_1_num,
Expand Down Expand Up @@ -211,7 +211,7 @@ def main(params):
logger.debug('Send final result done.')


def generate_defualt_params():
def generate_default_params():
'''
Generate default parameters for mnist network.
'''
Expand All @@ -232,7 +232,7 @@ def generate_defualt_params():
if __name__ == '__main__':
'''@nni.get_next_parameter()'''
try:
main(generate_defualt_params())
main(generate_default_params())
except Exception as exception:
logger.exception(exception)
raise
2 changes: 1 addition & 1 deletion examples/trials/mnist/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

class MnistNetwork(object):
'''
MnistNetwork is for initlizing and building basic network for mnist.
MnistNetwork is for initializing and building basic network for mnist.
'''
def __init__(self,
channel_1_num,
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/common/datastore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ abstract class Database {
public abstract queryExperimentProfile(experimentId: string, revision?: number): Promise<ExperimentProfile[]>;
public abstract queryLatestExperimentProfile(experimentId: string): Promise<ExperimentProfile>;
public abstract storeTrialJobEvent(
event: TrialJobEvent, trialJobId: string, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void>;
event: TrialJobEvent, trialJobId: string, timestamp: number, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void>;
public abstract queryTrialJobEvent(trialJobId?: string, event?: TrialJobEvent): Promise<TrialJobEventRecord[]>;
public abstract storeMetricData(trialJobId: string, data: string): Promise<void>;
public abstract queryMetricData(trialJobId?: string, type?: MetricType): Promise<MetricDataRecord[]>;
Expand Down
39 changes: 37 additions & 2 deletions src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import * as assert from 'assert';
import { randomBytes } from 'crypto';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
Expand All @@ -32,6 +33,7 @@ import { Database, DataStore } from './datastore';
import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from './experimentStartupInfo';
import { Manager } from './manager';
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
import { getLogger } from './log';

function getExperimentRootDir(): string {
return path.join(os.homedir(), 'nni', 'experiments', getExperimentId());
Expand Down Expand Up @@ -287,5 +289,38 @@ function getJobCancelStatus(isEarlyStopped: boolean): TrialJobStatus {
return isEarlyStopped ? 'EARLY_STOPPED' : 'USER_CANCELED';
}

export {getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getJobCancelStatus,
getDefaultDatabaseDir, getIPV4Address, mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect };
/**
* Utility method to calculate file numbers under a directory, recursively
* @param directory directory name
*/
function countFilesRecursively(directory: string, timeoutMilliSeconds?: number): Promise<number> {
if(!fs.existsSync(directory)) {
throw Error(`Direcotory ${directory} doesn't exist`);
}

const deferred: Deferred<number> = new Deferred<number>();

let timeoutId : NodeJS.Timer
const delayTimeout : Promise<number> = new Promise((resolve : Function, reject : Function) : void => {
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId = setTimeout(() => {
reject(new Error(`Timeout: path ${directory} has too many files`));
}, 5000);
});

let fileCount: number = -1;
cpp.exec(`find ${directory} -type f | wc -l`).then((result) => {
if(result.stdout && parseInt(result.stdout)) {
fileCount = parseInt(result.stdout);
}
deferred.resolve(fileCount);
});

return Promise.race([deferred.promise, delayTimeout]).finally(() => {
clearTimeout(timeoutId);
});
}

export {countFilesRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address,
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect };
21 changes: 20 additions & 1 deletion src/nni_manager/core/nniDataStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,21 @@ class NNIDataStore implements DataStore {
event: TrialJobEvent, trialJobId: string, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void> {
this.log.debug(`storeTrialJobEvent: event: ${event}, data: ${hyperParameter}, jobDetail: ${JSON.stringify(jobDetail)}`);

return this.db.storeTrialJobEvent(event, trialJobId, hyperParameter, jobDetail).catch(
// Use the timestamp in jobDetail as TrialJobEvent timestamp for different events
let timestamp: number | undefined;
if (event === 'WAITING' && jobDetail) {
timestamp = jobDetail.submitTime;
} else if (event === 'RUNNING' && jobDetail) {
timestamp = jobDetail.startTime;
} else if (['EARLY_STOPPED', 'SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED'].includes(event) && jobDetail) {
timestamp = jobDetail.endTime;
}
// Use current time as timestamp if timestamp is not assigned from jobDetail
if (timestamp === undefined) {
timestamp = Date.now();
}

return this.db.storeTrialJobEvent(event, trialJobId, timestamp, hyperParameter, jobDetail).catch(
(err: Error) => {
throw new NNIError('Datastore error', `Datastore error: ${err.message}`, err);
}
Expand Down Expand Up @@ -272,6 +286,11 @@ class NNIDataStore implements DataStore {
if (record.logPath !== undefined) {
jobInfo.logPath = record.logPath;
}
// Initially assign WAITING timestamp as job's start time,
// If there is RUNNING state event, it will be updated as RUNNING state timestamp
if (jobInfo.startTime === undefined && record.timestamp !== undefined) {
jobInfo.startTime = record.timestamp;
}
break;
case 'SUCCEEDED':
case 'FAILED':
Expand Down
4 changes: 2 additions & 2 deletions src/nni_manager/core/sqlDatabase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,11 @@ class SqlDB implements Database {
}

public storeTrialJobEvent(
event: TrialJobEvent, trialJobId: string, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void> {
event: TrialJobEvent, trialJobId: string, timestamp: number, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void> {
const sql: string = 'insert into TrialJobEvent values (?,?,?,?,?,?)';
const logPath: string | undefined = jobDetail === undefined ? undefined : jobDetail.url;
const sequenceId: number | undefined = jobDetail === undefined ? undefined : jobDetail.sequenceId;
const args: any[] = [Date.now(), trialJobId, event, hyperParameter, logPath, sequenceId];
const args: any[] = [timestamp, trialJobId, event, hyperParameter, logPath, sequenceId];

const deferred: Deferred<void> = new Deferred<void>();
this.db.run(sql, args, (err: Error | null) => { this.resolve(deferred, err); });
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/core/test/sqlDatabase.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ describe('core/sqlDatabase', () => {
await (<SqlDB>db).storeExperimentProfile(profile);
}
for (const event of events) {
await (<SqlDB>db).storeTrialJobEvent(<TrialJobEvent>event.event, event.trialJobId, event.data);
await (<SqlDB>db).storeTrialJobEvent(<TrialJobEvent>event.event, event.trialJobId, Date.now(), event.data);
}
for (const metric of metrics) {
await (<SqlDB>db).storeMetricData(metric.trialJobId, JSON.stringify(metric));
Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ export namespace ValidationSchemas {
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0),
command: joi.string().min(1),
virtualCluster: joi.string(),
worker: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
Expand Down
48 changes: 48 additions & 0 deletions src/nni_manager/training_service/common/util.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { getLogger } from "common/log";

/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

'use strict';

import { countFilesRecursively } from '../../common/utils'

/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
*
* @param codeDir codeDir in nni config file
* @returns file number under codeDir
*/
export async function validateCodeDir(codeDir: string) : Promise<number> {
let fileCount: number | undefined;

try {
fileCount = await countFilesRecursively(codeDir);
} catch(error) {
throw new Error(`Call count file error: ${error}`);
}

if(fileCount && fileCount > 1000) {
const errMessage: string = `Too many files(${fileCount} found}) in ${codeDir},`
+ ` please check if it's a valid code dir`;
throw new Error(errMessage);
}

return fileCount;
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import { KubeflowClusterConfig, kubeflowOperatorMap, KubeflowTrialConfig, NFSCon
import { KubeflowTrialJobDetail } from './kubeflowData';
import { KubeflowJobRestServer } from './kubeflowJobRestServer';
import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
import { validateCodeDir } from '../common/util';
import { AzureStorageClientUtility } from './azureStorageClientUtils';
import * as azureStorage from 'azure-storage';

Expand Down Expand Up @@ -360,6 +361,15 @@ class KubeflowTrainingService implements TrainingService {

this.kubeflowTrialConfig = <KubeflowTrialConfig>JSON.parse(value);
assert(this.kubeflowClusterConfig !== undefined && this.kubeflowTrialConfig.worker !== undefined);

// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.kubeflowTrialConfig.codeDir);
} catch(error) {
this.log.error(error);
return Promise.reject(new Error(error));
}

break;
default:
break;
Expand Down
14 changes: 12 additions & 2 deletions src/nni_manager/training_service/pai/paiConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ export class PAIJobConfig{
// List of taskRole, one task role at least
public taskRoles: PAITaskRole[];

//The virtual cluster job runs on.
public readonly virtualCluster: string;

/**
* Constructor
* @param jobName Name for the job, need to be unique
Expand All @@ -77,13 +80,15 @@ export class PAIJobConfig{
* @param outputDir Output directory on HDFS
* @param taskRoles List of taskRole, one task role at least
*/
constructor(jobName: string, image : string, dataDir : string, outputDir : string, codeDir : string, taskRoles : PAITaskRole[]){
constructor(jobName: string, image : string, dataDir : string, outputDir : string, codeDir : string,
taskRoles : PAITaskRole[], virtualCluster: string) {
this.jobName = jobName;
this.image = image;
this.dataDir = dataDir;
this.outputDir = outputDir;
this.codeDir = codeDir;
this.taskRoles = taskRoles;
this.virtualCluster = virtualCluster;
}
}

Expand Down Expand Up @@ -112,13 +117,18 @@ export class NNIPAITrialConfig extends TrialConfig{
public readonly dataDir: string;
public outputDir: string;

constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string, dataDir: string, outputDir: string) {
//The virtual cluster job runs on. If omitted, the job will run on default virtual cluster
public virtualCluster?: string;

constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number,
image: string, dataDir: string, outputDir: string, virtualCluster?: string) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
this.dataDir = dataDir;
this.outputDir = outputDir;
this.virtualCluster = virtualCluster;
}
}

20 changes: 16 additions & 4 deletions src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,14 @@ import {
JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, NNIManagerIpConfig
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { countFilesRecursively, delay, generateParamFileName,
getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { PAIJobRestServer } from './paiJobRestServer'
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { String } from 'typescript-string-operations';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
import { validateCodeDir } from '../common/util';

var WebHDFS = require('webhdfs');

Expand Down Expand Up @@ -236,9 +238,10 @@ class PAITrainingService implements TrainingService {
this.paiTrialConfig.outputDir,
// codeDir
`$PAI_DEFAULT_FS_URI${hdfsCodeDir}`,
// TODO: Add Virutal Cluster
// PAI Task roles
paiTaskRoles);
paiTaskRoles,
// Add Virutal Cluster
this.paiTrialConfig.virtualCluster === undefined ? 'default' : this.paiTrialConfig.virtualCluster.toString());

// Step 2. Upload code files in codeDir onto HDFS
try {
Expand Down Expand Up @@ -393,7 +396,16 @@ class PAITrainingService implements TrainingService {
this.paiClusterConfig.host
).replace(/\r\n|\n|\r/gm, '');
}


// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.paiTrialConfig.codeDir);
} catch(error) {
this.log.error(error);
deferred.reject(new Error(error));
break;
}

const hdfsDirContent = this.paiTrialConfig.outputDir.match(this.hdfsDirPattern);

if(hdfsDirContent === null) {
Expand Down
Loading

0 comments on commit e1ad56b

Please sign in to comment.