Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Merge pull request #18 from microsoft/master
Browse files Browse the repository at this point in the history
pull code
  • Loading branch information
chicm-ms authored May 23, 2019
2 parents ccb2211 + feb6f3b commit 841d467
Show file tree
Hide file tree
Showing 14 changed files with 228 additions and 36 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ Targeting at openness and advancing state-of-art technology, [Microsoft Research
* [OpenPAI](https://github.com/Microsoft/pai) : an open source platform that provides complete AI model training and resource management capabilities, it is easy to extend and supports on-premise, cloud and hybrid environments in various scale.
* [FrameworkController](https://github.com/Microsoft/frameworkcontroller) : an open source general-purpose Kubernetes Pod Controller that orchestrate all kinds of applications on Kubernetes by a single controller.
* [MMdnn](https://github.com/Microsoft/MMdnn) : A comprehensive, cross-framework solution to convert, visualize and diagnose deep neural network models. The "MM" in MMdnn stands for model management and "dnn" is an acronym for deep neural network.
* [SPTAG](https://github.com/Microsoft/SPTAG) : Space Partition Tree And Graph (SPTAG) is an open source library for large scale vector approximate nearest neighbor search scenario.

We encourage researchers and students leverage these projects to accelerate the AI development and research.

## **Install & Verify**
Expand Down
21 changes: 18 additions & 3 deletions src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -357,13 +357,18 @@ function countFilesRecursively(directory: string, timeoutMilliSeconds?: number):
});

let fileCount: number = -1;
cpp.exec(`find ${directory} -type f | wc -l`).then((result) => {
let cmd: string;
if(process.platform === "win32") {
cmd = `powershell "Get-ChildItem -Path ${directory} -Recurse -File | Measure-Object | %{$_.Count}"`
} else {
cmd = `find ${directory} -type f | wc -l`;
}
cpp.exec(cmd).then((result) => {
if(result.stdout && parseInt(result.stdout)) {
fileCount = parseInt(result.stdout);
}
deferred.resolve(fileCount);
});

return Promise.race([deferred.promise, delayTimeout]).finally(() => {
clearTimeout(timeoutId);
});
Expand Down Expand Up @@ -459,6 +464,16 @@ function getNewLine(): string{
}
}

/**
* Use '/' to join path instead of '\' for all kinds of platform
* @param path
*/
function unixPathJoin(...paths: any[]): string {
const dir: string = paths.filter((path: any) => path !== '').join('/');
if (dir === '') return '.';
return dir;
}

export {countFilesRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, unixPathJoin,
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine };
17 changes: 17 additions & 0 deletions src/nni_manager/config/kubeflow/pytorchjob-crd-v1beta2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1beta2",
"group": "kubeflow.org",
"names": {
"kind": "PyTorchJob",
"plural": "pytorchjobs",
"singular": "pytorchjob"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta2",
"metadata": {
"name": "pytorchjobs.kubeflow.org"
}
}
17 changes: 17 additions & 0 deletions src/nni_manager/config/kubeflow/tfjob-crd-v1beta2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1beta2",
"group": "kubeflow.org",
"names": {
"kind": "TFJob",
"plural": "tfjobs",
"singular": "tfjob"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta2",
"metadata": {
"name": "tfjobs.kubeflow.org"
}
}
6 changes: 5 additions & 1 deletion src/nni_manager/core/ipcInterface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { EventEmitter } from 'events';
import { Readable, Writable } from 'stream';
import { NNIError } from '../common/errors';
import { getLogger, Logger } from '../common/log';
import { getLogDir } from '../common/utils';
import * as CommandType from './commands';

const ipcOutgoingFd: number = 3;
Expand Down Expand Up @@ -106,7 +107,10 @@ class IpcInterface {
this.logger.warning('Commands jammed in buffer!');
}
} catch (err) {
throw NNIError.FromError(err, 'Dispatcher Error: ');
throw NNIError.FromError(
err,
`Dispatcher Error, please check this dispatcher log file for more detailed information: ${getLogDir()}/dispatcher.log . `
);
}
}

Expand Down
11 changes: 10 additions & 1 deletion src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,16 @@ mkDirP(getLogDir())
console.error(`Failed to create log dir: ${err.stack}`);
});

process.on('SIGTERM', async () => {
function getStopSignal(): any {
if (process.platform === "win32") {
return 'SIGBREAK';
}
else{
return 'SIGTERM';
}
}

process.on(getStopSignal(), async () => {
const log: Logger = getLogger();
let hasError: boolean = false;
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,35 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
*/
public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: string): KubernetesCRDClient {
if(kubeflowOperator === 'tf-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new TFOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new TFOperatorClientV1Beta1();
}
} else if(kubeflowOperator === 'pytorch-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new PytorchOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new PytorchOperatorClientV1Beta1();
switch(kubeflowOperator) {
case 'tf-operator': {
switch(operatorApiVersion) {
case 'v1alpha2': {
return new TFOperatorClientV1Alpha2();
}
case 'v1beta1': {
return new TFOperatorClientV1Beta1();
}
case 'v1beta2': {
return new TFOperatorClientV1Beta2();
}
}
break;
}
case 'pytorch-operator': {
switch(operatorApiVersion) {
case 'v1alpha2': {
return new PyTorchOperatorClientV1Alpha2();
}
case 'v1beta1': {
return new PyTorchOperatorClientV1Beta1();
}
case 'v1beta2': {
return new PyTorchOperatorClientV1Beta2();
}
}
}
}

throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
}
}
Expand Down Expand Up @@ -85,7 +100,26 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
}
}

class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1beta2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}

protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').tfjobs;
}

public get containerName(): string {
return 'tensorflow';
}
}

class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
Expand All @@ -104,7 +138,7 @@ class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
}
}

class PytorchOperatorClientV1Beta1 extends KubernetesCRDClient {
class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
/**
* constructor, to initialize tfjob CRD definition
*/
Expand All @@ -123,5 +157,24 @@ class PytorchOperatorClientV1Beta1 extends KubernetesCRDClient {
}
}

class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1beta2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}

protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').pytorchjobs;
}

public get containerName(): string {
return 'pytorch';
}
}

export { KubeflowOperatorClient, GeneralK8sClient };

Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import { MethodNotImplementedError } from '../../../common/errors';
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type DistTrainRole = 'worker' | 'ps' | 'master';
export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2';

export class KubeflowClusterConfig extends KubernetesClusterConfig {
public readonly operator: KubeflowOperator;
Expand Down
9 changes: 6 additions & 3 deletions src/nni_manager/training_service/pai/hdfsClientUtility.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import * as fs from 'fs';
import { Deferred } from 'ts-deferred';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger } from '../../common/log';
import { unixPathJoin } from '../../common/utils'

/**
* HDFS client utility, including copy file/directory
Expand All @@ -32,15 +33,15 @@ export namespace HDFSClientUtility {
* @param hdfsUserName HDFS user name
*/
function hdfsExpRootDir(hdfsUserName: string): string {
return path.join('/', hdfsUserName, 'nni', 'experiments', getExperimentId());
return '/' + unixPathJoin(hdfsUserName, 'nni', 'experiments', getExperimentId());
}

/**
* Get NNI experiment code directory
* @param hdfsUserName HDFS user name
*/
export function getHdfsExpCodeDir(hdfsUserName: string): string {
return path.join(hdfsExpRootDir(hdfsUserName), 'codeDir');
return unixPathJoin(hdfsExpRootDir(hdfsUserName), 'codeDir');
}

/**
Expand All @@ -49,7 +50,9 @@ export namespace HDFSClientUtility {
* @param trialId NNI trial ID
*/
export function getHdfsTrialWorkDir(hdfsUserName: string, trialId: string): string {
return path.join(hdfsExpRootDir(hdfsUserName), 'trials', trialId);
let root = hdfsExpRootDir(hdfsUserName)
console.log(root)
return unixPathJoin(root, 'trials', trialId);
}

/**
Expand Down
7 changes: 4 additions & 3 deletions src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ import { delay, generateParamFileName,
getExperimentRootDir, getIPV4Address, getVersion, uniqueString } from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { validateCodeDir } from '../common/util';
import { validateCodeDir, execMkdir } from '../common/util';
import { unixPathJoin } from '../../common/utils'
import { HDFSClientUtility } from './hdfsClientUtility';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
import { PAI_LOG_PATH_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiData';
Expand Down Expand Up @@ -406,12 +407,12 @@ class PAITrainingService implements TrainingService {
}

// Step 1. Prepare PAI job configuration
const hdfsOutputDir : string = path.join(this.hdfsBaseDir, this.experimentId, trialJobId);
const hdfsOutputDir : string = unixPathJoin(this.hdfsBaseDir, this.experimentId, trialJobId);
const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId);

const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
await execMkdir(trialLocalTempFolder);

const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
Expand Down
5 changes: 3 additions & 2 deletions test/generate_ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def convert_command():

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote'], default='pai')
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local'], default='pai')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
# args for PAI
Expand All @@ -111,4 +111,5 @@ def convert_command():
args = parser.parse_args()

update_training_service_config(args)
convert_command()
if args.ts == 'local':
convert_command()
2 changes: 1 addition & 1 deletion test/pipelines-it-local-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
displayName: 'Install dependencies for integration tests'
- script: |
cd test
python generate_ts_config.py
python generate_ts_config.py --ts local
displayName: 'generate config files'
- script: |
cd test
Expand Down
65 changes: 65 additions & 0 deletions test/pipelines-it-pai-windows.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
jobs:
- job: 'build_docker_image'
timeoutInMinutes: 0
pool:
vmImage: 'Ubuntu 16.04'
steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'

- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'
- script: |
if [ $(build_docker_img) = 'true' ]
then
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
export IMG_TAG=`date -u +%y%m%d%H%M`
echo 'build and upload docker image'
docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
docker push $(test_docker_img_name):$IMG_TAG
export TEST_IMG=$(test_docker_img_name):$IMG_TAG
cd ../../
else
export TEST_IMG=$(existing_docker_img)
fi
echo "##vso[task.setvariable variable=TEST_IMG]$TEST_IMG"
displayName: 'build docker image'
- script:
echo $TEST_IMG
echo "##vso[task.setvariable variable=docker_image;isOutput=true]$TEST_IMG"
name: setvariableStep
displayName: 'set image variable'

- job: 'integration_test_pai'
timeoutInMinutes: 0
dependsOn: build_docker_image
variables:
docker_image: $[ dependencies.build_docker_image.outputs['setvariableStep.docker_image'] ]

steps:
- script: |
set PATH=$(ENV_PATH)
python --version
powershell.exe -file install.ps1
displayName: 'Install nni toolkit via source code'
- script: |
cd test
set PATH=$(ENV_PATH)
python --version
python generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) --vc $(pai_virtual_cluster) --nni_docker_image $(docker_image) --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip)
python config_test.py --ts pai --exclude multi_phase,smac,bohb
displayName: 'Examples and advanced features tests on pai'
Loading

0 comments on commit 841d467

Please sign in to comment.