From d77a99c9966f8944817dff02950b20f7c7f30bb3 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Dec 2018 11:17:44 +0800 Subject: [PATCH 01/16] fix remote bug --- src/nni_manager/rest_server/restValidationSchemas.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index b1dc5293a7..bfb1ff24d2 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -28,7 +28,7 @@ export namespace ValidationSchemas { username: joi.string().required(), ip: joi.string().ip().required(), port: joi.number().min(1).max(65535).required(), - passwd: joi.string().required(), + passwd: joi.string(), sshKeyPath: joi.string(), passphrase: joi.string() })), From 7cb03f99d86efbc90ddae5809dfb0f412e666417 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Dec 2018 17:17:34 +0800 Subject: [PATCH 02/16] add document --- README.md | 11 +++-- docs/FrameworkControllerMode.md | 84 +++++++++++++++++++++++++++++++++ docs/KubeflowMode.md | 2 +- 3 files changed, 91 insertions(+), 6 deletions(-) create mode 100644 docs/FrameworkControllerMode.md diff --git a/README.md b/README.md index 4d57b720a0..0972751e95 100644 --- a/README.md +++ b/README.md @@ -76,11 +76,12 @@ You can use these commands to get more information about the experiment commands description 1. nnictl experiment show show the information of experiments 2. nnictl trial ls list all of trial jobs -3. nnictl log stderr show stderr log content -4. nnictl log stdout show stdout log content -5. nnictl stop stop an experiment -6. nnictl trial kill kill a trial job by id -7. nnictl --help get help information about nnictl +3. nnictl top monitor the status of running experiments +4. nnictl log stderr show stderr log content +5. nnictl log stdout show stdout log content +6. nnictl stop stop an experiment +7. nnictl trial kill kill a trial job by id +8. nnictl --help get help information about nnictl ----------------------------------------------------------------------- ``` diff --git a/docs/FrameworkControllerMode.md b/docs/FrameworkControllerMode.md new file mode 100644 index 0000000000..00bf25b267 --- /dev/null +++ b/docs/FrameworkControllerMode.md @@ -0,0 +1,84 @@ +**Run an Experiment on FrameworkController** +=== +NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, and you have to set a kubernetes cluster before using frameworkcontroller. + +## Set up Kubernetes Service and kubeconfig +FrameworkController has same prerequisites as kubeflow mode except that you don't need to install kubeflow. Please refer the [document](./KubeflowMode.md) to set up your kubernetes cluster and other prerequisites for nni. + +## Set up FrameworkController +Follow the [guideline](https://github.com/Microsoft/frameworkcontroller/tree/master/example/run) to set up the frameworkcontroller in the kubernetes cluster, nni support frameworkcontroller by the statefulset mode. + +## Design +Please refer the design of [kubeflow training service](./KubeflowMode.md), frameworkcontroller training service pipeline is similar with kubeflow training service. + +## Example + +The frameworkcontroller config file format is: +``` +authorName: default +experimentName: example_mnist +trialConcurrency: 1 +maxExecDuration: 10h +maxTrialNum: 100 +#choice: local, remote, pai, kubeflow, frameworkcontroller +trainingServicePlatform: frameworkcontroller +searchSpacePath: ~/nni/examples/trials/mnist/search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +#assessor: +# builtinAssessorName: Medianstop +# classArgs: +# optimize_mode: maximize +# gpuNum: 0 +trial: + codeDir: ~/nni/examples/trials/mnist + taskRoles: + - name: worker + taskNum: 1 + command: python3 mnist.py + gpuNum: 1 + cpuNum: 1 + memoryMB: 8192 + image: msranni/nni:latest + frameworkAttemptCompletionPolicy: + minFailedTaskCount: 1 + minSucceededTaskCount: 1 +frameworkcontrollerConfig: + storage: nfs + nfs: + server: {your_nfs_server} + path: {your_nfs_server_exported_path} +``` +If you use Azure Kubernetes Service, you should set `kubeflowConfig` in your config yaml file as follows: +``` +frameworkcontrollerConfig: + storage: azureStorage + keyVault: + vaultName: {your_vault_name} + name: {your_secert_name} + azureStorage: + accountName: {your_storage_account_name} + azureShare: {your_azure_share_name} +``` +Note: You should explicitly set `trainingServicePlatform: frameworkcontroller` in nni config yaml file if you want to start experiment in kubeflow mode. + +The trial's config format for nni frameworkcontroller mode is a simple version of frameworkcontroller's offical config, you could refer the [tensorflow example](https://github.com/Microsoft/frameworkcontroller/blob/master/example/framework/scenario/tensorflow/cpu/tensorflowdistributedtrainingwithcpu.yaml) for deeply understanding. +Trial configuration in frameworkcontroller mode have the following configuration keys: +* taskRoles: you could set multiple task roles in config file, and each task role is a basic unit to process in kubernetes cluster. + * name: the name of task role specified, like "worker", "ps", "master". + * taskNum: the replica number of the task role. + * command: the users' command to be used in the container. + * gpuNum: the number of gpu device used in container. + * cpuNum: the number of cpu device used in container. + * memoryMB: the memory limitaion to be specified in container. + * image: the docker image used to create pod and run the program. + * frameworkAttemptCompletionPolicy: the policy to run framework, please refer the [user-manual](https://github.com/Microsoft/frameworkcontroller/blob/master/doc/user-manual.md) to get the specific information. + +## How to run example +After you prepare a config file, you could run your experiment by nnictl. The way to start an experiment on frameworkcontroller is similar to kubeflow, please the [document](./KubeflowMode.md) for more information. \ No newline at end of file diff --git a/docs/KubeflowMode.md b/docs/KubeflowMode.md index 2c4721b971..c312c1e3b7 100644 --- a/docs/KubeflowMode.md +++ b/docs/KubeflowMode.md @@ -100,7 +100,7 @@ Trial configuration in kubeflow mode have the following configuration keys: * gpuNum * image * Required key. In kubeflow mode, your trial program will be scheduled by Kubernetes to run in [Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/). This key is used to specify the Docker image used to create the pod where your trail program will run. - * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](../deployment/Dockerfile.build.base). You can either use this image directly in your config file, or build your own image based on it. + * We already build a docker image [msranni/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](../deployment/Dockerfile.build.base). You can either use this image directly in your config file, or build your own image based on it. * ps (optional). This config section is used to configure tensorflow parameter server role. Once complete to fill nni experiment config file and save (for example, save as exp_kubeflow.yaml), then run the following command From 44d156550e97e27bf365569810668933d4234023 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Dec 2018 17:21:06 +0800 Subject: [PATCH 03/16] add document --- docs/FrameworkControllerMode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/FrameworkControllerMode.md b/docs/FrameworkControllerMode.md index 00bf25b267..a977e20e20 100644 --- a/docs/FrameworkControllerMode.md +++ b/docs/FrameworkControllerMode.md @@ -1,6 +1,6 @@ **Run an Experiment on FrameworkController** === -NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, and you have to set a kubernetes cluster before using frameworkcontroller. +NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you could use frameworkcontroller as a training service to run your experiment. ## Set up Kubernetes Service and kubeconfig FrameworkController has same prerequisites as kubeflow mode except that you don't need to install kubeflow. Please refer the [document](./KubeflowMode.md) to set up your kubernetes cluster and other prerequisites for nni. From 7ab7386d403987a29b805df027a042232ba8d259 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Dec 2018 17:26:53 +0800 Subject: [PATCH 04/16] update --- docs/FrameworkControllerMode.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/FrameworkControllerMode.md b/docs/FrameworkControllerMode.md index a977e20e20..15d3edbc48 100644 --- a/docs/FrameworkControllerMode.md +++ b/docs/FrameworkControllerMode.md @@ -9,7 +9,7 @@ FrameworkController has same prerequisites as kubeflow mode except that you don' Follow the [guideline](https://github.com/Microsoft/frameworkcontroller/tree/master/example/run) to set up the frameworkcontroller in the kubernetes cluster, nni support frameworkcontroller by the statefulset mode. ## Design -Please refer the design of [kubeflow training service](./KubeflowMode.md), frameworkcontroller training service pipeline is similar with kubeflow training service. +Please refer the design of [kubeflow training service](./KubeflowMode.md), frameworkcontroller training service pipeline is similar to kubeflow training service. ## Example @@ -31,11 +31,11 @@ tuner: classArgs: #choice: maximize, minimize optimize_mode: maximize -#assessor: -# builtinAssessorName: Medianstop -# classArgs: -# optimize_mode: maximize -# gpuNum: 0 +assessor: + builtinAssessorName: Medianstop + classArgs: + optimize_mode: maximize + gpuNum: 0 trial: codeDir: ~/nni/examples/trials/mnist taskRoles: @@ -55,7 +55,7 @@ frameworkcontrollerConfig: server: {your_nfs_server} path: {your_nfs_server_exported_path} ``` -If you use Azure Kubernetes Service, you should set `kubeflowConfig` in your config yaml file as follows: +If you use Azure Kubernetes Service, you should set `frameworkcontrollerConfig` in your config yaml file as follows: ``` frameworkcontrollerConfig: storage: azureStorage @@ -66,9 +66,9 @@ frameworkcontrollerConfig: accountName: {your_storage_account_name} azureShare: {your_azure_share_name} ``` -Note: You should explicitly set `trainingServicePlatform: frameworkcontroller` in nni config yaml file if you want to start experiment in kubeflow mode. +Note: You should explicitly set `trainingServicePlatform: frameworkcontroller` in nni config yaml file if you want to start experiment in frameworkcontrollerConfig mode. -The trial's config format for nni frameworkcontroller mode is a simple version of frameworkcontroller's offical config, you could refer the [tensorflow example](https://github.com/Microsoft/frameworkcontroller/blob/master/example/framework/scenario/tensorflow/cpu/tensorflowdistributedtrainingwithcpu.yaml) for deeply understanding. +The trial's config format for nni frameworkcontroller mode is a simple version of frameworkcontroller's offical config, you could refer the [tensorflow example of frameworkcontroller](https://github.com/Microsoft/frameworkcontroller/blob/master/example/framework/scenario/tensorflow/cpu/tensorflowdistributedtrainingwithcpu.yaml) for deeply understanding. Trial configuration in frameworkcontroller mode have the following configuration keys: * taskRoles: you could set multiple task roles in config file, and each task role is a basic unit to process in kubernetes cluster. * name: the name of task role specified, like "worker", "ps", "master". From d9e1ea8261f8ede301e2a1ce18c0d2027a8a5f76 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Dec 2018 17:42:12 +0800 Subject: [PATCH 05/16] update --- docs/KubeflowMode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/KubeflowMode.md b/docs/KubeflowMode.md index c312c1e3b7..4f830f0e82 100644 --- a/docs/KubeflowMode.md +++ b/docs/KubeflowMode.md @@ -1,6 +1,6 @@ **Run an Experiment on Kubeflow** === -Now NNI supports running experiment on [Kubeflow](https://github.com/kubeflow/kubeflow), called kubeflow mode. Before starting to use NNI kubeflow mode, you should have a kubernetes cluster, either on-prem or [Azure Kubernetes Service(AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/), a Ubuntu machine on which [kubeconfig](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) is setup to connect to your kubernetes cluster. If you are not familiar with kubernetes, [here](https://kubernetes.io/docs/tutorials/kubernetes-basics/) is a goot start. In kubeflow mode, your trial program will run as kubeflow job in kubernetes cluster. +Now NNI supports running experiment on [Kubeflow](https://github.com/kubeflow/kubeflow), called kubeflow mode. Before starting to use NNI kubeflow mode, you should have a kubernetes cluster, either on-prem or [Azure Kubernetes Service(AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/), a Ubuntu machine on which [kubeconfig](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) is setup to connect to your kubernetes cluster. If you are not familiar with kubernetes, [here](https://kubernetes.io/docs/tutorials/kubernetes-basics/) is a good start. In kubeflow mode, your trial program will run as kubeflow job in kubernetes cluster. ## Prerequisite for on-premises Kubernetes Service 1. A **Kubernetes** cluster using Kubernetes 1.8 or later. Follow this [guideline](https://kubernetes.io/docs/setup/) to set up Kubernetes From 2c225a840f16124107b6ae2a57b98422298239f5 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Dec 2018 17:47:56 +0800 Subject: [PATCH 06/16] update --- docs/FrameworkControllerMode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/FrameworkControllerMode.md b/docs/FrameworkControllerMode.md index 15d3edbc48..3f93f112f8 100644 --- a/docs/FrameworkControllerMode.md +++ b/docs/FrameworkControllerMode.md @@ -81,4 +81,4 @@ Trial configuration in frameworkcontroller mode have the following configuration * frameworkAttemptCompletionPolicy: the policy to run framework, please refer the [user-manual](https://github.com/Microsoft/frameworkcontroller/blob/master/doc/user-manual.md) to get the specific information. ## How to run example -After you prepare a config file, you could run your experiment by nnictl. The way to start an experiment on frameworkcontroller is similar to kubeflow, please the [document](./KubeflowMode.md) for more information. \ No newline at end of file +After you prepare a config file, you could run your experiment by nnictl. The way to start an experiment on frameworkcontroller is similar to kubeflow, please refer the [document](./KubeflowMode.md) for more information. \ No newline at end of file From be23f553b4f50e38d298fffc1a3332d21f2535fb Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Sat, 29 Dec 2018 15:41:29 +0800 Subject: [PATCH 07/16] update --- docs/FrameworkControllerMode.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/FrameworkControllerMode.md b/docs/FrameworkControllerMode.md index 3f93f112f8..0273d87321 100644 --- a/docs/FrameworkControllerMode.md +++ b/docs/FrameworkControllerMode.md @@ -1,15 +1,15 @@ **Run an Experiment on FrameworkController** === -NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you could use frameworkcontroller as a training service to run your experiment. +NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, now you can use frameworkcontroller as the training service to run NNI experiment. ## Set up Kubernetes Service and kubeconfig -FrameworkController has same prerequisites as kubeflow mode except that you don't need to install kubeflow. Please refer the [document](./KubeflowMode.md) to set up your kubernetes cluster and other prerequisites for nni. +FrameworkController has similar prerequisites as kubeflow mode, like K8S installation and storage preparation, but you don't need to install kubeflow. Please refer Kubeflow mode [document](./KubeflowMode.md) to set up your kubernetes cluster and other prerequisites for nni. ## Set up FrameworkController -Follow the [guideline](https://github.com/Microsoft/frameworkcontroller/tree/master/example/run) to set up the frameworkcontroller in the kubernetes cluster, nni support frameworkcontroller by the statefulset mode. +Follow the [guideline](https://github.com/Microsoft/frameworkcontroller/tree/master/example/run) to set up frameworkcontroller in the kubernetes cluster, nni support frameworkcontroller by the statefulset mode. ## Design -Please refer the design of [kubeflow training service](./KubeflowMode.md), frameworkcontroller training service pipeline is similar to kubeflow training service. +Please refer the design of [kubeflow training service](./KubeflowMode.md), frameworkcontroller training service pipeline is similar. ## Example @@ -68,7 +68,7 @@ frameworkcontrollerConfig: ``` Note: You should explicitly set `trainingServicePlatform: frameworkcontroller` in nni config yaml file if you want to start experiment in frameworkcontrollerConfig mode. -The trial's config format for nni frameworkcontroller mode is a simple version of frameworkcontroller's offical config, you could refer the [tensorflow example of frameworkcontroller](https://github.com/Microsoft/frameworkcontroller/blob/master/example/framework/scenario/tensorflow/cpu/tensorflowdistributedtrainingwithcpu.yaml) for deeply understanding. +The trial's config format for nni frameworkcontroller mode is a simple version of frameworkcontroller's offical config, you could refer the [tensorflow example of frameworkcontroller](https://github.com/Microsoft/frameworkcontroller/blob/master/example/framework/scenario/tensorflow/cpu/tensorflowdistributedtrainingwithcpu.yaml) for deep understanding. Trial configuration in frameworkcontroller mode have the following configuration keys: * taskRoles: you could set multiple task roles in config file, and each task role is a basic unit to process in kubernetes cluster. * name: the name of task role specified, like "worker", "ps", "master". From 91612098cf4e29ea5427b0ee00f11812b9422af6 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 3 Jan 2019 16:56:37 +0800 Subject: [PATCH 08/16] fix remote issue --- .../remote_machine/remoteMachineTrainingService.ts | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 5302d90bbb..7ab2adf943 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -110,15 +110,14 @@ class RemoteMachineTrainingService implements TrainingService { /** * List submitted trial jobs */ - public listTrialJobs(): Promise { + public async listTrialJobs(): Promise { const jobs: TrialJobDetail[] = []; const deferred: Deferred = new Deferred(); - - this.trialJobsMap.forEach(async (value: RemoteMachineTrialJobDetail, key: string) => { + for (const [key, value] of this.trialJobsMap) { if (value.form.jobType === 'TRIAL') { jobs.push(await this.getTrialJob(key)); } - }); + }; deferred.resolve(jobs); return deferred.promise; From e661c5528353e656db8de6a65b0c392bf90362f6 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 3 Jan 2019 17:10:50 +0800 Subject: [PATCH 09/16] fix forEach --- .../kubernetes/kubernetesTrainingService.ts | 6 +++--- src/nni_manager/training_service/pai/paiTrainingService.ts | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 631b0e98ea..a97ee63d57 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -81,14 +81,14 @@ abstract class KubernetesTrainingService { } } - public listTrialJobs(): Promise { + public async listTrialJobs(): Promise { const jobs: TrialJobDetail[] = []; - this.trialJobsMap.forEach(async (value: KubernetesTrialJobDetail, key: string) => { + for (const [key, value] of this.trialJobsMap) { if (value.form.jobType === 'TRIAL') { jobs.push(await this.getTrialJob(key)); } - }); + }; return Promise.resolve(jobs); } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index c1ef8ccb60..a19a225d09 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -104,11 +104,11 @@ class PAITrainingService implements TrainingService { public async listTrialJobs(): Promise { const jobs: TrialJobDetail[] = []; - this.trialJobsMap.forEach(async (value: PAITrialJobDetail, key: string) => { + for (const [key, value] of this.trialJobsMap) { if (value.form.jobType === 'TRIAL') { jobs.push(await this.getTrialJob(key)); } - }); + }; return Promise.resolve(jobs); } From 4fec2cc7c6b2b744e65dab21b9bee729b2f590e7 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 7 Jan 2019 14:15:15 +0800 Subject: [PATCH 10/16] update doc according to comments --- docs/FrameworkControllerMode.md | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/docs/FrameworkControllerMode.md b/docs/FrameworkControllerMode.md index 0273d87321..805b0d0b50 100644 --- a/docs/FrameworkControllerMode.md +++ b/docs/FrameworkControllerMode.md @@ -1,9 +1,25 @@ **Run an Experiment on FrameworkController** === -NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, now you can use frameworkcontroller as the training service to run NNI experiment. +NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you don't need to install kubeflow for specific deeplearning framework like tf-operator or pytorch-operator any more. Now you can use frameworkcontroller as the training service to run NNI experiment. + +## Prerequisite for on-premises Kubernetes Service +1. A **Kubernetes** cluster using Kubernetes 1.8 or later. Follow this [guideline](https://kubernetes.io/docs/setup/) to set up Kubernetes +2. Prepare a **kubeconfig** file, which will be used by NNI to interact with your kubernetes API server. By default, NNI manager will use $(HOME)/.kube/config as kubeconfig file's path. You can also specify other kubeconfig files by setting the **KUBECONFIG** environment variable. Refer this [guideline]( https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig) to learn more about kubeconfig. +3. If your NNI trial job needs GPU resource, you should follow this [guideline](https://github.com/NVIDIA/k8s-device-plugin) to configure **Nvidia device plugin for Kubernetes**. +4. Prepare a **NFS server** and export a general purpose mount (we recommend to map your NFS server path in `root_squash option`, otherwise permission issue may raise when nni copy files to NFS. Refer this [page](https://linux.die.net/man/5/exports) to learn what root_squash option is), or **Azure File Storage**. +5. Install **NFS client** on the machine where you install NNI and run nnictl to create experiment. Run this command to install NFSv4 client: + ``` + apt-get install nfs-common + ``` + +6. Install **NNI**, follow the install guide [here](GetStarted.md). + +## Prerequisite for Azure Kubernetes Service +1. NNI support kubeflow based on Azure Kubernetes Service, follow the [guideline](https://azure.microsoft.com/en-us/services/kubernetes-service/) to set up Azure Kubernetes Service. +2. Install [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) and __kubectl__. Use `az login` to set azure account, and connect kubectl client to AKS, refer this [guideline](https://docs.microsoft.com/en-us/azure/aks/kubernetes-walkthrough#connect-to-the-cluster). +3. Follow the [guideline](https://docs.microsoft.com/en-us/azure/storage/common/storage-quickstart-create-account?tabs=portal) to create azure file storage account. If you use Azure Kubernetes Service, nni need Azure Storage Service to store code files and the output files. +4. To access Azure storage service, nni need the access key of the storage account, and nni use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) Service to protect your private key. Set up Azure Key Vault Service, add a secret to Key Vault to store the access key of Azure storage account. Follow this [guideline](https://docs.microsoft.com/en-us/azure/key-vault/quick-create-cli) to store the access key. -## Set up Kubernetes Service and kubeconfig -FrameworkController has similar prerequisites as kubeflow mode, like K8S installation and storage preparation, but you don't need to install kubeflow. Please refer Kubeflow mode [document](./KubeflowMode.md) to set up your kubernetes cluster and other prerequisites for nni. ## Set up FrameworkController Follow the [guideline](https://github.com/Microsoft/frameworkcontroller/tree/master/example/run) to set up frameworkcontroller in the kubernetes cluster, nni support frameworkcontroller by the statefulset mode. @@ -78,7 +94,7 @@ Trial configuration in frameworkcontroller mode have the following configuration * cpuNum: the number of cpu device used in container. * memoryMB: the memory limitaion to be specified in container. * image: the docker image used to create pod and run the program. - * frameworkAttemptCompletionPolicy: the policy to run framework, please refer the [user-manual](https://github.com/Microsoft/frameworkcontroller/blob/master/doc/user-manual.md) to get the specific information. + * frameworkAttemptCompletionPolicy: the policy to run framework, please refer the [user-manual](https://github.com/Microsoft/frameworkcontroller/blob/master/doc/user-manual.md#frameworkattemptcompletionpolicy) to get the specific information. Users could use the policy to control the pod, for example, if ps does not stop, only worker stops, this completionpolicy could helps stop ps. ## How to run example After you prepare a config file, you could run your experiment by nnictl. The way to start an experiment on frameworkcontroller is similar to kubeflow, please refer the [document](./KubeflowMode.md) for more information. \ No newline at end of file From 11fec6f1e9d97cabe15e78128e63018ffc227f85 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 7 Jan 2019 14:40:58 +0800 Subject: [PATCH 11/16] update --- docs/KubeflowMode.md | 149 +++++++++++++++++++++++++++++++++---------- 1 file changed, 114 insertions(+), 35 deletions(-) diff --git a/docs/KubeflowMode.md b/docs/KubeflowMode.md index 4f830f0e82..12664ced34 100644 --- a/docs/KubeflowMode.md +++ b/docs/KubeflowMode.md @@ -28,64 +28,140 @@ Kubeflow training service instantiates a kubernetes rest client to interact with For each trial, we will upload all the files in your local codeDir path (configured in nni_config.yaml) together with NNI generated files like parameter.cfg into a storage volumn. Right now we support two kinds of storage volumns: [nfs](https://en.wikipedia.org/wiki/Network_File_System) and [azure file storage](https://azure.microsoft.com/en-us/services/storage/files/), you should configure the storage volumn in nni config yaml file. After files are prepared, Kubeflow training service will call K8S rest API to create kubeflow jobs ([tf-operator](https://github.com/kubeflow/tf-operator) job or [pytorch-operator](https://github.com/kubeflow/pytorch-operator) job) in K8S, and mount your storage volumn into the job's pod. Output files of kubeflow job, like stdout, stderr, trial.log or model files, will also be copied back to the storage volumn. NNI will show the storage volumn's URL for each trial in WebUI, to allow user browse the log files and job's output files. +## Supported operator +NNI only support tf-operator and pytorch-operator of kubeflow, other operators is not tested. +Users could set operator type in config file. +``` +operator: tf-operator +``` +If users want to use tf-operator, he could set `ps` and `worker` in trial config. If users want to use pytorch-operator, he could set `master` and `worker` in trial config. + +## Supported sotrage type +NNI support NFS and Azure Storage to store the code and output files, users could set storage type in config file and set the corresponding config. +The setting for NFS storage are as follows: +``` +kubeflowConfig: + operator: tf-operator + apiVersion: v1alpha2 + storage: nfs + nfs: + # Your NFS server IP, like 10.10.10.10 + server: {your_nfs_server_ip} + # Your NFS server export path, like /var/nfs/nni + path: {your_nfs_server_export_path} +``` +If you use Azure storage, you should set `kubeflowConfig` in your config yaml file as follows: +``` +kubeflowConfig: + operator: tf-operator + apiVersion: v1alpha2 + storage: azureStorage + keyVault: + vaultName: {your_vault_name} + name: {your_secert_name} + azureStorage: + accountName: {your_storage_account_name} + azureShare: {your_azure_share_name} +``` + + ## Run an experiment -Use `examples/trials/mnist` as an example. The nni config yaml file's content is like: +Use `examples/trials/mnist` as an example. This is a tensorflow job, and use tf-operator of kubeflow. The nni config yaml file's content is like: ``` -authorName: your_name +authorName: default experimentName: example_mnist -# how many trials could be concurrently running -trialConcurrency: 4 -# maximum experiment running duration -maxExecDuration: 3h -# empty means never stop -maxTrialNum: 100 -# choice: local, remote, pai, kubeflow +trialConcurrency: 2 +maxExecDuration: 1h +maxTrialNum: 20 +#choice: local, remote, pai, kubeflow trainingServicePlatform: kubeflow -# choice: true, false +searchSpacePath: search_space.json +#choice: true, false useAnnotation: false tuner: + #choice: TPE, Random, Anneal, Evolution builtinTunerName: TPE classArgs: #choice: maximize, minimize optimize_mode: maximize +assessor: + builtinAssessorName: Medianstop + classArgs: + optimize_mode: maximize + gpuNum: 0 trial: - codeDir: ~/nni/examples/trials/mnist - ps: - replicas: 1 - command: python mnist-keras.py - gpuNum: 0 + codeDir: . + worker: + replicas: 2 + command: python3 dist_mnist.py + gpuNum: 1 cpuNum: 1 memoryMB: 8196 - image: {your_docker_image_for_tensorflow_ps} - worker: - replicas: 1 - command: python mnist-keras.py - gpuNum: 2 + image: msranni/nni:latest + ps: + replicas: 1 + command: python3 dist_mnist.py + gpuNum: 0 cpuNum: 1 memoryMB: 8196 - image: {your_docker_image_for_tensorflow_worker} + image: msranni/nni:latest kubeflowConfig: operator: tf-operator + apiVersion: v1alpha2 storage: nfs nfs: - server: {your_nfs_server} - path: {your_nfs_server_exported_path} + # Your NFS server IP, like 10.10.10.10 + server: {your_nfs_server_ip} + # Your NFS server export path, like /var/nfs/nni + path: {your_nfs_server_export_path} ``` -If you use Azure Kubernetes Service, you should set `kubeflowConfig` in your config yaml file as follows: + +Note: You should explicitly set `trainingServicePlatform: kubeflow` in nni config yaml file if you want to start experiment in kubeflow mode. + +If you want to run Pytorch jobs, you could set your config files as follow: ``` +authorName: default +experimentName: example_mnist_distributed_pytorch +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +#choice: local, remote, pai, kubeflow +trainingServicePlatform: kubeflow +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: minimize +trial: + codeDir: . + master: + replicas: 1 + command: python3 dist_mnist.py + gpuNum: 1 + cpuNum: 1 + memoryMB: 2048 + image: msranni/nni:latest + worker: + replicas: 1 + command: python3 dist_mnist.py + gpuNum: 0 + cpuNum: 1 + memoryMB: 2048 + image: msranni/nni:latest kubeflowConfig: - operator: tf-operator - storage: azureStorage - keyVault: - vaultName: {your_vault_name} - name: {your_secert_name} - azureStorage: - accountName: {your_storage_account_name} - azureShare: {your_azure_share_name} + operator: pytorch-operator + apiVersion: v1alpha2 + nfs: + # Your NFS server IP, like 10.10.10.10 + server: {your_nfs_server_ip} + # Your NFS server export path, like /var/nfs/nni + path: {your_nfs_server_export_path} ``` -Note: You should explicitly set `trainingServicePlatform: kubeflow` in nni config yaml file if you want to start experiment in kubeflow mode. - Trial configuration in kubeflow mode have the following configuration keys: * codeDir * code directory, where you put training code and config files @@ -101,13 +177,16 @@ Trial configuration in kubeflow mode have the following configuration keys: * image * Required key. In kubeflow mode, your trial program will be scheduled by Kubernetes to run in [Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/). This key is used to specify the Docker image used to create the pod where your trail program will run. * We already build a docker image [msranni/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](../deployment/Dockerfile.build.base). You can either use this image directly in your config file, or build your own image based on it. + * apiVersion + * Required key. The API version of your kubeflow. * ps (optional). This config section is used to configure tensorflow parameter server role. +* master(optional). This config section is used to configure pytorch parameter server role. Once complete to fill nni experiment config file and save (for example, save as exp_kubeflow.yaml), then run the following command ``` nnictl create --config exp_kubeflow.yaml ``` -to start the experiment in kubeflow mode. NNI will create Kubeflow tfjob for each trial, and the job name format is something like `nni_exp_{experiment_id}_trial_{trial_id}`. +to start the experiment in kubeflow mode. NNI will create Kubeflow tfjob or pytorchjob for each trial, and the job name format is something like `nni_exp_{experiment_id}_trial_{trial_id}`. You can see the kubeflow tfjob created by NNI in your Kubernetes dashboard. Notice: In kubeflow mode, NNIManager will start a rest server and listen on a port which is your NNI WebUI's port plus 1. For example, if your WebUI port is `8080`, the rest server will listen on `8081`, to receive metrics from trial job running in Kubernetes. So you should `enable 8081` TCP port in your firewall rule to allow incoming traffic. From a03a1912048d73af61868305012d5c39e503e957 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 7 Jan 2019 14:44:28 +0800 Subject: [PATCH 12/16] update --- docs/KubeflowMode.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/KubeflowMode.md b/docs/KubeflowMode.md index 12664ced34..2beb69b59a 100644 --- a/docs/KubeflowMode.md +++ b/docs/KubeflowMode.md @@ -31,8 +31,15 @@ For each trial, we will upload all the files in your local codeDir path (configu ## Supported operator NNI only support tf-operator and pytorch-operator of kubeflow, other operators is not tested. Users could set operator type in config file. +The setting of tf-operator: ``` -operator: tf-operator +kubeflowConfig: + operator: tf-operator +``` +The setting of pytorch-operator: +``` +kubeflowConfig: + operator: pytorch-operator ``` If users want to use tf-operator, he could set `ps` and `worker` in trial config. If users want to use pytorch-operator, he could set `master` and `worker` in trial config. @@ -41,8 +48,6 @@ NNI support NFS and Azure Storage to store the code and output files, users coul The setting for NFS storage are as follows: ``` kubeflowConfig: - operator: tf-operator - apiVersion: v1alpha2 storage: nfs nfs: # Your NFS server IP, like 10.10.10.10 @@ -53,8 +58,6 @@ kubeflowConfig: If you use Azure storage, you should set `kubeflowConfig` in your config yaml file as follows: ``` kubeflowConfig: - operator: tf-operator - apiVersion: v1alpha2 storage: azureStorage keyVault: vaultName: {your_vault_name} From 7c7832cef0579539479460d07e1282e898489868 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 7 Jan 2019 15:17:22 +0800 Subject: [PATCH 13/16] update --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1d56023093..58b034fc78 100644 --- a/README.md +++ b/README.md @@ -83,12 +83,11 @@ You can use these commands to get more information about the experiment commands description 1. nnictl experiment show show the information of experiments 2. nnictl trial ls list all of trial jobs -3. nnictl top monitor the status of running experiments -4. nnictl log stderr show stderr log content -5. nnictl log stdout show stdout log content -6. nnictl stop stop an experiment -7. nnictl trial kill kill a trial job by id -8. nnictl --help get help information about nnictl +3. nnictl log stderr show stderr log content +4. nnictl log stdout show stdout log content +5. nnictl stop stop an experiment +6. nnictl trial kill kill a trial job by id +7. nnictl --help get help information about nnictl ----------------------------------------------------------------------- ``` From 85c015dc71a60879440b1a64dfa2e9f8b395f75e Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 8 Jan 2019 15:30:37 +0800 Subject: [PATCH 14/16] remove 'any more' --- docs/FrameworkControllerMode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/FrameworkControllerMode.md b/docs/FrameworkControllerMode.md index 805b0d0b50..c54c33756b 100644 --- a/docs/FrameworkControllerMode.md +++ b/docs/FrameworkControllerMode.md @@ -1,6 +1,6 @@ **Run an Experiment on FrameworkController** === -NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you don't need to install kubeflow for specific deeplearning framework like tf-operator or pytorch-operator any more. Now you can use frameworkcontroller as the training service to run NNI experiment. +NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you don't need to install kubeflow for specific deeplearning framework like tf-operator or pytorch-operator. Now you can use frameworkcontroller as the training service to run NNI experiment. ## Prerequisite for on-premises Kubernetes Service 1. A **Kubernetes** cluster using Kubernetes 1.8 or later. Follow this [guideline](https://kubernetes.io/docs/setup/) to set up Kubernetes From f46f8a5bc07bd01ebddfbf32d8571bab450f4db1 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 21 Jun 2019 14:28:13 +0800 Subject: [PATCH 15/16] init --- .../training_service/local/localTrainingService.ts | 7 ++++--- src/nni_manager/types/tail-stream/index.d.ts | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 31a90db695..2805b0f5b2 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -355,7 +355,8 @@ class LocalTrainingService implements TrainingService { this.log.info('Stopping local machine training service...'); this.stopping = true; for (const stream of this.jobStreamMap.values()) { - stream.destroy(); + stream.end(0) + stream.emit('end') } if (this.gpuScheduler !== undefined) { await this.gpuScheduler.stop(); @@ -372,7 +373,8 @@ class LocalTrainingService implements TrainingService { if (stream === undefined) { throw new Error(`Could not find stream in trial ${trialJob.id}`); } - stream.destroy(); + stream.end(0) + stream.emit('end') this.jobStreamMap.delete(trialJob.id); } } @@ -567,7 +569,6 @@ class LocalTrainingService implements TrainingService { buffer = remain; } }); - this.jobStreamMap.set(trialJobDetail.id, stream); } diff --git a/src/nni_manager/types/tail-stream/index.d.ts b/src/nni_manager/types/tail-stream/index.d.ts index 7ca08cb1bf..f7f3abb376 100644 --- a/src/nni_manager/types/tail-stream/index.d.ts +++ b/src/nni_manager/types/tail-stream/index.d.ts @@ -1,7 +1,8 @@ declare module 'tail-stream' { export interface Stream { on(type: 'data', callback: (data: Buffer) => void): void; - destroy(): void; + end(data: number): void; + emit(data: string): void; } export function createReadStream(path: string): Stream; } \ No newline at end of file From f41cbfa082f38370a817df684205e48ce14a9068 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Jun 2019 14:21:59 +0800 Subject: [PATCH 16/16] add related issue --- src/nni_manager/training_service/local/localTrainingService.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 2805b0f5b2..d8aff325da 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -373,6 +373,7 @@ class LocalTrainingService implements TrainingService { if (stream === undefined) { throw new Error(`Could not find stream in trial ${trialJob.id}`); } + //Refer https://github.com/Juul/tail-stream/issues/20 stream.end(0) stream.emit('end') this.jobStreamMap.delete(trialJob.id);