Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
nnictl experiment cleanup (#1186)
Browse files Browse the repository at this point in the history
* fix remote bug

* add document

* add document

* update

* update

* update

* update

* fix remote issue

* fix forEach

* update doc according to comments

* update

* update

* update

* remove 'any more'

* init

* update for remote and hdfs

* remove unused code

* fix comments

* fix comments

* fix comments

* fix comments
  • Loading branch information
SparkSnail authored and leckie-chn committed Jun 25, 2019
1 parent c84ba25 commit 8329d30
Show file tree
Hide file tree
Showing 4 changed files with 245 additions and 6 deletions.
47 changes: 47 additions & 0 deletions docs/en_US/Nnictl.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ nnictl support commands:
* [nnictl trial](#trial)
* [nnictl top](#top)
* [nnictl experiment](#experiment)
* [nnictl platform](#platform)
* [nnictl config](#config)
* [nnictl log](#log)
* [nnictl webui](#webui)
Expand Down Expand Up @@ -370,6 +371,26 @@ Debug mode will disable version check function in Trialkeeper.
nnictl experiment list
```
* __nnictl experiment delete__
* Description
Delete one or all experiments, it includes log, result, environment information and cache. It uses to delete useless experiment result, or save disk space.
* Usage
```bash
nnictl experiment delete [OPTIONS]
```
* Options
|Name, shorthand|Required|Default|Description|
|------|------|------ |------|
|id| False| |ID of the experiment|
<a name="export"></a>
* __nnictl experiment export__
Expand Down Expand Up @@ -456,6 +477,32 @@ Debug mode will disable version check function in Trialkeeper.
nnictl experiment import [experiment_id] -f experiment_data.json
```

<a name="platform"></a>
![](https://placehold.it/15/1589F0/000000?text=+) `Manage platform information`

* __nnictl platform clean__

* Description

It uses to clean up disk on a target platform. The provided YAML file includes the information of target platform, and it follows the same schema as the NNI configuration file.

* Note

if the target platform is being used by other users, it may cause unexpected errors to others.

* Usage

```bash
nnictl platform clean [OPTIONS]
```

* Options

|Name, shorthand|Required|Default|Description|
|------|------|------ |------|
|--config| True| |the path of yaml config file used when create an experiment|


<a name="config"></a>
![](https://placehold.it/15/1589F0/000000?text=+) `nnictl config show`

Expand Down
13 changes: 13 additions & 0 deletions tools/nni_cmd/nnictl.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,19 @@ def parse_args():
parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids')
parser_experiment_list.add_argument('all', nargs='?', help='list all of experiments')
parser_experiment_list.set_defaults(func=experiment_list)
parser_experiment_clean = parser_experiment_subparsers.add_parser('delete', help='clean up the experiment data')
parser_experiment_clean.add_argument('id', nargs='?', help='the id of experiment')
parser_experiment_clean.add_argument('--all', action='store_true', default=False, help='delete all of experiments')
parser_experiment_clean.set_defaults(func=experiment_clean)

#parse experiment command
parser_platform = subparsers.add_parser('platform', help='get platform information')
#add subparsers for parser_experiment
parser_platform_subparsers = parser_platform.add_subparsers()
parser_platform_clean = parser_platform_subparsers.add_parser('clean', help='clean up the platform data')
parser_platform_clean.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file')
parser_platform_clean.set_defaults(func=platform_clean)

#import tuning data
parser_import_data = parser_experiment_subparsers.add_parser('import', help='import additional data')
parser_import_data.add_argument('id', nargs='?', help='the id of experiment')
Expand Down
177 changes: 171 additions & 6 deletions tools/nni_cmd/nnictl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,20 @@
import json
import datetime
import time
import re
from pathlib import Path
from pyhdfs import HdfsClient, HdfsFileNotFoundException
import shutil
from subprocess import call, check_output
from nni_annotation import expand_annotations
from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response
from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_data_url
from .config_utils import Config, Experiments
from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \
EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT
from .common_utils import print_normal, print_error, print_warning, detect_process
from .common_utils import print_normal, print_error, print_warning, detect_process, get_yml_content
from .command_utils import check_output_command, kill_command
from .ssh_utils import create_ssh_sftp_client, remove_remote_directory

def get_experiment_time(port):
'''get the startTime and endTime of an experiment'''
Expand Down Expand Up @@ -73,10 +78,11 @@ def update_experiment():
if status:
experiment_config.update_experiment(key, 'status', status)

def check_experiment_id(args):
def check_experiment_id(args, update=True):
'''check if the id is valid
'''
update_experiment()
if update:
update_experiment()
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
if not experiment_dict:
Expand Down Expand Up @@ -170,7 +176,7 @@ def get_config_filename(args):
'''get the file name of config file'''
experiment_id = check_experiment_id(args)
if experiment_id is None:
print_error('Please set the experiment id!')
print_error('Please set correct experiment id!')
exit(1)
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
Expand All @@ -180,7 +186,7 @@ def get_experiment_port(args):
'''get the port of experiment'''
experiment_id = check_experiment_id(args)
if experiment_id is None:
print_error('Please set the experiment id!')
print_error('Please set correct experiment id!')
exit(1)
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
Expand Down Expand Up @@ -373,6 +379,166 @@ def webui_url(args):
nni_config = Config(get_config_filename(args))
print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl'))))

def local_clean(directory):
'''clean up local data'''
print_normal('removing folder {0}'.format(directory))
try:
shutil.rmtree(directory)
except FileNotFoundError as err:
print_error('{0} does not exist!'.format(directory))

def remote_clean(machine_list, experiment_id=None):
'''clean up remote data'''
for machine in machine_list:
passwd = machine.get('passwd')
userName = machine.get('username')
host = machine.get('ip')
port = machine.get('port')
if experiment_id:
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments', experiment_id])
else:
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments'])
sftp = create_ssh_sftp_client(host, port, userName, passwd)
print_normal('removing folder {0}'.format(host + ':' + str(port) + remote_dir))
remove_remote_directory(sftp, remote_dir)

def hdfs_clean(host, user_name, output_dir, experiment_id=None):
'''clean up hdfs data'''
hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5)
if experiment_id:
full_path = '/' + '/'.join([user_name, 'nni', 'experiments', experiment_id])
else:
full_path = '/' + '/'.join([user_name, 'nni', 'experiments'])
print_normal('removing folder {0} in hdfs'.format(full_path))
hdfs_client.delete(full_path, recursive=True)
if output_dir:
pattern = re.compile('hdfs://(?P<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?P<baseDir>/.*)?')
match_result = pattern.match(output_dir)
if match_result:
output_host = match_result.group('host')
output_dir = match_result.group('baseDir')
#check if the host is valid
if output_host != host:
print_warning('The host in {0} is not consistent with {1}'.format(output_dir, host))
else:
if experiment_id:
output_dir = output_dir + '/' + experiment_id
print_normal('removing folder {0} in hdfs'.format(output_dir))
hdfs_client.delete(output_dir, recursive=True)

def experiment_clean(args):
'''clean up the experiment data'''
experiment_id_list = []
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
if args.all:
experiment_id_list = list(experiment_dict.keys())
else:
if args.id is None:
print_error('please set experiment id!')
exit(1)
if args.id not in experiment_dict:
print_error('can not find id {0}!'.format(args.id))
exit(1)
experiment_id_list.append(args.id)
while True:
print('INFO: This action will delete experiment {0}, and it’s not recoverable.'.format(' '.join(experiment_id_list)))
inputs = input('INFO: do you want to continue?[y/N]:')
if not inputs.lower() or inputs.lower() in ['n', 'no']:
exit(0)
elif inputs.lower() not in ['y', 'n', 'yes', 'no']:
print_warning('please input Y or N!')
else:
break
for experiment_id in experiment_id_list:
nni_config = Config(experiment_dict[experiment_id]['fileName'])
platform = nni_config.get_config('experimentConfig').get('trainingServicePlatform')
experiment_id = nni_config.get_config('experimentId')
if platform == 'remote':
machine_list = nni_config.get_config('experimentConfig').get('machineList')
remote_clean(machine_list, experiment_id)
elif platform == 'pai':
host = nni_config.get_config('experimentConfig').get('paiConfig').get('host')
user_name = nni_config.get_config('experimentConfig').get('paiConfig').get('userName')
output_dir = nni_config.get_config('experimentConfig').get('trial').get('outputDir')
hdfs_clean(host, user_name, output_dir, experiment_id)
elif platform != 'local':
#TODO: support all platforms
print_warning('platform {0} clean up not supported yet!'.format(platform))
exit(0)
#clean local data
home = str(Path.home())
local_dir = nni_config.get_config('experimentConfig').get('logDir')
if not local_dir:
local_dir = os.path.join(home, 'nni', 'experiments', experiment_id)
local_clean(local_dir)
experiment_config = Experiments()
print_normal('removing metadata of experiment {0}'.format(experiment_id))
experiment_config.remove_experiment(experiment_id)
print_normal('Finish!')

def get_platform_dir(config_content):
'''get the dir list to be deleted'''
platform = config_content.get('trainingServicePlatform')
dir_list = []
if platform == 'remote':
machine_list = config_content.get('machineList')
for machine in machine_list:
host = machine.get('ip')
port = machine.get('port')
dir_list.append(host + ':' + str(port) + '/tmp/nni/experiments')
elif platform == 'pai':
pai_config = config_content.get('paiConfig')
host = config_content.get('paiConfig').get('host')
user_name = config_content.get('paiConfig').get('userName')
output_dir = config_content.get('trial').get('outputDir')
dir_list.append('hdfs://{0}:9000/{1}/nni/experiments'.format(host, user_name))
if output_dir:
dir_list.append(output_dir)
return dir_list

def platform_clean(args):
'''clean up the experiment data'''
config_path = os.path.abspath(args.config)
if not os.path.exists(config_path):
print_error('Please set correct config path!')
exit(1)
config_content = get_yml_content(config_path)
platform = config_content.get('trainingServicePlatform')
if platform not in ['remote', 'pai']:
print_normal('platform {0} not supported!'.format(platform))
exit(0)
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
update_experiment()
id_list = list(experiment_dict.keys())
dir_list = get_platform_dir(config_content)
if not dir_list:
print_normal('No folder of NNI caches is found!')
exit(1)
while True:
print_normal('This command will remove below folders of NNI caches. If other users are using experiments on below hosts, it will be broken.')
for dir in dir_list:
print(' ' + dir)
inputs = input('INFO: do you want to continue?[y/N]:')
if not inputs.lower() or inputs.lower() in ['n', 'no']:
exit(0)
elif inputs.lower() not in ['y', 'n', 'yes', 'no']:
print_warning('please input Y or N!')
else:
break
if platform == 'remote':
machine_list = config_content.get('machineList')
for machine in machine_list:
remote_clean(machine_list, None)
elif platform == 'pai':
pai_config = config_content.get('paiConfig')
host = config_content.get('paiConfig').get('host')
user_name = config_content.get('paiConfig').get('userName')
output_dir = config_content.get('trial').get('outputDir')
hdfs_clean(host, user_name, output_dir, None)
print_normal('Done!')

def experiment_list(args):
'''get the information of all experiments'''
experiment_config = Experiments()
Expand All @@ -393,7 +559,6 @@ def experiment_list(args):
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!')
experiment_information = ""
for key in experiment_id_list:

experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
Expand Down
14 changes: 14 additions & 0 deletions tools/nni_cmd/ssh_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,17 @@ def create_ssh_sftp_client(host_ip, port, username, password):
return sftp
except Exception as exception:
print_error('Create ssh client error %s\n' % exception)

def remove_remote_directory(sftp, directory):
'''remove a directory in remote machine'''
try:
files = sftp.listdir(directory)
for file in files:
filepath = '/'.join([directory, file])
try:
sftp.remove(filepath)
except IOError:
remove_remote_directory(sftp, filepath)
sftp.rmdir(directory)
except IOError as err:
print_error(err)

0 comments on commit 8329d30

Please sign in to comment.