Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

nnictl experiment cleanup #1186

Merged
merged 94 commits into from
Jun 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
94 commits
Select commit Hold shift + click to select a range
d77a99c
fix remote bug
Dec 25, 2018
695d866
Merge pull request #106 from Microsoft/master
SparkSnail Dec 25, 2018
b7e9799
Merge pull request #107 from Microsoft/master
SparkSnail Dec 27, 2018
7cb03f9
add document
Dec 27, 2018
44d1565
add document
Dec 27, 2018
7ab7386
update
Dec 27, 2018
d9e1ea8
update
Dec 27, 2018
2c225a8
update
Dec 27, 2018
be23f55
update
Dec 29, 2018
6f760ab
Merge pull request #108 from Microsoft/master
SparkSnail Jan 2, 2019
9161209
fix remote issue
Jan 3, 2019
e661c55
fix forEach
Jan 3, 2019
4e5d836
Merge pull request #109 from Microsoft/master
SparkSnail Jan 3, 2019
f80e737
fix conflict
Jan 4, 2019
aefc219
Merge branch 'Microsoft-master'
Jan 4, 2019
4fec2cc
update doc according to comments
Jan 7, 2019
dc45661
Merge pull request #111 from Microsoft/master
SparkSnail Jan 7, 2019
11fec6f
update
Jan 7, 2019
a03a191
update
Jan 7, 2019
7c7832c
update
Jan 7, 2019
2c862dc
Merge pull request #112 from Microsoft/master
SparkSnail Jan 8, 2019
85c015d
remove 'any more'
Jan 8, 2019
85cb472
Merge branch 'master' of https://github.com/SparkSnail/nni
Jan 8, 2019
3784355
Merge pull request #113 from Microsoft/master
SparkSnail Jan 9, 2019
d91c980
Merge pull request #114 from Microsoft/master
SparkSnail Jan 14, 2019
9786650
Merge pull request #115 from Microsoft/master
SparkSnail Jan 17, 2019
ef176d2
Merge pull request #116 from Microsoft/master
SparkSnail Jan 22, 2019
1089e80
Merge pull request #117 from Microsoft/master
SparkSnail Jan 23, 2019
627e823
Merge pull request #119 from Microsoft/master
SparkSnail Jan 24, 2019
b633c26
Merge pull request #120 from Microsoft/master
SparkSnail Jan 25, 2019
035d58b
Merge pull request #121 from Microsoft/master
SparkSnail Feb 11, 2019
cd549df
Merge pull request #122 from Microsoft/master
SparkSnail Feb 12, 2019
964743a
Merge pull request #123 from Microsoft/master
SparkSnail Feb 12, 2019
8422992
Merge pull request #124 from Microsoft/master
SparkSnail Feb 13, 2019
40391ec
Merge pull request #125 from Microsoft/master
SparkSnail Feb 18, 2019
1d84526
Merge pull request #126 from Microsoft/master
SparkSnail Feb 20, 2019
1852457
Merge pull request #127 from Microsoft/master
SparkSnail Feb 23, 2019
754a354
Merge pull request #128 from Microsoft/master
SparkSnail Feb 24, 2019
1ee9735
Merge pull request #129 from Microsoft/master
SparkSnail Feb 25, 2019
9f4485c
Merge pull request #130 from Microsoft/master
SparkSnail Feb 25, 2019
b1c3774
Merge pull request #131 from Microsoft/master
SparkSnail Feb 25, 2019
5d7923e
Merge pull request #132 from Microsoft/master
SparkSnail Feb 25, 2019
281f3dc
Merge pull request #133 from Microsoft/master
SparkSnail Feb 26, 2019
2ce9157
Merge pull request #134 from Microsoft/master
SparkSnail Feb 26, 2019
571a7af
Merge pull request #135 from Microsoft/master
SparkSnail Feb 28, 2019
f09d51a
Merge pull request #136 from Microsoft/master
SparkSnail Mar 1, 2019
41a9a59
Merge pull request #137 from Microsoft/master
SparkSnail Mar 5, 2019
21165b5
Merge pull request #138 from Microsoft/master
SparkSnail Mar 7, 2019
d25f7b5
Merge pull request #139 from Microsoft/master
SparkSnail Mar 11, 2019
17e719e
Merge pull request #140 from Microsoft/master
SparkSnail Mar 12, 2019
e25ffbd
Merge pull request #141 from Microsoft/master
SparkSnail Mar 13, 2019
5e777d2
Merge pull request #142 from Microsoft/master
SparkSnail Mar 14, 2019
6ff24a5
Merge pull request #143 from Microsoft/master
SparkSnail Mar 18, 2019
ccf6c04
Merge pull request #144 from Microsoft/master
SparkSnail Mar 20, 2019
eb5e21c
Merge pull request #145 from Microsoft/master
SparkSnail Mar 20, 2019
f796c60
Merge pull request #146 from Microsoft/master
SparkSnail Mar 21, 2019
e1ae623
Merge pull request #147 from Microsoft/master
SparkSnail Mar 22, 2019
ec41d56
Merge pull request #148 from Microsoft/master
SparkSnail Mar 25, 2019
080ae00
Merge pull request #149 from Microsoft/master
SparkSnail Mar 26, 2019
f0a2d39
Merge pull request #150 from Microsoft/master
SparkSnail Mar 26, 2019
77526d3
Merge pull request #152 from Microsoft/master
SparkSnail Apr 1, 2019
d95c351
Merge pull request #155 from Microsoft/master
SparkSnail Apr 3, 2019
346d49d
Merge pull request #156 from Microsoft/master
SparkSnail Apr 11, 2019
6af4b86
Merge pull request #158 from Microsoft/master
SparkSnail Apr 12, 2019
cf5336d
Merge pull request #159 from Microsoft/master
SparkSnail Apr 15, 2019
aec4977
Merge pull request #160 from Microsoft/master
SparkSnail Apr 16, 2019
b1dfaff
Merge pull request #161 from Microsoft/master
SparkSnail Apr 18, 2019
6c9360a
Merge pull request #162 from Microsoft/master
SparkSnail Apr 19, 2019
0663218
Merge pull request #163 from Microsoft/master
SparkSnail Apr 22, 2019
5187b2c
Merge pull request #164 from Microsoft/master
SparkSnail Apr 22, 2019
5032694
Merge pull request #165 from Microsoft/master
SparkSnail Apr 23, 2019
c577553
Merge pull request #166 from Microsoft/master
SparkSnail May 5, 2019
93d6502
Merge pull request #167 from Microsoft/master
SparkSnail May 6, 2019
b5eab4b
Merge pull request #168 from microsoft/master
SparkSnail May 10, 2019
f39d69e
Merge pull request #169 from microsoft/master
SparkSnail May 14, 2019
a030505
Merge pull request #170 from microsoft/master
SparkSnail May 15, 2019
c7ca451
Merge pull request #171 from microsoft/master
SparkSnail May 17, 2019
40bae6e
Merge pull request #172 from microsoft/master
SparkSnail May 26, 2019
c5acd8c
Merge pull request #173 from microsoft/master
SparkSnail May 27, 2019
bee8f84
Merge pull request #174 from microsoft/master
SparkSnail May 28, 2019
e1a4a80
Merge pull request #175 from microsoft/master
SparkSnail May 30, 2019
8a9b2cb
Merge pull request #177 from microsoft/v0.8
SparkSnail Jun 3, 2019
cbf88f7
Merge pull request #181 from microsoft/master
SparkSnail Jun 5, 2019
0235102
Merge pull request #182 from microsoft/master
SparkSnail Jun 10, 2019
9352cc8
Merge pull request #183 from microsoft/master
SparkSnail Jun 14, 2019
da1a9b8
init
Jun 19, 2019
aa561c4
update for remote and hdfs
Jun 20, 2019
d48ad02
Merge pull request #184 from microsoft/master
SparkSnail Jun 20, 2019
acb23a5
fix conflict
Jun 20, 2019
61192da
remove unused code
Jun 20, 2019
a97f5a3
fix comments
Jun 24, 2019
53fd68b
fix comments
Jun 24, 2019
01f375f
fix comments
Jun 25, 2019
dc42087
fix comments
Jun 25, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions docs/en_US/Nnictl.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ nnictl support commands:
* [nnictl trial](#trial)
* [nnictl top](#top)
* [nnictl experiment](#experiment)
* [nnictl platform](#platform)
* [nnictl config](#config)
* [nnictl log](#log)
* [nnictl webui](#webui)
Expand Down Expand Up @@ -370,6 +371,26 @@ Debug mode will disable version check function in Trialkeeper.
nnictl experiment list
```

* __nnictl experiment delete__

* Description

Delete one or all experiments, it includes log, result, environment information and cache. It uses to delete useless experiment result, or save disk space.

* Usage

```bash
nnictl experiment delete [OPTIONS]
```

* Options

|Name, shorthand|Required|Default|Description|
|------|------|------ |------|
|id| False| |ID of the experiment|
squirrelsc marked this conversation as resolved.
Show resolved Hide resolved



<a name="export"></a>

* __nnictl experiment export__
Expand Down Expand Up @@ -456,6 +477,32 @@ Debug mode will disable version check function in Trialkeeper.
nnictl experiment import [experiment_id] -f experiment_data.json
```

<a name="platform"></a>
![](https://placehold.it/15/1589F0/000000?text=+) `Manage platform information`

* __nnictl platform clean__

* Description

It uses to clean up disk on a target platform. The provided YAML file includes the information of target platform, and it follows the same schema as the NNI configuration file.

* Note

if the target platform is being used by other users, it may cause unexpected errors to others.

* Usage

```bash
nnictl platform clean [OPTIONS]
```

* Options

|Name, shorthand|Required|Default|Description|
|------|------|------ |------|
|--config| True| |the path of yaml config file used when create an experiment|


<a name="config"></a>
![](https://placehold.it/15/1589F0/000000?text=+) `nnictl config show`

Expand Down
13 changes: 13 additions & 0 deletions tools/nni_cmd/nnictl.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,19 @@ def parse_args():
parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids')
parser_experiment_list.add_argument('all', nargs='?', help='list all of experiments')
parser_experiment_list.set_defaults(func=experiment_list)
parser_experiment_clean = parser_experiment_subparsers.add_parser('delete', help='clean up the experiment data')
parser_experiment_clean.add_argument('id', nargs='?', help='the id of experiment')
parser_experiment_clean.add_argument('--all', action='store_true', default=False, help='delete all of experiments')
parser_experiment_clean.set_defaults(func=experiment_clean)

#parse experiment command
parser_platform = subparsers.add_parser('platform', help='get platform information')
#add subparsers for parser_experiment
parser_platform_subparsers = parser_platform.add_subparsers()
parser_platform_clean = parser_platform_subparsers.add_parser('clean', help='clean up the platform data')
parser_platform_clean.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file')
parser_platform_clean.set_defaults(func=platform_clean)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doc

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed, added doc in Nnictl.md


#import tuning data
parser_import_data = parser_experiment_subparsers.add_parser('import', help='import additional data')
parser_import_data.add_argument('id', nargs='?', help='the id of experiment')
Expand Down
177 changes: 171 additions & 6 deletions tools/nni_cmd/nnictl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,20 @@
import json
import datetime
import time
import re
from pathlib import Path
from pyhdfs import HdfsClient, HdfsFileNotFoundException
import shutil
from subprocess import call, check_output
from nni_annotation import expand_annotations
from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response
from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_data_url
from .config_utils import Config, Experiments
from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \
EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT
from .common_utils import print_normal, print_error, print_warning, detect_process
from .common_utils import print_normal, print_error, print_warning, detect_process, get_yml_content
from .command_utils import check_output_command, kill_command
from .ssh_utils import create_ssh_sftp_client, remove_remote_directory

def get_experiment_time(port):
'''get the startTime and endTime of an experiment'''
Expand Down Expand Up @@ -73,10 +78,11 @@ def update_experiment():
if status:
experiment_config.update_experiment(key, 'status', status)

def check_experiment_id(args):
def check_experiment_id(args, update=True):
'''check if the id is valid
'''
update_experiment()
if update:
update_experiment()
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
if not experiment_dict:
Expand Down Expand Up @@ -170,7 +176,7 @@ def get_config_filename(args):
'''get the file name of config file'''
experiment_id = check_experiment_id(args)
if experiment_id is None:
print_error('Please set the experiment id!')
print_error('Please set correct experiment id!')
exit(1)
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
Expand All @@ -180,7 +186,7 @@ def get_experiment_port(args):
'''get the port of experiment'''
experiment_id = check_experiment_id(args)
if experiment_id is None:
print_error('Please set the experiment id!')
print_error('Please set correct experiment id!')
exit(1)
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
Expand Down Expand Up @@ -373,6 +379,166 @@ def webui_url(args):
nni_config = Config(get_config_filename(args))
print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl'))))

def local_clean(directory):
SparkSnail marked this conversation as resolved.
Show resolved Hide resolved
'''clean up local data'''
print_normal('removing folder {0}'.format(directory))
try:
shutil.rmtree(directory)
except FileNotFoundError as err:
print_error('{0} does not exist!'.format(directory))

def remote_clean(machine_list, experiment_id=None):
'''clean up remote data'''
for machine in machine_list:
passwd = machine.get('passwd')
userName = machine.get('username')
host = machine.get('ip')
port = machine.get('port')
if experiment_id:
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments', experiment_id])
else:
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments'])
sftp = create_ssh_sftp_client(host, port, userName, passwd)
print_normal('removing folder {0}'.format(host + ':' + str(port) + remote_dir))
remove_remote_directory(sftp, remote_dir)

def hdfs_clean(host, user_name, output_dir, experiment_id=None):
'''clean up hdfs data'''
hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5)
if experiment_id:
full_path = '/' + '/'.join([user_name, 'nni', 'experiments', experiment_id])
else:
full_path = '/' + '/'.join([user_name, 'nni', 'experiments'])
print_normal('removing folder {0} in hdfs'.format(full_path))
hdfs_client.delete(full_path, recursive=True)
if output_dir:
pattern = re.compile('hdfs://(?P<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?P<baseDir>/.*)?')
match_result = pattern.match(output_dir)
if match_result:
output_host = match_result.group('host')
output_dir = match_result.group('baseDir')
#check if the host is valid
if output_host != host:
print_warning('The host in {0} is not consistent with {1}'.format(output_dir, host))
else:
if experiment_id:
output_dir = output_dir + '/' + experiment_id
print_normal('removing folder {0} in hdfs'.format(output_dir))
hdfs_client.delete(output_dir, recursive=True)

def experiment_clean(args):
'''clean up the experiment data'''
experiment_id_list = []
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
if args.all:
experiment_id_list = list(experiment_dict.keys())
else:
if args.id is None:
print_error('please set experiment id!')
exit(1)
if args.id not in experiment_dict:
print_error('can not find id {0}!'.format(args.id))
exit(1)
experiment_id_list.append(args.id)
while True:
print('INFO: This action will delete experiment {0}, and it’s not recoverable.'.format(' '.join(experiment_id_list)))
inputs = input('INFO: do you want to continue?[y/N]:')
if not inputs.lower() or inputs.lower() in ['n', 'no']:
exit(0)
elif inputs.lower() not in ['y', 'n', 'yes', 'no']:
print_warning('please input Y or N!')
else:
break
for experiment_id in experiment_id_list:
nni_config = Config(experiment_dict[experiment_id]['fileName'])
platform = nni_config.get_config('experimentConfig').get('trainingServicePlatform')
experiment_id = nni_config.get_config('experimentId')
if platform == 'remote':
machine_list = nni_config.get_config('experimentConfig').get('machineList')
remote_clean(machine_list, experiment_id)
elif platform == 'pai':
host = nni_config.get_config('experimentConfig').get('paiConfig').get('host')
user_name = nni_config.get_config('experimentConfig').get('paiConfig').get('userName')
output_dir = nni_config.get_config('experimentConfig').get('trial').get('outputDir')
hdfs_clean(host, user_name, output_dir, experiment_id)
elif platform != 'local':
#TODO: support all platforms
print_warning('platform {0} clean up not supported yet!'.format(platform))
exit(0)
#clean local data
home = str(Path.home())
local_dir = nni_config.get_config('experimentConfig').get('logDir')
if not local_dir:
local_dir = os.path.join(home, 'nni', 'experiments', experiment_id)
local_clean(local_dir)
experiment_config = Experiments()
print_normal('removing metadata of experiment {0}'.format(experiment_id))
experiment_config.remove_experiment(experiment_id)
print_normal('Finish!')

def get_platform_dir(config_content):
'''get the dir list to be deleted'''
platform = config_content.get('trainingServicePlatform')
dir_list = []
if platform == 'remote':
machine_list = config_content.get('machineList')
for machine in machine_list:
host = machine.get('ip')
port = machine.get('port')
dir_list.append(host + ':' + str(port) + '/tmp/nni/experiments')
elif platform == 'pai':
pai_config = config_content.get('paiConfig')
host = config_content.get('paiConfig').get('host')
user_name = config_content.get('paiConfig').get('userName')
output_dir = config_content.get('trial').get('outputDir')
dir_list.append('hdfs://{0}:9000/{1}/nni/experiments'.format(host, user_name))
if output_dir:
dir_list.append(output_dir)
return dir_list

def platform_clean(args):
'''clean up the experiment data'''
config_path = os.path.abspath(args.config)
if not os.path.exists(config_path):
print_error('Please set correct config path!')
exit(1)
config_content = get_yml_content(config_path)
platform = config_content.get('trainingServicePlatform')
if platform not in ['remote', 'pai']:
print_normal('platform {0} not supported!'.format(platform))
exit(0)
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
update_experiment()
id_list = list(experiment_dict.keys())
dir_list = get_platform_dir(config_content)
if not dir_list:
print_normal('No folder of NNI caches is found!')
exit(1)
while True:
print_normal('This command will remove below folders of NNI caches. If other users are using experiments on below hosts, it will be broken.')
for dir in dir_list:
print(' ' + dir)
inputs = input('INFO: do you want to continue?[y/N]:')
if not inputs.lower() or inputs.lower() in ['n', 'no']:
exit(0)
elif inputs.lower() not in ['y', 'n', 'yes', 'no']:
print_warning('please input Y or N!')
else:
break
if platform == 'remote':
machine_list = config_content.get('machineList')
for machine in machine_list:
remote_clean(machine_list, None)
elif platform == 'pai':
pai_config = config_content.get('paiConfig')
host = config_content.get('paiConfig').get('host')
user_name = config_content.get('paiConfig').get('userName')
output_dir = config_content.get('trial').get('outputDir')
hdfs_clean(host, user_name, output_dir, None)
print_normal('Done!')

def experiment_list(args):
'''get the information of all experiments'''
experiment_config = Experiments()
Expand All @@ -393,7 +559,6 @@ def experiment_list(args):
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!')
experiment_information = ""
for key in experiment_id_list:

experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
Expand Down
14 changes: 14 additions & 0 deletions tools/nni_cmd/ssh_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,17 @@ def create_ssh_sftp_client(host_ip, port, username, password):
return sftp
except Exception as exception:
print_error('Create ssh client error %s\n' % exception)

def remove_remote_directory(sftp, directory):
'''remove a directory in remote machine'''
try:
files = sftp.listdir(directory)
for file in files:
filepath = '/'.join([directory, file])
try:
sftp.remove(filepath)
except IOError:
remove_remote_directory(sftp, filepath)
sftp.rmdir(directory)
except IOError as err:
print_error(err)