-
Notifications
You must be signed in to change notification settings - Fork 1.8k
nnictl experiment cleanup #1186
Changes from 90 commits
d77a99c
695d866
b7e9799
7cb03f9
44d1565
7ab7386
d9e1ea8
2c225a8
be23f55
6f760ab
9161209
e661c55
4e5d836
f80e737
aefc219
4fec2cc
dc45661
11fec6f
a03a191
7c7832c
2c862dc
85c015d
85cb472
3784355
d91c980
9786650
ef176d2
1089e80
627e823
b633c26
035d58b
cd549df
964743a
8422992
40391ec
1d84526
1852457
754a354
1ee9735
9f4485c
b1c3774
5d7923e
281f3dc
2ce9157
571a7af
f09d51a
41a9a59
21165b5
d25f7b5
17e719e
e25ffbd
5e777d2
6ff24a5
ccf6c04
eb5e21c
f796c60
e1ae623
ec41d56
080ae00
f0a2d39
77526d3
d95c351
346d49d
6af4b86
cf5336d
aec4977
b1dfaff
6c9360a
0663218
5187b2c
5032694
c577553
93d6502
b5eab4b
f39d69e
a030505
c7ca451
40bae6e
c5acd8c
bee8f84
e1a4a80
8a9b2cb
cbf88f7
0235102
9352cc8
da1a9b8
aa561c4
d48ad02
acb23a5
61192da
a97f5a3
53fd68b
01f375f
dc42087
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,15 +24,20 @@ | |
import json | ||
import datetime | ||
import time | ||
import re | ||
from pathlib import Path | ||
from pyhdfs import HdfsClient, HdfsFileNotFoundException | ||
import shutil | ||
from subprocess import call, check_output | ||
from nni_annotation import expand_annotations | ||
from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response | ||
from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_data_url | ||
from .config_utils import Config, Experiments | ||
from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \ | ||
EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT | ||
from .common_utils import print_normal, print_error, print_warning, detect_process | ||
from .common_utils import print_normal, print_error, print_warning, detect_process, get_yml_content | ||
from .command_utils import check_output_command, kill_command | ||
from .ssh_utils import create_ssh_sftp_client, remote_remove_directory | ||
|
||
def get_experiment_time(port): | ||
'''get the startTime and endTime of an experiment''' | ||
|
@@ -73,10 +78,11 @@ def update_experiment(): | |
if status: | ||
experiment_config.update_experiment(key, 'status', status) | ||
|
||
def check_experiment_id(args): | ||
def check_experiment_id(args, update=True): | ||
'''check if the id is valid | ||
''' | ||
update_experiment() | ||
if update: | ||
update_experiment() | ||
experiment_config = Experiments() | ||
experiment_dict = experiment_config.get_all_experiments() | ||
if not experiment_dict: | ||
|
@@ -170,7 +176,7 @@ def get_config_filename(args): | |
'''get the file name of config file''' | ||
experiment_id = check_experiment_id(args) | ||
if experiment_id is None: | ||
print_error('Please set the experiment id!') | ||
print_error('Please set correct experiment id!') | ||
exit(1) | ||
experiment_config = Experiments() | ||
experiment_dict = experiment_config.get_all_experiments() | ||
|
@@ -180,7 +186,7 @@ def get_experiment_port(args): | |
'''get the port of experiment''' | ||
experiment_id = check_experiment_id(args) | ||
if experiment_id is None: | ||
print_error('Please set the experiment id!') | ||
print_error('Please set correct experiment id!') | ||
exit(1) | ||
experiment_config = Experiments() | ||
experiment_dict = experiment_config.get_all_experiments() | ||
|
@@ -373,6 +379,146 @@ def webui_url(args): | |
nni_config = Config(get_config_filename(args)) | ||
print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl')))) | ||
|
||
def local_clean(directory): | ||
SparkSnail marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'''clean up local data''' | ||
print_normal('cleaning up {0}'.format(directory)) | ||
try: | ||
shutil.rmtree(directory) | ||
except FileNotFoundError as err: | ||
print_error('{0} does not exist!'.format(directory)) | ||
|
||
def remote_clean(nni_config): | ||
'''clean up remote data''' | ||
machine_list = nni_config.get_config('experimentConfig').get('machineList') | ||
for machine in machine_list: | ||
passwd = machine.get('passwd') | ||
userName = machine.get('username') | ||
host = machine.get('ip') | ||
port = machine.get('port') | ||
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments', nni_config.get_config('experimentId')]) | ||
sftp = create_ssh_sftp_client(host, port, userName, passwd) | ||
print_normal('cleaning up {0}'.format(host + ':' + str(port) + remote_dir)) | ||
remote_remove_directory(sftp, remote_dir) | ||
|
||
def hdfs_clean(nni_config): | ||
'''clean up hdfs data''' | ||
host = nni_config.get_config('experimentConfig').get('paiConfig').get('host') | ||
user_name = nni_config.get_config('experimentConfig').get('paiConfig').get('userName') | ||
hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5) | ||
full_path = '/' + '/'.join([user_name, 'nni', 'experiments', nni_config.get_config('experimentId')]) | ||
print_normal('deleting {0} in hdfs'.format(full_path)) | ||
hdfs_client.delete(full_path, recursive=True) | ||
output_dir = nni_config.get_config('experimentConfig').get('paiConfig').get('outputDir') | ||
if output_dir: | ||
pattern = re.compile('hdfs://(?P<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?P<baseDir>/.*)?') | ||
match_result = pattern.match(output_dir) | ||
if match_result: | ||
output_host = match_result.group('host') | ||
output_directory = match_result.group('baseDir') | ||
if output_host == host: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you need to give warning when output_host and host do not match? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed, give warning in code. |
||
print_normal('deleting {0} in hdfs'.format(output_directory)) | ||
hdfs_client.delete(output_directory, recursive=True) | ||
|
||
def experiment_clean(args): | ||
'''clean up the experiment data''' | ||
nni_config = Config(get_config_filename(args)) | ||
QuanluZhang marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sure metadata in .local/nnictl/.experiment and corresponding folders are also removed. |
||
while True: | ||
print('INFO: clean up all data for experiment {0}'.format(args.id)) | ||
inputs = input('INFO: do you want to continue?[y/n]:') | ||
if not inputs.lower() or inputs.lower() in ['n', 'no']: | ||
print_normal('Exit!') | ||
exit(0) | ||
elif inputs.lower() not in ['y', 'n', 'yes', 'no']: | ||
print_warning('please input Y or N!') | ||
else: | ||
break | ||
#clean local data | ||
home = str(Path.home()) | ||
local_dir = os.path.join(home, 'nni', 'experiments', str(args.id)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you assume that the data is under ~/nni/experiments. I remembered that users can config the path where the data/log is stored. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed, use logDir first. |
||
local_clean(local_dir) | ||
platform = nni_config.get_config('experimentConfig').get('trainingServicePlatform') | ||
if platform == 'remote': | ||
remote_clean(nni_config) | ||
elif platform == 'pai': | ||
hdfs_clean(nni_config) | ||
elif platform != 'local': | ||
#TODO: support all platforms | ||
print_warning('platform {0} clean up not supported yet!'.format(platform)) | ||
exit(0) | ||
experiment_config = Experiments() | ||
experiment_config.remove_experiment(args.id) | ||
print_normal('Success!') | ||
|
||
def get_platform_dir(experiment_dict, platform): | ||
'''get directories in platform''' | ||
dir_list = [] | ||
for key in experiment_dict.keys(): | ||
if platform != experiment_dict[key]['platform']: | ||
continue | ||
file_name = experiment_dict[key].get('fileName') | ||
nni_config = Config(file_name) | ||
if platform == 'remote': | ||
machine_list = nni_config.get_config('experimentConfig').get('machineList') | ||
for machine in machine_list: | ||
host = machine.get('ip') | ||
port = machine.get('port') | ||
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments', nni_config.get_config('experimentId')]) | ||
dir_list.append(host + ':' + str(port) + remote_dir) | ||
elif platform == 'pai': | ||
user_name = nni_config.get_config('experimentConfig').get('paiConfig').get('userName') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you need to check whether this pai config is the same as the command arg config? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, this function will get all of directory in the related platform, the args config only have one directory. |
||
full_path = '/' + '/'.join([user_name, 'nni', 'experiments', nni_config.get_config('experimentId')]) | ||
dir_list.append(full_path) | ||
output_dir = nni_config.get_config('experimentConfig').get('paiConfig').get('outputDir') | ||
dir_list.append(output_dir) | ||
else: | ||
print_normal('not supported platform!') | ||
exit(1) | ||
return dir_list | ||
|
||
def platform_clean(args): | ||
'''clean up the experiment data''' | ||
config_path = os.path.abspath(args.config) | ||
if not os.path.exists(config_path): | ||
print_error('Please set correct config path!') | ||
exit(1) | ||
experiment_config = get_yml_content(config_path) | ||
platform = experiment_config.get('trainingServicePlatform') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 'trainingServicePlatform'? What is the spec for the config file? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The config file is same as the yaml file to create an experiment, like
|
||
experiment_config = Experiments() | ||
experiment_dict = experiment_config.get_all_experiments() | ||
update_experiment() | ||
id_list = list(experiment_dict.keys()) | ||
dir_list = get_platform_dir(experiment_dict, platform) | ||
if not dir_list: | ||
print_normal('No data to be deleted in platform {0}!'.format(platform)) | ||
exit(1) | ||
while True: | ||
print_normal('going to clean up all data in:') | ||
for dir in dir_list: | ||
print(' ' + dir) | ||
inputs = input('INFO: do you want to continue?[y/n]:') | ||
if not inputs.lower() or inputs.lower() in ['n', 'no']: | ||
print_normal('Exit!') | ||
exit(0) | ||
elif inputs.lower() not in ['y', 'n', 'yes', 'no']: | ||
print_warning('please input Y or N!') | ||
else: | ||
break | ||
if platform == 'remote': | ||
for key in id_list: | ||
if experiment_dict[key]['platform'] == 'remote': | ||
file_name = experiment_dict[key]['fileName'] | ||
nni_config = Config(file_name) | ||
remote_clean(nni_config) | ||
experiment_config.remove_experiment(key) | ||
elif platform == 'pai': | ||
for key in id_list: | ||
if experiment_dict[key]['platform'] == 'pai': | ||
file_name = experiment_dict[key]['fileName'] | ||
nni_config = Config(file_name) | ||
hdfs_clean(nni_config) | ||
experiment_config.remove_experiment(key) | ||
print_normal('Success!') | ||
|
||
def experiment_list(args): | ||
'''get the information of all experiments''' | ||
experiment_config = Experiments() | ||
|
@@ -393,7 +539,6 @@ def experiment_list(args): | |
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!') | ||
experiment_information = "" | ||
for key in experiment_id_list: | ||
|
||
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\ | ||
experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) | ||
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,3 +57,17 @@ def create_ssh_sftp_client(host_ip, port, username, password): | |
return sftp | ||
except Exception as exception: | ||
print_error('Create ssh client error %s\n' % exception) | ||
|
||
def remote_remove_directory(sftp, directory): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove_remote_directory is better. verb+noun is right pattern for function names. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
'''remove a directory in remote machine''' | ||
try: | ||
files = sftp.listdir(directory) | ||
for file in files: | ||
filepath = '/'.join([directory, file]) | ||
try: | ||
sftp.remove(filepath) | ||
except IOError: | ||
remote_remove_directory(sftp, filepath) | ||
sftp.rmdir(directory) | ||
except IOError as err: | ||
print_error(err) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
doc
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed, added doc in Nnictl.md