diff --git a/docs/en_US/Nnictl.md b/docs/en_US/Nnictl.md index a0fe0b50c4..e7c6dbdc78 100644 --- a/docs/en_US/Nnictl.md +++ b/docs/en_US/Nnictl.md @@ -15,6 +15,7 @@ nnictl support commands: * [nnictl trial](#trial) * [nnictl top](#top) * [nnictl experiment](#experiment) +* [nnictl platform](#platform) * [nnictl config](#config) * [nnictl log](#log) * [nnictl webui](#webui) @@ -370,6 +371,26 @@ Debug mode will disable version check function in Trialkeeper. nnictl experiment list ``` +* __nnictl experiment delete__ + + * Description + + Delete one or all experiments, it includes log, result, environment information and cache. It uses to delete useless experiment result, or save disk space. + + * Usage + + ```bash + nnictl experiment delete [OPTIONS] + ``` + + * Options + + |Name, shorthand|Required|Default|Description| + |------|------|------ |------| + |id| False| |ID of the experiment| + + + * __nnictl experiment export__ @@ -456,6 +477,32 @@ Debug mode will disable version check function in Trialkeeper. nnictl experiment import [experiment_id] -f experiment_data.json ``` + +![](https://placehold.it/15/1589F0/000000?text=+) `Manage platform information` + +* __nnictl platform clean__ + + * Description + + It uses to clean up disk on a target platform. The provided YAML file includes the information of target platform, and it follows the same schema as the NNI configuration file. + + * Note + + if the target platform is being used by other users, it may cause unexpected errors to others. + + * Usage + + ```bash + nnictl platform clean [OPTIONS] + ``` + + * Options + + |Name, shorthand|Required|Default|Description| + |------|------|------ |------| + |--config| True| |the path of yaml config file used when create an experiment| + + ![](https://placehold.it/15/1589F0/000000?text=+) `nnictl config show` diff --git a/tools/nni_cmd/nnictl.py b/tools/nni_cmd/nnictl.py index 286f643ab3..3fc908e1b6 100644 --- a/tools/nni_cmd/nnictl.py +++ b/tools/nni_cmd/nnictl.py @@ -121,6 +121,19 @@ def parse_args(): parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids') parser_experiment_list.add_argument('all', nargs='?', help='list all of experiments') parser_experiment_list.set_defaults(func=experiment_list) + parser_experiment_clean = parser_experiment_subparsers.add_parser('delete', help='clean up the experiment data') + parser_experiment_clean.add_argument('id', nargs='?', help='the id of experiment') + parser_experiment_clean.add_argument('--all', action='store_true', default=False, help='delete all of experiments') + parser_experiment_clean.set_defaults(func=experiment_clean) + + #parse experiment command + parser_platform = subparsers.add_parser('platform', help='get platform information') + #add subparsers for parser_experiment + parser_platform_subparsers = parser_platform.add_subparsers() + parser_platform_clean = parser_platform_subparsers.add_parser('clean', help='clean up the platform data') + parser_platform_clean.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file') + parser_platform_clean.set_defaults(func=platform_clean) + #import tuning data parser_import_data = parser_experiment_subparsers.add_parser('import', help='import additional data') parser_import_data.add_argument('id', nargs='?', help='the id of experiment') diff --git a/tools/nni_cmd/nnictl_utils.py b/tools/nni_cmd/nnictl_utils.py index 94a3cb9f37..2333fd6c0b 100644 --- a/tools/nni_cmd/nnictl_utils.py +++ b/tools/nni_cmd/nnictl_utils.py @@ -24,6 +24,10 @@ import json import datetime import time +import re +from pathlib import Path +from pyhdfs import HdfsClient, HdfsFileNotFoundException +import shutil from subprocess import call, check_output from nni_annotation import expand_annotations from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response @@ -31,8 +35,9 @@ from .config_utils import Config, Experiments from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \ EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT -from .common_utils import print_normal, print_error, print_warning, detect_process +from .common_utils import print_normal, print_error, print_warning, detect_process, get_yml_content from .command_utils import check_output_command, kill_command +from .ssh_utils import create_ssh_sftp_client, remove_remote_directory def get_experiment_time(port): '''get the startTime and endTime of an experiment''' @@ -73,10 +78,11 @@ def update_experiment(): if status: experiment_config.update_experiment(key, 'status', status) -def check_experiment_id(args): +def check_experiment_id(args, update=True): '''check if the id is valid ''' - update_experiment() + if update: + update_experiment() experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() if not experiment_dict: @@ -170,7 +176,7 @@ def get_config_filename(args): '''get the file name of config file''' experiment_id = check_experiment_id(args) if experiment_id is None: - print_error('Please set the experiment id!') + print_error('Please set correct experiment id!') exit(1) experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() @@ -180,7 +186,7 @@ def get_experiment_port(args): '''get the port of experiment''' experiment_id = check_experiment_id(args) if experiment_id is None: - print_error('Please set the experiment id!') + print_error('Please set correct experiment id!') exit(1) experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() @@ -373,6 +379,166 @@ def webui_url(args): nni_config = Config(get_config_filename(args)) print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl')))) +def local_clean(directory): + '''clean up local data''' + print_normal('removing folder {0}'.format(directory)) + try: + shutil.rmtree(directory) + except FileNotFoundError as err: + print_error('{0} does not exist!'.format(directory)) + +def remote_clean(machine_list, experiment_id=None): + '''clean up remote data''' + for machine in machine_list: + passwd = machine.get('passwd') + userName = machine.get('username') + host = machine.get('ip') + port = machine.get('port') + if experiment_id: + remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments', experiment_id]) + else: + remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments']) + sftp = create_ssh_sftp_client(host, port, userName, passwd) + print_normal('removing folder {0}'.format(host + ':' + str(port) + remote_dir)) + remove_remote_directory(sftp, remote_dir) + +def hdfs_clean(host, user_name, output_dir, experiment_id=None): + '''clean up hdfs data''' + hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5) + if experiment_id: + full_path = '/' + '/'.join([user_name, 'nni', 'experiments', experiment_id]) + else: + full_path = '/' + '/'.join([user_name, 'nni', 'experiments']) + print_normal('removing folder {0} in hdfs'.format(full_path)) + hdfs_client.delete(full_path, recursive=True) + if output_dir: + pattern = re.compile('hdfs://(?P([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?P/.*)?') + match_result = pattern.match(output_dir) + if match_result: + output_host = match_result.group('host') + output_dir = match_result.group('baseDir') + #check if the host is valid + if output_host != host: + print_warning('The host in {0} is not consistent with {1}'.format(output_dir, host)) + else: + if experiment_id: + output_dir = output_dir + '/' + experiment_id + print_normal('removing folder {0} in hdfs'.format(output_dir)) + hdfs_client.delete(output_dir, recursive=True) + +def experiment_clean(args): + '''clean up the experiment data''' + experiment_id_list = [] + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + if args.all: + experiment_id_list = list(experiment_dict.keys()) + else: + if args.id is None: + print_error('please set experiment id!') + exit(1) + if args.id not in experiment_dict: + print_error('can not find id {0}!'.format(args.id)) + exit(1) + experiment_id_list.append(args.id) + while True: + print('INFO: This action will delete experiment {0}, and it’s not recoverable.'.format(' '.join(experiment_id_list))) + inputs = input('INFO: do you want to continue?[y/N]:') + if not inputs.lower() or inputs.lower() in ['n', 'no']: + exit(0) + elif inputs.lower() not in ['y', 'n', 'yes', 'no']: + print_warning('please input Y or N!') + else: + break + for experiment_id in experiment_id_list: + nni_config = Config(experiment_dict[experiment_id]['fileName']) + platform = nni_config.get_config('experimentConfig').get('trainingServicePlatform') + experiment_id = nni_config.get_config('experimentId') + if platform == 'remote': + machine_list = nni_config.get_config('experimentConfig').get('machineList') + remote_clean(machine_list, experiment_id) + elif platform == 'pai': + host = nni_config.get_config('experimentConfig').get('paiConfig').get('host') + user_name = nni_config.get_config('experimentConfig').get('paiConfig').get('userName') + output_dir = nni_config.get_config('experimentConfig').get('trial').get('outputDir') + hdfs_clean(host, user_name, output_dir, experiment_id) + elif platform != 'local': + #TODO: support all platforms + print_warning('platform {0} clean up not supported yet!'.format(platform)) + exit(0) + #clean local data + home = str(Path.home()) + local_dir = nni_config.get_config('experimentConfig').get('logDir') + if not local_dir: + local_dir = os.path.join(home, 'nni', 'experiments', experiment_id) + local_clean(local_dir) + experiment_config = Experiments() + print_normal('removing metadata of experiment {0}'.format(experiment_id)) + experiment_config.remove_experiment(experiment_id) + print_normal('Finish!') + +def get_platform_dir(config_content): + '''get the dir list to be deleted''' + platform = config_content.get('trainingServicePlatform') + dir_list = [] + if platform == 'remote': + machine_list = config_content.get('machineList') + for machine in machine_list: + host = machine.get('ip') + port = machine.get('port') + dir_list.append(host + ':' + str(port) + '/tmp/nni/experiments') + elif platform == 'pai': + pai_config = config_content.get('paiConfig') + host = config_content.get('paiConfig').get('host') + user_name = config_content.get('paiConfig').get('userName') + output_dir = config_content.get('trial').get('outputDir') + dir_list.append('hdfs://{0}:9000/{1}/nni/experiments'.format(host, user_name)) + if output_dir: + dir_list.append(output_dir) + return dir_list + +def platform_clean(args): + '''clean up the experiment data''' + config_path = os.path.abspath(args.config) + if not os.path.exists(config_path): + print_error('Please set correct config path!') + exit(1) + config_content = get_yml_content(config_path) + platform = config_content.get('trainingServicePlatform') + if platform not in ['remote', 'pai']: + print_normal('platform {0} not supported!'.format(platform)) + exit(0) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + update_experiment() + id_list = list(experiment_dict.keys()) + dir_list = get_platform_dir(config_content) + if not dir_list: + print_normal('No folder of NNI caches is found!') + exit(1) + while True: + print_normal('This command will remove below folders of NNI caches. If other users are using experiments on below hosts, it will be broken.') + for dir in dir_list: + print(' ' + dir) + inputs = input('INFO: do you want to continue?[y/N]:') + if not inputs.lower() or inputs.lower() in ['n', 'no']: + exit(0) + elif inputs.lower() not in ['y', 'n', 'yes', 'no']: + print_warning('please input Y or N!') + else: + break + if platform == 'remote': + machine_list = config_content.get('machineList') + for machine in machine_list: + remote_clean(machine_list, None) + elif platform == 'pai': + pai_config = config_content.get('paiConfig') + host = config_content.get('paiConfig').get('host') + user_name = config_content.get('paiConfig').get('userName') + output_dir = config_content.get('trial').get('outputDir') + hdfs_clean(host, user_name, output_dir, None) + print_normal('Done!') + def experiment_list(args): '''get the information of all experiments''' experiment_config = Experiments() @@ -393,7 +559,6 @@ def experiment_list(args): print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!') experiment_information = "" for key in experiment_id_list: - experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\ experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) diff --git a/tools/nni_cmd/ssh_utils.py b/tools/nni_cmd/ssh_utils.py index 550e64bea5..da707dac48 100644 --- a/tools/nni_cmd/ssh_utils.py +++ b/tools/nni_cmd/ssh_utils.py @@ -57,3 +57,17 @@ def create_ssh_sftp_client(host_ip, port, username, password): return sftp except Exception as exception: print_error('Create ssh client error %s\n' % exception) + +def remove_remote_directory(sftp, directory): + '''remove a directory in remote machine''' + try: + files = sftp.listdir(directory) + for file in files: + filepath = '/'.join([directory, file]) + try: + sftp.remove(filepath) + except IOError: + remove_remote_directory(sftp, filepath) + sftp.rmdir(directory) + except IOError as err: + print_error(err) \ No newline at end of file