From 0dab726db73ae8896a15627e9bf1483fe26b015e Mon Sep 17 00:00:00 2001 From: Zejun Lin <871886504@qq.com> Date: Tue, 16 Oct 2018 10:16:52 +0800 Subject: [PATCH] Update ci (#175) * Update RemoteMachineMode.md (#63) * Remove unused classes for SQuAD QA example. * Remove more unused functions for SQuAD QA example. * Fix default dataset config. * Add Makefile README (#64) * update document (#92) * Edit readme.md * updated a word * Update GetStarted.md * Update GetStarted.md * refact readme, getstarted and write your trial md. * Update README.md * Update WriteYourTrial.md * Update WriteYourTrial.md * Update WriteYourTrial.md * Update WriteYourTrial.md * Fix nnictl bugs and add new feature (#75) * fix nnictl bug * fix nnictl create bug * add experiment status logic * add more information for nnictl * fix Evolution Tuner bug * refactor code * fix code in updater.py * fix nnictl --help * fix classArgs bug * update check response.status_code logic * remove Buffer warning (#100) * update readme in ga_squad * update readme * fix typo * Update README.md * Update README.md * Update README.md * Add support for debugging mode * modify CI cuz of refracting exp stop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop --- examples/trials/ga_squad/README.md | 2 +- src/nni_manager/core/nnimanager.ts | 1 + test/naive/.gitignore | 5 + test/naive/README.md | 19 +++ test/naive/expected_assessor_result.txt | 1 - test/naive/expected_tuner_result.txt | 1 - test/naive/naive_assessor.py | 6 +- test/naive/naive_tuner.py | 8 +- test/naive/run.py | 147 +++++++++++++++--------- tools/nnicmd/launcher.py | 2 + 10 files changed, 130 insertions(+), 62 deletions(-) create mode 100644 test/naive/.gitignore create mode 100644 test/naive/README.md diff --git a/examples/trials/ga_squad/README.md b/examples/trials/ga_squad/README.md index ab8ba853f7..8bc80e2b38 100644 --- a/examples/trials/ga_squad/README.md +++ b/examples/trials/ga_squad/README.md @@ -251,4 +251,4 @@ Every model configuration will has a "layers" section, which is a JSON list of l * `input_size` is the number of inputs the layer has. * `input` is the indices of layers taken as input of this layer. * `output` is the indices of layers use this layer's output as their input. - * `is_delete` means whether the layer is still available. \ No newline at end of file + * `is_delete` means whether the layer is still available. diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 7061efd29f..06d3959970 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -406,6 +406,7 @@ class NNIManager implements Manager { suspendStartTime = Date.now(); } this.status.status = 'SUSPENDED'; + this.log.info('Experiment suspended.'); } else { if (this.status.status === 'SUSPENDED') { assert(suspendStartTime !== 0); diff --git a/test/naive/.gitignore b/test/naive/.gitignore new file mode 100644 index 0000000000..d082c9bc5a --- /dev/null +++ b/test/naive/.gitignore @@ -0,0 +1,5 @@ +__pycache__ + +tuner_search_space.json +tuner_result.txt +assessor_result.txt \ No newline at end of file diff --git a/test/naive/README.md b/test/naive/README.md new file mode 100644 index 0000000000..04cbb5bca3 --- /dev/null +++ b/test/naive/README.md @@ -0,0 +1,19 @@ +## Usage +To test before installing: + ./run.py --preinstall +To test the integrity of installation: + ./run.py +It will print `PASS` in green eventually if everything works well. + +## Details +* This test case tests the communication between trials and tuner/assessor. +* The naive trials receive an integer `x` as parameter, and reports `x`, `x²`, `x³`, ... , `x¹⁰` as metrics. +* The naive tuner simply generates the sequence of natural numbers, and print received metrics to `tuner_result.txt`. +* The naive assessor kills trials when `sum(metrics) % 11 == 1`, and print killed trials to `assessor_result.txt`. +* When tuner and assessor exit with exception, they will append `ERROR` to corresponding result file. +* When the experiment is suspended, meaning it is successfully done in this case, `Experiment suspended` can be detected in the nni_manager.log file. + +## Issues +* Private APIs are used to detect whether tuner and assessor have terminated successfully. +* The output of REST server is not tested. +* Remote machine training service is not tested. \ No newline at end of file diff --git a/test/naive/expected_assessor_result.txt b/test/naive/expected_assessor_result.txt index e78ad44112..3c28700db5 100644 --- a/test/naive/expected_assessor_result.txt +++ b/test/naive/expected_assessor_result.txt @@ -4,4 +4,3 @@ 5 3 7 2 8 3 -DONE diff --git a/test/naive/expected_tuner_result.txt b/test/naive/expected_tuner_result.txt index 1d82ca68d6..a2b43fb2b2 100644 --- a/test/naive/expected_tuner_result.txt +++ b/test/naive/expected_tuner_result.txt @@ -2,4 +2,3 @@ 6 60466176 9 3486784401 10 10000000000 -DONE diff --git a/test/naive/naive_assessor.py b/test/naive/naive_assessor.py index 16c89d0484..4d42df7683 100644 --- a/test/naive/naive_assessor.py +++ b/test/naive/naive_assessor.py @@ -1,10 +1,13 @@ import logging +import os from nni.assessor import Assessor, AssessResult _logger = logging.getLogger('NaiveAssessor') _logger.info('start') -_result = open('/tmp/nni_assessor_result.txt', 'w') + +_pwd = os.path.dirname(__file__) +_result = open(os.path.join(_pwd, 'assessor_result.txt'), 'w') class NaiveAssessor(Assessor): def __init__(self, optimize_mode): @@ -30,7 +33,6 @@ def assess_trial(self, trial_job_id, trial_history): return AssessResult.Good def _on_exit(self): - _result.write('DONE\n') _result.close() def _on_error(self): diff --git a/test/naive/naive_tuner.py b/test/naive/naive_tuner.py index 71750678c0..9ff98d6961 100644 --- a/test/naive/naive_tuner.py +++ b/test/naive/naive_tuner.py @@ -1,11 +1,14 @@ import json import logging +import os from nni.tuner import Tuner _logger = logging.getLogger('NaiveTuner') _logger.info('start') -_result = open('/tmp/nni_tuner_result.txt', 'w') + +_pwd = os.path.dirname(__file__) +_result = open(os.path.join(_pwd, 'tuner_result.txt'), 'w') class NaiveTuner(Tuner): def __init__(self, optimize_mode): @@ -24,11 +27,10 @@ def receive_trial_result(self, parameter_id, parameters, reward): def update_search_space(self, search_space): _logger.info('update_search_space: %s' % search_space) - with open('/tmp/nni_tuner_search_space.json', 'w') as file_: + with open(os.path.join(_pwd, 'tuner_search_space.json'), 'w') as file_: json.dump(search_space, file_) def _on_exit(self): - _result.write('DONE\n') _result.close() def _on_error(self): diff --git a/test/naive/run.py b/test/naive/run.py index 0c77023eb6..a3e4019600 100644 --- a/test/naive/run.py +++ b/test/naive/run.py @@ -4,6 +4,8 @@ import json import os import subprocess +import requests +import sys import time import traceback @@ -11,75 +13,112 @@ RED = '\33[31m' CLEAR = '\33[0m' -def read_last_line(file_name): - try: - *_, last_line = open(file_name) - return last_line.strip() - except (FileNotFoundError, ValueError): - return None - -def run(): - os.environ['PATH'] = os.environ['PATH'] + ':' + os.environ['PWD'] - - with contextlib.suppress(FileNotFoundError): - os.remove('tuner_search_space.txt') - with contextlib.suppress(FileNotFoundError): - os.remove('tuner_result.txt') - with contextlib.suppress(FileNotFoundError): - os.remove('/tmp/nni_assessor_result.txt') - - proc = subprocess.run(['nnictl', 'create', '--config', 'local.yml']) - assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode - - print('Spawning trials...') - current_trial = 0 - - for _ in range(60): +class Integration_test(): + def __init__(self): + self.experiment_url = 'http://localhost:51188/api/v1/nni/experiment' + self.experiment_id = None + self.experiment_suspended_signal = '"Experiment suspended"' + + def read_last_line(self, file_name): + try: + *_, last_line = open(file_name) + return last_line.strip() + except (FileNotFoundError, ValueError): + return None + + def fetch_experiment_config(self): + experiment_profile = requests.get(self.experiment_url) + self.experiment_id = json.loads(experiment_profile.text)['id'] + self.experiment_path = os.path.join(os.environ['HOME'], 'nni/experiments', self.experiment_id) + self.nnimanager_log_path = os.path.join(self.experiment_path, 'log', 'nnimanager.log') + + def check_experiment_status(self): + assert os.path.exists(self.nnimanager_log_path), 'Experiment starts failed' + cmds = ['cat', self.nnimanager_log_path, '|', 'grep', self.experiment_suspended_signal] + completed_process = subprocess.run(' '.join(cmds), shell = True) + + return completed_process.returncode == 0 + + def remove_files(self, file_list): + for file_path in file_list: + with contextlib.suppress(FileNotFoundError): + os.remove(file_path) + + def run(self, installed = True): + if not installed: + os.environ['PATH'] = os.environ['PATH'] + ':' + os.environ['PWD'] + sdk_path = os.path.abspath('../../src/sdk/pynni') + cmd_path = os.path.abspath('../../tools') + pypath = os.environ.get('PYTHONPATH') + if pypath: + pypath = ':'.join([pypath, sdk_path, cmd_path]) + else: + pypath = ':'.join([sdk_path, cmd_path]) + os.environ['PYTHONPATH'] = pypath + + to_remove = ['tuner_search_space.json', 'tuner_result.txt', 'assessor_result.txt'] + self.remove_files(to_remove) + + proc = subprocess.run(['nnictl', 'create', '--config', 'local.yml']) + assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode + + print('Spawning trials...') time.sleep(1) + self.fetch_experiment_config() + current_trial = 0 + + for _ in range(60): + time.sleep(1) - tuner_status = read_last_line('/tmp/nni_tuner_result.txt') - assessor_status = read_last_line('/tmp/nni_assessor_result.txt') + tuner_status = self.read_last_line('tuner_result.txt') + assessor_status = self.read_last_line('assessor_result.txt') + experiment_status = self.check_experiment_status() - assert tuner_status != 'ERROR', 'Tuner exited with error' - assert assessor_status != 'ERROR', 'Assessor exited with error' + assert tuner_status != 'ERROR', 'Tuner exited with error' + assert assessor_status != 'ERROR', 'Assessor exited with error' - if tuner_status == 'DONE' and assessor_status == 'DONE': - break + if experiment_status: + break - if tuner_status is not None: - for line in open('/tmp/nni_tuner_result.txt'): - if line.strip() in ('DONE', 'ERROR'): - break - trial = int(line.split(' ')[0]) - if trial > current_trial: - current_trial = trial - print('Trial #%d done' % trial) + if tuner_status is not None: + for line in open('tuner_result.txt'): + if line.strip() == 'ERROR': + break + trial = int(line.split(' ')[0]) + if trial > current_trial: + current_trial = trial + print('Trial #%d done' % trial) - assert tuner_status == 'DONE' and assessor_status == 'DONE', 'Failed to finish in 1 min' + assert experiment_status, 'Failed to finish in 1 min' - ss1 = json.load(open('search_space.json')) - ss2 = json.load(open('/tmp/nni_tuner_search_space.json')) - assert ss1 == ss2, 'Tuner got wrong search space' + ss1 = json.load(open('search_space.json')) + ss2 = json.load(open('tuner_search_space.json')) + assert ss1 == ss2, 'Tuner got wrong search space' - tuner_result = set(open('/tmp/nni_tuner_result.txt')) - expected = set(open('expected_tuner_result.txt')) - # Trials may complete before NNI gets assessor's result, - # so it is possible to have more final result than expected - assert tuner_result.issuperset(expected), 'Bad tuner result' + # Waiting for naive_trial to report_final_result + time.sleep(2) + tuner_result = set(open('tuner_result.txt')) + expected = set(open('expected_tuner_result.txt')) + # Trials may complete before NNI gets assessor's result, + # so it is possible to have more final result than expected + assert tuner_result.issuperset(expected), 'Bad tuner result' - assessor_result = set(open('/tmp/nni_assessor_result.txt')) - expected = set(open('expected_assessor_result.txt')) - assert assessor_result == expected, 'Bad assessor result' + assessor_result = set(open('assessor_result.txt')) + expected = set(open('expected_assessor_result.txt')) + assert assessor_result == expected, 'Bad assessor result' if __name__ == '__main__': + installed = (sys.argv[-1] != '--preinstall') + + ic = Integration_test() try: - run() + ic.run(installed) # TODO: check the output of rest server print(GREEN + 'PASS' + CLEAR) except Exception as error: print(RED + 'FAIL' + CLEAR) print('%r' % error) traceback.print_exc() - raise error - - subprocess.run(['nnictl', 'stop', '--port', '51188']) + sys.exit(1) + finally: + subprocess.run(['nnictl', 'stop']) diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index b223551bea..cdc3ff5889 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -114,6 +114,8 @@ def set_pai_config(experiment_config, port): if not response or not response.status_code == 200: if response is not None: err_message = response.text + with open(STDERR_FULL_PATH, 'a+') as fout: + fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message #set trial_config