Update ci (#175)

* Update RemoteMachineMode.md (#63) * Remove unused classes for SQuAD QA example. * Remove more unused functions for SQuAD QA example. * Fix default dataset config. * Add Makefile README (#64) * update document (#92) * Edit readme.md * updated a word * Update GetStarted.md * Update GetStarted.md * refact readme, getstarted and write your trial md. * Update README.md * Update WriteYourTrial.md * Update WriteYourTrial.md * Update WriteYourTrial.md * Update WriteYourTrial.md * Fix nnictl bugs and add new feature (#75) * fix nnictl bug * fix nnictl create bug * add experiment status logic * add more information for nnictl * fix Evolution Tuner bug * refactor code * fix code in updater.py * fix nnictl --help * fix classArgs bug * update check response.status_code logic * remove Buffer warning (#100) * update readme in ga_squad * update readme * fix typo * Update README.md * Update README.md * Update README.md * Add support for debugging mode * modify CI cuz of refracting exp stop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop * update CI for expstop
microsoft · Oct 16, 2018 · 0dab726 · 0dab726
1 parent 1392c93
commit 0dab726
Show file tree

Hide file tree

Showing 10 changed files with 130 additions and 62 deletions.
diff --git a/examples/trials/ga_squad/README.md b/examples/trials/ga_squad/README.md
@@ -251,4 +251,4 @@ Every model configuration will has a "layers" section, which is a JSON list of l
  * `input_size` is the number of inputs the layer has.
  * `input` is the indices of layers taken as input of this layer.
  * `output` is the indices of layers use this layer's output as their input.
- * `is_delete` means whether the layer is still available.
+ * `is_delete` means whether the layer is still available.
diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts
@@ -406,6 +406,7 @@ class NNIManager implements Manager {
                     suspendStartTime = Date.now();
                 }
                 this.status.status = 'SUSPENDED';
+                this.log.info('Experiment suspended.');
             } else {
                 if (this.status.status === 'SUSPENDED') {
                     assert(suspendStartTime !== 0);

diff --git a/test/naive/.gitignore b/test/naive/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+
+tuner_search_space.json
+tuner_result.txt
+assessor_result.txt
diff --git a/test/naive/README.md b/test/naive/README.md
@@ -0,0 +1,19 @@
+## Usage
+To test before installing:
+    ./run.py --preinstall
+To test the integrity of installation:
+    ./run.py
+It will print `PASS` in green eventually if everything works well.
+
+## Details
+* This test case tests the communication between trials and tuner/assessor.
+* The naive trials receive an integer `x` as parameter, and reports `x`, `x²`, `x³`, ... , `x¹⁰` as metrics.
+* The naive tuner simply generates the sequence of natural numbers, and print received metrics to `tuner_result.txt`.
+* The naive assessor kills trials when `sum(metrics) % 11 == 1`, and print killed trials to `assessor_result.txt`.
+* When tuner and assessor exit with exception, they will append `ERROR` to corresponding result file.
+* When the experiment is suspended, meaning it is successfully done in this case, `Experiment suspended` can be detected in the nni_manager.log file.
+
+## Issues
+* Private APIs are used to detect whether tuner and assessor have terminated successfully. 
+* The output of REST server is not tested.
+* Remote machine training service is not tested.
diff --git a/test/naive/expected_assessor_result.txt b/test/naive/expected_assessor_result.txt
@@ -4,4 +4,3 @@
 5 3
 7 2
 8 3
-DONE
diff --git a/test/naive/expected_tuner_result.txt b/test/naive/expected_tuner_result.txt
@@ -2,4 +2,3 @@
 6 60466176
 9 3486784401
 10 10000000000
-DONE
diff --git a/test/naive/naive_assessor.py b/test/naive/naive_assessor.py
@@ -1,10 +1,13 @@
 import logging
+import os
 
 from nni.assessor import Assessor, AssessResult
 
 _logger = logging.getLogger('NaiveAssessor')
 _logger.info('start')
-_result = open('/tmp/nni_assessor_result.txt', 'w')
+
+_pwd = os.path.dirname(__file__)
+_result = open(os.path.join(_pwd, 'assessor_result.txt'), 'w')
 
 class NaiveAssessor(Assessor):
     def __init__(self, optimize_mode):
@@ -30,7 +33,6 @@ def assess_trial(self, trial_job_id, trial_history):
         return AssessResult.Good
 
     def _on_exit(self):
-        _result.write('DONE\n')
         _result.close()
 
     def _on_error(self):

diff --git a/test/naive/naive_tuner.py b/test/naive/naive_tuner.py
@@ -1,11 +1,14 @@
 import json
 import logging
+import os
 
 from nni.tuner import Tuner
 
 _logger = logging.getLogger('NaiveTuner')
 _logger.info('start')
-_result = open('/tmp/nni_tuner_result.txt', 'w')
+
+_pwd = os.path.dirname(__file__)
+_result = open(os.path.join(_pwd, 'tuner_result.txt'), 'w')
 
 class NaiveTuner(Tuner):
     def __init__(self, optimize_mode):
@@ -24,11 +27,10 @@ def receive_trial_result(self, parameter_id, parameters, reward):
 
     def update_search_space(self, search_space):
         _logger.info('update_search_space: %s' % search_space)
-        with open('/tmp/nni_tuner_search_space.json', 'w') as file_:
+        with open(os.path.join(_pwd, 'tuner_search_space.json'), 'w') as file_:
             json.dump(search_space, file_)
 
     def _on_exit(self):
-        _result.write('DONE\n')
         _result.close()
 
     def _on_error(self):

diff --git a/test/naive/run.py b/test/naive/run.py
@@ -4,82 +4,121 @@
 import json
 import os
 import subprocess
+import requests
+import sys
 import time
 import traceback
 
 GREEN = '\33[32m'
 RED = '\33[31m'
 CLEAR = '\33[0m'
 
-def read_last_line(file_name):
-    try:
-        *_, last_line = open(file_name)
-        return last_line.strip()
-    except (FileNotFoundError, ValueError):
-        return None
-
-def run():
-    os.environ['PATH'] = os.environ['PATH'] + ':' + os.environ['PWD']
-
-    with contextlib.suppress(FileNotFoundError):
-        os.remove('tuner_search_space.txt')
-    with contextlib.suppress(FileNotFoundError):
-        os.remove('tuner_result.txt')
-    with contextlib.suppress(FileNotFoundError):
-        os.remove('/tmp/nni_assessor_result.txt')
-
-    proc = subprocess.run(['nnictl', 'create', '--config', 'local.yml'])
-    assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
-
-    print('Spawning trials...')
-    current_trial = 0
-
-    for _ in range(60):
+class Integration_test():
+    def __init__(self):
+        self.experiment_url = 'http://localhost:51188/api/v1/nni/experiment'
+        self.experiment_id = None
+        self.experiment_suspended_signal = '"Experiment suspended"'
+
+    def read_last_line(self, file_name):
+        try:
+            *_, last_line = open(file_name)
+            return last_line.strip()
+        except (FileNotFoundError, ValueError):
+            return None
+
+    def fetch_experiment_config(self):
+        experiment_profile = requests.get(self.experiment_url)
+        self.experiment_id = json.loads(experiment_profile.text)['id']
+        self.experiment_path = os.path.join(os.environ['HOME'], 'nni/experiments', self.experiment_id)
+        self.nnimanager_log_path = os.path.join(self.experiment_path, 'log', 'nnimanager.log')
+
+    def check_experiment_status(self):
+        assert os.path.exists(self.nnimanager_log_path), 'Experiment starts failed'
+        cmds = ['cat', self.nnimanager_log_path, '|', 'grep', self.experiment_suspended_signal]
+        completed_process = subprocess.run(' '.join(cmds), shell = True)
+
+        return completed_process.returncode == 0
+
+    def remove_files(self, file_list):
+        for file_path in file_list:
+            with contextlib.suppress(FileNotFoundError):
+                os.remove(file_path)
+
+    def run(self, installed = True):
+        if not installed:
+            os.environ['PATH'] = os.environ['PATH'] + ':' + os.environ['PWD']
+            sdk_path = os.path.abspath('../../src/sdk/pynni')
+            cmd_path = os.path.abspath('../../tools')
+            pypath = os.environ.get('PYTHONPATH')
+            if pypath:
+                pypath = ':'.join([pypath, sdk_path, cmd_path])
+            else:
+                pypath = ':'.join([sdk_path, cmd_path])
+            os.environ['PYTHONPATH'] = pypath
+
+        to_remove = ['tuner_search_space.json', 'tuner_result.txt', 'assessor_result.txt']
+        self.remove_files(to_remove)
+
+        proc = subprocess.run(['nnictl', 'create', '--config', 'local.yml'])
+        assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
+
+        print('Spawning trials...')
         time.sleep(1)
+        self.fetch_experiment_config()
+        current_trial = 0
+
+        for _ in range(60):
+            time.sleep(1)
 
-        tuner_status = read_last_line('/tmp/nni_tuner_result.txt')
-        assessor_status = read_last_line('/tmp/nni_assessor_result.txt')
+            tuner_status = self.read_last_line('tuner_result.txt')
+            assessor_status = self.read_last_line('assessor_result.txt')
+            experiment_status = self.check_experiment_status()
 
-        assert tuner_status != 'ERROR', 'Tuner exited with error'
-        assert assessor_status != 'ERROR', 'Assessor exited with error'
+            assert tuner_status != 'ERROR', 'Tuner exited with error'
+            assert assessor_status != 'ERROR', 'Assessor exited with error'
 
-        if tuner_status == 'DONE' and assessor_status == 'DONE':
-            break
+            if experiment_status:
+                break
 
-        if tuner_status is not None:
-            for line in open('/tmp/nni_tuner_result.txt'):
-                if line.strip() in ('DONE', 'ERROR'):
-                    break
-                trial = int(line.split(' ')[0])
-                if trial > current_trial:
-                    current_trial = trial
-                    print('Trial #%d done' % trial)
+            if tuner_status is not None:
+                for line in open('tuner_result.txt'):
+                    if line.strip() == 'ERROR':
+                        break
+                    trial = int(line.split(' ')[0])
+                    if trial > current_trial:
+                        current_trial = trial
+                        print('Trial #%d done' % trial)
 
-    assert tuner_status == 'DONE' and assessor_status == 'DONE', 'Failed to finish in 1 min'
+        assert experiment_status, 'Failed to finish in 1 min'
 
-    ss1 = json.load(open('search_space.json'))
-    ss2 = json.load(open('/tmp/nni_tuner_search_space.json'))
-    assert ss1 == ss2, 'Tuner got wrong search space'
+        ss1 = json.load(open('search_space.json'))
+        ss2 = json.load(open('tuner_search_space.json'))
+        assert ss1 == ss2, 'Tuner got wrong search space'
 
-    tuner_result = set(open('/tmp/nni_tuner_result.txt'))
-    expected = set(open('expected_tuner_result.txt'))
-    # Trials may complete before NNI gets assessor's result,
-    # so it is possible to have more final result than expected
-    assert tuner_result.issuperset(expected), 'Bad tuner result'
+        # Waiting for naive_trial to report_final_result
+        time.sleep(2)
+        tuner_result = set(open('tuner_result.txt'))
+        expected = set(open('expected_tuner_result.txt'))
+        # Trials may complete before NNI gets assessor's result,
+        # so it is possible to have more final result than expected
+        assert tuner_result.issuperset(expected), 'Bad tuner result'
 
-    assessor_result = set(open('/tmp/nni_assessor_result.txt'))
-    expected = set(open('expected_assessor_result.txt'))
-    assert assessor_result == expected, 'Bad assessor result'
+        assessor_result = set(open('assessor_result.txt'))
+        expected = set(open('expected_assessor_result.txt'))
+        assert assessor_result == expected, 'Bad assessor result'
 
 if __name__ == '__main__':
+    installed = (sys.argv[-1] != '--preinstall')
+
+    ic = Integration_test()
     try:
-        run()
+        ic.run(installed)
         # TODO: check the output of rest server
         print(GREEN + 'PASS' + CLEAR)
     except Exception as error:
         print(RED + 'FAIL' + CLEAR)
         print('%r' % error)
         traceback.print_exc()
-        raise error
-
-    subprocess.run(['nnictl', 'stop', '--port', '51188'])
+        sys.exit(1)
+    finally:
+        subprocess.run(['nnictl', 'stop'])
diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py
@@ -114,6 +114,8 @@ def set_pai_config(experiment_config, port):
     if not response or not response.status_code == 200:
         if response is not None:
             err_message = response.text
+            with open(STDERR_FULL_PATH, 'a+') as fout:
+                fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
         return False, err_message
 
     #set trial_config
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,4 +4,3 @@ @@
 3
 2
 3
-    DONE