From e6234d1002eb529455fafde168f30ee98fe7cc0f Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Fri, 1 Mar 2019 20:05:15 -0800
Subject: [PATCH 1/6] supoort multiple trials
---
examples/nasjob-example-RL.yaml | 2 +
.../SuggestionParam.py | 54 +++----
pkg/suggestion/nasrl_service.py | 145 ++++++++++--------
3 files changed, 115 insertions(+), 86 deletions(-)
diff --git a/examples/nasjob-example-RL.yaml b/examples/nasjob-example-RL.yaml
index 3942aed635e..3e2946436d8 100644
--- a/examples/nasjob-example-RL.yaml
+++ b/examples/nasjob-example-RL.yaml
@@ -150,6 +150,8 @@ spec:
suggestionSpec:
suggestionAlgorithm: "nasrl"
suggestionParameters:
+ - name: "num_trials"
+ value: "1"
- name: "lstm_num_cells"
value: "64"
- name: "lstm_num_layers"
diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py b/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py
index ae9f1d19777..fc75b073062 100644
--- a/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py
+++ b/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py
@@ -1,34 +1,36 @@
def parseSuggestionParam(params_raw):
param_standard = {
- "lstm_num_cells": ['value', int, [1, 'inf']],
- "lstm_num_layers": ['value', int, [1, 'inf']],
- "lstm_keep_prob": ['value', float, [0.0, 1.0]],
- "optimizer": ['categorical', str, ["adam", "momentum", "sgd"]],
- "init_learning_rate": ['value', float, [1e-6, 1.0]],
- "lr_decay_start": ['value', int, [0, 'inf']],
- "lr_decay_every": ['value', int, [1, 'inf']],
- "lr_decay_rate": ['value', float, [0.0, 1.0]],
- "skip-target": ['value', float, [0.0, 1.0]],
- "skip-weight": ['value', float, [0.0, 'inf']],
- "l2_reg": ['value', float, [0.0, 'inf']],
- "entropy_weight": ['value', float, [0.0, 'inf']],
- "baseline_decay": ['value', float, [0.0, 1.0]],
+ "num_trials": ['value', int, [1, 'inf']],
+ "lstm_num_cells": ['value', int, [1, 'inf']],
+ "lstm_num_layers": ['value', int, [1, 'inf']],
+ "lstm_keep_prob": ['value', float, [0.0, 1.0]],
+ "optimizer": ['categorical', str, ["adam", "momentum", "sgd"]],
+ "init_learning_rate": ['value', float, [1e-6, 1.0]],
+ "lr_decay_start": ['value', int, [0, 'inf']],
+ "lr_decay_every": ['value', int, [1, 'inf']],
+ "lr_decay_rate": ['value', float, [0.0, 1.0]],
+ "skip-target": ['value', float, [0.0, 1.0]],
+ "skip-weight": ['value', float, [0.0, 'inf']],
+ "l2_reg": ['value', float, [0.0, 'inf']],
+ "entropy_weight": ['value', float, [0.0, 'inf']],
+ "baseline_decay": ['value', float, [0.0, 1.0]],
}
suggestion_params = {
- "lstm_num_cells": 64,
- "lstm_num_layers": 1,
- "lstm_keep_prob": 1.0,
- "optimizer": "adam",
- "init_learning_rate": 1e-3,
- "lr_decay_start": 0,
- "lr_decay_every": 1000,
- "lr_decay_rate": 0.9,
- "skip-target": 0.4,
- "skip-weight": 0.8,
- "l2_reg": 0,
- "entropy_weight": 1e-4,
- "baseline_decay": 0.9999
+ "num_trials": 1,
+ "lstm_num_cells": 64,
+ "lstm_num_layers": 1,
+ "lstm_keep_prob": 1.0,
+ "optimizer": "adam",
+ "init_learning_rate": 1e-3,
+ "lr_decay_start": 0,
+ "lr_decay_every": 1000,
+ "lr_decay_rate": 0.9,
+ "skip-target": 0.4,
+ "skip-weight": 0.8,
+ "l2_reg": 0,
+ "entropy_weight": 1e-4,
+ "baseline_decay": 0.9999
}
def checktype(param_name, param_value, check_mode, supposed_type, supposed_range=None):
diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py
index d571784ad5e..d447180b103 100644
--- a/pkg/suggestion/nasrl_service.py
+++ b/pkg/suggestion/nasrl_service.py
@@ -10,9 +10,11 @@
import json
import os
+
MANAGER_ADDRESS = "vizier-core"
MANAGER_PORT = 6789
+
class NAS_RL_StudyJob(object):
def __init__(self, request, logger):
self.logger = logger
@@ -20,7 +22,7 @@ def __init__(self, request, logger):
self.param_id = request.param_id
self.study_name = None
self.tf_graph = tf.Graph()
- self.prev_trial_id = None
+ self.prev_trial_ids = list()
self.ctrl_cache_file = "ctrl_cache/{}/{}.ckpt".format(request.study_id, request.study_id)
self.ctrl_step = 0
self.is_first_run = True
@@ -33,11 +35,13 @@ def __init__(self, request, logger):
self.search_space = None
self.opt_direction = None
self.objective_name = None
+ self.num_trials = 1
self.logger.info("-" * 100 + "\nSetting Up Suggestion for StudyJob ID {}\n".format(request.study_id) + "-" * 100)
self._get_study_param()
self._get_suggestion_param()
self._setup_controller()
+ self.num_trials = self.suggestion_config["num_trials"]
self.logger.info("Suggestion for StudyJob {} (ID: {}) has been initialized.\n".format(self.study_name, self.study_id))
def _get_study_param(self):
@@ -188,7 +192,10 @@ def GetSuggestions(self, request, context):
self.logger.info("First time running suggestion for {}. Random architecture will be given.".format(study.study_name))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
- arc = sess.run(controller_ops["sample_arc"])
+ candidates = list()
+ for _ in range(study.num_trials):
+ candidates.append(sess.run(controller_ops["sample_arc"]))
+
# TODO: will use PVC to store the checkpoint to protect against unexpected suggestion pod restart
saver.save(sess, study.ctrl_cache_file)
@@ -199,90 +206,108 @@ def GetSuggestions(self, request, context):
saver.restore(sess, study.ctrl_cache_file)
valid_acc = ctrl.reward
- result = self.GetEvaluationResult(study)
+ results = self.GetEvaluationResult(study)
+ avg_result = sum(results) / len(results)
+
+ self.logger.info("Evaluation results of previous trials: {}".format(str(results)[1:-1]))
+ self.logger.info("The average is {}".format(avg_result))
# This lstm cell is designed to maximize the metrics
# However, if the user want to minimize the metrics, we can take the negative of the result
if study.opt_direction == api_pb2.MINIMIZE:
- result = -result
+ avg_result = -avg_result
loss, entropy, lr, gn, bl, skip, _ = sess.run(
fetches=run_ops,
- feed_dict={valid_acc: result})
+ feed_dict={valid_acc: avg_result})
self.logger.info("Suggetion updated. LSTM Controller Reward: {}".format(loss))
- arc = sess.run(controller_ops["sample_arc"])
+ candidates = list()
+ for _ in range(study.num_trials):
+ candidates.append(sess.run(controller_ops["sample_arc"]))
saver.save(sess, study.ctrl_cache_file)
-
- arc = arc.tolist()
- organized_arc = [0 for _ in range(study.num_layers)]
- record = 0
- for l in range(study.num_layers):
- organized_arc[l] = arc[record: record + l + 1]
- record += l + 1
-
- nn_config = dict()
- nn_config['num_layers'] = study.num_layers
- nn_config['input_size'] = study.input_size
- nn_config['output_size'] = study.output_size
- nn_config['embedding'] = dict()
- for l in range(study.num_layers):
- opt = organized_arc[l][0]
- nn_config['embedding'][opt] = study.search_space[opt].get_dict()
-
- organized_arc_json = json.dumps(organized_arc)
- nn_config_json = json.dumps(nn_config)
-
- organized_arc_str = str(organized_arc_json).replace('\"', '\'')
- nn_config_str = str(nn_config_json).replace('\"', '\'')
-
- self.logger.info("\nNew Neural Network Architecture (internal representation):")
- self.logger.info(organized_arc_json)
- self.logger.info("\nCorresponding Seach Space Description:")
- self.logger.info(nn_config_str)
- self.logger.info("")
-
- trials = []
- trials.append(api_pb2.Trial(
- study_id=request.study_id,
- parameter_set=[
- api_pb2.Parameter(
- name="architecture",
- value=organized_arc_str,
- parameter_type= api_pb2.CATEGORICAL),
- api_pb2.Parameter(
- name="nn_config",
- value=nn_config_str,
- parameter_type= api_pb2.CATEGORICAL)
- ],
+
+ organized_candidates = list()
+ trials = list()
+
+ for i in range(study.num_trials):
+ arc = candidates[i].tolist()
+ organized_arc = [0 for _ in range(study.num_layers)]
+ record = 0
+ for l in range(study.num_layers):
+ organized_arc[l] = arc[record: record + l + 1]
+ record += l + 1
+ organized_candidates.append(organized_arc)
+
+ nn_config = dict()
+ nn_config['num_layers'] = study.num_layers
+ nn_config['input_size'] = study.input_size
+ nn_config['output_size'] = study.output_size
+ nn_config['embedding'] = dict()
+ for l in range(study.num_layers):
+ opt = organized_arc[l][0]
+ nn_config['embedding'][opt] = study.search_space[opt].get_dict()
+
+ organized_arc_json = json.dumps(organized_arc)
+ nn_config_json = json.dumps(nn_config)
+
+ organized_arc_str = str(organized_arc_json).replace('\"', '\'')
+ nn_config_str = str(nn_config_json).replace('\"', '\'')
+
+ self.logger.info("\nNeural Network Architecture Candidate #{} (internal representation):".format(i))
+ self.logger.info(organized_arc_json)
+ self.logger.info("\nCorresponding Seach Space Description:")
+ self.logger.info(nn_config_str)
+
+ trials.append(api_pb2.Trial(
+ study_id=request.study_id,
+ parameter_set=[
+ api_pb2.Parameter(
+ name="architecture",
+ value=organized_arc_str,
+ parameter_type= api_pb2.CATEGORICAL),
+ api_pb2.Parameter(
+ name="nn_config",
+ value=nn_config_str,
+ parameter_type= api_pb2.CATEGORICAL)
+ ],
+ )
)
- )
+ self.logger.info("")
channel = grpc.beta.implementations.insecure_channel(MANAGER_ADDRESS, MANAGER_PORT)
with api_pb2.beta_create_Manager_stub(channel) as client:
for i, t in enumerate(trials):
ctrep = client.CreateTrial(api_pb2.CreateTrialRequest(trial=t), 10)
trials[i].trial_id = ctrep.trial_id
- self.logger.info("Trial {} Created\n".format(ctrep.trial_id))
- study.prev_trial_id = ctrep.trial_id
-
+ self.logger.info("Trial {} Created".format(ctrep.trial_id))
+ study.prev_trial_ids.append(ctrep.trial_id)
+ self.logger.info("")
+
study.ctrl_step += 1
return api_pb2.GetSuggestionsReply(trials=trials)
def GetEvaluationResult(self, study):
- worker_list = []
channel = grpc.beta.implementations.insecure_channel(MANAGER_ADDRESS, MANAGER_PORT)
with api_pb2.beta_create_Manager_stub(channel) as client:
- gwfrep = client.GetWorkerFullInfo(api_pb2.GetWorkerFullInfoRequest(study_id=study.study_id, trial_id=study.prev_trial_id, only_latest_log=True), 10)
- worker_list = gwfrep.worker_full_infos
+ gwfrep = client.GetWorkerFullInfo(api_pb2.GetWorkerFullInfoRequest(study_id=study.study_id, only_latest_log=True), 10)
+ trials_list = gwfrep.worker_full_infos
+
+ completed = True
+ for trial in trials_list:
+ completed = completed and (trial.Worker.status == api_pb2.COMPLETED)
+
+
+ if completed:
+ metrics = list()
- for w in worker_list:
- if w.Worker.status == api_pb2.COMPLETED:
- for ml in w.metrics_logs:
+ for t in trials_list:
+ for ml in t.metrics_logs:
if ml.name == study.objective_name:
- self.logger.info("Evaluation result of previous candidate: {}".format(ml.values[-1].value))
- return float(ml.values[-1].value)
+ metrics.append(float(ml.values[-1].value))
+ return metrics
+
# TODO: add support for multiple trials
From deba168f4ea0142832a7b1ae90cb8adea6ff55bd Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Fri, 1 Mar 2019 23:49:49 -0800
Subject: [PATCH 2/6] adjust To Do
---
pkg/suggestion/NAS_Reinforcement_Learning/README.md | 6 +++---
pkg/suggestion/nasrl_service.py | 2 --
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
index 89b9735f1ff..d946e7ffbdf 100644
--- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md
+++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
@@ -122,6 +122,6 @@ This neural architecture can be visualized as
![a neural netowrk architecure example](example.png)
## To Do
-1. Add support for multiple trials
-2. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell`
-3. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts
+1. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell`
+2. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts
+3. Add `RequestCount` into API so that the suggestion can clean the information of completed studies.
diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py
index d447180b103..86b02372cf9 100644
--- a/pkg/suggestion/nasrl_service.py
+++ b/pkg/suggestion/nasrl_service.py
@@ -309,5 +309,3 @@ def GetEvaluationResult(self, study):
metrics.append(float(ml.values[-1].value))
return metrics
-
- # TODO: add support for multiple trials
From ef3ff9614d45b643954cdcca81e2d02510321499 Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Sat, 2 Mar 2019 00:15:20 -0800
Subject: [PATCH 3/6] language improvement in README.md
---
pkg/suggestion/NAS_Reinforcement_Learning/README.md | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
index d946e7ffbdf..25be4cbd88f 100644
--- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md
+++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
@@ -25,9 +25,8 @@ If n = 12, m = 6, the definition of an architecture will be like:
There are n rows, the ith row has i elements and describes the ith layer. Please notice that layer 0 is the input and is not included in this definition.
-In each row:
-The first integer ranges from 0 to m-1, indicates the operation in this layer.
-The next (i-1) integers is either 0 or 1. The kth (k>=2) integer indicates whether (k-2)th layer has a skip connection with this layer. (There will always be a connection from (k-1)th layer to kth layer)
+In each row, the first integer ranges from 0 to m-1 and indicates the operation in this layer.
+Starting from the second position, the kth integer is a boolean value that indicates whether (k-2)th layer has a skip connection with this layer. (There will always be a connection from (k-1)th layer to kth layer)
## Output of `GetSuggestion()`
The output of `GetSuggestion()` consists of two parts: `architecture` and `nn_config`.
From d3289041299d920589bcf0da1bd1fafbc840f87b Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Mon, 4 Mar 2019 20:49:21 -0800
Subject: [PATCH 4/6] fix several problems
---
examples/nasjob-example-RL.yaml | 3 +-
.../NAS_Reinforcement_Learning/Controller.py | 4 +-
.../SuggestionParam.py | 2 -
pkg/suggestion/nasrl_service.py | 42 ++++++++++---------
4 files changed, 26 insertions(+), 25 deletions(-)
diff --git a/examples/nasjob-example-RL.yaml b/examples/nasjob-example-RL.yaml
index 3e2946436d8..a64215857d9 100644
--- a/examples/nasjob-example-RL.yaml
+++ b/examples/nasjob-example-RL.yaml
@@ -149,9 +149,8 @@ spec:
restartPolicy: Never
suggestionSpec:
suggestionAlgorithm: "nasrl"
+ requestNumber: 3
suggestionParameters:
- - name: "num_trials"
- value: "1"
- name: "lstm_num_cells"
value: "64"
- name: "lstm_num_layers"
diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/Controller.py b/pkg/suggestion/NAS_Reinforcement_Learning/Controller.py
index 2c87868a573..937dc130b1a 100755
--- a/pkg/suggestion/NAS_Reinforcement_Learning/Controller.py
+++ b/pkg/suggestion/NAS_Reinforcement_Learning/Controller.py
@@ -31,7 +31,7 @@ def __init__(self,
logger=None):
self.logger = logger
- self.logger.info("Building Controller")
+ self.logger.info(">>> Building Controller")
self.num_layers = num_layers
self.num_operations = num_operations
@@ -87,7 +87,7 @@ def _create_params(self):
def _build_sampler(self):
"""Build the sampler ops and the log_prob ops."""
- self.logger.info("Building Controller Sampler")
+ self.logger.info(">>> Building Controller Sampler")
anchors = []
anchors_w_1 = []
diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py b/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py
index fc75b073062..ace456782b3 100644
--- a/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py
+++ b/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py
@@ -1,6 +1,5 @@
def parseSuggestionParam(params_raw):
param_standard = {
- "num_trials": ['value', int, [1, 'inf']],
"lstm_num_cells": ['value', int, [1, 'inf']],
"lstm_num_layers": ['value', int, [1, 'inf']],
"lstm_keep_prob": ['value', float, [0.0, 1.0]],
@@ -17,7 +16,6 @@ def parseSuggestionParam(params_raw):
}
suggestion_params = {
- "num_trials": 1,
"lstm_num_cells": 64,
"lstm_num_layers": 1,
"lstm_keep_prob": 1.0,
diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py
index 86b02372cf9..8e860ec1fa3 100644
--- a/pkg/suggestion/nasrl_service.py
+++ b/pkg/suggestion/nasrl_service.py
@@ -20,6 +20,7 @@ def __init__(self, request, logger):
self.logger = logger
self.study_id = request.study_id
self.param_id = request.param_id
+ self.num_trials = request.request_number
self.study_name = None
self.tf_graph = tf.Graph()
self.prev_trial_ids = list()
@@ -35,14 +36,12 @@ def __init__(self, request, logger):
self.search_space = None
self.opt_direction = None
self.objective_name = None
- self.num_trials = 1
self.logger.info("-" * 100 + "\nSetting Up Suggestion for StudyJob ID {}\n".format(request.study_id) + "-" * 100)
self._get_study_param()
self._get_suggestion_param()
self._setup_controller()
- self.num_trials = self.suggestion_config["num_trials"]
- self.logger.info("Suggestion for StudyJob {} (ID: {}) has been initialized.\n".format(self.study_name, self.study_id))
+ self.logger.info(">>> Suggestion for StudyJob {} (ID: {}) has been initialized.\n".format(self.study_name, self.study_id))
def _get_study_param(self):
# this function need to
@@ -115,7 +114,7 @@ def print_search_space(self):
self.logger.warning("Error! The Suggestion has not yet been initialized!")
return
- self.logger.info("Search Space for StudyJob {} (ID: {}):".format(self.study_name, self.study_id))
+ self.logger.info(">>> Search Space for StudyJob {} (ID: {}):".format(self.study_name, self.study_id))
for opt in self.search_space:
opt.print_op(self.logger)
self.logger.info("There are {} operations in total.\n".format(self.num_operations))
@@ -125,12 +124,13 @@ def print_suggestion_params(self):
self.logger.warning("Error! The Suggestion has not yet been initialized!")
return
- self.logger.info("Parameters of LSTM Controller for StudyJob {} (ID: {}):".format(self.study_name, self.study_id))
+ self.logger.info(">>> Parameters of LSTM Controller for StudyJob {} (ID: {}):".format(self.study_name, self.study_id))
for spec in self.suggestion_config:
if len(spec) > 13:
self.logger.info("{}: \t{}".format(spec, self.suggestion_config[spec]))
else:
self.logger.info("{}: \t\t{}".format(spec, self.suggestion_config[spec]))
+ self.logger.info("RequestNumber:\t\t{}".format(self.num_trials))
self.logger.info("")
@@ -189,7 +189,7 @@ def GetSuggestions(self, request, context):
controller_ops["train_op"]]
if study.is_first_run:
- self.logger.info("First time running suggestion for {}. Random architecture will be given.".format(study.study_name))
+ self.logger.info(">>> First time running suggestion for {}. Random architecture will be given.".format(study.study_name))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
candidates = list()
@@ -209,8 +209,7 @@ def GetSuggestions(self, request, context):
results = self.GetEvaluationResult(study)
avg_result = sum(results) / len(results)
- self.logger.info("Evaluation results of previous trials: {}".format(str(results)[1:-1]))
- self.logger.info("The average is {}".format(avg_result))
+ self.logger.info(">>> Evaluation results of previous trials: {}. The average is {}".format(str(results)[1:-1], avg_result))
# This lstm cell is designed to maximize the metrics
# However, if the user want to minimize the metrics, we can take the negative of the result
@@ -220,7 +219,7 @@ def GetSuggestions(self, request, context):
loss, entropy, lr, gn, bl, skip, _ = sess.run(
fetches=run_ops,
feed_dict={valid_acc: avg_result})
- self.logger.info("Suggetion updated. LSTM Controller Reward: {}".format(loss))
+ self.logger.info(">>> Suggetion updated. LSTM Controller Reward: {}".format(loss))
candidates = list()
for _ in range(study.num_trials):
@@ -255,9 +254,9 @@ def GetSuggestions(self, request, context):
organized_arc_str = str(organized_arc_json).replace('\"', '\'')
nn_config_str = str(nn_config_json).replace('\"', '\'')
- self.logger.info("\nNeural Network Architecture Candidate #{} (internal representation):".format(i))
+ self.logger.info("\n>>> Neural Network Architecture Candidate #{} (internal representation):".format(i))
self.logger.info(organized_arc_json)
- self.logger.info("\nCorresponding Seach Space Description:")
+ self.logger.info("\n>>> Corresponding Seach Space Description:")
self.logger.info(nn_config_str)
trials.append(api_pb2.Trial(
@@ -275,14 +274,18 @@ def GetSuggestions(self, request, context):
)
)
+ self.prev_trial_ids = list()
self.logger.info("")
channel = grpc.beta.implementations.insecure_channel(MANAGER_ADDRESS, MANAGER_PORT)
with api_pb2.beta_create_Manager_stub(channel) as client:
for i, t in enumerate(trials):
ctrep = client.CreateTrial(api_pb2.CreateTrialRequest(trial=t), 10)
trials[i].trial_id = ctrep.trial_id
- self.logger.info("Trial {} Created".format(ctrep.trial_id))
- study.prev_trial_ids.append(ctrep.trial_id)
+ self.prev_trial_ids.append(ctrep.trial_id)
+
+ self.logger.info(">>> {} Trials were created:".format(study.num_trials))
+ for t in self.prev_trial_ids:
+ self.logger.info(t)
self.logger.info("")
study.ctrl_step += 1
@@ -296,16 +299,17 @@ def GetEvaluationResult(self, study):
trials_list = gwfrep.worker_full_infos
completed = True
- for trial in trials_list:
- completed = completed and (trial.Worker.status == api_pb2.COMPLETED)
+ for t in trials_list:
+ if t.Worker.trial_id in self.prev_trial_ids:
+ completed = completed and (t.Worker.status == api_pb2.COMPLETED)
-
if completed:
metrics = list()
for t in trials_list:
- for ml in t.metrics_logs:
- if ml.name == study.objective_name:
- metrics.append(float(ml.values[-1].value))
+ if t.Worker.trial_id in self.prev_trial_ids:
+ for ml in t.metrics_logs:
+ if ml.name == study.objective_name:
+ metrics.append(float(ml.values[-1].value))
return metrics
From 2f415f348f463356ed35198489b0b590990d5e6a Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Tue, 5 Mar 2019 13:46:39 -0800
Subject: [PATCH 5/6] fix a potential problem
---
pkg/suggestion/nasrl_service.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py
index 8e860ec1fa3..d60baec0f4c 100644
--- a/pkg/suggestion/nasrl_service.py
+++ b/pkg/suggestion/nasrl_service.py
@@ -298,12 +298,12 @@ def GetEvaluationResult(self, study):
gwfrep = client.GetWorkerFullInfo(api_pb2.GetWorkerFullInfoRequest(study_id=study.study_id, only_latest_log=True), 10)
trials_list = gwfrep.worker_full_infos
- completed = True
+ completed_count = 0
for t in trials_list:
- if t.Worker.trial_id in self.prev_trial_ids:
- completed = completed and (t.Worker.status == api_pb2.COMPLETED)
+ if t.Worker.trial_id in self.prev_trial_ids and t.Worker.status == api_pb2.COMPLETED:
+ completed_count += 1
- if completed:
+ if completed_count == study.num_trials:
metrics = list()
for t in trials_list:
From 8986a7f9d62ab597d5539fcf1896ae9ab5fbd658 Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Tue, 5 Mar 2019 15:48:55 -0800
Subject: [PATCH 6/6] handle the GetEvaluationResult() return None problem
---
pkg/suggestion/nasrl_service.py | 45 ++++++++++++++++++---------------
1 file changed, 25 insertions(+), 20 deletions(-)
diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py
index d60baec0f4c..ebba3e13524 100644
--- a/pkg/suggestion/nasrl_service.py
+++ b/pkg/suggestion/nasrl_service.py
@@ -9,6 +9,7 @@
from logging import getLogger, StreamHandler, INFO, DEBUG
import json
import os
+import time
MANAGER_ADDRESS = "vizier-core"
@@ -206,19 +207,23 @@ def GetSuggestions(self, request, context):
saver.restore(sess, study.ctrl_cache_file)
valid_acc = ctrl.reward
- results = self.GetEvaluationResult(study)
- avg_result = sum(results) / len(results)
+ result = self.GetEvaluationResult(study)
- self.logger.info(">>> Evaluation results of previous trials: {}. The average is {}".format(str(results)[1:-1], avg_result))
+ # In some rare cases, GetEvaluationResult() may return None
+ # if GetSuggestions() is called before all the trials are completed
+ while result is None:
+ self.logger.warning(">>> GetEvaluationResult() returns None")
+ time.sleep(20)
+ result = self.GetEvaluationResult(study)
- # This lstm cell is designed to maximize the metrics
- # However, if the user want to minimize the metrics, we can take the negative of the result
+ # This LSTM network is designed to maximize the metrics
+ # However, if the user wants to minimize the metrics, we can take the negative of the result
if study.opt_direction == api_pb2.MINIMIZE:
- avg_result = -avg_result
+ result = -result
loss, entropy, lr, gn, bl, skip, _ = sess.run(
fetches=run_ops,
- feed_dict={valid_acc: avg_result})
+ feed_dict={valid_acc: result})
self.logger.info(">>> Suggetion updated. LSTM Controller Reward: {}".format(loss))
candidates = list()
@@ -254,7 +259,7 @@ def GetSuggestions(self, request, context):
organized_arc_str = str(organized_arc_json).replace('\"', '\'')
nn_config_str = str(nn_config_json).replace('\"', '\'')
- self.logger.info("\n>>> Neural Network Architecture Candidate #{} (internal representation):".format(i))
+ self.logger.info("\n>>> New Neural Network Architecture Candidate #{} (internal representation):".format(i))
self.logger.info(organized_arc_json)
self.logger.info("\n>>> Corresponding Seach Space Description:")
self.logger.info(nn_config_str)
@@ -298,18 +303,18 @@ def GetEvaluationResult(self, study):
gwfrep = client.GetWorkerFullInfo(api_pb2.GetWorkerFullInfoRequest(study_id=study.study_id, only_latest_log=True), 10)
trials_list = gwfrep.worker_full_infos
- completed_count = 0
+ completed_trials = dict()
for t in trials_list:
if t.Worker.trial_id in self.prev_trial_ids and t.Worker.status == api_pb2.COMPLETED:
- completed_count += 1
+ for ml in t.metrics_logs:
+ if ml.name == study.objective_name:
+ completed_trials[t.Worker.trial_id] = float(ml.values[-1].value)
- if completed_count == study.num_trials:
- metrics = list()
-
- for t in trials_list:
- if t.Worker.trial_id in self.prev_trial_ids:
- for ml in t.metrics_logs:
- if ml.name == study.objective_name:
- metrics.append(float(ml.values[-1].value))
-
- return metrics
+ if len(completed_trials) == study.num_trials:
+ self.logger.info(">>> Evaluation results of previous trials:")
+ for k in completed_trials:
+ self.logger.info("{}: {}".format(k, completed_trials[k]))
+ avg_metrics = sum(completed_trials.values()) / study.num_trials
+ self.logger.info("The average is {}\n".format(avg_metrics))
+
+ return avg_metrics
\ No newline at end of file