Adds a myriad of tunable training params

We can now select our optimizer, configure each optimizer, we have conversion to FP16 (where possible) as an option which improves memory usage during training at the very least. We can now perform image augmentation during training, control label smoothing, etc. We also automatically and always do early stopping once validation accuracy does not improve, and since I'm an old man with the old memes, I've set the training target epochs to 9001 because early stopping will actually control this. Basically I brought over a bunch of configuration stuff from automl and merged them in, fulfilling a bunch of TODOs in the original training code that was brought in from tfhub.
GantMan · May 14, 2020 · efea3ad · efea3ad
1 parent 931f107
commit efea3ad
Show file tree

Hide file tree

Showing 4 changed files with 210 additions and 69 deletions.
diff --git a/training/make_nsfw_model.py b/training/make_nsfw_model.py
@@ -128,6 +128,47 @@
 flags.DEFINE_bool(
 		"is_deprecated_tfhub_module", False,
 		"Whether or not the supplied TF hub module is old and from Tensorflow 1.")
+flags.DEFINE_float(
+		"label_smoothing", _DEFAULT_HPARAMS.label_smoothing,
+		"The degree of label smoothing to use.")
+flags.DEFINE_float(
+		"validation_split", _DEFAULT_HPARAMS.validation_split,
+		"The percentage of data to use for validation.")
+flags.DEFINE_string(
+    'optimizer', _DEFAULT_HPARAMS.optimizer,
+    'The name of the optimizer, one of "adadelta", "adagrad", "adam",'
+    '"ftrl", "momentum", "sgd" or "rmsprop".')
+flags.DEFINE_float(
+    'adadelta_rho', _DEFAULT_HPARAMS.adadelta_rho,
+    'The decay rate for adadelta.')
+flags.DEFINE_float(
+    'adagrad_initial_accumulator_value', _DEFAULT_HPARAMS.adagrad_initial_accumulator_value,
+    'Starting value for the AdaGrad accumulators.')
+flags.DEFINE_float(
+    'adam_beta1', _DEFAULT_HPARAMS.adam_beta1,
+    'The exponential decay rate for the 1st moment estimates.')
+flags.DEFINE_float(
+    'adam_beta2', _DEFAULT_HPARAMS.adam_beta2,
+    'The exponential decay rate for the 2nd moment estimates.')
+flags.DEFINE_float('opt_epsilon', _DEFAULT_HPARAMS.opt_epsilon, 'Epsilon term for the optimizer.')
+flags.DEFINE_float('ftrl_learning_rate_power', _DEFAULT_HPARAMS.ftrl_learning_rate_power,
+                          'The learning rate power.')
+flags.DEFINE_float(
+    'ftrl_initial_accumulator_value', _DEFAULT_HPARAMS.ftrl_initial_accumulator_value,
+    'Starting value for the FTRL accumulators.')
+flags.DEFINE_float(
+    'ftrl_l1', _DEFAULT_HPARAMS.ftrl_l1, 'The FTRL l1 regularization strength.')
+
+flags.DEFINE_float(
+    'ftrl_l2', _DEFAULT_HPARAMS.ftrl_l2, 'The FTRL l2 regularization strength.')
+flags.DEFINE_float('rmsprop_momentum', _DEFAULT_HPARAMS.rmsprop_momentum, 'Momentum.')
+flags.DEFINE_float('rmsprop_decay', _DEFAULT_HPARAMS.rmsprop_decay, 'Decay term for RMSProp.')
+flags.DEFINE_bool(
+		"do_data_augmentation", False,
+		"Whether or not to do data augmentation.")
+flags.DEFINE_bool(
+		"use_mixed_precision", False,
+		"Whether or not to use NVIDIA mixed precision. Requires NVIDIA card with at least compute level 7.0")
 
 FLAGS = flags.FLAGS
 
@@ -140,7 +181,26 @@ def _get_hparams_from_flags():
 			batch_size=FLAGS.batch_size,
 			learning_rate=FLAGS.learning_rate,
 			momentum=FLAGS.momentum,
-			dropout_rate=FLAGS.dropout_rate)
+			dropout_rate=FLAGS.dropout_rate,
+			label_smoothing=FLAGS.label_smoothing,
+			validation_split=FLAGS.validation_split,
+			optimizer=FLAGS.optimizer,
+			adadelta_rho=FLAGS.adadelta_rho,
+			adagrad_initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value,
+			adam_beta1=FLAGS.adam_beta1,
+			adam_beta2=FLAGS.adam_beta2,
+			opt_epsilon=FLAGS.opt_epsilon,
+			ftrl_learning_rate_power=FLAGS.ftrl_learning_rate_power,
+			ftrl_initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value,
+			ftrl_l1=FLAGS.ftrl_l1,
+			ftrl_l2=FLAGS.ftrl_l2,
+			rmsprop_momentum=FLAGS.rmsprop_momentum,
+			rmsprop_decay=FLAGS.rmsprop_decay,
+			do_data_augmentation=FLAGS.do_data_augmentation,
+			use_mixed_precision=FLAGS.use_mixed_precision
+			)
+
+
 
 
 def _check_keras_dependencies():
@@ -178,6 +238,9 @@ def main(args):
 	"""Main function to be called by absl.app.run() after flag parsing."""
 	del args
 
+	#policy = mixed_precision.Policy('mixed_float16')
+	#mixed_precision.set_policy(policy)
+
 #tf.config.gpu.set_per_process_memory_fraction(0.75)
 #tf.config.gpu.set_per_process_memory_growth(False)
 	physical_devices = tf.config.list_physical_devices('GPU') 

diff --git a/training/make_nsfw_model_lib.py b/training/make_nsfw_model_lib.py
@@ -21,11 +21,13 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+import multiprocessing
 from pathlib import Path
 from absl import app
 from absl import flags
 from absl import logging
 from tensorflow import keras
+from tensorflow.keras.mixed_precision import experimental as mixed_precision
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -67,11 +69,57 @@ def get_default_image_dir():
 	return tf.keras.utils.get_file("flower_photos",
 								 _DEFAULT_IMAGE_URL, untar=True)
 
+def configure_optimizer(hparams):
+	"""Configures the optimizer used for training.
+	
+	Args:
+		learning_rate: A scalar or `Tensor` learning rate.
+	
+	Returns:
+		An instance of an optimizer.
+	
+	Raises:
+		ValueError: if hparams.optimizer is not recognized.
+	"""
+	if hparams.optimizer == 'adadelta':
+		optimizer = tf.keras.optimizers.Adadelta(
+			hparams.learning_rate,
+			rho=hparams.adadelta_rho,
+			epsilon=hparams.opt_epsilon)
+	elif hparams.optimizer == 'adagrad':
+		optimizer = tf.keras.optimizers.Adagrad(
+			hparams.learning_rate,
+			initial_accumulator_value=hparams.adagrad_initial_accumulator_value)
+	elif hparams.optimizer == 'adam':
+		optimizer = tf.keras.optimizers.Adam(
+			hparams.learning_rate,
+			beta_1=hparams.adam_beta1,
+			beta_2=hparams.adam_beta2,
+			epsilon=hparams.opt_epsilon)
+	elif hparams.optimizer == 'ftrl':
+		optimizer = tf.keras.optimizers.Ftrl(
+			hparams.learning_rate,
+			learning_rate_power=hparams.ftrl_learning_rate_power,
+			initial_accumulator_value=hparams.ftrl_initial_accumulator_value,
+			l1_regularization_strength=hparams.ftrl_l1,
+			l2_regularization_strength=hparams.ftrl_l2)  
+	elif hparams.optimizer == 'rmsprop':
+		optimizer = tf.keras.optimizers.RMSprop(learning_rate=hparams.learning_rate, epsilon=hparams.opt_epsilon, momentum=hparams.rmsprop_momentum)	
+	elif hparams.optimizer == 'sgd':
+		optimizer = tf.keras.optimizers.SGD(learning_rate=hparams.learning_rate, momentum=hparams.momentum)
+	else:
+		raise ValueError('Optimizer [%s] was not recognized' % hparams.optimizer)
+	return optimizer
+
 
 class HParams(
 	collections.namedtuple("HParams", [
 		"train_epochs", "do_fine_tuning", "batch_size", "learning_rate",
-		"momentum", "dropout_rate"
+		"momentum", "dropout_rate", "label_smoothing", "validation_split",
+		"optimizer", "adadelta_rho", "adagrad_initial_accumulator_value",
+		"adam_beta1", "adam_beta2", "opt_epsilon", "ftrl_learning_rate_power",
+		"ftrl_initial_accumulator_value", "ftrl_l1", "ftrl_l2", "rmsprop_momentum",
+		"rmsprop_decay", "do_data_augmentation", "use_mixed_precision"
 	])):
 	"""The hyperparameters for make_image_classifier.
 
@@ -93,11 +141,28 @@ def get_default_hparams():
 		batch_size=32,
 		learning_rate=0.005,
 		momentum=0.9,
-		dropout_rate=0.2)
+		dropout_rate=0.2,
+		label_smoothing=0.1,
+		validation_split=.20,
+		optimizer='rmsprop',
+		adadelta_rho=0.95,
+		adagrad_initial_accumulator_value=0.1,
+		adam_beta1=0.9,
+		adam_beta2=0.999,
+		opt_epsilon=1.0,
+		ftrl_learning_rate_power=-0.5,
+		ftrl_initial_accumulator_value=0.1,
+		ftrl_l1=0.0,
+		ftrl_l2=0.0,
+		rmsprop_momentum=0.9,
+		rmsprop_decay=0.9,
+		do_data_augmentation=False,
+		use_mixed_precision=False
+		)
 
 
 def _get_data_with_keras(image_dir, image_size, batch_size,
-						 do_data_augmentation=False):
+						 validation_size=0.2, do_data_augmentation=False):
 	"""Gets training and validation data via keras_preprocessing.
 
 	Args:
@@ -126,7 +191,7 @@ def _get_data_with_keras(image_dir, image_size, batch_size,
 	"""
 	datagen_kwargs = dict(rescale=1./255,
 						# TODO(b/139467904): Expose this as a flag.
-						validation_split=.20)
+						validation_split=validation_size)
 	dataflow_kwargs = dict(target_size=image_size, batch_size=batch_size,
 						 interpolation="bilinear")
 
@@ -143,7 +208,8 @@ def _get_data_with_keras(image_dir, image_size, batch_size,
 			**datagen_kwargs)
 	else:
 		train_datagen = valid_datagen
-		train_generator = train_datagen.flow_from_directory(
+
+	train_generator = train_datagen.flow_from_directory(
 			image_dir, subset="training", shuffle=True, **dataflow_kwargs)
 
 	indexed_labels = [(index, label)
@@ -215,14 +281,27 @@ def build_model(module_layer, hparams, image_size, num_classes):
 	The full classifier model.
 	"""
 	# TODO(b/139467904): Expose the hyperparameters below as flags.
-	model = tf.keras.Sequential([
-		tf.keras.Input(shape=(image_size[0], image_size[1], 3), name='input', dtype='float32'), module_layer,
-		tf.keras.layers.Dropout(rate=hparams.dropout_rate),
-		tf.keras.layers.Dense(
-			num_classes,
-			kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
-		tf.keras.layers.Activation('softmax', dtype='float32', name='prediction')
-	])
+
+	if hparams.dropout_rate is not None and hparams.dropout_rate > 0:
+		model = tf.keras.Sequential([
+				tf.keras.Input(shape=(image_size[0], image_size[1], 3), name='input', dtype='float32'), 
+				module_layer,
+				tf.keras.layers.Dropout(rate=hparams.dropout_rate),
+				tf.keras.layers.Dense(
+					num_classes,
+					kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
+				tf.keras.layers.Activation('softmax', dtype='float32', name='prediction')
+			])
+	else:
+		model = tf.keras.Sequential([
+				tf.keras.Input(shape=(image_size[0], image_size[1], 3), name='input', dtype='float32'), 
+				module_layer,
+				tf.keras.layers.Dense(
+					num_classes,
+					kernel_regularizer=None),
+				tf.keras.layers.Activation('softmax', dtype='float32', name='prediction')
+			])
+
 	print(model.summary())
 	return model
 
@@ -249,20 +328,33 @@ def train_model(model, hparams, train_data_and_size, valid_data_and_size):
 	Returns:
 	The tf.keras.callbacks.History object returned by tf.keras.Model.fit().
 	"""
+
+	earlystop_callback = tf.keras.callbacks.EarlyStopping(
+  		monitor='val_accuracy', min_delta=0.0001,
+  		patience=1)
+
 	train_data, train_size = train_data_and_size
 	valid_data, valid_size = valid_data_and_size
 	# TODO(b/139467904): Expose this hyperparameter as a flag.
-	loss = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1)
+	loss = tf.keras.losses.CategoricalCrossentropy(label_smoothing=hparams.label_smoothing)
+
+	if hparams.use_mixed_precision is True:
+		optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(configure_optimizer(hparams))
+	else:
+		optimizer = configure_optimizer(hparams)
+
 	model.compile(
-		optimizer=tf.keras.optimizers.SGD(
-			lr=hparams.learning_rate, momentum=hparams.momentum),
+		optimizer=optimizer,
 		loss=loss,
 		metrics=["accuracy"])
 	steps_per_epoch = train_size // hparams.batch_size
 	validation_steps = valid_size // hparams.batch_size
 	return model.fit(
 		train_data,
+		use_multiprocessing=False,
+		workers=multiprocessing.cpu_count() -1,
 		epochs=hparams.train_epochs,
+		callbacks=[earlystop_callback],
 		steps_per_epoch=steps_per_epoch,
 		validation_data=valid_data,
 		validation_steps=validation_steps)
@@ -404,13 +496,17 @@ def make_image_classifier(tfhub_module, image_dir, hparams,
 		must be omitted or set to that same value.
 	"""
 
+	print("Using hparams:")
+	for key, value in hparams._asdict().items():
+		print("\t{0} : {1}".format(key, value))
+
 	module_layer = hub.KerasLayer(tfhub_module, trainable=hparams.do_fine_tuning)
 
 	image_size = _image_size_for_module(module_layer, requested_image_size)
 	print("Using module {} with image size {}".format(
 		tfhub_module, image_size))
 	train_data_and_size, valid_data_and_size, labels = _get_data_with_keras(
-		image_dir, image_size, hparams.batch_size)
+		image_dir, image_size, hparams.batch_size, hparams.validation_split, hparams.do_data_augmentation)
 	print("Found", len(labels), "classes:", ", ".join(labels))
 
 	model = build_model(module_layer, hparams, image_size, len(labels))