Source code for deepobs.tensorflow.runners.runner

"""Module implementing StandardRunner."""

from __future__ import print_function

import abc
import importlib
from copy import deepcopy

import numpy as np
import tensorflow as tf

from deepobs import config as global_config
from deepobs.abstract_runner.abstract_runner import Runner

from .. import config, testproblems
from . import runner_utils


[docs]class TFRunner(Runner):
[docs] def __init__(self, optimizer_class, hyperparameter_names): super(TFRunner, self).__init__(optimizer_class, hyperparameter_names)
[docs] @staticmethod def init_summary(loss, learning_rate_var, batch_size, tb_log_dir): """Initializes the tensorboard summaries""" # per iteration mb_train_loss_summary = tf.summary.scalar( "training/minibatch_train_losses", loss, collections=[tf.GraphKeys.SUMMARIES, "per_iteration"], ) # per epoch lr_summary = tf.summary.scalar( "hyperparams/learning_rate", learning_rate_var, collections=[tf.GraphKeys.SUMMARIES, "per_epoch"], ) batch_summary = tf.summary.scalar( "hyperparams/batch_size", batch_size, collections=[tf.GraphKeys.SUMMARIES, "per_epoch"], ) per_iter_summaries = tf.summary.merge_all(key="per_iteration") per_epoch_summaries = tf.summary.merge_all(key="per_epoch") summary_writer = tf.summary.FileWriter(tb_log_dir) return per_iter_summaries, per_epoch_summaries, summary_writer
[docs] @staticmethod def write_per_epoch_summary( sess, loss_, acc_, current_step, per_epoch_summaries, summary_writer, phase, ): """Writes the tensorboard epoch summary""" if phase == "TEST": tag = "epoch/test_" elif phase == "TRAIN": tag = "epoch/train_" elif phase == "VALID": tag = "epoch/valid_" else: raise NotImplementedError( "Phase " + phase + " not implemented for write_epoch_summary()." ) summary = tf.Summary() summary.value.add(tag=tag + "loss_", simple_value=loss_) summary.value.add(tag=tag + "acc_", simple_value=acc_) per_epoch_summary_ = sess.run(per_epoch_summaries) summary_writer.add_summary(per_epoch_summary_, current_step) summary_writer.add_summary(summary, current_step) summary_writer.flush() return
[docs] @staticmethod def write_per_iter_summary(sess, per_iter_summaries, summary_writer, current_step): """Writes the tensorboard iteration summary""" per_iter_summary_ = sess.run(per_iter_summaries) summary_writer.add_summary(per_iter_summary_, current_step)
[docs] @staticmethod def create_testproblem(testproblem, batch_size, l2_reg, random_seed): """Sets up the deepobs.tensorflow.testproblems.testproblem instance. Args: testproblem (str): The name of the testproblem. batch_size (int): Batch size that is used for training l2_reg (float): Regularization factor random_seed (int): The random seed of the framework Returns: deepobs.tensorflow.testproblems.testproblem: An instance of deepobs.pytorch.testproblems.testproblem """ # Find testproblem by name and instantiate with batch size and L2-regularization. try: testproblem_mod = importlib.import_module(testproblem) testproblem_cls = getattr(testproblem_mod, testproblem) print("Loading local testproblem.") except: testproblem_cls = getattr(testproblems, testproblem) if l2_reg is not None: tproblem = testproblem_cls(batch_size, l2_reg) else: tproblem = testproblem_cls(batch_size) # Set up the testproblem. tf.reset_default_graph() tf.set_random_seed(random_seed) tproblem.set_up() return tproblem
# Wrapper functions for the evaluation phase.
[docs] @staticmethod def evaluate(tproblem, sess, loss, phase): """Computes average loss and accuracy in the evaluation phase. Args: tproblem (deepobs.tensorflow.testproblems.testproblem): The testproblem instance. sess (tensorflow.Session): The current TensorFlow Session. loss: The TensorFlow operation that computes the loss. phase (str): The phase of the evaluation. Muste be one of 'TRAIN', 'VALID' or 'TEST' """ if phase == "TEST": sess.run(tproblem.test_init_op) msg = "TEST:" elif phase == "TRAIN": sess.run(tproblem.train_eval_init_op) msg = "TRAIN:" elif phase == "VALID": sess.run(tproblem.valid_init_op) msg = "VALID:" else: raise NotImplementedError( "Phase " + phase + " not implemented for evaluate()." ) # Compute average loss and (if applicable) accuracy. loss_ = 0.0 num_iters = 0.0 acc_ = 0.0 if tproblem.accuracy is not None: while True: try: l_, a_ = sess.run([loss, tproblem.accuracy]) loss_ += l_ acc_ += a_ num_iters += 1.0 except tf.errors.OutOfRangeError: break else: # accuracy is None acc_ = 0.0 while True: try: l_ = sess.run(loss) loss_ += l_ num_iters += 1.0 except tf.errors.OutOfRangeError: break loss_ /= num_iters acc_ /= num_iters # if acc_ != 0.0: print("{0:s} loss {1:g}, acc {2:f}".format(msg, loss_, acc_)) else: print("{0:s} loss {1:g}".format(msg, loss_)) # Print and return the results. return loss_, acc_
def evaluate_all( self, n, num_epochs, tproblem, sess, loss, tb_log, per_epoch_summaries, summary_writer, train_losses, valid_losses, test_losses, train_accuracies, valid_accuracies, test_accuracies, ): print("********************************") print("Evaluating after {0:d} of {1:d} epochs...".format(n, num_epochs)) loss_, acc_ = self.evaluate(tproblem, sess, loss, phase="TRAIN") if tb_log: current_step = len(train_losses) self.write_per_epoch_summary( sess, loss_, acc_, current_step, per_epoch_summaries, summary_writer, phase="TRAIN", ) train_losses.append(loss_) train_accuracies.append(acc_) loss_, acc_ = self.evaluate(tproblem, sess, loss, phase="VALID") if tb_log: current_step = len(train_losses) self.write_per_epoch_summary( sess, loss_, acc_, current_step, per_epoch_summaries, summary_writer, phase="VALID", ) valid_losses.append(loss_) valid_accuracies.append(acc_) loss_, acc_ = self.evaluate(tproblem, sess, loss, phase="TEST") if tb_log: current_step = len(test_losses) self.write_per_epoch_summary( sess, loss_, acc_, current_step, per_epoch_summaries, summary_writer, phase="TEST", ) test_losses.append(loss_) test_accuracies.append(acc_) print("********************************")
[docs] @abc.abstractmethod def training( self, tproblem, hyperparams, num_epochs, print_train_iter, train_log_interval, tb_log, tb_log_dir, **training_params ): return
[docs]class StandardRunner(TFRunner):
[docs] def __init__(self, optimizer_class, hyperparameter_names): super(StandardRunner, self).__init__(optimizer_class, hyperparameter_names)
[docs] def training( self, tproblem, hyperparams, num_epochs, print_train_iter, train_log_interval, tb_log, tb_log_dir, ): loss = tf.reduce_mean(tproblem.losses) + tproblem.regularizer # Set up the optimizer and create learning rate schedule. global_step = tf.Variable(0, trainable=False) # this is neccesary to apply the lr_sched later. learning_rate = hyperparams["learning_rate"] learning_rate_var = tf.Variable(learning_rate, trainable=False) hyperparams_ = deepcopy(hyperparams) hyperparams_.pop("learning_rate") opt = self._optimizer_class(learning_rate=learning_rate_var, **hyperparams_) # Call optimizer's minimize on loss to update all variables in the # TRAINABLE_VARIABLES collection (with a dependency on performing all ops # in the collection UPDATE_OPS collection for batch norm, etc). with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): # Try to pass with global step, otherwise don't pass it try: step = opt.minimize(loss, global_step=global_step) except TypeError: step = opt.minimize(loss) # Lists to track train/test loss and accuracy. train_losses = [] valid_losses = [] test_losses = [] minibatch_train_losses = [] train_accuracies = [] valid_accuracies = [] test_accuracies = [] # Tensorboard summaries if tb_log: batch_size = tproblem._batch_size per_iter_summaries, per_epoch_summaries, summary_writer = self.init_summary( loss, learning_rate_var, batch_size, tb_log_dir ) else: # make sure that they are assigned for evaluate_all() per_epoch_summaries = (None,) summary_writer = None # Start tensorflow session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # Start of training loop. for n in range(num_epochs + 1): # Evaluate at beginning of epoch. self.evaluate_all( n, num_epochs, tproblem, sess, loss, tb_log, per_epoch_summaries, summary_writer, train_losses, valid_losses, test_losses, train_accuracies, valid_accuracies, test_accuracies, ) # Break from train loop after the last round of evaluation if n == num_epochs: break # Training sess.run(tproblem.train_init_op) s = 0 while True: try: if s % train_log_interval == 0: # Training step, with logging if we hit the train_log_interval _, loss_ = sess.run([step, loss]) minibatch_train_losses.append(loss_.astype(float)) if tb_log: self.write_per_iter_summary( sess, per_iter_summaries, summary_writer, s ) minibatch_train_losses.append(loss_.astype(float)) if print_train_iter: print( "Epoch {0:d}, step {1:d}: loss {2:g}".format( n, s, loss_ ) ) else: sess.run(step) s += 1 except tf.errors.OutOfRangeError: break # break from training if it goes wrong if not np.isfinite(loss_): self._abort_routine( n, num_epochs, train_losses, valid_losses, test_losses, train_accuracies, valid_accuracies, test_accuracies, minibatch_train_losses, ) break else: continue sess.close() # --- End of training loop. # Put results into output dictionary. output = { "train_losses": train_losses, "valid_losses": valid_losses, "test_losses": test_losses, "train_accuracies": train_accuracies, "valid_accuracies": valid_accuracies, "test_accuracies": test_accuracies, "minibatch_train_losses": minibatch_train_losses, } return output
[docs]class LearningRateScheduleRunner(TFRunner):
[docs] def __init__(self, optimizer_class, hyperparameter_names): super(LearningRateScheduleRunner, self).__init__( optimizer_class, hyperparameter_names )
def _add_training_params_to_argparse(self, parser, args, training_params): try: args["lr_sched_epochs"] = training_params["lr_sched_epochs"] except KeyError: parser.add_argument( "--lr_sched_epochs", nargs="+", type=int, help="""One or more epoch numbers (positive integers) that mark learning rate changes. The base learning rate has to be passed via '--learing_rate' and the factors by which to change have to be passed via '--lr_sched_factors'. Example: '--lr 0.3 --lr_sched_epochs 50 100 --lr_sched_factors 0.1 0.01' will start with a learning rate of 0.3, then decrease to 0.1*0.3=0.03 after training for 50 epochs, and decrease to 0.01*0.3=0.003' after training for 100 epochs.""", ) try: args["lr_sched_factors"] = training_params["lr_sched_factors"] except KeyError: parser.add_argument( "--lr_sched_factors", nargs="+", type=float, help="""One or more factors (floats) by which to change the learning rate. The base learning rate has to be passed via '--learing_rate' and the epochs at which to change the learning rate have to be passed via '--lr_sched_factors'. Example: '--lr 0.3 --lr_sched_epochs 50 100 --lr_sched_factors 0.1 0.01' will start with a learning rate of 0.3, then decrease to 0.1*0.3=0.03 after training for 50 epochs, and decrease to 0.01*0.3=0.003' after training for 100 epochs.""", )
[docs] def training( self, tproblem, hyperparams, num_epochs, print_train_iter, train_log_interval, tb_log, tb_log_dir, # the following are the training_params lr_sched_epochs=None, lr_sched_factors=None, ): """Performs the training and stores the metrices. Args: tproblem (deepobs.[tensorflow/pytorch].testproblems.testproblem): The testproblem instance to train on. hyperparams (dict): The optimizer hyperparameters to use for the training. num_epochs (int): The number of training epochs. print_train_iter (bool): Whether to print the training progress at every train_log_interval train_log_interval (int): Mini-batch interval for logging. tb_log (bool): Whether to use tensorboard logging or not tb_log_dir (str): The path where to save tensorboard events. lr_sched_epochs (list): The epochs where to adjust the learning rate. lr_sched_factors (list): The corresponding factors by which to adjust the learning rate. Returns: dict: The logged metrices. Is of the form: \ {'test_losses' : [...], \ 'valid_losses': [...], \ 'train_losses': [...], \ 'test_accuracies': [...], \ 'valid_accuracies': [...], \ 'train_accuracies': [...] \ } \ where the metrices values are lists that were filled during training. """ loss = tf.reduce_mean(tproblem.losses) + tproblem.regularizer # Set up the optimizer and create learning rate schedule. global_step = tf.Variable(0, trainable=False) # this is neccesary to apply the lr_sched later. learning_rate = hyperparams["learning_rate"] learning_rate_var = tf.Variable(learning_rate, trainable=False) hyperparams_ = deepcopy(hyperparams) hyperparams_.pop("learning_rate") opt = self._optimizer_class(learning_rate=learning_rate_var, **hyperparams_) lr_schedule = runner_utils.make_lr_schedule( learning_rate, lr_sched_epochs, lr_sched_factors ) # Call optimizer's minimize on loss to update all variables in the # TRAINABLE_VARIABLES collection (with a dependency on performing all ops # in the collection UPDATE_OPS collection for batch norm, etc). with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): # Try to pass with global step, otherwise don't pass it try: step = opt.minimize(loss, global_step=global_step) except TypeError: step = opt.minimize(loss) # Lists to track train/test loss and accuracy. train_losses = [] valid_losses = [] test_losses = [] minibatch_train_losses = [] train_accuracies = [] valid_accuracies = [] test_accuracies = [] # Tensorboard summaries if tb_log: batch_size = tproblem._batch_size per_iter_summaries, per_epoch_summaries, summary_writer = self.init_summary( loss, learning_rate_var, batch_size, tb_log_dir ) else: # make sure that they are assigned for evaluate_all() per_epoch_summaries = (None,) summary_writer = None # Start tensorflow session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # Start of training loop. for n in range(num_epochs + 1): # Evaluate at beginning of epoch. self.evaluate_all( n, num_epochs, tproblem, sess, loss, tb_log, per_epoch_summaries, summary_writer, train_losses, valid_losses, test_losses, train_accuracies, valid_accuracies, test_accuracies, ) # Break from train loop after the last round of evaluation if n == num_epochs: break # Training if n in lr_schedule: sess.run(learning_rate_var.assign(lr_schedule[n])) print("Setting learning rate to {0:f}".format(lr_schedule[n])) sess.run(tproblem.train_init_op) s = 0 while True: try: if s % train_log_interval == 0: # Training step, with logging if we hit the train_log_interval _, loss_ = sess.run([step, loss]) minibatch_train_losses.append(loss_.astype(float)) if tb_log: self.write_per_iter_summary( sess, per_iter_summaries, summary_writer, s ) minibatch_train_losses.append(loss_.astype(float)) if print_train_iter: print( "Epoch {0:d}, step {1:d}: loss {2:g}".format( n, s, loss_ ) ) else: sess.run(step) s += 1 except tf.errors.OutOfRangeError: break # break from training if it goes wrong if not np.isfinite(loss_): self._abort_routine( n, num_epochs, train_losses, valid_losses, test_losses, train_accuracies, valid_accuracies, test_accuracies, minibatch_train_losses, ) break else: continue sess.close() # --- End of training loop. # Put results into output dictionary. output = { "train_losses": train_losses, "valid_losses": valid_losses, "test_losses": test_losses, "train_accuracies": train_accuracies, "valid_accuracies": valid_accuracies, "test_accuracies": test_accuracies, "minibatch_train_losses": minibatch_train_losses, } return output