Source code for deepobs.tensorflow.runners.standard_runner

"""Module implementing StandardRunner."""

from __future__ import print_function

import argparse
import os
import json
import importlib
import tensorflow as tf

from .. import config
from .. import testproblems
from . import runner_utils


[docs]class StandardRunner(object): """Provides functionality to run optimizers on DeepOBS testproblems including the logging of important performance metrics. Args: optimizer_class: Optimizer class, which should inherit from tf.train.Optimizer and/or obey the same interface for ``.minimize()``. hyperparams: A list describing the optimizer's hyperparameters other than learning rate. Each entry of the list is a dictionary describing one of the hyperparameters. This dictionary is expected to have the following two fields: - hyperparams["name"] must contain the name of the parameter (i.e., the exact name of the corresponding keyword argument to the optimizer class' init function. - hyperparams["type"] specifies the type of the parameter (e.g., ``int``, ``float``, ``bool``). Optionally, the dictionary can have a third field indexed by the key "default", which specifies a default value for the hyperparameter. Example -------- >>> optimizer_class = tf.train.MomentumOptimizer >>> hyperparams = [ {"name": "momentum", "type": float}, {"name": "use_nesterov", "type": bool, "default": False}] >>> runner = StandardRunner(optimizer_class, hyperparms) """ def __init__(self, optimizer_class, hyperparams): """Creates a new StandardRunner. Args: optimizer_class: Optimizer class, which should inherit from tf.train.Optimizer and/or obey the same interface for ``.minimize()``. hyperparams: A list describing the optimizer's hyperparameters other than learning rate. Each entry of the list is a dictionary describing one of the hyperparameters. This dictionary is expected to have the following two fields: - hyperparams["name"] must contain the name of the parameter (i.e., the exact name of the corresponding keyword argument to the optimizer class' init function. - hyperparams["type"] specifies the type of the parameter (e.g., ``int``, ``float``, ``bool``). Optionally, the dictionary can have a third field indexed by the key "default", which specifies a default value for the hyperparameter. Example: optimizer_class = tf.train.MomentumOptimizer hyperparams = [ {"name": "momentum", "type": float}, {"name": "use_nesterov", "type": bool, "default": False}] runner = StandardRunner(optimizer_class, hyperparms) """ self._optimizer_class = optimizer_class self._optimizer_name = optimizer_class.__name__ self._hyperparams = hyperparams # This function is a wrapper around _run() which grabs all non-specified # arguments from the command line.
[docs] def run(self, testproblem=None, weight_decay=None, batch_size=None, num_epochs=None, learning_rate=None, lr_sched_epochs=None, lr_sched_factors=None, random_seed=None, data_dir=None, output_dir=None, train_log_interval=None, print_train_iter=None, tf_logging=None, no_logs=None, **optimizer_hyperparams): """Runs a given optimizer on a DeepOBS testproblem. This method receives all relevant options to run the optimizer on a DeepOBS testproblem, including the hyperparameters of the optimizers, which can be passed as keyword arguments (based on the names provided via ``hyperparams`` in the init function). Options which are *not* passed here will automatically be added as command line arguments. (Some of those will be required, others will have defaults; run the script with the ``--help`` flag to see a description of the command line interface.) Training statistics (train/test loss/accuracy) are collected and will be saved to a ``JSON`` output file, together with metadata. The training statistics can optionally also be saved in TensorFlow output files and read during training using `Tensorboard`. Args: testproblem (str): Name of a DeepOBS test problem. weight_decay (float): The weight decay factor to use. batch_size (int): The mini-batch size to use. num_epochs (int): The number of epochs to train. learning_rate (float): The learning rate to use. This will function as the base learning rate when implementing a schedule using ``lr_sched_epochs`` and ``lr_sched_factors`` (see below). lr_sched_epochs (list): A list of epoch numbers (positive integers) that mark learning rate changes. The base learning rate is passed via ``learning_rate`` and the factors by which to change are passed via ``lr_sched_factors``. Example: ``learning_rate=0.3``, ``lr_sched_epochs=[50, 100]``, ``lr_sched_factors=[0.1 0.01]`` will start with a learning rate of ``0.3``, then decrease to ``0.1*0.3=0.03`` after training for ``50`` epochs, and decrease to ``0.01*0.3=0.003`` after training for ``100`` epochs. lr_sched_factors (list): A list of factors (floats) by which to change the learning rate. The base learning rate has to be passed via ``learing_rate`` and the epochs at which to change the learning rate have to be passed via ``lr_sched_factors``. Example: ``learning_rate=0.3``, ``lr_sched_epochs=[50, 100]``, ``lr_sched_factors=[0.1 0.01]`` will start with a learning rate of ``0.3``, then decrease to ``0.1*0.3=0.03`` after training for ``50`` epochs, and decrease to ``0.01*0.3=0.003`` after training for ``100`` epochs. random_seed (int): Random seed to use. If unspecified, it defaults to ``42``. data_dir (str): Path to the DeepOBS data directory. If unspecified, DeepOBS uses its default `/data_deepobs`. output_dir (str): Path to the output directory. Within this directory, subfolders for the testproblem and the optimizer are automatically created. If unspecified, defaults to '/results'. train_log_interval (int): Interval of steps at which to log training loss. If unspecified it defaults to ``10``. print_train_iter (bool): If ``True``, training loss is printed to screen. If unspecified it defaults to ``False``. tf_logging (bool): If ``True`` log all statistics with tensorflow summaries, which can be viewed in real time with tensorboard. If unspecified it defaults to ``False``. no_logs (bool): If ``True`` no ``JSON`` files are created. If unspecified it defaults to ``False``. optimizer_hyperparams (dict): Keyword arguments for the hyperparameters of the optimizer. These are the ones specified in the ``hyperparams`` dictionary passed to the ``__init__``. """ # We will go through all the arguments, check whether they have been passed # to this function. If yes, we collect the (name, value) pairs in ``args``. # If not, we add corresponding command line arguments. args = {} parser = argparse.ArgumentParser( description="Run {0:s} on a DeepOBS test problem.".format( self._optimizer_name)) if testproblem is None: parser.add_argument( "testproblem", help="""Name of the DeepOBS testproblem (e.g. 'cifar10_3c3d'""") else: args["testproblem"] = testproblem if weight_decay is None: parser.add_argument( "--weight_decay", "--wd", type=float, help="""Factor used for the weight_deacy. If not given, the default weight decay for this model is used. Note that not all models use weight decay and this value will be ignored in such a case.""") else: args["weight_decay"] = weight_decay if batch_size is None: parser.add_argument( "--batch_size", "--bs", required=True, type=int, help="The batch size (positive integer).") else: args["batch_size"] = batch_size if num_epochs is None: parser.add_argument( "-N", "--num_epochs", required=True, type=int, help="Total number of training epochs.") else: args["num_epochs"] = num_epochs if learning_rate is None: parser.add_argument( "--learning_rate", "--lr", required=True, type=float, help= """Learning rate (positive float) to use. Can be used as the base of a learning rate schedule when used in conjunction with --lr_sched_epochs and --lr_sched_factors.""") else: args["learning_rate"] = learning_rate if lr_sched_epochs is None: parser.add_argument( "--lr_sched_epochs", nargs="+", type=int, help="""One or more epoch numbers (positive integers) that mark learning rate changes. The base learning rate has to be passed via '--learing_rate' and the factors by which to change have to be passed via '--lr_sched_factors'. Example: '--lr 0.3 --lr_sched_epochs 50 100 --lr_sched_factors 0.1 0.01' will start with a learning rate of 0.3, then decrease to 0.1*0.3=0.03 after training for 50 epochs, and decrease to 0.01*0.3=0.003' after training for 100 epochs.""") else: args["lr_sched_epochs"] = lr_sched_epochs if lr_sched_factors is None: parser.add_argument( "--lr_sched_factors", nargs="+", type=float, help= """One or more factors (floats) by which to change the learning rate. The base learning rate has to be passed via '--learing_rate' and the epochs at which to change the learning rate have to be passed via '--lr_sched_factors'. Example: '--lr 0.3 --lr_sched_epochs 50 100 --lr_sched_factors 0.1 0.01' will start with a learning rate of 0.3, then decrease to 0.1*0.3=0.03 after training for 50 epochs, and decrease to 0.01*0.3=0.003' after training for 100 epochs.""") else: args["lr_sched_factors"] = lr_sched_factors if random_seed is None: parser.add_argument( "-r", "--random_seed", type=int, default=42, help="An integer to set as tensorflow's random seed.") else: args["random_seed"] = random_seed if data_dir is None: parser.add_argument( "--data_dir", help="""Path to the base data dir. If not specified, DeepOBS uses its default.""") else: args["data_dir"] = data_dir if output_dir is None: parser.add_argument( "--output_dir", type=str, default="results", help="""Path to the base directory in which output files will be stored. Results will automatically be sorted into subdirectories of the form 'testproblem/optimizer'.""") else: args["output_dir"] = output_dir if train_log_interval is None: parser.add_argument( "--train_log_interval", type=int, default=10, help="Interval of steps at which training loss is logged.") else: args["train_log_interval"] = train_log_interval if print_train_iter is None: parser.add_argument( "--print_train_iter", action="store_const", const=True, default=False, help="""Add this flag to print mini-batch training loss to stdout on each (logged) interation.""") else: args["print_train_iter"] = print_train_iter if tf_logging is None: parser.add_argument( "--tf_logging", action="store_const", const=True, default=False, help="""Add this flag to log statistics using tensorflow (to view in tensorboard).""") else: args["tf_logging"] = tf_logging if no_logs is None: parser.add_argument( "--no_logs", action="store_const", const=True, default=False, help="""Add this flag to not save any json logging files.""") else: args["no_logs"] = no_logs # Optimizer hyperparams for hp in self._hyperparams: hp_name = hp["name"] if hp_name in optimizer_hyperparams: args[hp_name] = optimizer_hyperparams[hp_name] else: # hp_name not in optimizer_hyperparams hp_type = hp["type"] if "default" in hp: hp_default = hp["default"] parser.add_argument( "--{0:s}".format(hp_name), type=hp_type, default=hp_default, help="""Hyperparameter {0:s} of {1:s} ({2:s}; defaults to {3:s}).""".format(hp_name, self._optimizer_name, str(hp_type), str(hp_default))) else: parser.add_argument( "--{0:s}".format(hp_name), type=hp_type, required=True, help="Hyperparameter {0:s} of {1:s} ({2:s}).".format( hp_name, self._optimizer_name, str(hp_type))) # Get the command line arguments and add them to the ``args`` dict. Then # call the _run function with those arguments. cmdline_args = vars(parser.parse_args()) args.update(cmdline_args) self._run(**args)
def _run(self, testproblem, weight_decay, batch_size, num_epochs, learning_rate, lr_sched_epochs, lr_sched_factors, random_seed, data_dir, output_dir, train_log_interval, print_train_iter, tf_logging, no_logs, **optimizer_hyperparams): """Performs the actual run, given all the arguments.""" # Set data directory of DeepOBS. if data_dir is not None: config.set_data_dir(data_dir) # Find testproblem by name and instantiate with batch size and weight decay. try: testproblem_mod = importlib.import_module(testproblem) testproblem_cls = getattr(testproblem_mod, testproblem) print("Loading local testproblem.") except: testproblem_cls = getattr(testproblems, testproblem) if weight_decay is not None: tproblem = testproblem_cls(batch_size, weight_decay) else: tproblem = testproblem_cls(batch_size) # Set up the testproblem. tf.reset_default_graph() tf.set_random_seed(random_seed) tproblem.set_up() loss = tf.reduce_mean(tproblem.losses) + tproblem.regularizer # Set up the optimizer and create learning rate schedule. global_step = tf.Variable(0, trainable=False) learning_rate_var = tf.Variable(learning_rate, trainable=False) opt = self._optimizer_class(learning_rate_var, **optimizer_hyperparams) lr_schedule = runner_utils.make_lr_schedule( learning_rate, lr_sched_epochs, lr_sched_factors) # Call optimizer's minimize on loss to update all variables in the # TRAINABLE_VARIABLES collection (with a dependency on performing all ops # in the collection UPDATE_OPS collection for batch norm, etc). with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): step = opt.minimize(loss, global_step=global_step) # Create output folder if not no_logs: run_folder_name, file_name = runner_utils.make_run_name( weight_decay, batch_size, num_epochs, learning_rate, lr_sched_epochs, lr_sched_factors, random_seed, **optimizer_hyperparams) directory = os.path.join(output_dir, testproblem, self._optimizer_name, run_folder_name) if not os.path.exists(directory): os.makedirs(directory) # Lists to track train/test loss and accuracy. train_losses = [] test_losses = [] minibatch_train_losses = [] train_accuracies = [] test_accuracies = [] # Tensorboard summaries if tf_logging: # per iteration mb_train_loss_summary = tf.summary.scalar( "training/minibatch_train_losses", loss, collections=[tf.GraphKeys.SUMMARIES, "per_iteration"]) # per epoch lr_summary = tf.summary.scalar( "hyperparams/learning_rate", learning_rate_var, collections=[tf.GraphKeys.SUMMARIES, "per_epoch"]) batch_summary = tf.summary.scalar( "hyperparams/batch_size", batch_size, collections=[tf.GraphKeys.SUMMARIES, "per_epoch"]) per_iter_summaries = tf.summary.merge_all(key="per_iteration") per_epoch_summaries = tf.summary.merge_all(key="per_epoch") summary_writer = tf.summary.FileWriter(directory) # Start tensorflow session and initialize variables. sess = tf.Session() sess.run(tf.global_variables_initializer()) # Wrapper functions for the evaluation phase. def evaluate(test=True): """Computes average loss and accuracy in the evaluation phase.""" if test: sess.run(tproblem.test_init_op) msg = "TEST:" loss_list = test_losses acc_list = test_accuracies else: sess.run(tproblem.train_eval_init_op) msg = "TRAIN:" loss_list = train_losses acc_list = train_accuracies # Compute average loss and (if applicable) accuracy. loss_ = 0.0 num_iters = 0.0 acc_ = 0.0 if tproblem.accuracy is not None: while True: try: l_, a_ = sess.run([loss, tproblem.accuracy]) loss_ += l_ acc_ += a_ num_iters += 1.0 except tf.errors.OutOfRangeError: break else: # accuracy is None acc_ = 0.0 while True: try: l_ = sess.run(loss) loss_ += l_ num_iters += 1.0 except tf.errors.OutOfRangeError: break loss_ /= num_iters acc_ /= num_iters # Print and log the results. loss_list.append(loss_) acc_list.append(acc_) # Log results to tensorflow summaries if tf_logging: if test: tag = "epoch/test_" else: tag = "epoch/train_" summary = tf.Summary() summary.value.add(tag=tag + "loss_", simple_value=loss_) summary.value.add(tag=tag + "acc_", simple_value=acc_) per_epoch_summary_ = sess.run(per_epoch_summaries) summary_writer.add_summary(per_epoch_summary_, len(loss_list) - 1) summary_writer.add_summary(summary, len(loss_list) - 1) summary_writer.flush() print("{0:s} loss {1:g}, acc {2:f}".format(msg, loss_, acc_)) # Start of training loop. for n in range(num_epochs + 1): # Evaluate at beginning of epoch. print("********************************") print("Evaluating after {0:d} of {1:d} epochs...".format( n, num_epochs)) evaluate(test=False) evaluate(test=True) print("********************************") # Break from train loop after the last round of evaluation if n == num_epochs: break # Training if n in lr_schedule: sess.run(learning_rate_var.assign(lr_schedule[n])) print("Setting learning rate to {0:f}".format(lr_schedule[n])) sess.run(tproblem.train_init_op) s = 0 while True: try: # Training step, with logging if we hit the train_log_interval if s % train_log_interval == 0: if tf_logging: _, loss_, per_iter_summary_ = sess.run( [step, loss, per_iter_summaries]) summary_writer.add_summary(per_iter_summary_, sess.run(global_step)) else: _, loss_ = sess.run([step, loss]) minibatch_train_losses.append(loss_.astype(float)) if print_train_iter: print("Epoch {0:d}, step {1:d}: loss {2:g}".format( n, s, loss_)) else: sess.run(step) s += 1 except tf.errors.OutOfRangeError: break sess.close() # --- End of training loop. # Put results into output dictionary. output = { "train_losses": train_losses, "test_losses": test_losses, "minibatch_train_losses": minibatch_train_losses } if tproblem.accuracy is not None: output["train_accuracies"] = train_accuracies output["test_accuracies"] = test_accuracies # Put all run parameters into output dictionary. output["optimizer"] = self._optimizer_name output["testproblem"] = testproblem output["weight_decay"] = weight_decay output["batch_size"] = batch_size output["num_epochs"] = num_epochs output["learning_rate"] = learning_rate output["lr_sched_epochs"] = lr_sched_epochs output["lr_sched_factors"] = lr_sched_factors output["random_seed"] = random_seed output["train_log_interval"] = train_log_interval # Add optimizer hyperparameters as a sub-dictionary. output["hyperparams"] = optimizer_hyperparams # Dump output into json file. if not no_logs: with open(os.path.join(directory, file_name + ".json"), "w") as f: json.dump(output, f)