Source code for deepobs.tensorflow.runners.standard_runner

"""Module implementing StandardRunner."""

from __future__ import print_function

import argparse
import os
import json
import importlib
import tensorflow as tf

from .. import config
from .. import testproblems
from . import runner_utils


[docs]class StandardRunner(object):
    """Provides functionality to run optimizers on DeepOBS testproblems including
    the logging of important performance metrics.

    Args:
      optimizer_class: Optimizer class, which should inherit from
          tf.train.Optimizer and/or obey the same interface for ``.minimize()``.
      hyperparams: A list describing the optimizer's hyperparameters other
          than learning rate. Each entry of the list is a dictionary describing
          one of the hyperparameters. This dictionary is expected to have the
          following two fields:

            - hyperparams["name"] must contain the name of the parameter (i.e.,
              the exact name of the corresponding keyword argument to the
              optimizer class' init function.
            - hyperparams["type"] specifies the type of the parameter (e.g.,
              ``int``, ``float``, ``bool``).

          Optionally, the dictionary can have a third field indexed by the key
          "default", which specifies a default value for the hyperparameter.

    Example
    --------
    >>> optimizer_class = tf.train.MomentumOptimizer
    >>> hyperparams = [
            {"name": "momentum", "type": float},
            {"name": "use_nesterov", "type": bool, "default": False}]
    >>> runner = StandardRunner(optimizer_class, hyperparms)

    """

    def __init__(self, optimizer_class, hyperparams):
        """Creates a new StandardRunner.

    Args:
      optimizer_class: Optimizer class, which should inherit from
          tf.train.Optimizer and/or obey the same interface for ``.minimize()``.
      hyperparams: A list describing the optimizer's hyperparameters other
          than learning rate. Each entry of the list is a dictionary describing
          one of the hyperparameters. This dictionary is expected to have the
          following two fields:
            - hyperparams["name"] must contain the name of the parameter (i.e.,
              the exact name of the corresponding keyword argument to the
              optimizer class' init function.
            - hyperparams["type"] specifies the type of the parameter (e.g.,
              ``int``, ``float``, ``bool``).
          Optionally, the dictionary can have a third field indexed by the key
          "default", which specifies a default value for the hyperparameter.

    Example:
        optimizer_class = tf.train.MomentumOptimizer
        hyperparams = [
            {"name": "momentum", "type": float},
            {"name": "use_nesterov", "type": bool, "default": False}]
        runner = StandardRunner(optimizer_class, hyperparms)
    """
        self._optimizer_class = optimizer_class
        self._optimizer_name = optimizer_class.__name__
        self._hyperparams = hyperparams

    # This function is a wrapper around _run() which grabs all non-specified
    # arguments from the command line.
[docs]    def run(self,
            testproblem=None,
            weight_decay=None,
            batch_size=None,
            num_epochs=None,
            learning_rate=None,
            lr_sched_epochs=None,
            lr_sched_factors=None,
            random_seed=None,
            data_dir=None,
            output_dir=None,
            train_log_interval=None,
            print_train_iter=None,
            tf_logging=None,
            no_logs=None,
            **optimizer_hyperparams):
        """Runs a given optimizer on a DeepOBS testproblem.

    This method receives all relevant options to run the optimizer on a DeepOBS
    testproblem, including the hyperparameters of the optimizers, which can be
    passed as keyword arguments (based on the names provided via ``hyperparams``
    in the init function).

    Options which are *not* passed here will
    automatically be added as command line arguments. (Some of those will be
    required, others will have defaults; run the script with the ``--help`` flag
    to see a description of the command line interface.)

    Training statistics (train/test loss/accuracy) are collected and will be
    saved to a ``JSON`` output file, together with metadata. The training
    statistics can optionally also be saved in TensorFlow output files and read
    during training using `Tensorboard`.

    Args:
      testproblem (str): Name of a DeepOBS test problem.
      weight_decay (float): The weight decay factor to use.
      batch_size (int): The mini-batch size to use.
      num_epochs (int): The number of epochs to train.
      learning_rate (float): The learning rate to use. This will function as the
          base learning rate when implementing a schedule using
          ``lr_sched_epochs`` and ``lr_sched_factors`` (see below).
      lr_sched_epochs (list): A list of epoch numbers (positive integers) that
          mark learning rate changes. The base learning rate is passed via
          ``learning_rate`` and the factors by which to change are passed via
          ``lr_sched_factors``.
          Example: ``learning_rate=0.3``, ``lr_sched_epochs=[50, 100]``,
          ``lr_sched_factors=[0.1 0.01]`` will start with a learning rate of
          ``0.3``, then decrease to ``0.1*0.3=0.03`` after training for ``50``
          epochs, and decrease to ``0.01*0.3=0.003`` after training for ``100``
          epochs.
      lr_sched_factors (list): A list of factors (floats) by which to change the
          learning rate. The base learning rate has to be passed via
          ``learing_rate`` and the epochs at which to change the learning rate
          have to be passed via ``lr_sched_factors``.
          Example: ``learning_rate=0.3``, ``lr_sched_epochs=[50, 100]``,
          ``lr_sched_factors=[0.1 0.01]`` will start with a learning rate of
          ``0.3``, then decrease to ``0.1*0.3=0.03`` after training for ``50``
          epochs, and decrease to ``0.01*0.3=0.003`` after training for ``100``
          epochs.
      random_seed (int): Random seed to use. If unspecified, it defaults to
          ``42``.
      data_dir (str): Path to the DeepOBS data directory. If unspecified,
          DeepOBS uses its default `/data_deepobs`.
      output_dir (str): Path to the output directory. Within this directory,
          subfolders for the testproblem and the optimizer are automatically
          created. If unspecified, defaults to '/results'.
      train_log_interval (int): Interval of steps at which to log training loss.
          If unspecified it defaults to ``10``.
      print_train_iter (bool): If ``True``, training loss is printed to screen.
          If unspecified it defaults to ``False``.
      tf_logging (bool): If ``True`` log all statistics with tensorflow summaries,
          which can be viewed in real time with tensorboard. If unspecified it
          defaults to ``False``.
      no_logs (bool): If ``True`` no ``JSON`` files are created. If unspecified
          it defaults to ``False``.
      optimizer_hyperparams (dict): Keyword arguments for the hyperparameters of
          the optimizer. These are the ones specified in the ``hyperparams``
          dictionary passed to the ``__init__``.
    """
        # We will go through all the arguments, check whether they have been passed
        # to this function. If yes, we collect the (name, value) pairs  in ``args``.
        # If not, we add corresponding command line arguments.
        args = {}
        parser = argparse.ArgumentParser(
            description="Run {0:s} on a DeepOBS test problem.".format(
                self._optimizer_name))

        if testproblem is None:
            parser.add_argument(
                "testproblem",
                help="""Name of the DeepOBS testproblem
          (e.g. 'cifar10_3c3d'""")
        else:
            args["testproblem"] = testproblem

        if weight_decay is None:
            parser.add_argument(
                "--weight_decay",
                "--wd",
                type=float,
                help="""Factor
          used for the weight_deacy. If not given, the default weight decay for
          this model is used. Note that not all models use weight decay and this
          value will be ignored in such a case.""")
        else:
            args["weight_decay"] = weight_decay

        if batch_size is None:
            parser.add_argument(
                "--batch_size",
                "--bs",
                required=True,
                type=int,
                help="The batch size (positive integer).")
        else:
            args["batch_size"] = batch_size

        if num_epochs is None:
            parser.add_argument(
                "-N",
                "--num_epochs",
                required=True,
                type=int,
                help="Total number of training epochs.")
        else:
            args["num_epochs"] = num_epochs

        if learning_rate is None:
            parser.add_argument(
                "--learning_rate",
                "--lr",
                required=True,
                type=float,
                help=
                """Learning rate (positive float) to use. Can be used as the base
          of a learning rate schedule when used in conjunction with
          --lr_sched_epochs and --lr_sched_factors.""")
        else:
            args["learning_rate"] = learning_rate

        if lr_sched_epochs is None:
            parser.add_argument(
                "--lr_sched_epochs",
                nargs="+",
                type=int,
                help="""One or more epoch numbers (positive integers) that mark
          learning rate changes. The base learning rate has to be passed via
          '--learing_rate' and the factors by which to change have to be passed
          via '--lr_sched_factors'. Example: '--lr 0.3 --lr_sched_epochs 50 100
          --lr_sched_factors 0.1 0.01' will start with a learning rate of 0.3,
          then decrease to 0.1*0.3=0.03 after training for 50 epochs, and
          decrease to 0.01*0.3=0.003' after training for 100 epochs.""")
        else:
            args["lr_sched_epochs"] = lr_sched_epochs

        if lr_sched_factors is None:
            parser.add_argument(
                "--lr_sched_factors",
                nargs="+",
                type=float,
                help=
                """One or more factors (floats) by which to change the learning
          rate. The base learning rate has to be passed via '--learing_rate' and
          the epochs at which to change the learning rate have to be passed via
          '--lr_sched_factors'. Example: '--lr 0.3 --lr_sched_epochs 50 100
          --lr_sched_factors 0.1 0.01' will start with a learning rate of 0.3,
          then decrease to 0.1*0.3=0.03 after training for 50 epochs, and
          decrease to 0.01*0.3=0.003' after training for 100 epochs.""")
        else:
            args["lr_sched_factors"] = lr_sched_factors

        if random_seed is None:
            parser.add_argument(
                "-r",
                "--random_seed",
                type=int,
                default=42,
                help="An integer to set as tensorflow's random seed.")
        else:
            args["random_seed"] = random_seed

        if data_dir is None:
            parser.add_argument(
                "--data_dir",
                help="""Path to the base data dir. If
      not specified, DeepOBS uses its default.""")
        else:
            args["data_dir"] = data_dir

        if output_dir is None:
            parser.add_argument(
                "--output_dir",
                type=str,
                default="results",
                help="""Path to the base directory in which output files will be
          stored. Results will automatically be sorted into subdirectories of
          the form 'testproblem/optimizer'.""")
        else:
            args["output_dir"] = output_dir

        if train_log_interval is None:
            parser.add_argument(
                "--train_log_interval",
                type=int,
                default=10,
                help="Interval of steps at which training loss is logged.")
        else:
            args["train_log_interval"] = train_log_interval

        if print_train_iter is None:
            parser.add_argument(
                "--print_train_iter",
                action="store_const",
                const=True,
                default=False,
                help="""Add this flag to print mini-batch training loss to
          stdout on each (logged) interation.""")
        else:
            args["print_train_iter"] = print_train_iter

        if tf_logging is None:
            parser.add_argument(
                "--tf_logging",
                action="store_const",
                const=True,
                default=False,
                help="""Add this flag to log statistics using tensorflow
          (to view in tensorboard).""")
        else:
            args["tf_logging"] = tf_logging

        if no_logs is None:
            parser.add_argument(
                "--no_logs",
                action="store_const",
                const=True,
                default=False,
                help="""Add this flag to not save any json logging files.""")
        else:
            args["no_logs"] = no_logs

        # Optimizer hyperparams
        for hp in self._hyperparams:
            hp_name = hp["name"]
            if hp_name in optimizer_hyperparams:
                args[hp_name] = optimizer_hyperparams[hp_name]
            else:  # hp_name not in optimizer_hyperparams
                hp_type = hp["type"]
                if "default" in hp:
                    hp_default = hp["default"]
                    parser.add_argument(
                        "--{0:s}".format(hp_name),
                        type=hp_type,
                        default=hp_default,
                        help="""Hyperparameter {0:s} of {1:s} ({2:s};
              defaults to {3:s}).""".format(hp_name, self._optimizer_name,
                                            str(hp_type), str(hp_default)))
                else:
                    parser.add_argument(
                        "--{0:s}".format(hp_name),
                        type=hp_type,
                        required=True,
                        help="Hyperparameter {0:s} of {1:s} ({2:s}).".format(
                            hp_name, self._optimizer_name, str(hp_type)))

        # Get the command line arguments and add them to the ``args`` dict. Then
        # call the _run function with those arguments.
        cmdline_args = vars(parser.parse_args())
        args.update(cmdline_args)
        self._run(**args)

    def _run(self, testproblem, weight_decay, batch_size, num_epochs,
             learning_rate, lr_sched_epochs, lr_sched_factors, random_seed,
             data_dir, output_dir, train_log_interval, print_train_iter,
             tf_logging, no_logs, **optimizer_hyperparams):
        """Performs the actual run, given all the arguments."""

        # Set data directory of DeepOBS.
        if data_dir is not None:
            config.set_data_dir(data_dir)

        # Find testproblem by name and instantiate with batch size and weight decay.
        try:
            testproblem_mod = importlib.import_module(testproblem)
            testproblem_cls = getattr(testproblem_mod, testproblem)
            print("Loading local testproblem.")
        except:
            testproblem_cls = getattr(testproblems, testproblem)
        if weight_decay is not None:
            tproblem = testproblem_cls(batch_size, weight_decay)
        else:
            tproblem = testproblem_cls(batch_size)

        # Set up the testproblem.
        tf.reset_default_graph()
        tf.set_random_seed(random_seed)
        tproblem.set_up()
        loss = tf.reduce_mean(tproblem.losses) + tproblem.regularizer

        # Set up the optimizer and create learning rate schedule.
        global_step = tf.Variable(0, trainable=False)
        learning_rate_var = tf.Variable(learning_rate, trainable=False)
        opt = self._optimizer_class(learning_rate_var, **optimizer_hyperparams)
        lr_schedule = runner_utils.make_lr_schedule(
            learning_rate, lr_sched_epochs, lr_sched_factors)

        # Call optimizer's minimize on loss to update all variables in the
        # TRAINABLE_VARIABLES collection (with a dependency on performing all ops
        # in the collection UPDATE_OPS collection for batch norm, etc).
        with tf.control_dependencies(
            tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
            step = opt.minimize(loss, global_step=global_step)

        # Create output folder
        if not no_logs:
            run_folder_name, file_name = runner_utils.make_run_name(
                weight_decay, batch_size, num_epochs, learning_rate,
                lr_sched_epochs, lr_sched_factors, random_seed,
                **optimizer_hyperparams)
            directory = os.path.join(output_dir, testproblem, self._optimizer_name,
                                     run_folder_name)
            if not os.path.exists(directory):
                os.makedirs(directory)

        # Lists to track train/test loss and accuracy.
        train_losses = []
        test_losses = []
        minibatch_train_losses = []
        train_accuracies = []
        test_accuracies = []

        # Tensorboard summaries
        if tf_logging:
            # per iteration
            mb_train_loss_summary = tf.summary.scalar(
                "training/minibatch_train_losses",
                loss,
                collections=[tf.GraphKeys.SUMMARIES, "per_iteration"])
            # per epoch
            lr_summary = tf.summary.scalar(
                "hyperparams/learning_rate",
                learning_rate_var,
                collections=[tf.GraphKeys.SUMMARIES, "per_epoch"])
            batch_summary = tf.summary.scalar(
                "hyperparams/batch_size",
                batch_size,
                collections=[tf.GraphKeys.SUMMARIES, "per_epoch"])

            per_iter_summaries = tf.summary.merge_all(key="per_iteration")
            per_epoch_summaries = tf.summary.merge_all(key="per_epoch")
            summary_writer = tf.summary.FileWriter(directory)

        # Start tensorflow session and initialize variables.
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        # Wrapper functions for the evaluation phase.
        def evaluate(test=True):
            """Computes average loss and accuracy in the evaluation phase."""
            if test:
                sess.run(tproblem.test_init_op)
                msg = "TEST:"
                loss_list = test_losses
                acc_list = test_accuracies
            else:
                sess.run(tproblem.train_eval_init_op)
                msg = "TRAIN:"
                loss_list = train_losses
                acc_list = train_accuracies

            # Compute average loss and (if applicable) accuracy.
            loss_ = 0.0
            num_iters = 0.0
            acc_ = 0.0
            if tproblem.accuracy is not None:
                while True:
                    try:
                        l_, a_ = sess.run([loss, tproblem.accuracy])
                        loss_ += l_
                        acc_ += a_
                        num_iters += 1.0
                    except tf.errors.OutOfRangeError:
                        break
            else:  # accuracy is None
                acc_ = 0.0
                while True:
                    try:
                        l_ = sess.run(loss)
                        loss_ += l_
                        num_iters += 1.0
                    except tf.errors.OutOfRangeError:
                        break
            loss_ /= num_iters
            acc_ /= num_iters

            # Print and log the results.
            loss_list.append(loss_)
            acc_list.append(acc_)
            # Log results to tensorflow summaries
            if tf_logging:
                if test:
                    tag = "epoch/test_"
                else:
                    tag = "epoch/train_"
                summary = tf.Summary()
                summary.value.add(tag=tag + "loss_", simple_value=loss_)
                summary.value.add(tag=tag + "acc_", simple_value=acc_)
                per_epoch_summary_ = sess.run(per_epoch_summaries)
                summary_writer.add_summary(per_epoch_summary_,
                                           len(loss_list) - 1)
                summary_writer.add_summary(summary, len(loss_list) - 1)
                summary_writer.flush()

            print("{0:s} loss {1:g}, acc {2:f}".format(msg, loss_, acc_))

        # Start of training loop.
        for n in range(num_epochs + 1):
            # Evaluate at beginning of epoch.
            print("********************************")
            print("Evaluating after {0:d} of {1:d} epochs...".format(
                n, num_epochs))
            evaluate(test=False)
            evaluate(test=True)
            print("********************************")

            # Break from train loop after the last round of evaluation
            if n == num_epochs:
                break

            # Training
            if n in lr_schedule:
                sess.run(learning_rate_var.assign(lr_schedule[n]))
                print("Setting learning rate to {0:f}".format(lr_schedule[n]))
            sess.run(tproblem.train_init_op)
            s = 0
            while True:
                try:
                    # Training step, with logging if we hit the train_log_interval
                    if s % train_log_interval == 0:
                        if tf_logging:
                            _, loss_, per_iter_summary_ = sess.run(
                                [step, loss, per_iter_summaries])
                            summary_writer.add_summary(per_iter_summary_,
                                                       sess.run(global_step))
                        else:
                            _, loss_ = sess.run([step, loss])
                        minibatch_train_losses.append(loss_.astype(float))
                        if print_train_iter:
                            print("Epoch {0:d}, step {1:d}: loss {2:g}".format(
                                n, s, loss_))
                    else:
                        sess.run(step)
                    s += 1
                except tf.errors.OutOfRangeError:
                    break

        sess.close()
        # --- End of training loop.

        # Put results into output dictionary.
        output = {
            "train_losses": train_losses,
            "test_losses": test_losses,
            "minibatch_train_losses": minibatch_train_losses
        }
        if tproblem.accuracy is not None:
            output["train_accuracies"] = train_accuracies
            output["test_accuracies"] = test_accuracies

        # Put all run parameters into output dictionary.
        output["optimizer"] = self._optimizer_name
        output["testproblem"] = testproblem
        output["weight_decay"] = weight_decay
        output["batch_size"] = batch_size
        output["num_epochs"] = num_epochs
        output["learning_rate"] = learning_rate
        output["lr_sched_epochs"] = lr_sched_epochs
        output["lr_sched_factors"] = lr_sched_factors
        output["random_seed"] = random_seed
        output["train_log_interval"] = train_log_interval

        # Add optimizer hyperparameters as a sub-dictionary.
        output["hyperparams"] = optimizer_hyperparams

        # Dump output into json file.
        if not no_logs:
            with open(os.path.join(directory, file_name + ".json"), "w") as f:
                json.dump(output, f)