Source code for deepobs.tensorflow.datasets.tolstoi

# -*- coding: utf-8 -*-
"""Tolstoi DeepOBS dataset."""

import os
import numpy as np
import tensorflow as tf
from . import dataset
from .. import config


[docs]class tolstoi(dataset.DataSet): """DeepOBS data set class for character prediction on `War and Peace` by\ Leo Tolstoi. Args: batch_size (int): The mini-batch size to use. Note that, if ``batch_size`` is not a divider of the dataset size the remainder is dropped in each epoch (after shuffling). seq_length (int): Sequence length to be modeled in each step. Defaults to ``50``. train_eval_size (int): Size of the train eval dataset. Defaults to ``653 237``, the size of the test set. Attributes: batch: A tuple ``(x, y)`` of tensors, yielding batches of tolstoi data (``x`` with shape ``(batch_size, seq_length)``) and (``y`` with shape ``(batch_size, seq_length)`` which is ``x`` shifted by one). Executing these tensors raises a ``tf.errors.OutOfRangeError`` after one epoch. train_init_op: A tensorflow operation initializing the dataset for the training phase. train_eval_init_op: A tensorflow operation initializing the testproblem for evaluating on training data. test_init_op: A tensorflow operation initializing the testproblem for evaluating on test data. phase: A string-value tf.Variable that is set to ``train``, ``train_eval`` or ``test``, depending on the current phase. This can be used by testproblems to adapt their behavior to this phase. """ def __init__(self, batch_size, seq_length=50, train_eval_size=653237): """Creates a new Tolstoi instance. Args: batch_size (int): The mini-batch size to use. Note that, if ``batch_size`` is not a divider of the dataset size the remainder is dropped in each epoch (after shuffling). seq_length (int): Sequence length to be modeled in each step. Defaults to ``50``. train_eval_size (int): Size of the train eval dataset. Defaults to ``653 237``, the size of the test set. """ self._name = "tolstoi" self._seq_length = seq_length self._train_eval_size = train_eval_size super(tolstoi, self).__init__(batch_size) def _make_dataset(self, filepath): """Creates a Tolstoi data set (helper used by ``.make_*_datset`` below). Args: filepath (str): Filepath to the .npy file containing the data set. Returns: A tf.data.Dataset yielding batches of Tolstoi data. """ # Load the array of character ids, determine the number of batches that # can be produced, given batch size and sequence lengh arr = np.load(filepath) num_batches = int( np.floor( (np.size(arr) - 1) / (self._batch_size * self._seq_length))) if num_batches == 0: raise ValueError( "This dataset is to small to use with this batch size " "and sequence length.") # Create input and output, where output is the text shifted by one # character x = arr[:num_batches * self._batch_size * self._seq_length] y = arr[1:num_batches * self._batch_size * self._seq_length + 1] # Split into batches and put into arrays X, Y, such that X[i,:] is the # i-th batch x_batches = np.split(x.reshape(self._batch_size, -1), num_batches, 1) y_batches = np.split(y.reshape(self._batch_size, -1), num_batches, 1) X = np.array(x_batches) Y = np.array(y_batches) with tf.name_scope(self._name): with tf.device('/cpu:0'): data = tf.data.Dataset.from_tensor_slices((X, Y)) data = data.prefetch(buffer_size=4) return data def _make_train_dataset(self): """Creates the Tolstoi training dataset. Returns: A tf.data.Dataset instance with batches of training data. """ filepath = os.path.join(config.get_data_dir(), "tolstoi", "train.npy") return self._make_dataset(filepath) def _make_train_eval_dataset(self): """Creates the Tolstoi train eval dataset. Returns: A tf.data.Dataset instance with batches of training eval data. """ return self._train_dataset.take( self._train_eval_size // self._batch_size) def _make_test_dataset(self): """Creates the Tolstoi test dataset. Returns: A tf.data.Dataset instance with batches of test data. """ filepath = os.path.join(config.get_data_dir(), "tolstoi", "test.npy") return self._make_dataset(filepath)