Source code for deepobs.pytorch.datasets.tolstoi

# -*- coding: utf-8 -*-
"""Tolstoi DeepOBS dataset."""

import os

import numpy as np
import torch
from torch.utils import data as dat

from .. import config
from . import dataset


[docs]class tolstoi(dataset.DataSet):
    """DeepOBS data set class for character prediction on `War and Peace` by\
    Leo Tolstoi.

  Args:
    batch_size (int): The mini-batch size to use. Note that, if ``batch_size``
        is not a divider of the dataset size the remainder is dropped in each
        epoch (after shuffling).
    seq_length (int): Sequence length to be modeled in each step.
        Defaults to ``50``.
    train_eval_size (int): Size of the train eval dataset.
        Defaults to ``653 237``, the size of the test set.

  """

    def __init__(self, batch_size, seq_length=50, train_eval_size=653237):
        """Creates a new Tolstoi instance.

    Args:
      batch_size (int): The mini-batch size to use. Note that, if ``batch_size``
          is not a divider of the dataset size the remainder is dropped in each
          epoch (after shuffling).
      seq_length (int): Sequence length to be modeled in each step.
          Defaults to ``50``.
      train_eval_size (int): Size of the train eval dataset.
          Defaults to ``653 237``, the size of the test set.
    """
        self._name = "tolstoi"
        self._seq_length = seq_length
        self._train_eval_size = train_eval_size
        super(tolstoi, self).__init__(batch_size)

    def _make_dataloader(self, filepath):
        # Load the array of character ids, determine the number of batches that
        # can be produced, given batch size and sequence lengh
        arr = np.load(filepath)
        num_batches = int(
            np.floor((np.size(arr) - 1) / (self._batch_size * self._seq_length))
        )
        if num_batches == 0:
            raise ValueError(
                "This dataset is to small to use with this batch size "
                "and sequence length."
            )

        # Create input and output, where output is the text shifted by one
        # character
        x = arr[: num_batches * self._batch_size * self._seq_length]
        y = arr[1 : num_batches * self._batch_size * self._seq_length + 1]

        #        x_sequences = x.reshape((self._batch_size  * num_batches, -1))
        #        y_sequences = y.reshape((self._batch_size  * num_batches, -1))
        #        dataset = dat.TensorDataset(torch.from_numpy(x_sequences), torch.from_numpy(y_sequences))
        #        loader = dat.DataLoader(dataset=dataset, batch_size=self._batch_size, shuffle=False, sampler = sampler)

        # Split into batches and put into arrays X, Y, such that X[i,:] is the
        # i-th batch
        x_batches = np.split(x.reshape(self._batch_size, -1), num_batches, 1)
        y_batches = np.split(y.reshape(self._batch_size, -1), num_batches, 1)

        X = np.array(x_batches)
        Y = np.array(y_batches)

        dataset = dat.TensorDataset(torch.from_numpy(X), torch.from_numpy(Y))

        return dataset

    def _make_train_dataloader(self):
        filepath = os.path.join(config.get_data_dir(), "tolstoi", "train.npy")
        return self._make_dataloader(filepath)

    def _make_train_eval_dataloader(self):
        indices = np.arange(
            self._train_eval_size // (self._batch_size * self._seq_length)
        )
        train_eval_set = self._train_dataloader[indices]
        return dat.TensorDataset(train_eval_set[0], train_eval_set[1])

    def _make_test_dataloader(self):
        filepath = os.path.join(config.get_data_dir(), "tolstoi", "test.npy")
        return self._make_dataloader(filepath)