Source code for snntoolbox.datasets.utils

# -*- coding: utf-8 -*-
"""
The main purpose of this module is to load a dataset from disk and feed it to
the toolbox in one of the formats it can handle.

For details see

.. autosummary::
    :nosignatures:

    get_dataset

@author: rbodo
"""

import json
import os

from configparser import NoOptionError
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from snntoolbox.utils.utils import import_helpers


[docs]def get_dataset(config):
    """Get dataset, either from ``.npz`` files or ``keras.ImageDataGenerator``.

    Returns Dictionaries with keys ``x_test`` and ``y_test`` if data set was
    loaded in ``.npz`` format, or with ``dataflow`` key if data will be loaded
    from ``.jpg``, ``.png``, or ``.bmp`` files by a
    ``keras.ImageDataGenerator``.

    Parameters
    ----------

    config: configparser.ConfigParser
        Settings.

    Returns
    -------

    normset: dict
        Used to normalized the network parameters.

    testset: dict
        Used to test the networks.

    """

    testset = {}
    normset = try_get_normset_from_scalefacs(config)
    dataset_path = config.get('paths', 'dataset_path')
    dataset_format = config.get('input', 'dataset_format')
    normalize_thresholds = config.getboolean('loihi', 'normalize_thresholds',
                                             fallback=False)
    is_testset_needed = config.getboolean('tools', 'evaluate_ann') or \
        config.getboolean('tools', 'simulate') or normalize_thresholds
    is_normset_needed = normalize_thresholds or (
            config.getboolean('tools', 'normalize') and normset == {})
    batch_size = config.getint('simulation', 'batch_size')

    # _______________________________ Keras __________________________________#
    try:
        keras_dataset = config.get('input', 'keras_dataset')
        if keras_dataset:
            from keras_rewiring.utilities.load_dataset \
                import load_and_preprocess_dataset
            num_to_test = config.getint('simulation', 'num_to_test')
            data = load_and_preprocess_dataset(keras_dataset)
            x_test, y_test = data['test']
            testset = {
                'x_test': x_test[:num_to_test],
                'y_test': y_test[:num_to_test]}
            if is_normset_needed:
                normset['x_norm'] = x_test
            return normset, testset
    except (NoOptionError, ImportError) as e:
        print("Warning:", e)

    # ________________________________ npz ___________________________________#
    if dataset_format == 'npz':
        print("Loading data set from '.npz' files in {}.\n".format(
            dataset_path))
        if is_testset_needed:
            num_to_test = config.getint('simulation', 'num_to_test')
            x_test = load_npz(dataset_path, 'x_test.npz')[:num_to_test]
            y_test = load_npz(dataset_path, 'y_test.npz')[:num_to_test]
            testset = {'x_test': x_test, 'y_test': y_test}
        if is_normset_needed:
            x_norm = load_npz(dataset_path, 'x_norm.npz')
            normset['x_norm'] = x_norm

    # ________________________________ jpg ___________________________________#
    elif dataset_format in {'jpg', 'png'}:
        print("Loading data set from ImageDataGenerator, using images in "
              "{}.\n".format(dataset_path))
        # Transform str to dict
        datagen_kwargs = eval(config.get('input', 'datagen_kwargs'))
        dataflow_kwargs = eval(config.get('input', 'dataflow_kwargs'))

        # Get class labels
        class_idx_path = config.get('paths', 'class_idx_path')
        if class_idx_path != '':
            class_idx = json.load(open(os.path.abspath(class_idx_path)))
            dataflow_kwargs['classes'] = \
                [class_idx[str(idx)][0] for idx in range(len(class_idx))]

        # Get proprocessing function
        if 'preprocessing_function' in datagen_kwargs:
            helpers = import_helpers(datagen_kwargs['preprocessing_function'],
                                     config)
            datagen_kwargs['preprocessing_function'] = \
                helpers.preprocessing_function

        dataflow_kwargs['directory'] = dataset_path
        if 'batch_size' not in dataflow_kwargs:
            dataflow_kwargs['batch_size'] = batch_size
        datagen = ImageDataGenerator(**datagen_kwargs)
        if (datagen.featurewise_center or datagen.featurewise_std_normalization
                or datagen.zca_whitening):
            # Compute quantities required for featurewise normalization
            # (std, mean, and principal components if ZCA whitening is applied)
            rs = datagen_kwargs.get('rescale', None)
            x_orig = ImageDataGenerator(rescale=rs).flow_from_directory(
                **dataflow_kwargs).next()[0]
            datagen.fit(x_orig)
        if is_normset_needed:
            shuffle = dataflow_kwargs.get('shuffle')
            dataflow_kwargs['shuffle'] = True
            normset['dataflow'] = \
                datagen.flow_from_directory(**dataflow_kwargs)
            dataflow_kwargs['shuffle'] = shuffle
        if is_testset_needed:
            testset = {
                'dataflow': datagen.flow_from_directory(**dataflow_kwargs)}

    # _______________________________ aedat __________________________________#
    elif dataset_format == 'aedat':
        if is_normset_needed:
            print("Loading normalization dataset from '.npz' file in {}.\n"
                  "".format(dataset_path))
            x_norm = load_npz(dataset_path, 'x_norm.npz')
            normset['x_norm'] = x_norm
            # For Loihi threshold normalization we need to pass the
            # normalization data in the testset dict.
            testset = {'x_norm': x_norm}

    return normset, testset


[docs]def try_get_normset_from_scalefacs(config):
    """
    Instead of loading a normalization data set to calculate scale-factors, try
    to get the scale-factors stored on disk during a previous run.

    Parameters
    ----------

    config: configparser.ConfigParser
        Settings.

    Returns
    -------

    : Union[dict, None]
        A dictionary with single key 'scale_facs'. The corresponding value is
        itself a dictionary containing the scale factors for each layer.
        Returns empty set if no scale factors were found.
    """

    newpath = os.path.join(config.get('paths', 'log_dir_of_current_run'),
                           'normalization')
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    filepath = os.path.join(newpath, config.get('normalization',
                                                'percentile') + '.json')
    if os.path.isfile(filepath):
        print("Loading scale factors from disk instead of recalculating.")
        with open(filepath) as f:
            return {'scale_facs': json.load(f)}

    return {}


[docs]def to_categorical(y, nb_classes):
    """Convert class vector to binary class matrix.

    If the input ``y`` has shape (``nb_samples``,) and contains integers from 0
    to ``nb_classes``, the output array will be of dimension
    (``nb_samples``, ``nb_classes``).
    """

    y = np.asarray(y, dtype='int32')
    y_cat = np.zeros((len(y), nb_classes))
    for i in range(len(y)):
        y_cat[i, y[i]] = 1.
    return y_cat


[docs]def load_npz(path, filename):
    """Load dataset from an ``.npz`` file.

    Parameters
    ----------

    filename : string
        Name of file.
    path: string
        Location of dataset to load.

    Returns
    -------

    : tuple[np.array]
        The dataset as a numpy array containing samples.
    """

    return np.load(os.path.join(path, filename))['arr_0']