"""Data preprocessing.""" import os import pickle from typing import Any, Dict, Tuple import numpy as np def load_pickle(f: str) -> Any: """Load a pickle file. Parameters: f: the pickle filename Returns: the pickled data """ return pickle.load(f, encoding="latin1") def load_CIFAR_batch(filename: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Load a single batch of cifar data. Parameters: filename: the pickle filename Returns: the data the labels """ with open(filename, "rb") as f: datadict = load_pickle(f) X = datadict["data"] Y = datadict["labels"] X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float") Y = np.array(Y) return X, Y def load_CIFAR10(ROOT: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Load all of cifar data. Parameters: ROOT: the root directory containing the data Returns: training data training labels testing data testing labels """ xs = [] ys = [] for b in range(1, 6): f = os.path.join(ROOT, "data_batch_{}".format(b)) X, Y = load_CIFAR_batch(f) xs.append(X) ys.append(Y) Xtr = np.concatenate(xs) Ytr = np.concatenate(ys) Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, "test_batch")) return Xtr, Ytr, Xte, Yte def get_CIFAR10_data( num_training: int = 49000, num_validation: int = 1000, num_test: int = 10000, subtract_mean: bool = True, ) -> Dict[str, np.ndarray]: """Load the CIFAR-10 dataset from disk and perform preprocessing to prepare it for classifiers. These are the same steps as we used for the SVM, but condensed to a single function. Parameters: num_training: number of training images num_validation: number of validation images num_test: number of test images subtract_mean: whether or not to normalize the data Returns: the train/val/test data and labels """ # Load the raw CIFAR-10 data cifar10_dir = os.path.join("cifar10", "cifar-10-batches-py") X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir) # Subsample the data mask = list(range(num_training, num_training + num_validation)) X_val = X_train[mask] y_val = y_train[mask] mask = list(range(num_training)) X_train = X_train[mask] y_train = y_train[mask] mask = list(range(num_test)) X_test = X_test[mask] y_test = y_test[mask] # Subtract mean image mean_image = np.mean(X_train, axis=0) X_train -= mean_image X_val -= mean_image X_test -= mean_image # Normalize pixel distributions over channels mean_channels = np.ones((32, 32, 3)) * np.mean(X_train, axis=(0, 1, 2)) std_channels = np.ones((32, 32, 3)) * np.std(X_train, axis=(0, 1, 2)) X_train -= mean_channels X_val -= mean_channels X_test -= mean_channels X_train /= std_channels X_val /= std_channels X_test /= std_channels # Reshape data to rows X_train = X_train.reshape(num_training, -1) X_val = X_val.reshape(num_validation, -1) X_test = X_test.reshape(num_test, -1) # Package data into a dictionary return { "X_train": X_train, "y_train": y_train, "X_val": X_val, "y_val": y_val, "X_test": X_test, "y_test": y_test, }