import json import numpy as np import torch import torch.optim import pickle from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.model_selection import train_test_split from attn_encoder import AttrEncoder from torch.utils.data import DataLoader CUDA = torch.cuda.is_available() CHANNELS = 9 EPOCHS = 100 def same_seed(seed): '''Fixes random number generator seeds for reproducibility.''' torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) def collate(x): trees = [] knobs = [] targets = [] for tree, knob, target in x: trees.append(tree) knobs.append(knob) targets.append(target) targets = torch.tensor(targets) return trees, knobs, targets class TCNNRegression: def __init__(self): self.trained = False def fit(self, X, K, Y): if isinstance(Y, list): Y = np.array(Y).reshape(-1, 1).astype(np.float32) pairs = list(zip(X, K, Y)) dataset = DataLoader(pairs, batch_size=32, shuffle=True, collate_fn=collate) self._net = AttrEncoder(CHANNELS, 11) self._net.train() optimizer = torch.optim.Adam(self._net.parameters()) loss_fn = torch.nn.MSELoss() losses = [] for epoch in range(1, 1+EPOCHS): accum = 0. for x, k, y in dataset: y_pred = self._net(x, k) # y_pred = y_pred.view(1, -1).reshape(1, -1) loss = loss_fn(y_pred, y) accum += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() accum /= len(dataset) losses.append(accum) print("epoch {} loss {}".format(epoch, accum)) self.trained = True def predict(self, X, K): assert self.trained self._net.eval() pred = self._net(X, K) # pred = self._net(X).cpu().detach().numpy() return pred if __name__ == "__main__": same_seed(1145141101) X, K, Y = [], [], [] SPLIT_POS = 4000 # with open("../dataset/imdb_data.bak") as f: # lines = f.readlines() # for l in lines: # items = l.split('\t') # plan = utils.generate_plan_tree(json.loads(items[1])['plan']) # latency = float(items[2]) # X.append(plan) # Y.append(latency) with open("../dataset/sample_test_0618") as f: data = f.readlines() for d in data: d_list = d.split('\t') X.append(json.loads(d_list[0])['plan']) K.append(eval(d_list[1])) Y.append(json.loads(d_list[2])['elapsed']) X_train, X_test, K_train, K_test, y_train, y_test = train_test_split(X, K, Y, test_size=0.2, random_state=42) model = TCNNRegression() model.fit(X_train, K_train, y_train) fw = open("attn_model.pkl", "wb") pickle.dump(model, fw) # with open('attn_model.pkl', 'rb') as file: # model = pickle.load(file) preds = model.predict(X_test, K_test).detach().numpy().flatten() mse = mean_squared_error(y_test, preds) mae = mean_absolute_error(y_test, preds) print("mean squared error: {}".format(mse)) print("mean absolute error: {}".format(mae))