importance.py · aqetuner

import numpy as np
import xgboost

import shap
import json

RowsNorm = 1e7
CostNorm = 1e8
TimeNorm = 1e4

NodeTypes = [
    "Projection",
    "MergingAggregated",
    "Exchange",
    "Aggregating",
    "Join",
    "Filter",
    "TableScan",
    "Limit",
    "Sorting",
    "CTERef",
    "Buffer",
    "Union",
    "EnforceSingleRow",
    "Window",
    "Values",
    "PartitionTopN",
    ""
]

def get_analyzed_data(path):
    xs, ys = {}, {}
    for tp in NodeTypes:
        xs[tp], ys[tp] = [], []
    with open(path) as f:
        lines = f.read().splitlines()
    for l in lines:
        items = l.split('\t')
        root = json.loads(items[0])['plan']
        stack = [root]
        while len(stack) > 0:
            next = stack.pop()
            arr = eval(items[2])
            arr += [next['Statistic']['RowCount']/RowsNorm, next['Statistic']['Cost']/CostNorm]
            xs[next['NodeType']].append(arr)
            ys[next['NodeType']].append(next['Profiles']['OutputWaitTimeMs']/1e3)
            if 'Children' in next:
                stack = next['Children'] + stack
    return xs, ys

def main(target_file):
    xs, ys = get_analyzed_data("data/stats_samples")
    f = open(target_file, 'w+')
    for tp in NodeTypes:
        if len(xs[tp]) > 0:
            X = np.array(xs[tp])
            y = np.reshape(np.array(ys[tp]), (-1,1))
            Xd = xgboost.DMatrix(X, label=y)
            model = xgboost.train({"eta": 1, "max_depth": 5, "base_score": 0, "lambda": 0}, Xd, 1)
            pred = model.predict(Xd, output_margin=True)
            explainer = shap.TreeExplainer(model)
            explanation = explainer(Xd)
            shap_values = np.abs(explanation.values)
            shap_values = shap_values.mean(0)
            print(tp)
            f.write(tp + ' ')
            for i, v in enumerate(shap_values.tolist()):
                if v > 0.:
                    f.write(str(i) + ' ')
            f.write('\n')
    f.close()

if __name__ == "__main__":
    main('data/correlation.txt')