import numpy as np
import xgboost
import shap
import json
RowsNorm = 1e7
CostNorm = 1e8
TimeNorm = 1e4
NodeTypes = [
"Projection",
"MergingAggregated",
"Exchange",
"Aggregating",
"Join",
"Filter",
"TableScan",
"Limit",
"Sorting",
"CTERef",
"Buffer",
"Union",
"EnforceSingleRow",
"Window",
"Values",
"PartitionTopN",
""
]
def get_analyzed_data(path):
xs, ys = {}, {}
for tp in NodeTypes:
xs[tp], ys[tp] = [], []
with open(path) as f:
lines = f.read().splitlines()
for l in lines:
items = l.split('\t')
root = json.loads(items[0])['plan']
stack = [root]
while len(stack) > 0:
next = stack.pop()
arr = eval(items[2])
arr += [next['Statistic']['RowCount']/RowsNorm, next['Statistic']['Cost']/CostNorm]
xs[next['NodeType']].append(arr)
ys[next['NodeType']].append(next['Profiles']['OutputWaitTimeMs']/1e3)
if 'Children' in next:
stack = next['Children'] + stack
return xs, ys
def main(target_file):
xs, ys = get_analyzed_data("data/stats_samples")
f = open(target_file, 'w+')
for tp in NodeTypes:
if len(xs[tp]) > 0:
X = np.array(xs[tp])
y = np.reshape(np.array(ys[tp]), (-1,1))
Xd = xgboost.DMatrix(X, label=y)
model = xgboost.train({"eta": 1, "max_depth": 5, "base_score": 0, "lambda": 0}, Xd, 1)
pred = model.predict(Xd, output_margin=True)
explainer = shap.TreeExplainer(model)
explanation = explainer(Xd)
shap_values = np.abs(explanation.values)
shap_values = shap_values.mean(0)
print(tp)
f.write(tp + ' ')
for i, v in enumerate(shap_values.tolist()):
if v > 0.:
f.write(str(i) + ' ')
f.write('\n')
f.close()
if __name__ == "__main__":
main('data/correlation.txt')