import dask.dataframe as dd import random import tqdm import time from helper.config import * from helper.queries import * from helper.cache import * from helper.plot import * from helper.comparsion_plot import plot_comparison # **************************CONFIGURATIONS***************************** THRESHOLD = 0.75 EXPERIMENT_NAME = 'attribute_precision' DATASET = 'smallerReal' DATABASE = 'smaller_real' # ********************************************************************* SAVE_RESULT_AS = EXPERIMENT_NAME + '_' + DATASET + '_' + str(THRESHOLD) SPARQL = connect_to_stardog(db=DATABASE) # ********************************************************************* def load_cache(load_as='cache'): with open(load_as, 'rb') as handle: return pickle.load(handle) def load_ground_truth(): print('Loading ground-truth', end=' ') if DATASET == 'smallerReal': file = '../gt_files/attr_gt.csv' print(file, end=' ') df = dd.read_csv(file) df[df.columns[0]] = df[df.columns[0]] + '.csv' df[df.columns[2]] = df[df.columns[2]] + '.csv' else: file = '../gt_files/att_groundtruth.csv' print(file, end=' ') df = dd.read_csv(file) print('\tdone.') return df def get_n_random_tables(df, n: int): print('Getting {} random tables '.format(n), end='') t1 = time.time() random_samples = random.sample(list(df[df.columns[0]].unique().compute()), n) print('\tdone, time taken: {}.'.format(time.time() - t1)) return random_samples def attribute_precision(ground_truth: set, query_table: str, k_related_tables: list): def calculate_attribute_precision_without_join( predicted_pairs: list, ground_truth_pairs: set): tp = fp = 0 for pair in predicted_pairs: if pair in ground_truth_pairs: tp = tp + 1 else: fp = fp + 1 return tp / (tp + fp) def calculate_attribute_precision_with_join( predicted_pairs: list, join_paths: list, ground_truth_pairs: set): tp = fp = 0 for pair in predicted_pairs: if pair in ground_truth_pairs: tp = tp + 1 else: flag = False for j in join_paths: if j[1] == pair[1] and j in ground_truth_pairs: flag = True break if flag: tp = tp + 1 else: fp = fp + 1 return tp / (tp + fp) precision = [] precision_j = [] for table in k_related_tables: pred_without_joins = get_related_columns_between_2_tables_attribute_precision( SPARQL, query_table, table[1], THRESHOLD) attr_precision = calculate_attribute_precision_without_join(pred_without_joins, ground_truth) if attr_precision == 1.0: attr_precision_with_join = attr_precision precision_j.append(attr_precision_with_join) else: pred_with_joins = get_related_columns_between_2_tables_j_attribute_precision( SPARQL, query_table, table[1], THRESHOLD) attr_precision_with_join = calculate_attribute_precision_with_join(pred_without_joins, pred_with_joins, ground_truth) precision.append(attr_precision) precision_j.append(attr_precision_with_join) return np.mean(precision), np.mean(precision_j) def run_experiment(df): ground_truth_per_query_table = {} def get_ground_truth_for_query_table(table: str): if table in ground_truth_per_query_table: return ground_truth_per_query_table.get(table) else: gd = df.loc[df[df.columns[0]] == table].compute() gd = set([tuple(x) for x in gd.values]) ground_truth_per_query_table[table] = gd return ground_truth_per_query_table.get(table) if os.path.exists('../cache/' + SAVE_RESULT_AS + '.txt'): os.remove('../cache/' + SAVE_RESULT_AS + '.txt') random_100_tables = get_n_random_tables(df, 100) top_k = [] if DATASET == 'smallerReal': top_k = [5, 20, 50, 80, 110, 140, 170, 200, 230, 260] elif DATASET == 'synthetic': top_k = [5, 20, 50, 80, 110, 140, 170, 200, 230, 260, 290, 320, 350] df = df[['query_table', 'query_col_name', 'candidate_table', 'candidate_col_name']] print("\nRunning '{}' experiment on '{}' dataset. Top-k values = {}". format(EXPERIMENT_NAME.replace('_', ' ').upper(), DATASET.upper(), top_k)) res = {} for k in top_k: print('\nComputing for K =', k) ap_per_k = [] ap_j_per_k = [] for query_table in tqdm.tqdm(random_100_tables): # average over 100 random tables k_related_tables = get_top_k_related_tables(SPARQL, query_table, k, THRESHOLD) ground_truth = get_ground_truth_for_query_table(query_table) if len(k_related_tables): ap, ap_j = attribute_precision(ground_truth, query_table, k_related_tables) ap_per_k.append(ap) ap_j_per_k.append(ap_j) """ print('\n• Mean Attribute precision : ', np.mean(ap_per_k)) print('• Mean Attribute precision + Join : ', np.mean(ap_j_per_k)) """ else: print('bad table ', query_table) print("Attribute precision for k: {} = {}\n" "Attribute precision +J for k: {} = {}".format(k, np.mean(ap_per_k), k, np.mean(ap_j_per_k))) f = open("../cache/" + SAVE_RESULT_AS + ".txt", "a") f.write("K:{}\n\tattribute precision: {}\n\tattribute precision +J:{}\n\n".format(k, np.mean(ap_per_k), np.mean(ap_j_per_k))) f.close() res[k] = {"attribute precision": np.mean(ap_per_k), "attribute precision + J": np.mean(ap_j_per_k)} cache_score(res, k, top_k, SAVE_RESULT_AS) def main(): df = load_ground_truth() t1 = time.time() run_experiment(df) print('\nTotal time taken: ', time.time() - t1) exp_res = load_cache('../cache/attribute_precision_smallerReal_k-260.pkl') plt.figure(figsize=(12, 5)) plt.subplot(1, 2, 1) attribute_precision_plot = visualize(exp_res, EXPERIMENT_NAME.replace('_', ' ').capitalize(), DATASET) plt.subplot(1, 2, 2) comparison_plot = plot_comparison() plt.tight_layout() plt.savefig('../plots/{}.pdf'.format(EXPERIMENT_NAME), dpi=300) print('done.') main()