import itertools import pandas as pd import zlib import re import numpy as np import seaborn as sns from graphviz import Digraph from camelsplit import camelsplit from matplotlib import pyplot as plt from data_items.knowledge_graph.src.label import Label from api.helpers.helper import execute_query PREFIXES = """ PREFIX kglids: PREFIX data: PREFIX schema: PREFIX rdf: PREFIX rdfs: PREFIX pipeline: PREFIX lib: """ def query_kglids(config, rdf_query): return execute_query(config, PREFIXES + rdf_query) def get_datasets_info(config, show_query): query = PREFIXES + """ SELECT ?Dataset (count(?table_id) as ?Number_of_tables) WHERE { ?dataset_id rdf:type kglids:Dataset . ?dataset_id schema:name ?Dataset . ?table_id kglids:isPartOf ?dataset_id . } group by ?Dataset """ if show_query: print(query) return execute_query(config, query) def get_tables_info(config, dataset: str, show_query): if dataset: dataset = '?dataset_id schema:name "{}" .'.format(dataset) query = PREFIXES + """ SELECT ?Table ?Dataset ?Path_to_table WHERE { ?table_id rdf:type kglids:Table . %s ?table_id kglids:isPartOf ?dataset_id . ?table_id schema:name ?Table . ?table_id data:hasFilePath ?Path_to_table . ?dataset_id schema:name ?Dataset . }""" % dataset if show_query: print(query) return execute_query(config, query) def get_top_k_tables(pairs: list): top_k = {} dataset = {} path = {} for p in pairs: if p[0] not in top_k: top_k[p[0]] = p[1] dataset[p[0]] = p[2] path[p[0]] = p[3] else: updated_score = top_k.get(p[0]) + p[1] top_k[p[0]] = updated_score scores = top_k top_k = list(dict(sorted(top_k.items(), key=lambda item: item[1], reverse=True)).keys()) top_k = [list(ele) for ele in top_k] for pair in top_k: c1 = pair[0] c2 = pair[1] pair = pair.extend([scores.get((c1, c2)), dataset.get((c1, c2)), path.get((c1, c2))]) return top_k def recommend_tables(config, dataset: str, table: str, k: int, relation: str, show_query: bool): query = PREFIXES + """ SELECT ?table_name1 ?table_name2 ?certainty ?dataset2_n ?path WHERE { ?table_id schema:name "%s" . ?table_id schema:name ?table_name1 . ?dataset_id schema:name "%s" . ?table_id kglids:isPartOf ?dataset_id . ?column_id kglids:isPartOf ?table_id . <> data:withCertainty ?certainty . ?column_id2 kglids:isPartOf ?table_id2 . ?table_id2 schema:name ?table_name2 . ?table_id2 data:hasFilePath ?path . ?table_id2 kglids:isPartOf ?dataset2 . ?dataset2 schema:name ?dataset2_n . } """ % (table, dataset, relation) if show_query: print(query) res = execute_query(config, query, return_type='json') result = [] for r in res: table1 = r["table_name1"]["value"] table2 = r["table_name2"]["value"] certainty = float(r["certainty"]["value"]) dataset = r["dataset2_n"]["value"] path = r["path"]["value"] result.append([(table1, table2), certainty, dataset, path]) result = get_top_k_tables(result)[:k] table = list(map(lambda x: x[1], result)) scores = list(map(lambda x: x[2], result)) dataset = list(map(lambda x: x[3], result)) path = list(map(lambda x: x[4], result)) return pd.DataFrame({'Dataset': dataset, 'Recommended_table': table, 'Score': scores, 'Path_to_table': path}) def show_graph_info(config, show_query): query1 = PREFIXES + """ SELECT (COUNT(?Dataset) as ?Datasets) WHERE { ?Dataset rdf:type kglids:Dataset . } """ query2 = PREFIXES + """ SELECT (COUNT(?Table) as ?Tables) WHERE { ?Table rdf:type kglids:Table ; kglids:isPartOf ?Dataset . ?Dataset rdf:type kglids:Dataset . } """ query3 = PREFIXES + """ SELECT (COUNT(?Pipeline) as ?Pipelines) WHERE { ?Pipeline rdf:type kglids:Pipeline ; kglids:isPartOf ?Dataset . ?Dataset rdf:type kglids:Dataset . } """ query4 = PREFIXES + """ SELECT (COUNT(?Column) as ?Columns) WHERE { ?Column rdf:type kglids:Column ; kglids:isPartOf ?Table . ?Table rdf:type kglids:Table . } """ if show_query: print(query1, '\n', query3, '\n', query2, '\n', query4) dataset = execute_query(config, query1) tables = execute_query(config, query2) pipelines = execute_query(config, query3) columns = execute_query(config, query4) return pd.concat([dataset, pipelines, tables, columns], axis=1) def get_table_path(config, dataset, table): query = PREFIXES + """ SELECT ?table_path WHERE { ?dataset schema:name "%s" ; rdf:type kglids:Dataset . ?table schema:name "%s" ; rdf:type kglids:Table ; data:hasFilePath ?table_path . } """ % (dataset, table) res = execute_query(config, query)["results"]["bindings"][0] return res["table_path"]["value"] def get_table_info(config, dataset, table, show_query): query = PREFIXES + """ SELECT (max(?rows) as ?number_of_rows) (COUNT(?col) as ?number_of_columns) WHERE { ?dataset schema:name "%s" ; rdf:type kglids:Dataset . ?table schema:name "%s" ; rdf:type kglids:Table ; data:hasFilePath ?table_path . ?col kglids:isPartOf ?table ; rdf:type kglids:Column ; data:hasTotalValueCount ?rows . } """ % (dataset, table) if show_query: print(query) res = execute_query(config, query)["results"]["bindings"][0] rows = res["number_of_rows"]["value"] columns = res["number_of_columns"]["value"] return pd.DataFrame( {'Dataset': [dataset], 'Table': [table], 'Path_to_table': [get_table_path(config, dataset, table)], 'Number_of_columns': [columns], 'Number_of_rows': [rows]}) def _create_tables_df_row(results): return { 'Dataset': results['dataset_name']['value'], 'Table': results['name']['value'], 'Number_of_columns': float(results['number_of_columns']['value']), 'Number_of_rows': float(results['number_of_rows']['value']), 'Path_to_table': results['path']['value'] } def search_tables_on(config, all_conditions: tuple, show_query: bool): def search(conditions: tuple): return PREFIXES + \ '\nselect ?name ?dataset_name ?path (' \ 'count(distinct ?cols) as ?number_of_columns) (max (?total) as ?number_of_rows)' \ '\nwhere {' \ '\n?table schema:name ?name.' \ '\n?table data:hasFilePath ?path.' \ '\n?table kglids:isPartOf ?dataset.' \ '\n?dataset schema:name ?dataset_name.' \ '\n?cols kglids:isPartOf ?table.' \ '\n?cols data:hasTotalValueCount ?total.\n' \ + conditions[0] + \ '\nfilter( ' + conditions[1] + ')}' \ '\n group by ?name ?dataset_name ?path' query = search(all_conditions) if show_query: print(query) res = execute_query(config, query, return_type='json') for result in res: yield _create_tables_df_row(result) def _get_iri(config, dataset_name: str, table_name: str = None, show_query: bool = False): if table_name is None: query = PREFIXES + \ '\nselect ?id' \ '\nwhere {' \ '\n?id a kglids:Dataset.' \ '\n?id rdfs:label %s }' % dataset_name else: query = PREFIXES + \ '\nselect ?id where{' \ '\n?id a kglids:Table.' \ '\n?id rdfs:label %s.' \ '\n?id kglids:isPartOf ?dataset.' \ '\n?dataset rdfs:label %s.' \ '\n?dataset a kglids:Dataset.}' % (table_name, dataset_name) if show_query: print(query) results = execute_query(config, query, return_type='json') bindings = results if not bindings: return None return str(bindings[0]['id']['value']) def get_iri_of_table(config, dataset_name: str, table_name: str, show_query: bool = False): dataset_label = generate_label(dataset_name, 'en') table_label = generate_label(table_name, 'en') return _get_iri(config, dataset_label, table_label, show_query) def generate_label(col_name: str, lan: str) -> Label: if '.csv' in col_name: col_name = re.sub('.csv', '', col_name) col_name = re.sub('[^0-9a-zA-Z]+', ' ', col_name) text = " ".join(camelsplit(col_name.strip())) text = re.sub(r'\s+', ' ', text.strip()) return Label(text.lower(), lan) def _create_path_row(result, hops): data = {'starting_column': result['c1name']['value'], 'starting_table': result['t1name']['value'], 'starting_table_path': result['t1path']['value'], 'starting_dataset': result['d1name']['value']} intermediate = {} for i in range(2, hops + 1): intermediate.update({'intermediate_column_land_in' + str(i): result['c' + str(i) + 'name']['value'], 'intermediate_table' + str(i): result['t' + str(i) + 'name']['value'], 'intermediate_table_path' + str(i): result['t' + str(i) + 'path']['value'], 'intermediate_column_take_off' + str(i): result['cc' + str(i) + 'name']['value'], 'intermediate_dataset' + str(i): result['d' + str(i) + 'name']['value']}) data.update(intermediate) data.update({'target_column': result['c' + str(hops + 1) + 'name']['value'], 'target_table': result['t' + str(hops + 1) + 'name']['value'], 'target_table_path': result['t' + str(hops + 1) + 'path']['value'], 'target_dataset': result['d' + str(hops + 1) + 'name']['value']}) return data def get_path_between(config, start_iri: str, target_iri: str, predicate: str, hops: int, show_query: bool = False): def _generate_starting_nodes() -> str: return '\n ?c1 schema:name ?c1name.' \ '\n ?c1 kglids:isPartOf ?t1.' \ '\n ?t1 schema:name ?t1name.' \ '\n ?t1 data:hasFilePath ?t1path.' \ '\n ?t1 kglids:isPartOf ?d1.' \ '\n ?d1 schema:name ?d1name.' def _generate_intermediate_nodes(h: int) -> str: inters = '' for i in range(2, h + 1): inter = '\n ?c' + str(i) + ' a kglids:Column.' \ '\n ?c' + str(i) + ' schema:name ?c' + str(i) + 'name.' \ '\n ?c' + str( i) + ' kglids:isPartOf ?t' + str(i) + '.' \ '\n ?t' + str(i) + ' schema:name ?t' + str(i) + 'name.' \ '\n ?t' + str( i) + ' data:hasFilePath ?t' + str(i) + 'path.' \ '\n ?cc' + str(i) + ' kglids:isPartOf ?t' + str(i) + '.' \ '\n ?cc' + str( i) + ' schema:name ?cc' + str(i) + 'name.' \ '\n ?t' + str(i) + ' kglids:isPartOf ?d' + str(i) + '.' \ '\n ?d' + str( i) + ' schema:name ?d' + str(i) + 'name.' inters += inter return inters def _generate_target_nodes(h: int) -> str: return '\n ?c' + str(h + 1) + ' schema:name ?c' + str(h + 1) + 'name.' \ '\n ?c' + str( h + 1) + ' kglids:isPartOf ?t' + str(h + 1) + '.' \ '\n ?t' + str(h + 1) + ' schema:name ?t' + str( h + 1) + 'name.' \ '\n ?t' + str(h + 1) + ' data:hasFilePath ?t' + str(h + 1) + 'path.' \ '\n ?t' + str( h + 1) + ' kglids:isPartOf ?d' + str(h + 1) + '.' \ '\n ?d' + str(h + 1) + ' schema:name ?d' + str( h + 1) + 'name.' def _generate_relationships(h: int, pred: str) -> str: relations = '\n ?c1 ' + pred + ' ?c2.' for i in range(2, h + 1): relation = '\n ?cc' + str(i) + ' ' + pred + ' ?c' + str(i + 1) + '.' relations += relation return relations def _generate_select(h: int) -> str: selects = '\n ?c1name ?t1name ?t1path ?d1name' for i in range(2, h + 1): selects += '\n ?c' + str(i) + 'name ?t' + str(i) + 'name ?t' + str(i) + 'path' \ ' ?cc' + str(i) + 'name ?d' + str( i) + 'name' selects += '\n ?c' + str(h + 1) + 'name ?t' + str(h + 1) + 'name ?t' + str(h + 1) + 'path ?d' + str( h + 1) + 'name' return selects starting_nodes = _generate_starting_nodes() intermediate_nodes = _generate_intermediate_nodes(hops) target_nodes = _generate_target_nodes(hops) relationships = _generate_relationships(hops, predicate) select = _generate_select(hops) all_nodes = starting_nodes + intermediate_nodes + target_nodes query = PREFIXES + \ '\n select' + select + \ '\nwhere {' + \ all_nodes + relationships + \ '\n values ?t1 {' + start_iri + '}' + \ '\n values ?t' + str(hops + 1) + ' {' + target_iri + '}}' if show_query: print(query) results = execute_query(config, query, return_type='json') bindings = results if not bindings: return [] for result in bindings: yield _create_path_row(result, hops) def generate_component_id(dataset_name: str, table_name: str = '', column_name: str = ''): return zlib.crc32(bytes(dataset_name + table_name + column_name, 'utf-8')) def generate_graphviz(df: pd.DataFrame, predicate: str): def parse_starting_or_target_nodes(dot, row, column_ids: list, table_ids: list, dataset_ids: list, start: bool) -> str: relation_name = 'partOf' if start: dataset_name = row[0] table_name = row[1] column_name = row[3] color = 'lightblue2' else: dataset_name = row[-4] table_name = row[-3] column_name = row[-1] color = 'darkorange3' dataset_id = str(generate_component_id(dataset_name)) table_id = str(generate_component_id(dataset_name, table_name)) column_id = str(generate_component_id(dataset_name, table_name, column_name)) if column_id in column_ids: return column_id dot.node(column_id, column_name, style='filled', fillcolor=color) column_ids.append(column_id) if table_id in table_ids: dot.edge(column_id, table_id, relation_name) return column_id dot.node(table_id, table_name, style='filled', fillcolor=color) table_ids.append(table_id) if dataset_id in dataset_ids: dot.edge(column_id, table_id, relation_name) dot.edge(table_id, dataset_id, relation_name) return column_id dot.node(dataset_id, dataset_name, style='filled', fillcolor=color) dataset_ids.append(dataset_id) dot.edge(column_id, table_id, relation_name) dot.edge(table_id, dataset_id, relation_name) return column_id def parse_intermediate_nodes(dot, row, column_ids: list, table_ids: list, dataset_ids: list) -> list: ids = [] relation_name = 'partOf' for i in range(4, len(row) - 4, 5): dataset_name = row[i] table_name = row[i + 1] land_in_column_name = row[i + 2] take_off_column_name = row[i + 4] dataset_id = str(generate_component_id(dataset_name)) table_id = str(generate_component_id(dataset_name, table_name)) land_in_column_id = str(generate_component_id(dataset_name, table_name, land_in_column_name)) take_off_column_id = str(generate_component_id(dataset_name, table_name, take_off_column_name)) ids.extend([land_in_column_id, take_off_column_id]) land_in_column_exist = False take_off_column_exist = False if land_in_column_id in column_ids: land_in_column_exist = True dot.node(land_in_column_id, land_in_column_name) column_ids.append(land_in_column_id) if take_off_column_id in column_ids: take_off_column_exist = True if land_in_column_exist and take_off_column_exist: continue dot.node(take_off_column_id, take_off_column_name) column_ids.append(take_off_column_id) if table_id in table_ids: if land_in_column_id == take_off_column_id: dot.edge(land_in_column_id, table_id, relation_name) else: dot.edge(land_in_column_id, table_id, relation_name) dot.edge(take_off_column_id, table_id, relation_name) continue dot.node(table_id, table_name) table_ids.append(table_id) if dataset_id in dataset_ids: if land_in_column_id == take_off_column_id: dot.edge(land_in_column_id, table_id, relation_name) else: dot.edge(land_in_column_id, table_id, relation_name) dot.edge(take_off_column_id, table_id, relation_name) dot.edge(table_id, dataset_id, relation_name) continue dot.node(dataset_id, dataset_name) dataset_ids.append(dataset_id) if land_in_column_id == take_off_column_id: dot.edge(land_in_column_id, table_id, relation_name) else: dot.edge(land_in_column_id, table_id, relation_name) dot.edge(take_off_column_id, table_id, relation_name) dot.edge(table_id, dataset_id, relation_name) return ids def establish_relationships(dot, row_ids: list, relationships: list): for j in range(0, len(row_ids) - 1, 2): pair = (row_ids[j], row_ids[j + 1]) if pair[0] == pair[1]: continue if not pair in relationships: relationships.append(pair) dot.edge(pair[0], pair[1], 'similar', dir='none') col_ids = [] tab_ids = [] data_ids = [] relations = [] dot_graph = Digraph(strict=True) for i in range(len(df)): r = df.iloc[i] row_col_ids = [] starting_column_id = parse_starting_or_target_nodes(dot_graph, r, col_ids, tab_ids, data_ids, True) intermediate_col_ids = parse_intermediate_nodes(dot_graph, r, col_ids, tab_ids, data_ids) target_col_id = parse_starting_or_target_nodes(dot_graph, r, col_ids, tab_ids, data_ids, False) row_col_ids.append(starting_column_id) row_col_ids.extend(intermediate_col_ids) row_col_ids.append(target_col_id) establish_relationships(dot_graph, row_col_ids, relations) dot_graph.attr(label='Paths between starting nodes in blue and target nodes in orange', size='8,75,10') return dot_graph def get_path_between_tables(config, source_table_info, target_table_info, hops, relation, show_query): source_table_name = source_table_info["Table"] source_dataset_name = source_table_info["Dataset"] target_dataset_name = target_table_info["Dataset"] if 'Recommended_table' in target_table_info.keys(): target_table_name = target_table_info["Recommended_table"] else: target_table_name = target_table_info["Table"] starting_table_iri = get_iri_of_table(config, dataset_name=generate_label(source_dataset_name, 'en').get_text(), table_name=generate_label(source_table_name, 'en').get_text()) target_table_iri = get_iri_of_table(config, dataset_name=generate_label(target_dataset_name, 'en').get_text(), table_name=generate_label(target_table_name, 'en').get_text()) if starting_table_iri is None: raise ValueError(str(source_table_info) + ' does not exist') if target_table_iri is None: raise ValueError(str(target_table_info) + ' does not exist') data = get_path_between(config, '<' + starting_table_iri + '>', '<' + target_table_iri + '>', relation, hops, show_query) path_row = ['starting_dataset', 'starting_table', 'starting_table_path', 'starting_column'] for i in range(2, hops + 1): intermediate = ['intermediate_dataset' + str(i), 'intermediate_table' + str(i), 'intermediate_column_land_in' + str(i), 'intermediate_table_path' + str(i), 'intermediate_column_take_off' + str(i)] path_row.extend(intermediate) path_row.extend(['target_dataset', 'target_table', 'target_table_path', 'target_column']) df = pd.DataFrame(list(data), columns=path_row) dot = generate_graphviz(df, relation) return dot def get_top_scoring_ml_model(config, dataset, show_query): query = """ PREFIX kglids: SELECT (count(?x) as ?count) WHERE { ?x rdf:type kglids:Pipeline . } """ return execute_query(config, query) def get_pipelines_info(config, author, show_query): if author != '': author = "FILTER (?Author = '{}') .".format(author) query = PREFIXES + """ SELECT ?Pipeline ?Dataset ?Author ?Written_on ?Number_of_votes ?Score WHERE { ?pipeline_id rdf:type kglids:Pipeline ; pipeline:hasVotes ?Number_of_votes ; rdfs:label ?Pipeline ; pipeline:isWrittenOn ?Written_on ; pipeline:isWrittenBy ?Author ; pipeline:hasScore ?Score ; kglids:isPartOf ?Dataset_id . ?Dataset_id schema:name ?Dataset . %s } ORDER BY DESC(?Number_of_votes) """ % author if show_query: print(query) return execute_query(config, query) def get_most_recent_pipeline(config, dataset, show_query): if dataset != '': dataset = "FILTER (?Dataset = '{}') .".format(dataset) query = PREFIXES + """ SELECT ?Pipeline ?Dataset ?Author ?Written_on ?Number_of_votes ?Score WHERE { ?pipeline_id rdf:type kglids:Pipeline ; pipeline:hasVotes ?Number_of_votes ; rdfs:label ?Pipeline ; pipeline:isWrittenOn ?Written_on ; pipeline:isWrittenBy ?Author ; pipeline:hasScore ?Score ; kglids:isPartOf ?Dataset_id . ?Dataset_id schema:name ?Dataset . %s } ORDER BY DESC(?Written_on) LIMIT 1 """ % dataset if show_query: print(query) return execute_query(config, query) def get_top_k_scoring_pipelines_for_dataset(config, dataset, k, show_query): if k is not None: k = 'LIMIT ' + str(k) if k is None: k = '' if dataset != '': dataset = "FILTER (?Dataset = '{}') .".format(dataset) query = PREFIXES + """ SELECT ?Pipeline ?Dataset ?Author ?Written_on ?Number_of_votes ?Score WHERE { ?pipeline_id rdf:type kglids:Pipeline ; pipeline:hasVotes ?Number_of_votes ; rdfs:label ?Pipeline ; pipeline:isWrittenOn ?Written_on ; pipeline:isWrittenBy ?Author ; pipeline:hasScore ?Score ; kglids:isPartOf ?Dataset_id . ?Dataset_id schema:name ?Dataset . %s } ORDER BY DESC(?Score) %s """ % (dataset, k) if show_query: print(query) return execute_query(config, query) CLASSIFIERS = {'RandomForestClassifier': '', 'SVC': '', 'KNeighborsClassifier': '', 'GradientBoostingClassifier': '', 'LogisticRegression': '', 'DecisionTreeClassifier': '', 'AdaBoostClassifier': '', 'SGDClassifier': '', 'MLPClassifier': '', 'XGBClassifier': '', 'VotingClassifier': '', 'PassiveAggressiveClassifier': '', 'BaggingClassifier': '', 'RidgeClassifier': '', 'RadiusNeighborsClassifier': '', 'ExtraTreesClassifier': '', 'TFDistilBertForSequenceClassification': ''} def search_classifier(config, dataset, show_query): sub_graph_query = """ """ for classifier, classifier_url in CLASSIFIERS.items(): if classifier != 'RandomForestClassifier': query = """ UNION { ?Statement_number pipeline:callsLibrary %s. BIND('%s' as ?Classifier) } """ % (classifier_url, classifier) sub_graph_query = sub_graph_query + query filter_for_dataset = '' if dataset != '': filter_for_dataset = '?Dataset_id schema:name "{}".'.format(dataset) query = PREFIXES + """ SELECT DISTINCT ?Dataset ?Pipeline ?Classifier ?Score WHERE { ?Dataset_id rdf:type kglids:Dataset ; schema:name ?Dataset . ?Pipeline_id kglids:isPartOf ?Dataset_id ; rdfs:label ?Pipeline ; pipeline:hasScore ?Score . graph ?Pipeline_id { ?x pipeline:callsLibrary ?y { ?Statement_number pipeline:callsLibrary . BIND('RandomForestClassifier' as ?Classifier) }""" + sub_graph_query + """ } %s } ORDER BY DESC(?Score) """ % filter_for_dataset if show_query: print(query) return execute_query(config, query) def get_hyperparameters(config, pipeline, classifier, show_query): classifier_url = CLASSIFIERS.get(classifier) parameter_heading = '?{}_hyperparameter'.format(classifier) query = PREFIXES + """ SELECT DISTINCT %s ?Value WHERE { ?Pipeline_id rdfs:label '%s' ; pipeline:hasScore ?Score . graph ?Pipeline_id { ?Statement_number pipeline:callsLibrary %s . << ?Statement_number pipeline:hasParameter %s >> pipeline:withParameterValue ?Value . } } ORDER BY DESC(?Score)""" % (parameter_heading, pipeline, classifier_url, parameter_heading) if show_query: print(query) df = execute_query(config, query) if np.shape(df)[0] == 0: return 'Using default configurations' else: return df def get_library_usage(config, dataset, k, show_query): if dataset != '': dataset = '?Dataset schema:name "{}" .\n\t\t' \ '?Pipeline kglids:isPartOf ?Dataset .'.format(dataset) query = PREFIXES + """ SELECT ?Library (COUNT(distinct ?Pipeline) as ?Usage) WHERE { %s ?Pipeline rdf:type kglids:Pipeline . GRAPH ?Pipeline { ?Statement pipeline:callsLibrary ?l . BIND(STRAFTER(str(?l), str(lib:)) as ?l1) . BIND(STRBEFORE(str(?l1), str('/')) as ?Library) . } FILTER (?Library != "") . FILTER (?Library != "builtin") . } GROUP BY ?Library ORDER BY DESC(?Usage) """ % dataset if show_query: print(query) df = execute_query(config, query) df['Usage (in %)'] = list(map(lambda x: x * 100, [int(i) / sum(df['Usage']. tolist()) for i in (df['Usage'].tolist())])) if len(df) == 0: print("No library found") return df = df.head(k) fig, ax = plt.subplots(figsize=(6, 4)) ax.bar(range(len(df)), df['Usage'].tolist(), color='mediumseagreen') ax.set_xticks(range(len(df))) ax.set_xticklabels(df['Library'].tolist(), rotation=20) ax.set_ylabel('Number of Pipelines') for i, usage in enumerate(df['Usage']): ax.text(i-0.35, usage + 200, str(usage), color='k', fontweight='bold') ax.set_ylim(0, df['Usage'].max() + 2000) plt.grid(axis='y') plt.tight_layout() plt.savefig('library_usage_stats.pdf') plt.show() def get_top_used_libraries(config, task, show_query): if task == 'classification': task = 'classifi' elif task == 'clustering': task = 'cluster' elif task == 'visualization': task = 'plot' else: task = 'regress' query = PREFIXES + """ SELECT DISTINCT ?Library ?Module ?Pipeline ?Dataset WHERE { ?Pipeline_id rdf:type kglids:Pipeline ; rdfs:label ?Pipeline ; kglids:isPartOf ?dataset_id . ?dataset_id schema:name ?Dataset . GRAPH ?Pipeline_id { ?statement pipeline:callsLibrary ?l . BIND(STRAFTER(str(?l), str(lib:)) as ?l1) . BIND(STRBEFORE(str(?l1), str('/')) as ?Library) . BIND(STRAFTER(str(?l), str(?Library)) as ?m) . BIND(STRAFTER(str(?m), str('/')) as ?Module) . } FILTER(regex(?l, "%s", "i")) }""" % task if show_query: print(query) return execute_query(config, query) def get_pipelines_calling_libraries(config, components, show_query): sub_query = '' for i in range(len(components)): sub_query = sub_query + \ '?Statement_{} pipeline:callsLibrary .\n ' \ .format(i + 1, components[i].replace('.', '/')) query = PREFIXES + """ SELECT DISTINCT ?Pipeline ?Dataset ?Author ?Score ?Number_of_votes WHERE { ?Pipeline_id rdf:type kglids:Pipeline ; pipeline:hasVotes ?Number_of_votes ; rdfs:label ?Pipeline ; pipeline:isWrittenOn ?Written_on ; pipeline:isWrittenBy ?Author ; pipeline:hasScore ?Score ; kglids:isPartOf ?Dataset_id ; graph ?Pipeline_id { %s } ?Dataset_id schema:name ?Dataset . } ORDER BY DESC(?Score)""" % sub_query if show_query: print(query) return execute_query(config, query) def get_pipelines_for_deep_learning(config, show_query): query = PREFIXES + """ SELECT DISTINCT ?Pipeline ?Dataset ?Author ?Written_on ?Score ?Number_of_votes WHERE { ?Pipeline_id rdf:type kglids:Pipeline ; pipeline:hasVotes ?Number_of_votes ; rdfs:label ?Pipeline ; pipeline:isWrittenOn ?Written_on ; pipeline:isWrittenBy ?Author ; pipeline:hasScore ?Score ; pipeline:hasTag ?Tag ; kglids:isPartOf ?Dataset_id . ?Dataset_id schema:name ?Dataset . FILTER(regex(?Tag, "deep learning", "i")) } ORDER BY DESC(?Score)""" if show_query: print(query) return execute_query(config, query) def recommend_transformations(config, show_query): query = PREFIXES + """ SELECT DISTINCT ?Transformation ?Pipeline (?Table_id as ?Table) ?Dataset WHERE { ?Pipeline_id rdf:type kglids:Pipeline ; pipeline:hasVotes ?Number_of_votes ; rdfs:label ?Pipeline ; pipeline:isWrittenOn ?Written_on ; pipeline:isWrittenBy ?Author ; pipeline:hasScore ?Score ; pipeline:hasTag ?Tag ; kglids:isPartOf ?Dataset_id . ?Dataset_id schema:name ?Dataset . graph ?Pipeline_id { ?s pipeline:callsLibrary ?l . BIND(STRAFTER(str(?l), str(lib:)) as ?Transformation) . ?Table_id rdf:type kglids:Table . } FILTER(regex(?l, "preprocessing", "i")) } ORDER BY DESC(?Score) """ if show_query: print(query) df = execute_query(config, query) df['Table'] = df['Table'].apply(lambda x: x.rsplit('/', 1)[-1]) df['Transformation'] = df['Transformation'].apply(lambda x: x.replace('/', '.')) return df def get_pipelines_by_tags(config, tag, show_query): if tag != '': tag = 'FILTER(regex(?Tag, "{}", "i"))'.format(tag) query = PREFIXES + """ SELECT DISTINCT ?Tag (COUNT (?Pipeline_id) AS ?Number_of_pipelines) WHERE { ?Pipeline_id rdf:type kglids:Pipeline ; pipeline:hasTag ?Tag . %s } GROUP BY ?Tag ORDER BY DESC(?Number_of_pipelines) """ % tag if show_query: print(query) return execute_query(config, query) def plot_top_k_classifiers(config, k, show_query): query = PREFIXES + """ SELECT DISTINCT ?Module (COUNT (?Module) as ?Usage) WHERE { ?Pipeline_id rdf:type kglids:Pipeline ; rdfs:label ?Pipeline ; kglids:isPartOf ?dataset_id . ?dataset_id schema:name ?Dataset . GRAPH ?Pipeline_id { ?statement pipeline:callsLibrary ?l . BIND(STRAFTER(str(?l), str(lib:)) as ?l1) . BIND(STRBEFORE(str(?l1), str('/')) as ?Library) . BIND(STRAFTER(str(?l), str(?Library)) as ?m) . BIND(STRAFTER(str(?m), str('/')) as ?Module) . } FILTER(regex(?l, "classifier", "i")) FILTER (!regex(?l, "report", "i")) FILTER (!regex(?l, "ROCAUC", "i")) FILTER (!regex(?l, "threshold", "i")) } GROUP BY ?Module ORDER BY DESC(?Usage)""" if show_query: print(query) df = execute_query(config, query) df['Usage (in %)'] = list(map(lambda x: x * 100, [int(i) / sum(df['Usage']. tolist()) for i in (df['Usage'].tolist())])) df['Classifier'] = df['Module'].apply(lambda x: x.rsplit('/', 1)[-1]) if len(df) == 0: print("No classifier found") return if len(df) < k: print('Maximum {} classifier(s) were found'.format(len(df))) print('Showing top-{} classifiers'.format(len(df))) k = len(df) df = df.head(k) plt.rcParams['figure.figsize'] = 10, 5 plt.rcParams['figure.dpi'] = 300 plt.rcParams['savefig.dpi'] = 300 sns.set_theme(style='darkgrid') ax = sns.barplot(x="Classifier", y="Usage (in %)", data=df, palette='viridis') ax = ax.set_xticklabels(ax.get_xticklabels(), rotation=40) def plot_top_k_regressors(config, k , show_query): query = PREFIXES + """ SELECT DISTINCT ?Module (COUNT (?Module) as ?Usage) WHERE { ?Pipeline_id rdf:type kglids:Pipeline ; rdfs:label ?Pipeline ; kglids:isPartOf ?dataset_id . ?dataset_id schema:name ?Dataset . GRAPH ?Pipeline_id { ?statement pipeline:callsLibrary ?l . BIND(STRAFTER(str(?l), str(lib:)) as ?l1) . BIND(STRBEFORE(str(?l1), str('/')) as ?Library) . BIND(STRAFTER(str(?l), str(?Library)) as ?m) . BIND(STRAFTER(str(?m), str('/')) as ?Module) . } FILTER(regex(?l, "regres", "i")) } GROUP BY ?Module ORDER BY DESC(?Usage)""" if show_query: print(query) df = execute_query(config, query) df['Usage (in %)'] = list(map(lambda x: x * 100, [int(i) / sum(df['Usage']. tolist()) for i in (df['Usage'].tolist())])) df['Regressor'] = df['Module'].apply(lambda x: x.rsplit('/', 1)[-1]) if len(df) == 0: print("No classifier found") return if len(df) < k: print('Maximum {} regressor(s) were found'.format(len(df))) print('Showing top-{} regressors'.format(len(df))) k = len(df) df = df.head(k) plt.rcParams['figure.figsize'] = 10, 5 plt.rcParams['figure.dpi'] = 300 plt.rcParams['savefig.dpi'] = 300 sns.set_theme(style='darkgrid') ax = sns.barplot(x="Regressor", y="Usage (in %)", data=df, palette='viridis') ax = ax.set_xticklabels(ax.get_xticklabels(), rotation=40)