import itertools
import pandas as pd
import zlib
import re
import numpy as np
import seaborn as sns
from graphviz import Digraph
from camelsplit import camelsplit
from matplotlib import pyplot as plt
from data_items.knowledge_graph.src.label import Label
from api.helpers.helper import execute_query
PREFIXES = """
PREFIX kglids:
PREFIX data:
PREFIX schema:
PREFIX rdf:
PREFIX rdfs:
PREFIX pipeline:
PREFIX lib:
"""
def query_kglids(config, rdf_query):
return execute_query(config, PREFIXES + rdf_query)
def get_datasets_info(config, show_query):
query = PREFIXES + """
SELECT ?Dataset (count(?table_id) as ?Number_of_tables)
WHERE
{
?dataset_id rdf:type kglids:Dataset .
?dataset_id schema:name ?Dataset .
?table_id kglids:isPartOf ?dataset_id .
}
group by ?Dataset """
if show_query:
print(query)
return execute_query(config, query)
def get_tables_info(config, dataset: str, show_query):
if dataset:
dataset = '?dataset_id schema:name "{}" .'.format(dataset)
query = PREFIXES + """
SELECT ?Table ?Dataset ?Path_to_table
WHERE
{
?table_id rdf:type kglids:Table .
%s
?table_id kglids:isPartOf ?dataset_id .
?table_id schema:name ?Table .
?table_id data:hasFilePath ?Path_to_table .
?dataset_id schema:name ?Dataset .
}""" % dataset
if show_query:
print(query)
return execute_query(config, query)
def get_top_k_tables(pairs: list):
top_k = {}
dataset = {}
path = {}
for p in pairs:
if p[0] not in top_k:
top_k[p[0]] = p[1]
dataset[p[0]] = p[2]
path[p[0]] = p[3]
else:
updated_score = top_k.get(p[0]) + p[1]
top_k[p[0]] = updated_score
scores = top_k
top_k = list(dict(sorted(top_k.items(), key=lambda item: item[1], reverse=True)).keys())
top_k = [list(ele) for ele in top_k]
for pair in top_k:
c1 = pair[0]
c2 = pair[1]
pair = pair.extend([scores.get((c1, c2)), dataset.get((c1, c2)), path.get((c1, c2))])
return top_k
def recommend_tables(config, dataset: str, table: str, k: int, relation: str, show_query: bool):
query = PREFIXES + """
SELECT ?table_name1 ?table_name2 ?certainty ?dataset2_n ?path
WHERE
{
?table_id schema:name "%s" .
?table_id schema:name ?table_name1 .
?dataset_id schema:name "%s" .
?table_id kglids:isPartOf ?dataset_id .
?column_id kglids:isPartOf ?table_id .
<> data:withCertainty ?certainty .
?column_id2 kglids:isPartOf ?table_id2 .
?table_id2 schema:name ?table_name2 .
?table_id2 data:hasFilePath ?path .
?table_id2 kglids:isPartOf ?dataset2 .
?dataset2 schema:name ?dataset2_n .
}
""" % (table, dataset, relation)
if show_query:
print(query)
res = execute_query(config, query, return_type='json')
result = []
for r in res:
table1 = r["table_name1"]["value"]
table2 = r["table_name2"]["value"]
certainty = float(r["certainty"]["value"])
dataset = r["dataset2_n"]["value"]
path = r["path"]["value"]
result.append([(table1, table2), certainty, dataset, path])
result = get_top_k_tables(result)[:k]
table = list(map(lambda x: x[1], result))
scores = list(map(lambda x: x[2], result))
dataset = list(map(lambda x: x[3], result))
path = list(map(lambda x: x[4], result))
return pd.DataFrame({'Dataset': dataset, 'Recommended_table': table,
'Score': scores, 'Path_to_table': path})
def show_graph_info(config, show_query):
query1 = PREFIXES + """
SELECT (COUNT(?Dataset) as ?Datasets)
WHERE
{
?Dataset rdf:type kglids:Dataset .
}
"""
query2 = PREFIXES + """
SELECT (COUNT(?Table) as ?Tables)
WHERE
{
?Table rdf:type kglids:Table ;
kglids:isPartOf ?Dataset .
?Dataset rdf:type kglids:Dataset .
}
"""
query3 = PREFIXES + """
SELECT (COUNT(?Pipeline) as ?Pipelines)
WHERE
{
?Pipeline rdf:type kglids:Pipeline ;
kglids:isPartOf ?Dataset .
?Dataset rdf:type kglids:Dataset .
}
"""
query4 = PREFIXES + """
SELECT (COUNT(?Column) as ?Columns)
WHERE
{
?Column rdf:type kglids:Column ;
kglids:isPartOf ?Table .
?Table rdf:type kglids:Table .
}
"""
if show_query:
print(query1, '\n', query3, '\n', query2, '\n', query4)
dataset = execute_query(config, query1)
tables = execute_query(config, query2)
pipelines = execute_query(config, query3)
columns = execute_query(config, query4)
return pd.concat([dataset, pipelines, tables, columns], axis=1)
def get_table_path(config, dataset, table):
query = PREFIXES + """
SELECT ?table_path
WHERE
{
?dataset schema:name "%s" ;
rdf:type kglids:Dataset .
?table schema:name "%s" ;
rdf:type kglids:Table ;
data:hasFilePath ?table_path .
}
""" % (dataset, table)
res = execute_query(config, query)["results"]["bindings"][0]
return res["table_path"]["value"]
def get_table_info(config, dataset, table, show_query):
query = PREFIXES + """
SELECT (max(?rows) as ?number_of_rows) (COUNT(?col) as ?number_of_columns)
WHERE
{
?dataset schema:name "%s" ;
rdf:type kglids:Dataset .
?table schema:name "%s" ;
rdf:type kglids:Table ;
data:hasFilePath ?table_path .
?col kglids:isPartOf ?table ;
rdf:type kglids:Column ;
data:hasTotalValueCount ?rows .
}
""" % (dataset, table)
if show_query:
print(query)
res = execute_query(config, query)["results"]["bindings"][0]
rows = res["number_of_rows"]["value"]
columns = res["number_of_columns"]["value"]
return pd.DataFrame(
{'Dataset': [dataset], 'Table': [table], 'Path_to_table':
[get_table_path(config, dataset, table)], 'Number_of_columns': [columns],
'Number_of_rows': [rows]})
def _create_tables_df_row(results):
return {
'Dataset': results['dataset_name']['value'],
'Table': results['name']['value'],
'Number_of_columns': float(results['number_of_columns']['value']),
'Number_of_rows': float(results['number_of_rows']['value']),
'Path_to_table': results['path']['value']
}
def search_tables_on(config, all_conditions: tuple, show_query: bool):
def search(conditions: tuple):
return PREFIXES + \
'\nselect ?name ?dataset_name ?path (' \
'count(distinct ?cols) as ?number_of_columns) (max (?total) as ?number_of_rows)' \
'\nwhere {' \
'\n?table schema:name ?name.' \
'\n?table data:hasFilePath ?path.' \
'\n?table kglids:isPartOf ?dataset.' \
'\n?dataset schema:name ?dataset_name.' \
'\n?cols kglids:isPartOf ?table.' \
'\n?cols data:hasTotalValueCount ?total.\n' \
+ conditions[0] + \
'\nfilter( ' + conditions[1] + ')}' \
'\n group by ?name ?dataset_name ?path'
query = search(all_conditions)
if show_query:
print(query)
res = execute_query(config, query, return_type='json')
for result in res:
yield _create_tables_df_row(result)
def _get_iri(config, dataset_name: str, table_name: str = None, show_query: bool = False):
if table_name is None:
query = PREFIXES + \
'\nselect ?id' \
'\nwhere {' \
'\n?id a kglids:Dataset.' \
'\n?id rdfs:label %s }' % dataset_name
else:
query = PREFIXES + \
'\nselect ?id where{' \
'\n?id a kglids:Table.' \
'\n?id rdfs:label %s.' \
'\n?id kglids:isPartOf ?dataset.' \
'\n?dataset rdfs:label %s.' \
'\n?dataset a kglids:Dataset.}' % (table_name, dataset_name)
if show_query:
print(query)
results = execute_query(config, query, return_type='json')
bindings = results
if not bindings:
return None
return str(bindings[0]['id']['value'])
def get_iri_of_table(config, dataset_name: str, table_name: str, show_query: bool = False):
dataset_label = generate_label(dataset_name, 'en')
table_label = generate_label(table_name, 'en')
return _get_iri(config, dataset_label, table_label, show_query)
def generate_label(col_name: str, lan: str) -> Label:
if '.csv' in col_name:
col_name = re.sub('.csv', '', col_name)
col_name = re.sub('[^0-9a-zA-Z]+', ' ', col_name)
text = " ".join(camelsplit(col_name.strip()))
text = re.sub(r'\s+', ' ', text.strip())
return Label(text.lower(), lan)
def _create_path_row(result, hops):
data = {'starting_column': result['c1name']['value'],
'starting_table': result['t1name']['value'],
'starting_table_path': result['t1path']['value'],
'starting_dataset': result['d1name']['value']}
intermediate = {}
for i in range(2, hops + 1):
intermediate.update({'intermediate_column_land_in' + str(i): result['c' + str(i) + 'name']['value'],
'intermediate_table' + str(i): result['t' + str(i) + 'name']['value'],
'intermediate_table_path' + str(i): result['t' + str(i) + 'path']['value'],
'intermediate_column_take_off' + str(i): result['cc' + str(i) + 'name']['value'],
'intermediate_dataset' + str(i): result['d' + str(i) + 'name']['value']})
data.update(intermediate)
data.update({'target_column': result['c' + str(hops + 1) + 'name']['value'],
'target_table': result['t' + str(hops + 1) + 'name']['value'],
'target_table_path': result['t' + str(hops + 1) + 'path']['value'],
'target_dataset': result['d' + str(hops + 1) + 'name']['value']})
return data
def get_path_between(config, start_iri: str, target_iri: str, predicate: str, hops: int,
show_query: bool = False):
def _generate_starting_nodes() -> str:
return '\n ?c1 schema:name ?c1name.' \
'\n ?c1 kglids:isPartOf ?t1.' \
'\n ?t1 schema:name ?t1name.' \
'\n ?t1 data:hasFilePath ?t1path.' \
'\n ?t1 kglids:isPartOf ?d1.' \
'\n ?d1 schema:name ?d1name.'
def _generate_intermediate_nodes(h: int) -> str:
inters = ''
for i in range(2, h + 1):
inter = '\n ?c' + str(i) + ' a kglids:Column.' \
'\n ?c' + str(i) + ' schema:name ?c' + str(i) + 'name.' \
'\n ?c' + str(
i) + ' kglids:isPartOf ?t' + str(i) + '.' \
'\n ?t' + str(i) + ' schema:name ?t' + str(i) + 'name.' \
'\n ?t' + str(
i) + ' data:hasFilePath ?t' + str(i) + 'path.' \
'\n ?cc' + str(i) + ' kglids:isPartOf ?t' + str(i) + '.' \
'\n ?cc' + str(
i) + ' schema:name ?cc' + str(i) + 'name.' \
'\n ?t' + str(i) + ' kglids:isPartOf ?d' + str(i) + '.' \
'\n ?d' + str(
i) + ' schema:name ?d' + str(i) + 'name.'
inters += inter
return inters
def _generate_target_nodes(h: int) -> str:
return '\n ?c' + str(h + 1) + ' schema:name ?c' + str(h + 1) + 'name.' \
'\n ?c' + str(
h + 1) + ' kglids:isPartOf ?t' + str(h + 1) + '.' \
'\n ?t' + str(h + 1) + ' schema:name ?t' + str(
h + 1) + 'name.' \
'\n ?t' + str(h + 1) + ' data:hasFilePath ?t' + str(h + 1) + 'path.' \
'\n ?t' + str(
h + 1) + ' kglids:isPartOf ?d' + str(h + 1) + '.' \
'\n ?d' + str(h + 1) + ' schema:name ?d' + str(
h + 1) + 'name.'
def _generate_relationships(h: int, pred: str) -> str:
relations = '\n ?c1 ' + pred + ' ?c2.'
for i in range(2, h + 1):
relation = '\n ?cc' + str(i) + ' ' + pred + ' ?c' + str(i + 1) + '.'
relations += relation
return relations
def _generate_select(h: int) -> str:
selects = '\n ?c1name ?t1name ?t1path ?d1name'
for i in range(2, h + 1):
selects += '\n ?c' + str(i) + 'name ?t' + str(i) + 'name ?t' + str(i) + 'path' \
' ?cc' + str(i) + 'name ?d' + str(
i) + 'name'
selects += '\n ?c' + str(h + 1) + 'name ?t' + str(h + 1) + 'name ?t' + str(h + 1) + 'path ?d' + str(
h + 1) + 'name'
return selects
starting_nodes = _generate_starting_nodes()
intermediate_nodes = _generate_intermediate_nodes(hops)
target_nodes = _generate_target_nodes(hops)
relationships = _generate_relationships(hops, predicate)
select = _generate_select(hops)
all_nodes = starting_nodes + intermediate_nodes + target_nodes
query = PREFIXES + \
'\n select' + select + \
'\nwhere {' + \
all_nodes + relationships + \
'\n values ?t1 {' + start_iri + '}' + \
'\n values ?t' + str(hops + 1) + ' {' + target_iri + '}}'
if show_query:
print(query)
results = execute_query(config, query, return_type='json')
bindings = results
if not bindings:
return []
for result in bindings:
yield _create_path_row(result, hops)
def generate_component_id(dataset_name: str, table_name: str = '', column_name: str = ''):
return zlib.crc32(bytes(dataset_name + table_name + column_name, 'utf-8'))
def generate_graphviz(df: pd.DataFrame, predicate: str):
def parse_starting_or_target_nodes(dot, row, column_ids: list, table_ids: list, dataset_ids: list,
start: bool) -> str:
relation_name = 'partOf'
if start:
dataset_name = row[0]
table_name = row[1]
column_name = row[3]
color = 'lightblue2'
else:
dataset_name = row[-4]
table_name = row[-3]
column_name = row[-1]
color = 'darkorange3'
dataset_id = str(generate_component_id(dataset_name))
table_id = str(generate_component_id(dataset_name, table_name))
column_id = str(generate_component_id(dataset_name, table_name, column_name))
if column_id in column_ids:
return column_id
dot.node(column_id, column_name, style='filled', fillcolor=color)
column_ids.append(column_id)
if table_id in table_ids:
dot.edge(column_id, table_id, relation_name)
return column_id
dot.node(table_id, table_name, style='filled', fillcolor=color)
table_ids.append(table_id)
if dataset_id in dataset_ids:
dot.edge(column_id, table_id, relation_name)
dot.edge(table_id, dataset_id, relation_name)
return column_id
dot.node(dataset_id, dataset_name, style='filled', fillcolor=color)
dataset_ids.append(dataset_id)
dot.edge(column_id, table_id, relation_name)
dot.edge(table_id, dataset_id, relation_name)
return column_id
def parse_intermediate_nodes(dot, row, column_ids: list, table_ids: list, dataset_ids: list) -> list:
ids = []
relation_name = 'partOf'
for i in range(4, len(row) - 4, 5):
dataset_name = row[i]
table_name = row[i + 1]
land_in_column_name = row[i + 2]
take_off_column_name = row[i + 4]
dataset_id = str(generate_component_id(dataset_name))
table_id = str(generate_component_id(dataset_name, table_name))
land_in_column_id = str(generate_component_id(dataset_name, table_name, land_in_column_name))
take_off_column_id = str(generate_component_id(dataset_name, table_name, take_off_column_name))
ids.extend([land_in_column_id, take_off_column_id])
land_in_column_exist = False
take_off_column_exist = False
if land_in_column_id in column_ids:
land_in_column_exist = True
dot.node(land_in_column_id, land_in_column_name)
column_ids.append(land_in_column_id)
if take_off_column_id in column_ids:
take_off_column_exist = True
if land_in_column_exist and take_off_column_exist:
continue
dot.node(take_off_column_id, take_off_column_name)
column_ids.append(take_off_column_id)
if table_id in table_ids:
if land_in_column_id == take_off_column_id:
dot.edge(land_in_column_id, table_id, relation_name)
else:
dot.edge(land_in_column_id, table_id, relation_name)
dot.edge(take_off_column_id, table_id, relation_name)
continue
dot.node(table_id, table_name)
table_ids.append(table_id)
if dataset_id in dataset_ids:
if land_in_column_id == take_off_column_id:
dot.edge(land_in_column_id, table_id, relation_name)
else:
dot.edge(land_in_column_id, table_id, relation_name)
dot.edge(take_off_column_id, table_id, relation_name)
dot.edge(table_id, dataset_id, relation_name)
continue
dot.node(dataset_id, dataset_name)
dataset_ids.append(dataset_id)
if land_in_column_id == take_off_column_id:
dot.edge(land_in_column_id, table_id, relation_name)
else:
dot.edge(land_in_column_id, table_id, relation_name)
dot.edge(take_off_column_id, table_id, relation_name)
dot.edge(table_id, dataset_id, relation_name)
return ids
def establish_relationships(dot, row_ids: list, relationships: list):
for j in range(0, len(row_ids) - 1, 2):
pair = (row_ids[j], row_ids[j + 1])
if pair[0] == pair[1]:
continue
if not pair in relationships:
relationships.append(pair)
dot.edge(pair[0], pair[1], 'similar', dir='none')
col_ids = []
tab_ids = []
data_ids = []
relations = []
dot_graph = Digraph(strict=True)
for i in range(len(df)):
r = df.iloc[i]
row_col_ids = []
starting_column_id = parse_starting_or_target_nodes(dot_graph, r, col_ids, tab_ids, data_ids, True)
intermediate_col_ids = parse_intermediate_nodes(dot_graph, r, col_ids, tab_ids, data_ids)
target_col_id = parse_starting_or_target_nodes(dot_graph, r, col_ids, tab_ids, data_ids, False)
row_col_ids.append(starting_column_id)
row_col_ids.extend(intermediate_col_ids)
row_col_ids.append(target_col_id)
establish_relationships(dot_graph, row_col_ids, relations)
dot_graph.attr(label='Paths between starting nodes in blue and target nodes in orange', size='8,75,10')
return dot_graph
def get_path_between_tables(config, source_table_info, target_table_info, hops, relation, show_query):
source_table_name = source_table_info["Table"]
source_dataset_name = source_table_info["Dataset"]
target_dataset_name = target_table_info["Dataset"]
if 'Recommended_table' in target_table_info.keys():
target_table_name = target_table_info["Recommended_table"]
else:
target_table_name = target_table_info["Table"]
starting_table_iri = get_iri_of_table(config,
dataset_name=generate_label(source_dataset_name, 'en').get_text(),
table_name=generate_label(source_table_name, 'en').get_text())
target_table_iri = get_iri_of_table(config,
dataset_name=generate_label(target_dataset_name, 'en').get_text(),
table_name=generate_label(target_table_name, 'en').get_text())
if starting_table_iri is None:
raise ValueError(str(source_table_info) + ' does not exist')
if target_table_iri is None:
raise ValueError(str(target_table_info) + ' does not exist')
data = get_path_between(config, '<' + starting_table_iri + '>', '<' + target_table_iri + '>',
relation, hops, show_query)
path_row = ['starting_dataset', 'starting_table', 'starting_table_path', 'starting_column']
for i in range(2, hops + 1):
intermediate = ['intermediate_dataset' + str(i), 'intermediate_table' + str(i),
'intermediate_column_land_in' + str(i), 'intermediate_table_path' + str(i),
'intermediate_column_take_off' + str(i)]
path_row.extend(intermediate)
path_row.extend(['target_dataset', 'target_table', 'target_table_path', 'target_column'])
df = pd.DataFrame(list(data), columns=path_row)
dot = generate_graphviz(df, relation)
return dot
def get_top_scoring_ml_model(config, dataset, show_query):
query = """
PREFIX kglids:
SELECT (count(?x) as ?count)
WHERE
{
?x rdf:type kglids:Pipeline .
}
"""
return execute_query(config, query)
def get_pipelines_info(config, author, show_query):
if author != '':
author = "FILTER (?Author = '{}') .".format(author)
query = PREFIXES + """
SELECT ?Pipeline ?Dataset ?Author ?Written_on ?Number_of_votes ?Score
WHERE
{
?pipeline_id rdf:type kglids:Pipeline ;
pipeline:hasVotes ?Number_of_votes ;
rdfs:label ?Pipeline ;
pipeline:isWrittenOn ?Written_on ;
pipeline:isWrittenBy ?Author ;
pipeline:hasScore ?Score ;
kglids:isPartOf ?Dataset_id .
?Dataset_id schema:name ?Dataset .
%s
} ORDER BY DESC(?Number_of_votes)
""" % author
if show_query:
print(query)
return execute_query(config, query)
def get_most_recent_pipeline(config, dataset, show_query):
if dataset != '':
dataset = "FILTER (?Dataset = '{}') .".format(dataset)
query = PREFIXES + """
SELECT ?Pipeline ?Dataset ?Author ?Written_on ?Number_of_votes ?Score
WHERE
{
?pipeline_id rdf:type kglids:Pipeline ;
pipeline:hasVotes ?Number_of_votes ;
rdfs:label ?Pipeline ;
pipeline:isWrittenOn ?Written_on ;
pipeline:isWrittenBy ?Author ;
pipeline:hasScore ?Score ;
kglids:isPartOf ?Dataset_id .
?Dataset_id schema:name ?Dataset .
%s
} ORDER BY DESC(?Written_on) LIMIT 1
""" % dataset
if show_query:
print(query)
return execute_query(config, query)
def get_top_k_scoring_pipelines_for_dataset(config, dataset, k, show_query):
if k is not None:
k = 'LIMIT ' + str(k)
if k is None:
k = ''
if dataset != '':
dataset = "FILTER (?Dataset = '{}') .".format(dataset)
query = PREFIXES + """
SELECT ?Pipeline ?Dataset ?Author ?Written_on ?Number_of_votes ?Score
WHERE
{
?pipeline_id rdf:type kglids:Pipeline ;
pipeline:hasVotes ?Number_of_votes ;
rdfs:label ?Pipeline ;
pipeline:isWrittenOn ?Written_on ;
pipeline:isWrittenBy ?Author ;
pipeline:hasScore ?Score ;
kglids:isPartOf ?Dataset_id .
?Dataset_id schema:name ?Dataset .
%s
} ORDER BY DESC(?Score) %s
""" % (dataset, k)
if show_query:
print(query)
return execute_query(config, query)
CLASSIFIERS = {'RandomForestClassifier': '',
'SVC': '',
'KNeighborsClassifier': '',
'GradientBoostingClassifier': '',
'LogisticRegression': '',
'DecisionTreeClassifier': '',
'AdaBoostClassifier': '',
'SGDClassifier': '',
'MLPClassifier': '',
'XGBClassifier': '',
'VotingClassifier': '',
'PassiveAggressiveClassifier': '',
'BaggingClassifier': '',
'RidgeClassifier': '',
'RadiusNeighborsClassifier': '',
'ExtraTreesClassifier': '',
'TFDistilBertForSequenceClassification': ''}
def search_classifier(config, dataset, show_query):
sub_graph_query = """
"""
for classifier, classifier_url in CLASSIFIERS.items():
if classifier != 'RandomForestClassifier':
query = """
UNION
{
?Statement_number pipeline:callsLibrary %s.
BIND('%s' as ?Classifier)
}
""" % (classifier_url, classifier)
sub_graph_query = sub_graph_query + query
filter_for_dataset = ''
if dataset != '':
filter_for_dataset = '?Dataset_id schema:name "{}".'.format(dataset)
query = PREFIXES + """
SELECT DISTINCT ?Dataset ?Pipeline ?Classifier ?Score
WHERE
{
?Dataset_id rdf:type kglids:Dataset ;
schema:name ?Dataset .
?Pipeline_id kglids:isPartOf ?Dataset_id ;
rdfs:label ?Pipeline ;
pipeline:hasScore ?Score .
graph ?Pipeline_id
{
?x pipeline:callsLibrary ?y
{
?Statement_number pipeline:callsLibrary .
BIND('RandomForestClassifier' as ?Classifier)
}""" + sub_graph_query + """
}
%s
} ORDER BY DESC(?Score) """ % filter_for_dataset
if show_query:
print(query)
return execute_query(config, query)
def get_hyperparameters(config, pipeline, classifier, show_query):
classifier_url = CLASSIFIERS.get(classifier)
parameter_heading = '?{}_hyperparameter'.format(classifier)
query = PREFIXES + """
SELECT DISTINCT %s ?Value
WHERE
{
?Pipeline_id rdfs:label '%s' ;
pipeline:hasScore ?Score .
graph ?Pipeline_id
{
?Statement_number pipeline:callsLibrary %s .
<< ?Statement_number pipeline:hasParameter %s >> pipeline:withParameterValue ?Value .
}
} ORDER BY DESC(?Score)""" % (parameter_heading, pipeline, classifier_url, parameter_heading)
if show_query:
print(query)
df = execute_query(config, query)
if np.shape(df)[0] == 0:
return 'Using default configurations'
else:
return df
def get_library_usage(config, dataset, k, show_query):
if dataset != '':
dataset = '?Dataset schema:name "{}" .\n\t\t' \
'?Pipeline kglids:isPartOf ?Dataset .'.format(dataset)
query = PREFIXES + """
SELECT ?Library (COUNT(distinct ?Pipeline) as ?Usage)
WHERE
{
%s
?Pipeline rdf:type kglids:Pipeline .
GRAPH ?Pipeline
{
?Statement pipeline:callsLibrary ?l .
BIND(STRAFTER(str(?l), str(lib:)) as ?l1) .
BIND(STRBEFORE(str(?l1), str('/')) as ?Library) .
}
FILTER (?Library != "") .
FILTER (?Library != "builtin") .
} GROUP BY ?Library ORDER BY DESC(?Usage)
""" % dataset
if show_query:
print(query)
df = execute_query(config, query)
df['Usage (in %)'] = list(map(lambda x: x * 100, [int(i) / sum(df['Usage'].
tolist()) for i in (df['Usage'].tolist())]))
if len(df) == 0:
print("No library found")
return
df = df.head(k)
fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(range(len(df)), df['Usage'].tolist(), color='mediumseagreen')
ax.set_xticks(range(len(df)))
ax.set_xticklabels(df['Library'].tolist(), rotation=20)
ax.set_ylabel('Number of Pipelines')
for i, usage in enumerate(df['Usage']):
ax.text(i-0.35, usage + 200, str(usage), color='k', fontweight='bold')
ax.set_ylim(0, df['Usage'].max() + 2000)
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('library_usage_stats.pdf')
plt.show()
def get_top_used_libraries(config, task, show_query):
if task == 'classification':
task = 'classifi'
elif task == 'clustering':
task = 'cluster'
elif task == 'visualization':
task = 'plot'
else:
task = 'regress'
query = PREFIXES + """
SELECT DISTINCT ?Library ?Module ?Pipeline ?Dataset
WHERE
{
?Pipeline_id rdf:type kglids:Pipeline ;
rdfs:label ?Pipeline ;
kglids:isPartOf ?dataset_id .
?dataset_id schema:name ?Dataset .
GRAPH ?Pipeline_id
{
?statement pipeline:callsLibrary ?l .
BIND(STRAFTER(str(?l), str(lib:)) as ?l1) .
BIND(STRBEFORE(str(?l1), str('/')) as ?Library) .
BIND(STRAFTER(str(?l), str(?Library)) as ?m) .
BIND(STRAFTER(str(?m), str('/')) as ?Module) .
}
FILTER(regex(?l, "%s", "i"))
}""" % task
if show_query:
print(query)
return execute_query(config, query)
def get_pipelines_calling_libraries(config, components, show_query):
sub_query = ''
for i in range(len(components)):
sub_query = sub_query + \
'?Statement_{} pipeline:callsLibrary .\n ' \
.format(i + 1, components[i].replace('.', '/'))
query = PREFIXES + """
SELECT DISTINCT ?Pipeline ?Dataset ?Author ?Score ?Number_of_votes
WHERE
{
?Pipeline_id rdf:type kglids:Pipeline ;
pipeline:hasVotes ?Number_of_votes ;
rdfs:label ?Pipeline ;
pipeline:isWrittenOn ?Written_on ;
pipeline:isWrittenBy ?Author ;
pipeline:hasScore ?Score ;
kglids:isPartOf ?Dataset_id ;
graph ?Pipeline_id
{
%s
}
?Dataset_id schema:name ?Dataset .
} ORDER BY DESC(?Score)""" % sub_query
if show_query:
print(query)
return execute_query(config, query)
def get_pipelines_for_deep_learning(config, show_query):
query = PREFIXES + """
SELECT DISTINCT ?Pipeline ?Dataset ?Author ?Written_on ?Score ?Number_of_votes
WHERE
{
?Pipeline_id rdf:type kglids:Pipeline ;
pipeline:hasVotes ?Number_of_votes ;
rdfs:label ?Pipeline ;
pipeline:isWrittenOn ?Written_on ;
pipeline:isWrittenBy ?Author ;
pipeline:hasScore ?Score ;
pipeline:hasTag ?Tag ;
kglids:isPartOf ?Dataset_id .
?Dataset_id schema:name ?Dataset .
FILTER(regex(?Tag, "deep learning", "i"))
} ORDER BY DESC(?Score)"""
if show_query:
print(query)
return execute_query(config, query)
def recommend_transformations(config, show_query):
query = PREFIXES + """
SELECT DISTINCT ?Transformation ?Pipeline (?Table_id as ?Table) ?Dataset
WHERE
{
?Pipeline_id rdf:type kglids:Pipeline ;
pipeline:hasVotes ?Number_of_votes ;
rdfs:label ?Pipeline ;
pipeline:isWrittenOn ?Written_on ;
pipeline:isWrittenBy ?Author ;
pipeline:hasScore ?Score ;
pipeline:hasTag ?Tag ;
kglids:isPartOf ?Dataset_id .
?Dataset_id schema:name ?Dataset .
graph ?Pipeline_id
{
?s pipeline:callsLibrary ?l .
BIND(STRAFTER(str(?l), str(lib:)) as ?Transformation) .
?Table_id rdf:type kglids:Table .
}
FILTER(regex(?l, "preprocessing", "i"))
} ORDER BY DESC(?Score) """
if show_query:
print(query)
df = execute_query(config, query)
df['Table'] = df['Table'].apply(lambda x: x.rsplit('/', 1)[-1])
df['Transformation'] = df['Transformation'].apply(lambda x: x.replace('/', '.'))
return df
def get_pipelines_by_tags(config, tag, show_query):
if tag != '':
tag = 'FILTER(regex(?Tag, "{}", "i"))'.format(tag)
query = PREFIXES + """
SELECT DISTINCT ?Tag (COUNT (?Pipeline_id) AS ?Number_of_pipelines)
WHERE
{
?Pipeline_id rdf:type kglids:Pipeline ;
pipeline:hasTag ?Tag .
%s
} GROUP BY ?Tag ORDER BY DESC(?Number_of_pipelines)
""" % tag
if show_query:
print(query)
return execute_query(config, query)
def plot_top_k_classifiers(config, k, show_query):
query = PREFIXES + """
SELECT DISTINCT ?Module (COUNT (?Module) as ?Usage)
WHERE
{
?Pipeline_id rdf:type kglids:Pipeline ;
rdfs:label ?Pipeline ;
kglids:isPartOf ?dataset_id .
?dataset_id schema:name ?Dataset .
GRAPH ?Pipeline_id
{
?statement pipeline:callsLibrary ?l .
BIND(STRAFTER(str(?l), str(lib:)) as ?l1) .
BIND(STRBEFORE(str(?l1), str('/')) as ?Library) .
BIND(STRAFTER(str(?l), str(?Library)) as ?m) .
BIND(STRAFTER(str(?m), str('/')) as ?Module) .
}
FILTER(regex(?l, "classifier", "i"))
FILTER (!regex(?l, "report", "i"))
FILTER (!regex(?l, "ROCAUC", "i"))
FILTER (!regex(?l, "threshold", "i"))
} GROUP BY ?Module ORDER BY DESC(?Usage)"""
if show_query:
print(query)
df = execute_query(config, query)
df['Usage (in %)'] = list(map(lambda x: x * 100, [int(i) / sum(df['Usage'].
tolist()) for i in (df['Usage'].tolist())]))
df['Classifier'] = df['Module'].apply(lambda x: x.rsplit('/', 1)[-1])
if len(df) == 0:
print("No classifier found")
return
if len(df) < k:
print('Maximum {} classifier(s) were found'.format(len(df)))
print('Showing top-{} classifiers'.format(len(df)))
k = len(df)
df = df.head(k)
plt.rcParams['figure.figsize'] = 10, 5
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
sns.set_theme(style='darkgrid')
ax = sns.barplot(x="Classifier", y="Usage (in %)", data=df, palette='viridis')
ax = ax.set_xticklabels(ax.get_xticklabels(), rotation=40)
def plot_top_k_regressors(config, k , show_query):
query = PREFIXES + """
SELECT DISTINCT ?Module (COUNT (?Module) as ?Usage)
WHERE
{
?Pipeline_id rdf:type kglids:Pipeline ;
rdfs:label ?Pipeline ;
kglids:isPartOf ?dataset_id .
?dataset_id schema:name ?Dataset .
GRAPH ?Pipeline_id
{
?statement pipeline:callsLibrary ?l .
BIND(STRAFTER(str(?l), str(lib:)) as ?l1) .
BIND(STRBEFORE(str(?l1), str('/')) as ?Library) .
BIND(STRAFTER(str(?l), str(?Library)) as ?m) .
BIND(STRAFTER(str(?m), str('/')) as ?Module) .
}
FILTER(regex(?l, "regres", "i"))
} GROUP BY ?Module ORDER BY DESC(?Usage)"""
if show_query:
print(query)
df = execute_query(config, query)
df['Usage (in %)'] = list(map(lambda x: x * 100, [int(i) / sum(df['Usage'].
tolist()) for i in (df['Usage'].tolist())]))
df['Regressor'] = df['Module'].apply(lambda x: x.rsplit('/', 1)[-1])
if len(df) == 0:
print("No classifier found")
return
if len(df) < k:
print('Maximum {} regressor(s) were found'.format(len(df)))
print('Showing top-{} regressors'.format(len(df)))
k = len(df)
df = df.head(k)
plt.rcParams['figure.figsize'] = 10, 5
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
sns.set_theme(style='darkgrid')
ax = sns.barplot(x="Regressor", y="Usage (in %)", data=df, palette='viridis')
ax = ax.set_xticklabels(ax.get_xticklabels(), rotation=40)