KGTOSA / DatasetTransformer / Yago_TSV_TO_Hetro_OGB.py
Yago_TSV_TO_Hetro_OGB.py
Raw
import pandas as pd
import gzip
import datetime
import os
import shutil
from sklearn.metrics import precision_recall_fscore_support as score


def compress_gz(f_path):
    f_in = open(f_path, 'rb')
    f_out = gzip.open(f_path + ".gz", 'wb')
    f_out.writelines(f_in)
    f_out.close()
    f_in.close()


###################### Zip Folder to OGB Format
# zip -r mag_ComputerProgramming_papers_venue_QM3.zip mag_ComputerProgramming_papers_venue_QM3/ -i '*.gz'

if __name__ == '__main__':
    csv_path = "/shared_mnt/DBLP/dblp-2022-03-01_URI_Only.tsv"
    split_rel = "https://dblp.org/rdf/schema#yearOfEvent"
    target_rel = "https://dblp.org/rdf/schema#publishedIn"
    label_node = "rec"
    # prefix_uri="https://dblp.org"
    label_type = "conf"
    classification_usecase = "conf"
    filter_publication_years = 2015
    affaliations_Coverage_df = pd.read_csv("/shared_mnt/DBLP/Sparql_Sampling_conf/BDLP_Papers_Per_Affaliation_conf.csv")
    affaliations_Coverage_df = affaliations_Coverage_df[affaliations_Coverage_df["do_train"] == 1].reset_index(
        drop=True)
    sampledQueries = {
        "StarQuery": "",
        "BStarQuery": "",
        "PathQuery": "",
        "BPathQuery": ""
    }
    dic_results = {}
    use_FM = True
    split_by = {"folder_name": "time", "split_data_type": "int", "train": 2019, "valid": 2020, "test": 2021}
    for i, aff_row in affaliations_Coverage_df.iterrows():
        if i >= 0:
            for sample_key in sampledQueries.keys():

                start_t = datetime.datetime.now()
                if use_FM == True:
                    dataset_name = "dblp-2022-03-01_URI_Only"
                    g_tsv_df = pd.read_csv("/shared_mnt/DBLP/" + dataset_name + ".tsv", sep="\t", header=None)
                    try:
                        g_tsv_df = g_tsv_df.rename(columns={0: "s", 1: "p", 2: "o"})
                    except:
                        print("g_tsv_df columns=", g_tsv_df.columns())
                else:
                    dataset_name = "OBGN_QM_DBLP_" + classification_usecase + "_" + sample_key + "Usecase_" + str(
                        int(aff_row["Q_idx"])) + "_" + str(
                        str(aff_row["affiliation"]).strip().replace(" ", "_").replace("/", "_").replace(",", "_"))
                    g_tsv_df = pd.read_csv("/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + ".csv")
                    # print("g_tsv_df columns=",g_tsv_df)
                    try:
                        g_tsv_df = g_tsv_df.rename(columns={"subject": "s", "predicate": "p", "object": "o"})
                    except:
                        print("g_tsv_df columns=", g_tsv_df.columns())
                print("dataset_name=", dataset_name)
                # print("g_tsv_df columns=",g_tsv_df)
                dic_results[dataset_name] = {}
                dic_results[dataset_name]["q_idx"] = int(aff_row["Q_idx"])
                dic_results[dataset_name]["usecase"] = dataset_name
                dic_results[dataset_name]["sample_key"] = sample_key
                #################################Filter by years ######################
                yearOfPublication_df = g_tsv_df[g_tsv_df["p"] == "https://dblp.org/rdf/schema#yearOfPublication"]
                yearOfPublication_df["o"] = yearOfPublication_df["o"].astype("int64")
                remove_rec_years_lst = yearOfPublication_df[yearOfPublication_df["o"] < filter_publication_years][
                    "s"].unique().tolist()
                g_tsv_df = g_tsv_df[~g_tsv_df["s"].isin(remove_rec_years_lst)]
                ########################filter conf Only #############################
                rec_types = ['journals', 'conf', 'reference', 'books']
                rec_types.remove(classification_usecase)
                for rec_type in rec_types:
                    remove_recs = \
                    g_tsv_df[g_tsv_df["s"].str.startswith("https://dblp.org/" + label_node + "/" + rec_type)][
                        "s"].unique().tolist()
                    g_tsv_df = g_tsv_df[~g_tsv_df["s"].isin(remove_recs)]
                    g_tsv_df = g_tsv_df.dropna()
                    remove_recs = \
                    g_tsv_df[g_tsv_df["o"].str.startswith("https://dblp.org/" + label_node + "/" + rec_type)][
                        "o"].unique().tolist()
                    g_tsv_df = g_tsv_df[~g_tsv_df["o"].isin(remove_recs)]
                ########################delete non target papers #####################
                rec_rels = ["https://dblp.org/rdf/schema#yearOfPublication",
                            "https://dblp.org/rdf/rdf-schema#label",
                            "http://www.w3.org/2000/01/rdf-schema#label",
                            "https://dblp.org/rdf/schema#title",
                            "https://dblp.org/rdf/schema#primaryElectronicEdition",
                            "https://dblp.org/rdf/schema#orderedCreators",
                            "http://www.w3.org/2002/07/owl#sameAs",
                            "https://dblp.org/rdf/schema#publishedInBook",
                            "https://dblp.org/rdf/schema#publishedInJournalVolume",
                            "https://dblp.org/rdf/schema#publishedInJournalVolumeIssue",
                            "https://dblp.org/rdf/schema#pagination",
                            "https://dblp.org/rdf/schema#numberOfCreators",
                            "https://dblp.org/rdf/schema#publishedInJournal",
                            "https://dblp.org/rdf/schema#otherElectronicEdition",
                            "https://dblp.org/rdf/schema#publicationNote",
                            "https://dblp.org/rdf/schema#isbn",
                            "https://dblp.org/rdf/schema#thesisAcceptedBySchool",
                            "https://dblp.org/rdf/schema#archivedElectronicEdition",
                            "https://dblp.org/rdf/schema#publishedBy",
                            'https://dblp.org/rdf/schema#authoredBy',
                            "https://dblp.org/rdf/schema#monthOfPublication",
                            "https://dblp.org/rdf/schema#publishedInSeries",
                            "https://dblp.org/rdf/schema#publishedInSeriesVolume",
                            "https://dblp.org/rdf/schema#publishedInBookChapter",
                            'https://dblp.org/rdf/schema#doi',
                            'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
                            'http://purl.org/spar/datacite/hasIdentifier',
                            'https://dblp.org/rdf/schema#listedOnTocPage',
                            'https://dblp.org/rdf/schema#publishedAsPartOf',
                            'https://dblp.org/rdf/schema#bibtexType',
                            'https://dblp.org/rdf/schema#editedBy',
                            'https://dblp.org/rdf/schema#wikidata',
                            split_rel]

                lst_targets = list(set(g_tsv_df[g_tsv_df["p"] == target_rel]["s"].tolist()))
                for rel in rec_rels:
                    rel_df = g_tsv_df[(g_tsv_df["p"] == rel) & (g_tsv_df["s"].str.startswith("https://dblp.org/rec/"))]
                    to_delete_papers = rel_df[~rel_df["s"].isin(lst_targets)]["s"].unique().tolist()
                    g_tsv_df = g_tsv_df[~g_tsv_df["s"].isin(to_delete_papers)]
                lst_targets = list(set(g_tsv_df[g_tsv_df["p"] == target_rel]["s"].tolist()))
                lst_targets2 = list(set(g_tsv_df[g_tsv_df["s"].str.startswith("https://dblp.org/rec/")]["s"].tolist()))
                tergets_dff = list(set(lst_targets2).difference(lst_targets))
                g_tsv_df_diff = g_tsv_df[g_tsv_df["s"].isin(tergets_dff)]["p"].unique()
                ################################filter Objects ######################
                rel_df = g_tsv_df[g_tsv_df["o"].str.startswith("https://dblp.org/rec/")]
                to_delete_papers = rel_df[~rel_df["o"].isin(lst_targets)]["o"].unique().tolist()
                g_tsv_df = g_tsv_df[~g_tsv_df["o"].isin(to_delete_papers)]
                #####################################################################
                relations_lst = g_tsv_df["p"].unique().tolist()
                relations_lst.remove(split_rel)
                relations_lst.remove(target_rel)
                ################################write relations index ########################
                relations_df = pd.DataFrame(relations_lst, columns=["rel name"])
                relations_df["rel name"] = relations_df["rel name"].apply(lambda x: str(x).split("/")[-1])
                relations_df["rel idx"] = relations_df.index
                relations_df = relations_df[["rel idx", "rel name"]]
                map_folder = "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/mapping"
                try:
                    os.stat(map_folder)
                except:
                    os.makedirs(map_folder)
                relations_df.to_csv(map_folder + "/relidx2relname.csv", index=None)
                compress_gz(map_folder + "/relidx2relname.csv")
                ############################### create target label index ########################
                label_idx_df = pd.DataFrame(
                    g_tsv_df[g_tsv_df["p"] == target_rel]["o"].apply(lambda x: str(x).strip()).unique().tolist(),
                    columns=["label name"])
                try:
                    label_idx_df["label name"] = label_idx_df["label name"].astype("int64")
                    label_idx_df = label_idx_df.sort_values(by=["label name"]).reset_index(drop=True)
                except:
                    label_idx_df["label name"] = label_idx_df["label name"].astype("str")
                    label_idx_df = label_idx_df.sort_values(by=["label name"]).reset_index(drop=True)

                label_idx_df["label idx"] = label_idx_df.index
                label_idx_df = label_idx_df[["label idx", "label name"]]
                label_idx_df.to_csv(map_folder + "/labelidx2labelname.csv", index=None)
                compress_gz(map_folder + "/labelidx2labelname.csv")
                ###########################################prepare relations mapping#################################
                relations_entites_map = {}
                relations_dic = {}
                entites_dic = {}

                # literal_rels=["https://dblp.org/rdf/schema#doi",
                #               "https://dblp.org/rdf/schema#title",
                #               "https://dblp.org/rdf/schema#primaryElectronicEdition",
                #               "http://www.w3.org/2002/07/owl#sameAs",
                #               'https://dblp.org/rdf/schema#otherElectronicEdition',
                #               'https://dblp.org/rdf/schema#orcid',
                #               'https://dblp.org/rdf/schema#webpage',
                #               'https://dblp.org/rdf/schema#archivedElectronicEdition',
                #               'https://dblp.org/rdf/schema#otherHomepage',
                #               'https://dblp.org/rdf/schema#primaryHomepage',
                #               'https://dblp.org/rdf/schema#archivedWebpage',
                #               'https://dblp.org/rdf/schema#yearOfPublication']
                for rel in relations_lst:
                    rel_df = g_tsv_df[g_tsv_df["p"] == rel].reset_index(drop=True)
                    rel_df["s_type"] = rel_df["s"].apply(
                        lambda x: str(x).split("/")[3] if str(x).startswith("http") and len(str(x).split(
                            "/")) > 3 else "literal")  # get third parts from url i.e https://dblp.org/rec/journals/corr/abs-2111-03922 to be "rec"

                    rel_df = rel_df[~rel_df["s_type"].isin(["literal"])]
                    ##########################filter O types by uri pattern ################################
                    rel_df["o_type"] = rel_df["o"].apply(
                        lambda x: str(x).split("/")[3] if str(x).startswith("http") and len(
                            str(x).split("/")) > 3 else "literal")
                    if len(rel_df["o_type"].unique()) > 3:
                        if rel.startswith("http") and '#' in rel:  ## DOI special case
                            rel_df["o_type"] = rel.split("#")[-1]
                        else:
                            rel_df["o_type"] = "literal"

                    # if rel.startswith("http") and '#' in rel : ## DOI special case
                    #     # in literal_rels
                    #     rel_df["o_type"] = rel.split("#")[-1]
                    # else:
                    #     rel_df["o_type"]=rel_df["o"].apply(lambda x: str(x).split("/")[3] if str(x).startswith("http") and len(str(x).split("/"))>3 else "literal")
                    rel_df = rel_df[~rel_df["o_type"].isin(["literal"])]
                    rel_entity_types = rel_df[["s_type", "o_type"]].drop_duplicates()
                    list_rel_types = []
                    for idx, row in rel_entity_types.iterrows():
                        list_rel_types.append((row["s_type"], rel, row["o_type"]))

                    relations_entites_map[rel] = list_rel_types
                    if len(list_rel_types) > 2:
                        print(len(list_rel_types))
                    relations_dic[rel] = rel_df
                    # e1_list=list(set(relations_dic[rel]["s"].apply(lambda x:str(x).split("/")[:-1])))
                    for rel_pair in list_rel_types:
                        e1, rel, e2 = rel_pair
                        if e1 != "literal" and e1 in entites_dic:
                            entites_dic[e1] = entites_dic[e1].union(set(rel_df[rel_df["s_type"] == e1]["s"].apply(
                                lambda x: str(x).split("/" + e1 + "/")[-1]).unique()))
                        elif e1 != "literal":
                            entites_dic[e1] = set(rel_df[rel_df["s_type"] == e1]["s"].apply(
                                lambda x: str(x).split("/" + e1 + "/")[-1]).unique())

                        if e2 != "literal" and e2 in entites_dic:
                            entites_dic[e2] = entites_dic[e2].union(set(rel_df[rel_df["o_type"] == e2]["o"].apply(
                                lambda x: str(x).split("/" + e2 + "/")[-1]).unique()))
                        elif e2 != "literal":
                            entites_dic[e2] = set(rel_df[rel_df["o_type"] == e2]["o"].apply(
                                lambda x: str(x).split("/" + e2 + "/")[-1]).unique())

                ############################ write entites index #################################
                for key in list(entites_dic.keys()):
                    entites_dic[key] = pd.DataFrame(list(entites_dic[key]), columns=['ent name']).astype(
                        'str').sort_values(by="ent name").reset_index(drop=True)
                    entites_dic[key] = entites_dic[key].drop_duplicates()
                    entites_dic[key]["ent idx"] = entites_dic[key].index
                    entites_dic[key] = entites_dic[key][["ent idx", "ent name"]]
                    entites_dic[key + "_dic"] = pd.Series(entites_dic[key]["ent idx"].values,
                                                          index=entites_dic[key]["ent name"]).to_dict()
                    # print("key=",entites_dic[key+"_dic"])
                    map_folder = "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/mapping"
                    try:
                        os.stat(map_folder)
                    except:
                        os.makedirs(map_folder)
                    entites_dic[key].to_csv(map_folder + "/" + key + "_entidx2name.csv", index=None)
                    compress_gz(map_folder + "/" + key + "_entidx2name.csv")
                #################### write nodes statistics ######################
                lst_node_has_feat = [
                    list(filter(lambda entity: str(entity).endswith("_dic") == False, list(entites_dic.keys())))]
                lst_node_has_label = lst_node_has_feat.copy()
                lst_num_node_dict = lst_node_has_feat.copy()
                lst_has_feat = []
                lst_has_label = []
                lst_num_node = []

                for entity in lst_node_has_feat[0]:
                    if str(entity) == str(label_node):
                        lst_has_label.append("True")
                        lst_has_feat.append("True")
                    else:
                        lst_has_label.append("False")
                        lst_has_feat.append("False")

                    # lst_has_feat.append("False")
                    lst_num_node.append(len(entites_dic[entity + "_dic"]))

                lst_node_has_feat.append(lst_has_feat)
                lst_node_has_label.append(lst_has_label)
                lst_num_node_dict.append(lst_num_node)

                lst_relations = []

                for key in list(relations_entites_map.keys()):
                    for elem in relations_entites_map[key]:
                        (e1, rel, e2) = elem
                        lst_relations.append([e1, str(rel).split("/")[-1], e2])

                map_folder = "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw"
                print("map_folder=", map_folder)
                try:
                    os.stat(map_folder)
                except:
                    os.makedirs(map_folder)

                pd.DataFrame(lst_node_has_feat).to_csv(
                    "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/nodetype-has-feat.csv", header=None,
                    index=None)
                compress_gz("/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/nodetype-has-feat.csv")

                pd.DataFrame(lst_node_has_label).to_csv(
                    "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/nodetype-has-label.csv",
                    header=None, index=None)
                compress_gz("/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/nodetype-has-label.csv")

                pd.DataFrame(lst_num_node_dict).to_csv(
                    "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/num-node-dict.csv", header=None,
                    index=None)
                compress_gz("/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/num-node-dict.csv")

                ############################### create label (target) relation index ######################
                label_idx_dic = pd.Series(label_idx_df["label idx"].values, index=label_idx_df["label name"]).to_dict()
                labels_rel_df = g_tsv_df[g_tsv_df["p"] == target_rel]
                label_type = labels_rel_df["s"].values[0]
                s_label_type = (label_type.split("/")[3] if label_type.startswith("http") and len(
                    label_type.split("/")) > 3 else "literal")
                # label_type = str(labels_rel_df["s"].values[0]).split("/")
                # label_type=label_type[len(label_type)-2]
                labels_rel_df["s_idx"] = labels_rel_df["s"].apply(lambda x: str(x).split("/" + s_label_type + "/")[-1])
                labels_rel_df["s_idx"] = labels_rel_df["s_idx"].astype("str")
                labels_rel_df["s_idx"] = labels_rel_df["s_idx"].apply(
                    lambda x: entites_dic[s_label_type + "_dic"][x] if x in entites_dic[
                        s_label_type + "_dic"].keys() else -1)
                labels_rel_df_notfound = labels_rel_df[labels_rel_df["s_idx"] == -1]
                labels_rel_df = labels_rel_df[labels_rel_df["s_idx"] != -1]
                labels_rel_df = labels_rel_df.sort_values(by=["s_idx"]).reset_index(drop=True)
                o_label_type = labels_rel_df["o"].values[0]
                o_label_type = (o_label_type.split("/")[3] if o_label_type.startswith("http") and len(
                    o_label_type.split("/")) > 3 else "literal")
                labels_rel_df["o_idx"] = labels_rel_df["o"].apply(lambda x: str(x).split("/" + o_label_type + "/")[-1])
                labels_rel_df["o_idx"] = labels_rel_df["o_idx"].apply(lambda x: label_idx_dic[x])
                out_labels_df = labels_rel_df[["o_idx"]]
                map_folder = "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/node-label/" + s_label_type
                try:
                    os.stat(map_folder)
                except:
                    os.makedirs(map_folder)
                out_labels_df.to_csv(map_folder + "/node-label.csv", header=None, index=None)
                compress_gz(map_folder + "/node-label.csv")
                ###########################################split parts (train/test/validate)#########################
                split_df = g_tsv_df[g_tsv_df["p"] == split_rel]
                label_type = str(split_df["s"].values[0])
                label_type = (label_type.split("/")[3] if label_type.startswith("http") and len(
                    label_type.split("/")) > 3 else "literal")
                try:
                    split_df["s"] = split_df["s"].apply(lambda x: str(x).split("/" + label_type + "/")[-1]).astype(
                        "int64").apply(
                        lambda x: entites_dic[label_type + "_dic"][x] if x in entites_dic[label_type + "_dic"] else -1)
                    # split_df = split_df[split_df["s"] != -1]
                except:
                    split_df["s"] = split_df["s"].apply(lambda x: str(x).split("/" + label_type + "/")[-1]).astype(
                        "str").apply(
                        lambda x: entites_dic[label_type + "_dic"][x] if x in entites_dic[label_type + "_dic"] else -1)

                split_df = split_df[split_df["s"] != -1]
                split_df["o"] = split_df["o"].astype(split_by["split_data_type"])
                label_type_values_lst = list(entites_dic[label_type + "_dic"].values())
                split_df = split_df[split_df["s"].isin(label_type_values_lst)]
                split_df = split_df.sort_values(by=["s"]).reset_index(drop=True)

                train_df = split_df[split_df["o"] <= split_by["train"]]["s"]
                valid_df = split_df[(split_df["o"] > split_by["train"]) & (split_df["o"] <= split_by["valid"])]["s"]
                test_df = split_df[(split_df["o"] > split_by["valid"]) & (split_df["o"] <= split_by["test"])]["s"]

                map_folder = "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/split/" + split_by[
                    "folder_name"] + "/" + label_type
                try:
                    os.stat(map_folder)
                except:
                    os.makedirs(map_folder)
                train_df.to_csv(map_folder + "/train.csv", index=None, header=None)
                compress_gz(map_folder + "/train.csv")
                valid_df.to_csv(map_folder + "/valid.csv", index=None, header=None)
                compress_gz(map_folder + "/valid.csv")
                test_df.to_csv(map_folder + "/test.csv", index=None, header=None)
                compress_gz(map_folder + "/test.csv")
                ###################### create nodetype-has-split.csv#####################
                lst_node_has_split = [
                    list(filter(lambda entity: str(entity).endswith("_dic") == False, list(entites_dic.keys())))]
                lst_has_split = []
                for rel in lst_node_has_split[0]:
                    if rel == label_type:
                        lst_has_split.append("True")
                    else:
                        lst_has_split.append("False")
                lst_node_has_split.append(lst_has_split)
                pd.DataFrame(lst_node_has_split).to_csv(
                    "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/split/" + split_by[
                        "folder_name"] + "/nodetype-has-split.csv", header=None, index=None)
                compress_gz("/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/split/" + split_by[
                    "folder_name"] + "/nodetype-has-split.csv")
                ############################ write entites relations  #################################
                idx = 0
                # for key in entites_dic["author_dic"].keys():
                #     print(key, entites_dic["author_dic"][key])
                #     idx=idx+1
                #     if idx>10:
                #         break;
                # print( list(entites_dic.keys()))
                for rel in relations_dic:
                    for rel_list in relations_entites_map[rel]:
                        e1, rel, e2 = rel_list
                        ############
                        relations_dic[rel]["s_idx"] = relations_dic[rel]["s"].apply(
                            lambda x: str(x).split("/" + e1 + "/")[-1])
                        relations_dic[rel]["s_idx"] = relations_dic[rel]["s_idx"].apply(
                            lambda x: entites_dic[e1 + "_dic"][x] if x in entites_dic[e1 + "_dic"].keys() else -1)
                        relations_dic[rel] = relations_dic[rel][relations_dic[rel]["s_idx"] != -1]
                        ################
                        # relations_dic[rel]["o_keys"]=relations_dic[rel]["o"].apply(lambda x:x.split("/")[3] if x.startswith("http") and len(x.split("/")) > 3 else x)
                        relations_dic[rel]["o_idx"] = relations_dic[rel]["o"].apply(
                            lambda x: str(x).split("/" + e2 + "/")[-1])
                        relations_dic[rel]["o_idx"] = relations_dic[rel]["o_idx"].apply(
                            lambda x: entites_dic[e2 + "_dic"][x] if x in entites_dic[e2 + "_dic"].keys() else -1)
                        relations_dic[rel] = relations_dic[rel][relations_dic[rel]["o_idx"] != -1]

                        relations_dic[rel] = relations_dic[rel].sort_values(by="s_idx").reset_index(drop=True)
                        rel_out = relations_dic[rel][["s_idx", "o_idx"]]
                        if len(rel_out) > 0:
                            map_folder = "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/relations/" + e1 + "___" + \
                                         rel.split("/")[-1] + "___" + e2
                            try:
                                os.stat(map_folder)
                            except:
                                os.makedirs(map_folder)
                            rel_out.to_csv(map_folder + "/edge.csv", index=None, header=None)
                            compress_gz(map_folder + "/edge.csv")
                            ########## write relations num #################
                            f = open(map_folder + "/num-edge-list.csv", "w")
                            f.write(str(len(relations_dic[rel])))
                            f.close()
                            compress_gz(map_folder + "/num-edge-list.csv")
                            ##################### write relations idx #######################
                            rel_idx = relations_df[relations_df["rel name"] == rel.split("/")[-1]]["rel idx"].values[0]
                            rel_out["rel_idx"] = rel_idx
                            rel_idx_df = rel_out["rel_idx"]
                            rel_idx_df.to_csv(map_folder + "/edge_reltype.csv", header=None, index=None)
                            compress_gz(map_folder + "/edge_reltype.csv")
                        else:
                            lst_relations.remove([e1, str(rel).split("/")[-1], e2])

                    pd.DataFrame(lst_relations).to_csv(
                        "/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/triplet-type-list.csv",
                        header=None, index=None)
                    compress_gz("/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name + "/raw/triplet-type-list.csv")
                    #####################Zip Folder ###############3
                shutil.make_archive("/shared_mnt/DBLP/Sparql_Sampling_conf/" + dataset_name, 'zip',
                                    root_dir="/shared_mnt/DBLP/Sparql_Sampling_conf/", base_dir=dataset_name)
                # shutil.rmtree("/shared_mnt/DBLP/Sparql_Sampling_conf/"+dataset_name)
                end_t = datetime.datetime.now()
                print("DBLP_QM_conf_csv_to_Hetrog_time=", end_t - start_t, " sec.")
                if use_FM == True:
                    dic_results[dataset_name]["csv_to_Hetrog_time"] = (end_t - start_t).total_seconds()
                    pd.DataFrame(dic_results).transpose().to_csv(
                        "/shared_mnt/DBLP/Sparql_Sampling_conf/OGBN_BDLP_conf_FM_Uscases_CSVtoHetrog_times" + ".csv",
                        index=False)
                    break;
                # print(entites_dic)
                else:
                    dic_results[dataset_name]["csv_to_Hetrog_time"] = (end_t - start_t).total_seconds()
                    pd.DataFrame(dic_results).transpose().to_csv(
                        "/shared_mnt/DBLP/Sparql_Sampling_conf/OGBN_BDLP_conf_QM_Uscases_CSVtoHetrog_times" + ".csv",
                        index=False)
                # print(entites_dic)
            if use_FM == True:
                break;