LMIA / node2vec / node2vec-LMIA.py
node2vec-LMIA.py
Raw
from __future__ import division
import networkx as nx
import numpy as np
import pickle as pk
import os
from gae.preprocessing import mask_test_edges
import link_prediction_scores
import pandas as pd
import sys

from bi_samplingv3 import *

sys.setrecursionlimit(1000000)


def get_edge_types(edges):
    typelist = set([])
    for edge in edges:
        # print(edge)
        # print(type(edge))
        e1 = edge[0]
        e2 = edge[1]
        t1 = g.nodes[e1]['gender']
        t2 = g.nodes[e2]['gender']
        if t1 <= t2:
            etype = (t1, t2)
        else:
            etype = (t2, t1)
        typelist.add(etype)
    # print(typelist)
    return typelist


def qda_test(train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false, val_preds,
             test_preds, f):
    # train_format: uid0, uid1, gender0, gender1, prediction, truth, binary_prediction
    # test_format: uid0, uid1, gender0, gender1, prediction, truth, binary_prediction
    train_format = []
    val_format = []
    test_format = []

    train_all = []
    val_all = []
    test_all = []
    # train_all = train_edges + train_edges_false
    # val_all=val_edges + val_edges_false
    # test_all =test_edges + test_edges_false
    train_all = np.vstack((train_edges, train_edges_false))
    val_all = np.vstack((val_edges, val_edges_false))
    test_all = np.vstack((test_edges, test_edges_false))
    print(train_all)
    print(np.shape(train_all))
    print(np.shape(train_edges))

    for i in range(np.shape(train_all)[0]):
        grt = 1
        if i > np.shape(train_edges)[0]:
            grt = 0
        print([train_all[i][1]])
        train_f = [train_all[i][0], train_all[i][1], g.nodes[train_all[i][0]]['gender'],
                   g.nodes[train_all[i][1]]['gender'], grt, grt, grt]
        train_format.append(train_f)

    name = ['node1', 'node2', 'gender1', 'gender2', 'grt', 'score', 'biscore']
    result = pd.DataFrame(columns=name, data=train_format)
    result.to_csv("E:\\python\\banlance\\code\\{2}\\train{0}_{1}.csv".format(ego_user, f, DATASET))
    # result.to_csv("/Users/xiulingwang/Downloads/google+-raw-data/result/train{0}_{1}.csv".format(ego_user, f))

    for i in range(np.shape(val_all)[0]):
        grt = 1
        if i > np.shape(val_edges)[0]:
            grt = 0
        if val_preds[i] >= 0.5:
            bi_score = 1
        else:
            bi_score = 0
        val_f = [val_all[i][0], val_all[i][1], g.nodes[val_all[i][0]]['gender'], g.nodes[val_all[i][1]]['gender'], grt,
                 val_preds[i], bi_score]
        val_format.append(val_f)

    name = ['node1', 'node2', 'gender1', 'gender2', 'grt', 'score', 'biscore']
    result = pd.DataFrame(columns=name, data=val_format)
    result.to_csv("E:\\python\\banlance\\code\\{2}\\val{0}_{1}.csv".format(ego_user, f, DATASET, METHOD))
    # result.to_csv("/Users/xiulingwang/Downloads/google+-raw-data/result/val{0}_{1}.csv".format(ego_user, f))

    for i in range(np.shape(test_all)[0]):
        grt = 1
        if i > np.shape(test_edges)[0]:
            grt = 0
        if test_preds[i] >= 0.5:
            bi_score = 1
        else:
            bi_score = 0
        test_f = [test_all[i][0], test_all[i][1], g.nodes[test_all[i][0]]['gender'], g.nodes[test_all[i][1]]['gender'],
                  grt, test_preds[i], bi_score]
        test_format.append(test_f)

    name = ['node1', 'node2', 'gender1', 'gender2', 'grt', 'score', 'biscore']
    result = pd.DataFrame(columns=name, data=test_format)
    result.to_csv("E:\\python\\banlance\\code\\{2}\\test{0}_{1}.csv".format(ego_user, f, DATASET, METHOD))
    # result.to_csv("/Users/xiulingwang/Downloads/google+-raw-data/result/{1}/test{0}_{1}.csv".format(ego_user, f))

    # uid0, uid2, gender0, gender1, score, truth, prediction
    return train_format, val_format, test_format

def other_edge_generate3(other_edge_len, adj, train_edges_list,test_edges):

    other_edges_false = set()

    num_nodes=np.shape(adj)[0]
    print(num_nodes)

    edgeall=set()
    for i in range(num_nodes):
        for j in range(i+1,num_nodes):
            edgeall.add((i,j))
    print(edgeall)
    print(len(edgeall))
    # train_edges1=list(train_edges)
    # train_edges_false1=list(train_edges_false)
    # val_edges1 = list(val_edges)
    # val_edges_false1 = list(val_edges_false)
    # test_edges1 = list(test_edges)
    # test_edges_false1 = list(test_edges_false)
    print(train_edges[4])
    cnt=0
    print(np.shape(train_edges_list))
    for edge in train_edges_list:
        eg=(edge[0],edge[1])
        if (edge[0]==edge[1]):
            continue
        # print(eg))
        edgeall.remove(eg)

    print(len(edgeall))

    for edge in test_edges:
        eg=(edge[0],edge[1])
        if eg not in edgeall:
            continue
        if (edge[0]==edge[1]):
            continue
        edgeall.remove(eg)


    edgeall = list([list(edge_tuple) for edge_tuple in edgeall])
    print(np.shape(edgeall))
    print(other_edge_len)


    other_edges=random.sample(edgeall,other_edge_len)

    for edge in other_edges:
        eg = (edge[0], edge[1])
        other_edges_false.add(eg)

    other_edges_false = np.array([list(edge_tuple) for edge_tuple in other_edges_false])

    return other_edges_false




seds=[1]
# #DATAS

G_EGO_USERS=['dblp','fb','google+','cora','citeseer','pubmed','lastfm']

combs=[1,2,3,4,5,6,7]
for sed in seds:
    METHOD = 'graphleaks%s'%(sed)
    for ego_user in G_EGO_USERS:

        feat_dir = './data/' + str(ego_user) + '-adj-feat.pkl'

        f2 = open(feat_dir, 'rb')

        adj, ft = pk.load(f2, encoding='latin1')

        g = nx.Graph(adj)


        if ego_user=='dblp' or  ego_user=='google+' :

            gindex=0
            for i, n in enumerate(g.nodes()):
                if (ft[n][gindex]==0):
                    ginfo = 1 #male
                elif (ft[n][gindex]==1):
                    ginfo = 2 #female

                else:
                    print('***')
                    ginfo = 0 #unknow gender

                g.nodes[n]['gender'] = ginfo


        elif ego_user=='fb':

            gindex = 77
            for i, n in enumerate(g.nodes()):
                if (ft[n][gindex] == 1 and ft[n][gindex + 1] != 1):
                    ginfo = 1  # male
                elif (ft[n][gindex + 1] == 1 and ft[n][gindex] != 1):
                    ginfo = 2  # female

                else:
                    print('***')
                    ginfo = 0  # unknow gender

                print(ginfo)

                g.nodes[n]['gender'] = ginfo

        else:

            with open('./data/' + str(ego_user) + '-target.txt') as tfile:
                Lines = tfile.readlines()
                target = []
                for line in Lines:
                    arr = line.strip().split(',')
                    target.append(int(arr[1]))

            for i, n in enumerate(g.nodes()):
                g.nodes[n]['gender'] = target[n]

        np.random.seed(sed)
        adj_sparse = nx.to_scipy_sparse_matrix(g)

        # Perform train-test split
        train_test_split = mask_test_edges(adj_sparse, test_frac=.3, val_frac=0)
        adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = train_test_split  # Unpack train-test split
        g_train = nx.from_scipy_sparse_matrix(
            adj_train)  # new graph object with only non-hidden edges, keep all the original nodes

        dp=0 ###dp=0:original, dp=1:differential, dp=5:adversarial
        sigma=48
        if dp==1:
            F = 'n2v-'+str(ego_user)+str(dp)+str(sigma)+str(sed)

        else:
            F = 'n2v-'+str(ego_user) + str(dp)+str(sed)


        res_dir = './data/'

        out = open('%s/%s-fair-%s-train.txt' % (res_dir, str(ego_user), F), 'w')
        for item in train_edges:
            for jtem in item:
                out.write(str(jtem) + '\t')
            out.write('\n')
        out.close()

        train_edge_labels, test_edge_labels, emb_matrix, train_sim_matrix, test_sim_matrix, train_edge_embs, test_edge_embs, train_embs_1, train_embs_2, test_embs_1, test_embs_2, train_edges_sampled= link_prediction_scores.node2vec_scores8(
            g_train, train_test_split, DATASET, METHOD, F,dp,res_dir,ego_user,sigma,
            P=0.25,  # Return hyperparameter
            Q=4,  # In-out hyperparameter
            WINDOW_SIZE=10,  # Context size for optimization
            NUM_WALKS=10,  # Number of walks per source
            WALK_LENGTH=80,  # Length of walk per source
            DIMENSIONS=256,  # Embedding dimension
            DIRECTED=False,  # Graph directed/undirected
            WORKERS=1,  # Num. parallel workers
            ITER=3,  # SGD epochs
            edge_score_mode="edge-emb",  # Whether to use bootstrapped edge embeddings + LogReg (like in node2vec paper),
            # or simple dot-product (like in GAE paper) for edge scoring
            verbose=1,
            Ego_user=ego_user,
        )
        # print(n2v_scores)

        train_edges_list = np.array(train_edges_sampled)
        # print(train_edges_list)
        test_edges_list = test_edges
        # print(test_edges_list)

        edges_list = np.concatenate((train_edges_list, test_edges_list), axis=0)

        ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0]

        for comb in combs:
            if comb ==1:#dot
                sim_matrix = np.concatenate((train_sim_matrix[:, 0], test_sim_matrix[:, 0]), axis=0)

                sim_matrix_train = train_sim_matrix[:, 0]
                sim_matrix_test = test_sim_matrix[:, 0]
                sim_matrix = sim_matrix.reshape(-1, 1)

                sim_matrix_train = sim_matrix_train.reshape(-1, 1)
                sim_matrix_test = sim_matrix_test.reshape(-1, 1)

            if comb ==2:#dot
                sim_matrix = np.concatenate((train_sim_matrix[:, 1], test_sim_matrix[:, 1]), axis=0)
                sim_matrix_train = train_sim_matrix[:, 1]
                sim_matrix_test = test_sim_matrix[:, 1]
                sim_matrix = sim_matrix.reshape(-1, 1)
                # print(sim_matrix)

                sim_matrix_train = sim_matrix_train.reshape(-1, 1)
                sim_matrix_test = sim_matrix_test.reshape(-1, 1)
            if comb==3:#eu
                sim_matrix = np.concatenate((train_sim_matrix[:, 2], test_sim_matrix[:, 2]), axis=0)

                sim_matrix_train = train_sim_matrix[:, 2]
                sim_matrix_test = test_sim_matrix[:, 2]
                sim_matrix = sim_matrix.reshape(-1, 1)
                # print(sim_matrix)

                sim_matrix_train = sim_matrix_train.reshape(-1, 1)
                sim_matrix_test = sim_matrix_test.reshape(-1, 1)
            if comb == 4:#dot+cos
                sim_matrix = np.concatenate((train_sim_matrix[:, 0:2], test_sim_matrix[:, 0:2]), axis=0)
                sim_matrix_train = train_sim_matrix[:, 0:2]
                sim_matrix_test = test_sim_matrix[:, 0:2]
            if comb == 5:#dot+eu
                sim_matrix = np.concatenate((train_sim_matrix[:, [0,2]], test_sim_matrix[:, [0,2]]), axis=0)
                sim_matrix_train = train_sim_matrix[:, [0,2]]
                sim_matrix_test = test_sim_matrix[:, [0,2]]
            if comb == 6:#cos+eu
                sim_matrix = np.concatenate((train_sim_matrix[:, 1:3], test_sim_matrix[:, 1:3]), axis=0)
                sim_matrix_train = train_sim_matrix[:, 1:3]
                sim_matrix_test = test_sim_matrix[:, 1:3]

            if comb == 7:#dot+cos+eu
                sim_matrix = np.concatenate((train_sim_matrix[:, 0:3], test_sim_matrix[:, 0:3]), axis=0)

                sim_matrix_train = train_sim_matrix[:, 0:3]
                sim_matrix_test = test_sim_matrix[:, 0:3]


            from sklearn.cluster import KMeans
            from sklearn.metrics import accuracy_score

            accuracy = []
            for i in range(500):
                kmeans = KMeans(n_clusters=2, random_state=i).fit(sim_matrix)

                ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0]
                acc = accuracy_score(kmeans.labels_, ylabel)
                accuracy.append(acc)
            print(max(accuracy))

            acc_kmeans_sim = max(accuracy)

            tsts=[]
            print(len(kmeans.labels_))
            for i in range(len(kmeans.labels_)):
                node1=edges_list[i][0]
                node2=edges_list[i][1]
                dgr1=g.degree(node1)
                dgr2 = g.degree(node2)
                gender1 = g.nodes[node1]['gender']
                gender2 = g.nodes[node2]['gender']

                sim0 = sim_matrix[i]

                tst = [kmeans.labels_[i], ylabel[i], node1, node2, dgr1, dgr2, gender1, gender2]
                tsts.append(tst)
            name = ['y_score', 'y_test_grd', 'node1', 'node2', 'dgr1', 'dgr2', 'gender1', 'gender2']
            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-kmeans_sim_{}.csv".format(res_dir, F,comb))

            cents = kmeans.cluster_centers_
            # print('cents',cents)

            dist0 = 0
            dist1 = 0

            for l in range(len(kmeans.labels_)):
                dist0 = np.sqrt(np.sum(np.square(sim_matrix[l] - cents[0])))
                dist1 = np.sqrt(np.sum(np.square(sim_matrix[l] - cents[1])))
                if kmeans.labels_[l] == 0 and (dist0 < dist1):
                    cent0 = cents[0]
                    cent1 = cents[1]
                    break
                elif kmeans.labels_[l] == 0 and (dist0 > dist1):
                    cent1 = cents[0]
                    cent0 = cents[1]
                    break
            # print(l)

            dis0 = []
            dis1 = []

            for l in range(len(kmeans.labels_)):

                if kmeans.labels_[l] == 0:
                    dist = np.sqrt(np.sum(np.square(sim_matrix[l] - cent0)))
                    dis0.append(dist)

                else:
                    dist = np.sqrt(np.sum(np.square(sim_matrix[l] - cent1)))
                    dis1.append(dist)

            # print(cent0,cent1)
            dist0 = sum(dis0) / len(dis0)
            dist1 = sum(dis1) / len(dis1)
            #
            from sklearn.model_selection import train_test_split

            ylabel1=ylabel
            ylable1=np.reshape(len(ylabel1),1)

            y_label=np.zeros((np.shape(edges_list)[0],3))
            for i in range(np.shape(edges_list)[0]):
                y_label[i][0]=edges_list[i][0]
                y_label[i][1] = edges_list[i][1]
                y_label[i][2] = ylabel[i]
            print(np.shape(y_label))

            y_label_train=np.zeros((np.shape(train_edges_list)[0],3))
            for i in range(np.shape(train_edges_list)[0]):
                y_label_train[i][0]=train_edges_list[i][0]
                y_label_train[i][1] = train_edges_list[i][1]
                y_label_train[i][2] = 1
            print(np.shape(y_label_train))

            y_label_test=np.zeros((np.shape(test_edges_list)[0],3))
            for i in range(np.shape(test_edges_list)[0]):
                y_label_test[i][0]=test_edges_list[i][0]
                y_label_test[i][1] = test_edges_list[i][1]
                y_label_test[i][2] = 0
            print(np.shape(y_label_test))



            X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(sim_matrix_train, y_label_train, test_size=0.3, random_state=42)

            X_test_train, X_test_test, y_test_train, y_test_test = train_test_split(sim_matrix_test, y_label_test,
                                                                                        test_size=0.3, random_state=42)

            X_train=np.concatenate((X_train_train, X_test_train),axis=0)
            X_test = np.concatenate((X_train_test, X_test_test), axis=0)
            y_train=np.concatenate((y_train_train, y_test_train),axis=0)
            y_test=np.concatenate((y_train_test, y_test_test),axis=0)


            #X_train, X_test, y_train, y_test = train_test_split(sim_matrix, y_label, test_size=0.3, random_state=42)
            #
            # # ######################################################################

            from sklearn import metrics
            from sklearn.neural_network import MLPClassifier

            mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16, 18), random_state=1,
                                max_iter=500)

            mlp.fit(X_train, y_train[:,2])

            print("Training set score: %f" % mlp.score(X_train, y_train[:,2]))
            print("Test set score: %f" % mlp.score(X_test, y_test[:,2]))

            y_score = mlp.predict(X_test)
            print(metrics.f1_score(y_test[:,2], y_score, average='micro'))
            print(metrics.classification_report(y_test[:,2], y_score, labels=range(3)))

            acc_mlp_sim = accuracy_score(y_score, y_test[:,2])

            tsts=[]
            for i in range(len(y_score)):
                node1=y_test[i][0]
                node2=y_test[i][1]
                dgr1=g.degree(node1)
                dgr2 = g.degree(node2)

                gender1 = g.nodes[node1]['gender']
                gender2 = g.nodes[node2]['gender']

                tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1], dgr1, dgr2, gender1, gender2]
                tsts.append(tst)
            name = ['y_score', 'y_test_grd', 'node1', 'node2', 'dgr1', 'dgr2', 'gender1', 'gender2']
            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-mlp_sim_{}.csv".format(res_dir, F,comb))

            # # ######################################################################

            from sklearn.ensemble import RandomForestClassifier

            rf = RandomForestClassifier(max_depth=150, random_state=0)
            rf.fit(X_train, y_train[:,2])

            print("Training set score: %f" % rf.score(X_train, y_train[:,2]))
            print("Test set score: %f" % rf.score(X_test, y_test[:,2]))

            y_score = rf.predict(X_test)
            print(metrics.f1_score(y_test[:,2], y_score, average='micro'))
            print(metrics.classification_report(y_test[:,2], y_score, labels=range(3)))

            acc_rf_sim = accuracy_score(y_score, y_test[:,2])


            tsts=[]
            for i in range(len(y_score)):
                node1=y_test[i][0]
                node2=y_test[i][1]
                dgr1=g.degree(node1)
                dgr2 = g.degree(node2)

                gender1 = g.nodes[node1]['gender']
                gender2 = g.nodes[node2]['gender']

                tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1], dgr1, dgr2, gender1, gender2]
                tsts.append(tst)
            name = ['y_score', 'y_test_grd', 'node1', 'node2', 'dgr1', 'dgr2', 'gender1', 'gender2']

            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-rf_sim_{}.csv".format(res_dir, F,comb))

            # # ######################################################################

            from sklearn.multiclass import OneVsRestClassifier
            from sklearn.svm import SVC

            svm = OneVsRestClassifier(SVC())
            svm.fit(X_train, y_train[:,2])

            print("Training set score: %f" % svm.score(X_train, y_train[:,2]))
            print("Test set score: %f" % svm.score(X_test, y_test[:,2]))

            y_score = svm.predict(X_test)
            print(metrics.f1_score(y_test[:,2], y_score, average='micro'))
            print(metrics.classification_report(y_test[:,2], y_score, labels=range(3)))

            acc_svm_sim = accuracy_score(y_score, y_test[:,2])


            tsts=[]
            for i in range(len(y_score)):
                node1=y_test[i][0]
                node2=y_test[i][1]
                dgr1=g.degree(node1)
                dgr2 = g.degree(node2)
                gender1 = g.nodes[node1]['gender']
                gender2 = g.nodes[node2]['gender']
                tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1], dgr1, dgr2, gender1, gender2]
                tsts.append(tst)
            name = ['y_score', 'y_test_grd', 'node1', 'node2', 'dgr1', 'dgr2', 'gender1', 'gender2']
            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-svm_sim_{}.csv".format(res_dir, F,comb))