LMIA / deepwalk / deepwalk-LMIA.py
deepwalk-LMIA.py
Raw
import networkx as nx
import numpy as np
import pickle as pk
import os
from gae.preprocessing import mask_test_edges
import deepwalk.deepwalk as DW
import pandas as pd
import sys
from bi_samplingv3 import *
import math

sys.setrecursionlimit(1000000)

seds=[1]
G_EGO_USERS=['dblp','fb','google+','cora','citeseer','pubmed','lastfm']
combs=[1,2,3,4,5,6,7]

for sed in seds:
    METHOD = 'bi-line%s' % (sed)

    for ego_user in G_EGO_USERS:

        feat_dir = './data/' + str(ego_user) + '-adj-feat.pkl'

        f2 = open(feat_dir, 'rb')

        adj, ft = pk.load(f2, encoding='latin1')

        g = nx.Graph(adj)


        if ego_user=='dblp' or  ego_user=='google+' :

            gindex=0
            for i, n in enumerate(g.nodes()):
                if (ft[n][gindex]==0):
                    ginfo = 1 #male
                elif (ft[n][gindex]==1):
                    ginfo = 2 #female

                else:
                    print('***')
                    ginfo = 0 #unknow gender

                g.nodes[n]['gender'] = ginfo


        elif ego_user=='fb':

            gindex = 77
            for i, n in enumerate(g.nodes()):
                if (ft[n][gindex] == 1 and ft[n][gindex + 1] != 1):
                    ginfo = 1  # male
                elif (ft[n][gindex + 1] == 1 and ft[n][gindex] != 1):
                    ginfo = 2  # female

                else:
                    print('***')
                    ginfo = 0  # unknow gender

                print(ginfo)

                g.nodes[n]['gender'] = ginfo

        else:

            with open('./data/' + str(ego_user) + '-target.txt') as tfile:
                Lines = tfile.readlines()
                target = []
                for line in Lines:
                    arr = line.strip().split(',')
                    target.append(int(arr[1]))

            for i, n in enumerate(g.nodes()):
                g.nodes[n]['gender'] = target[n]

        np.random.seed(sed)
        adj_sparse = nx.to_scipy_sparse_matrix(g)

        # Perform train-test split
        train_test_split = mask_test_edges(adj_sparse, test_frac=.3, val_frac=0)
        adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = train_test_split  # Unpack train-test split
        g_train = nx.from_scipy_sparse_matrix(
            adj_train)  # new graph object with only non-hidden edges, keep all the original nodes

        dp=0 ###dp=0:original, dp=1:differential, dp=5:adversarial

        if dp==1:
            sigma = 48
            F = 'dw-'+str(ego_user) +'-'+str(dp)+'-'+str(sigma)

        else:
            F = 'dw-'+str(ego_user) +'-'+ str(dp)


        res_dir = './data/'

        node_list=g.nodes()

        # for edge in train_edges:
        edge_tuples0 = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in g_train.edges()]

        train_edges0 = set(edge_tuples0)  # initialize train_edges to have all edges
        train_edges1 = np.array([list(edge_tuple) for edge_tuple in train_edges0])

        out = open('%s/%s-fair-%s-train.txt' % (res_dir, str(ego_user), F), 'w')
        for item in train_edges1:
            for jtem in item:
                out.write(str(jtem) + '\t')
            out.write('\n')
        out.close()


        train_edge_labels,test_edge_labels, emb_matrix,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2 ,train_edges_sampled= DW.deepwalk8(g_train, train_test_split,
                                                                                           DATASET, METHOD, res_dir,
                                                                                           ego_user, F,dp,sigma)

        train_edges_list = np.array(train_edges_sampled)
        test_edges_list = test_edges

        edges_list = np.concatenate((train_edges_list, test_edges_list), axis=0)


        ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0]


        for comb in combs:
            if comb ==1:#dot
                sim_matrix = np.concatenate((train_sim_matrix[:, 0], test_sim_matrix[:, 0]), axis=0)

                sim_matrix_train = train_sim_matrix[:, 0]
                sim_matrix_test = test_sim_matrix[:, 0]

                sim_matrix = sim_matrix.reshape(-1, 1)
                # print(sim_matrix)

                sim_matrix_train = sim_matrix_train.reshape(-1, 1)
                sim_matrix_test = sim_matrix_test.reshape(-1, 1)

            if comb ==2:#cos
                sim_matrix = np.concatenate((train_sim_matrix[:, 1], test_sim_matrix[:, 1]), axis=0)
                sim_matrix_train = train_sim_matrix[:, 1]
                sim_matrix_test = test_sim_matrix[:, 1]

                sim_matrix = sim_matrix.reshape(-1, 1)

                sim_matrix_train = sim_matrix_train.reshape(-1, 1)
                sim_matrix_test = sim_matrix_test.reshape(-1, 1)

            if comb==3:#eu
                sim_matrix = np.concatenate((train_sim_matrix[:, 2], test_sim_matrix[:, 2]), axis=0)

                sim_matrix_train = train_sim_matrix[:, 2]
                sim_matrix_test = test_sim_matrix[:, 2]

                sim_matrix = sim_matrix.reshape(-1, 1)

                sim_matrix_train = sim_matrix_train.reshape(-1, 1)
                sim_matrix_test = sim_matrix_test.reshape(-1, 1)
            if comb == 4:#dot+cos
                sim_matrix = np.concatenate((train_sim_matrix[:, 0:2], test_sim_matrix[:, 0:2]), axis=0)

                sim_matrix_train = train_sim_matrix[:, 0:2]
                sim_matrix_test = test_sim_matrix[:, 0:2]
            if comb == 5:#dot+eu
                sim_matrix = np.concatenate((train_sim_matrix[:, [0,2]], test_sim_matrix[:, [0,2]]), axis=0)

                sim_matrix_train = train_sim_matrix[:, [0,2]]
                sim_matrix_test = test_sim_matrix[:, [0,2]]
            if comb == 6:#cos+eu
                sim_matrix = np.concatenate((train_sim_matrix[:, 1:3], test_sim_matrix[:, 1:3]), axis=0)

                sim_matrix_train = train_sim_matrix[:, 1:3]
                sim_matrix_test = test_sim_matrix[:, 1:3]

            if comb == 7:#dot+cos+eu
                sim_matrix = np.concatenate((train_sim_matrix[:, 0:3], test_sim_matrix[:, 0:3]), axis=0)

                sim_matrix_train = train_sim_matrix[:, 0:3]
                sim_matrix_test = test_sim_matrix[:, 0:3]


            from sklearn.cluster import KMeans
            from sklearn.metrics import accuracy_score

            accuracy = []
            for i in range(500):
                kmeans = KMeans(n_clusters=2, random_state=i).fit(sim_matrix)
                # kmeans = KMeans(n_clusters=2, random_state=i).fit(X)
                # print(kmeans.labels_)
                ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0]
                acc = accuracy_score(kmeans.labels_, ylabel)
                accuracy.append(acc)
            print(max(accuracy))

            acc_kmeans_sim = max(accuracy)

            tsts=[]
            print(len(kmeans.labels_))
            for i in range(len(kmeans.labels_)):
                node1=edges_list[i][0]
                node2=edges_list[i][1]
                dgr1=g.degree(node1)
                dgr2 = g.degree(node2)
                gender1 = g.nodes[node1]['gender']
                gender2 = g.nodes[node2]['gender']

                sim0 = sim_matrix[i]

                tst = [kmeans.labels_[i], ylabel[i], node1, node2, dgr1, dgr2, gender1, gender2]
                tsts.append(tst)
            name = ['y_score', 'y_test_grd', 'node1', 'node2', 'dgr1', 'dgr2', 'gender1', 'gender2']
            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-kmeans_sim_{}.csv".format(res_dir, F,comb))

            cents = kmeans.cluster_centers_

            dist0 = 0
            dist1 = 0

            for l in range(len(kmeans.labels_)):
                dist0 = np.sqrt(np.sum(np.square(sim_matrix[l] - cents[0])))
                dist1 = np.sqrt(np.sum(np.square(sim_matrix[l] - cents[1])))
                if kmeans.labels_[l] == 0 and (dist0 < dist1):
                    cent0 = cents[0]
                    cent1 = cents[1]
                    break
                elif kmeans.labels_[l] == 0 and (dist0 > dist1):
                    cent1 = cents[0]
                    cent0 = cents[1]
                    break

            dis0 = []
            dis1 = []

            for l in range(len(kmeans.labels_)):

                if kmeans.labels_[l] == 0:
                    dist = np.sqrt(np.sum(np.square(sim_matrix[l] - cent0)))
                    dis0.append(dist)

                else:
                    dist = np.sqrt(np.sum(np.square(sim_matrix[l] - cent1)))
                    dis1.append(dist)

            dist0 = sum(dis0) / len(dis0)
            dist1 = sum(dis1) / len(dis1)

            print(dist0, dist1)

            print(np.shape(ylabel))

            #
            from sklearn.model_selection import train_test_split

            ylabel1=ylabel
            ylable1=np.reshape(len(ylabel1),1)
            y_label=np.zeros((np.shape(edges_list)[0],3))
            for i in range(np.shape(edges_list)[0]):
                y_label[i][0]=edges_list[i][0]
                y_label[i][1] = edges_list[i][1]
                y_label[i][2] = ylabel[i]
            print(np.shape(y_label))

            y_label_train=np.zeros((np.shape(train_edges_list)[0],3))
            for i in range(np.shape(train_edges_list)[0]):
                y_label_train[i][0]=train_edges_list[i][0]
                y_label_train[i][1] = train_edges_list[i][1]
                y_label_train[i][2] = 1
            print(np.shape(y_label_train))

            y_label_test=np.zeros((np.shape(test_edges_list)[0],3))
            for i in range(np.shape(test_edges_list)[0]):
                y_label_test[i][0]=test_edges_list[i][0]
                y_label_test[i][1] = test_edges_list[i][1]
                y_label_test[i][2] = 0
            print(np.shape(y_label_test))



            X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(sim_matrix_train, y_label_train, test_size=0.3, random_state=42)

            X_test_train, X_test_test, y_test_train, y_test_test = train_test_split(sim_matrix_test, y_label_test,
                                                                                        test_size=0.3, random_state=42)

            X_train=np.concatenate((X_train_train, X_test_train),axis=0)
            X_test = np.concatenate((X_train_test, X_test_test), axis=0)
            y_train=np.concatenate((y_train_train, y_test_train),axis=0)
            y_test=np.concatenate((y_train_test, y_test_test),axis=0)


            #X_train, X_test, y_train, y_test = train_test_split(sim_matrix, y_label, test_size=0.3, random_state=42)
            #
            # # ######################################################################

            from sklearn import metrics
            from sklearn.neural_network import MLPClassifier

            mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16, 18), random_state=1,
                                max_iter=500)

            mlp.fit(X_train, y_train[:,2])

            print("Training set score: %f" % mlp.score(X_train, y_train[:,2]))
            print("Test set score: %f" % mlp.score(X_test, y_test[:,2]))

            y_score = mlp.predict(X_test)
            print(metrics.f1_score(y_test[:,2], y_score, average='micro'))
            print(metrics.classification_report(y_test[:,2], y_score, labels=range(3)))

            acc_mlp_sim = accuracy_score(y_score, y_test[:,2])

            tsts=[]
            for i in range(len(y_score)):
                node1=y_test[i][0]
                node2=y_test[i][1]
                dgr1=g.degree(node1)
                dgr2 = g.degree(node2)

                gender1 = g.nodes[node1]['gender']
                gender2 = g.nodes[node2]['gender']

                tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1], dgr1, dgr2, gender1, gender2]
                tsts.append(tst)
            name = ['y_score', 'y_test_grd', 'node1', 'node2', 'dgr1', 'dgr2', 'gender1', 'gender2']
            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-mlp_sim_{}.csv".format(res_dir, F,comb))

            # # ######################################################################

            from sklearn.ensemble import RandomForestClassifier

            rf = RandomForestClassifier(max_depth=150, random_state=0)
            rf.fit(X_train, y_train[:,2])

            print("Training set score: %f" % rf.score(X_train, y_train[:,2]))
            print("Test set score: %f" % rf.score(X_test, y_test[:,2]))

            y_score = rf.predict(X_test)
            print(metrics.f1_score(y_test[:,2], y_score, average='micro'))
            print(metrics.classification_report(y_test[:,2], y_score, labels=range(3)))

            acc_rf_sim = accuracy_score(y_score, y_test[:,2])


            tsts=[]
            for i in range(len(y_score)):
                node1=y_test[i][0]
                node2=y_test[i][1]
                dgr1=g.degree(node1)
                dgr2 = g.degree(node2)

                gender1 = g.nodes[node1]['gender']
                gender2 = g.nodes[node2]['gender']

                tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1], dgr1, dgr2, gender1, gender2]
                tsts.append(tst)
            name = ['y_score', 'y_test_grd', 'node1', 'node2', 'dgr1', 'dgr2', 'gender1', 'gender2']

            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-rf_sim_{}.csv".format(res_dir, F,comb))

            # # ######################################################################

            from sklearn.multiclass import OneVsRestClassifier
            from sklearn.svm import SVC

            svm = OneVsRestClassifier(SVC())
            svm.fit(X_train, y_train[:,2])

            print("Training set score: %f" % svm.score(X_train, y_train[:,2]))
            print("Test set score: %f" % svm.score(X_test, y_test[:,2]))

            y_score = svm.predict(X_test)
            print(metrics.f1_score(y_test[:,2], y_score, average='micro'))
            print(metrics.classification_report(y_test[:,2], y_score, labels=range(3)))

            acc_svm_sim = accuracy_score(y_score, y_test[:,2])


            tsts=[]
            for i in range(len(y_score)):
                node1=y_test[i][0]
                node2=y_test[i][1]
                dgr1=g.degree(node1)
                dgr2 = g.degree(node2)
                gender1 = g.nodes[node1]['gender']
                gender2 = g.nodes[node2]['gender']

                tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1], dgr1, dgr2, gender1, gender2]
                tsts.append(tst)
            name = ['y_score', 'y_test_grd', 'node1', 'node2', 'dgr1', 'dgr2', 'gender1', 'gender2']
            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-svm_sim_{}.csv".format(res_dir, F,comb))