# import tensorflow.compat.v1 as tf # tf.disable_v2_behavior() import tensorflow as tf import numpy as np import argparse from model import LINEModel from utils import DBLPDataLoader,DBLPDataLoader1 import pickle import time import random from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve import pandas as pd import copy def LINE(g_train, train_test_split,graph_file,DATASET,METHOD,ego_user, F): parser = argparse.ArgumentParser() parser.add_argument('--embedding_dim', default=128) parser.add_argument('--batch_size', default=128) parser.add_argument('--K', default=5) parser.add_argument('--proximity', default='second-order', help='first-order or second-order') parser.add_argument('--learning_rate', default=0.025) parser.add_argument('--mode', default='train') parser.add_argument('--num_batches', default=2000) parser.add_argument('--total_graph', default=True) parser.add_argument('--graph_file', default='/Users/xiulingwang/Downloads/line-master/data/0-adj-feat.pkl') parser.add_argument('--edge_score_mode', default='edge-emb') parser.add_argument('--uid', default='0') parser.add_argument('--flag', default='weighted') args = parser.parse_args() #args.proximity='first-order' args.graph_file=graph_file args.uid = str(ego_user) args.flag=str(F) print(args.graph_file) if args.mode == 'train': normalized_embedding=train(args) data_loader = DBLPDataLoader(graph_file=args.graph_file) emb_list = [] print(np.shape(g_train)[0]) for node_index in range(0, np.shape(g_train)[0]): node_str = str(node_index) node_emb = normalized_embedding[node_index] emb_list.append(node_emb) emb_matrix = np.vstack(emb_list) print(emb_list) print(np.shape(emb_list)) with open('/Users/xiulingwang/Downloads/' + DATASET + '/' + METHOD + '/embeds/' + F + '-' + str(ego_user), 'w') as f: f.write('%d %d\n' % (np.shape(g_train)[0], args.embedding_dim)) for i in range(np.shape(g_train)[0]): e = ' '.join(map(lambda x: str(x), emb_list[i])) f.write('%s %s\n' % (str(i), e)) # with open('/Users/xiulingwang/Downloads/' + DATASET + '/' + METHOD + '/embeds/' + F + '-' + str(ego_user),'w') as f: # pickle.dump(data_loader.embedding_mapping(normalized_embedding), f) # print(args.graph_file) n2v_scores, val_edge_labels, val_preds, test_edge_labels, test_preds=linkpre_scores(args, emb_matrix, train_test_split, ego_user,DATASET,METHOD, F) return n2v_scores, val_edge_labels, val_preds, test_edge_labels, test_preds elif args.mode == 'test': test(args) def train_adj_defense(args,sigma): data_loader = DBLPDataLoader1(graph_file=args.graph_file,b=sigma) suffix = args.proximity args.num_of_nodes = data_loader.num_of_nodes model = LINEModel(args) with tf.Session() as sess: print(args) print('batches\tloss\tsampling time\ttraining_time\tdatetime') tf.global_variables_initializer().run() initial_embedding = sess.run(model.embedding) learning_rate = args.learning_rate sampling_time, training_time = 0, 0 for b in range(args.num_batches): t1 = time.time() u_i, u_j, label = data_loader.fetch_batch(batch_size=args.batch_size, K=args.K) feed_dict = {model.u_i: u_i, model.u_j: u_j, model.label: label, model.learning_rate: learning_rate} t2 = time.time() sampling_time += t2 - t1 if b % 100 != 0: sess.run(model.train_op, feed_dict=feed_dict) training_time += time.time() - t2 if learning_rate > args.learning_rate * 0.0001: learning_rate = args.learning_rate * (1 - b / args.num_batches) else: learning_rate = args.learning_rate * 0.0001 else: loss = sess.run(model.loss, feed_dict=feed_dict) print('%d\t%f\t%0.2f\t%0.2f\t%s' % (b, loss, sampling_time, training_time, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sampling_time, training_time = 0, 0 if b % 1000 == 0 or b == (args.num_batches - 1): embedding = sess.run(model.embedding) normalized_embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True) pickle.dump(data_loader.embedding_mapping(normalized_embedding), open('data/embedding_%s_%s.pkl' % (args.uid,args.flag), 'wb')) return (embedding) def train(args): data_loader = DBLPDataLoader(graph_file=args.graph_file) suffix = args.proximity args.num_of_nodes = data_loader.num_of_nodes model = LINEModel(args) with tf.Session() as sess: print(args) print('batches\tloss\tsampling time\ttraining_time\tdatetime') tf.global_variables_initializer().run() initial_embedding = sess.run(model.embedding) learning_rate = args.learning_rate sampling_time, training_time = 0, 0 for b in range(args.num_batches): t1 = time.time() u_i, u_j, label = data_loader.fetch_batch(batch_size=args.batch_size, K=args.K) feed_dict = {model.u_i: u_i, model.u_j: u_j, model.label: label, model.learning_rate: learning_rate} t2 = time.time() sampling_time += t2 - t1 if b % 100 != 0: sess.run(model.train_op, feed_dict=feed_dict) training_time += time.time() - t2 if learning_rate > args.learning_rate * 0.0001: learning_rate = args.learning_rate * (1 - b / args.num_batches) else: learning_rate = args.learning_rate * 0.0001 else: loss = sess.run(model.loss, feed_dict=feed_dict) print('%d\t%f\t%0.2f\t%0.2f\t%s' % (b, loss, sampling_time, training_time, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sampling_time, training_time = 0, 0 if b % 1000 == 0 or b == (args.num_batches - 1): embedding = sess.run(model.embedding) normalized_embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True) pickle.dump(data_loader.embedding_mapping(normalized_embedding), open('data/embedding_%s_%s.pkl' % (args.uid,args.flag), 'wb')) return (embedding) def sigmoid(x): return 1 / (1 + np.exp(-x)) def linkpre_scores(args, emb_matrix, train_test_split, ego_user,DATASET,METHOD, Flag): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split start_time = time.time() # Generate bootstrapped edge embeddings (as is done in node2vec paper) # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2 if args.edge_score_mode == "edge-emb": def get_edge_embeddings(edge_list,ego_user,DATASET, Flag, flag): embs = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = emb_matrix[node1] emb2 = emb_matrix[node2] edge_emb = np.multiply(emb1, emb2) #edge_emb = np.array(emb1) + np.array(emb2) embs.append(list(edge_emb)) embs = np.array(embs) #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f: #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f: #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f: with open('./' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag, 'w') as f: f.write('%d %d\n' % (edge_list.shape[0], args.embedding_dim)) for i in range(edge_list.shape[0]): e = ' '.join(map(lambda x: str(x), embs[i])) f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e)) return embs # Train-set edge embeddings pos_train_edge_embs = get_edge_embeddings(train_edges,ego_user,DATASET, Flag, flag='pos-train') neg_train_edge_embs = get_edge_embeddings(train_edges_false, ego_user,DATASET,Flag, flag='neg-train') train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs]) # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))]) # Val-set edge embeddings, labels if len(val_edges) > 0 and len(val_edges_false) > 0: pos_val_edge_embs = get_edge_embeddings(val_edges,ego_user,DATASET,Flag, flag='pos-val') neg_val_edge_embs = get_edge_embeddings(val_edges_false,ego_user,DATASET,Flag, flag='neg-val') val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs]) val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))]) # Test-set edge embeddings, labels pos_test_edge_embs = get_edge_embeddings(test_edges,ego_user,DATASET,Flag, flag='pos-test') neg_test_edge_embs = get_edge_embeddings(test_edges_false,ego_user,DATASET,Flag, flag='neg-test') test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs]) # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))]) # Train logistic regression classifier on train-set edge embeddings edge_classifier = LogisticRegression(random_state=0) edge_classifier.fit(train_edge_embs, train_edge_labels) # Predicted edge scores: probability of being of class "1" (real edge) if len(val_edges) > 0 and len(val_edges_false) > 0: val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1] test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1] print(test_preds) print(np.shape(test_preds)) runtime = time.time() - start_time # Calculate scores if len(val_edges) > 0 and len(val_edges_false) > 0: n2v_val_roc = roc_auc_score(val_edge_labels, val_preds) # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds) n2v_val_ap = average_precision_score(val_edge_labels, val_preds) else: n2v_val_roc = None n2v_val_roc_curve = None n2v_val_ap = None n2v_test_roc = roc_auc_score(test_edge_labels, test_preds) # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds) n2v_test_ap = average_precision_score(test_edge_labels, test_preds) # Generate edge scores using simple dot product of node embeddings (like in GAE paper) elif args.edge_score_mode == "dot-product": score_matrix = np.dot(emb_matrix, emb_matrix.T) runtime = time.time() - start_time # Val set scores if len(val_edges) > 0: n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True) else: n2v_val_roc = None n2v_val_roc_curve = None n2v_val_ap = None # Test set scores n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True) else: print "Invalid edge_score_mode! Either use edge-emb or dot-product." # Record scores n2v_scores = {} n2v_scores['test_roc'] = n2v_test_roc # n2v_scores['test_roc_curve'] = n2v_test_roc_curve n2v_scores['test_ap'] = n2v_test_ap n2v_scores['val_roc'] = n2v_val_roc # n2v_scores['val_roc_curve'] = n2v_val_roc_curve n2v_scores['val_ap'] = n2v_val_ap n2v_scores['runtime'] = runtime return n2v_scores, val_edge_labels, val_preds, test_edge_labels, test_preds # Input: positive test/val edges, negative test/val edges, edge score matrix # Output: ROC AUC score, ROC Curve (FPR, TPR, Thresholds), AP score def get_roc_score(edges_pos, edges_neg, score_matrix, apply_sigmoid=False): # Edge case if len(edges_pos) == 0 or len(edges_neg) == 0: return (None, None, None) # Store positive edge predictions, actual values preds_pos = [] pos = [] for edge in edges_pos: if apply_sigmoid == True: preds_pos.append(sigmoid(score_matrix[edge[0], edge[1]])) else: preds_pos.append(score_matrix[edge[0], edge[1]]) pos.append(1) # actual value (1 for positive) # Store negative edge predictions, actual values preds_neg = [] neg = [] for edge in edges_neg: if apply_sigmoid == True: preds_neg.append(sigmoid(score_matrix[edge[0], edge[1]])) else: preds_neg.append(score_matrix[edge[0], edge[1]]) neg.append(0) # actual value (0 for negative) # Calculate scores preds_all = np.hstack([preds_pos, preds_neg]) labels_all = np.hstack([np.ones(len(preds_pos)), np.zeros(len(preds_neg))]) roc_score = roc_auc_score(labels_all, preds_all) # roc_curve_tuple = roc_curve(labels_all, preds_all) ap_score = average_precision_score(labels_all, preds_all) # return roc_score, roc_curve_tuple, ap_score return roc_score, ap_score # Return a list of tuples (node1, node2) for networkx link prediction evaluation def get_ebunch(train_test_split): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split test_edges_list = test_edges.tolist() # convert to nested list test_edges_list = [tuple(node_pair) for node_pair in test_edges_list] # convert node-pairs to tuples test_edges_false_list = test_edges_false.tolist() test_edges_false_list = [tuple(node_pair) for node_pair in test_edges_false_list] return (test_edges_list + test_edges_false_list) def test(args): pass #if __name__ == '__main__': #main() def LINE1(g_train, train_test_split,graph_file,DATASET,METHOD,ego_user, F): parser = argparse.ArgumentParser() parser.add_argument('--embedding_dim', default=128) parser.add_argument('--batch_size', default=128) parser.add_argument('--K', default=5) parser.add_argument('--proximity', default='second-order', help='first-order or second-order') parser.add_argument('--learning_rate', default=0.025) parser.add_argument('--mode', default='train') parser.add_argument('--num_batches', default=8000) parser.add_argument('--total_graph', default=True) parser.add_argument('--graph_file', default='/Users/xiulingwang/Downloads/line-master/data/0-adj-feat.pkl') parser.add_argument('--edge_score_mode', default='edge-emb') parser.add_argument('--uid', default='0') parser.add_argument('--flag', default='weighted') args = parser.parse_args() #args.proximity='first-order' args.graph_file=graph_file args.uid = str(ego_user) args.flag=str(F) print(args.graph_file) if args.mode == 'train': normalized_embedding=train(args) data_loader = DBLPDataLoader(graph_file=args.graph_file) emb_list = [] print(np.shape(g_train)[0]) for node_index in range(0, np.shape(g_train)[0]): node_str = str(node_index) node_emb = normalized_embedding[node_index] emb_list.append(node_emb) emb_matrix = np.vstack(emb_list) print(emb_list) print(np.shape(emb_list)) with open('E:\\python\\banlance\\code\\'+DATASET+'\\'+METHOD+'-embeds-'+F+'-'+str(ego_user), 'w') as f: f.write('%d %d\n' % (np.shape(g_train)[0], args.embedding_dim)) for i in range(np.shape(g_train)[0]): e = ' '.join(map(lambda x: str(x), emb_list[i])) f.write('%s %s\n' % (str(i), e)) # with open('/Users/xiulingwang/Downloads/' + DATASET + '/' + METHOD + '/embeds/' + F + '-' + str(ego_user),'w') as f: # pickle.dump(data_loader.embedding_mapping(normalized_embedding), f) # print(args.graph_file) n2v_scores, train_edge_labels, test_edge_labels, test_preds, train_sim_matrix, test_sim_matrix, train_edge_embs, test_edge_embs, train_embs_1, train_embs_2, test_embs_1, test_embs_2=linkpre_scores1(args, emb_matrix, train_test_split, ego_user,DATASET,METHOD, F) return n2v_scores, train_edge_labels,test_edge_labels, test_preds,emb_matrix,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2 elif args.mode == 'test': test(args) def linkpre_scores1(args, emb_matrix, train_test_split, ego_user,DATASET,METHOD, Flag): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split start_time = time.time() # Generate bootstrapped edge embeddings (as is done in node2vec paper) # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2 if args.edge_score_mode == "edge-emb": def get_edge_embeddings(edge_list,ego_user,DATASET, Flag, flag): embs = [] sim_matrix=[] embs_1=[] embs_2 = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = emb_matrix[node1] #print(np.shape(emb1)) emb2 = emb_matrix[node2] edge_emb = np.multiply(emb1, emb2) sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2)) #edge_emb = np.array(emb1) + np.array(emb2) print(np.shape(edge_emb)) embs.append(edge_emb) embs_1.append(emb1) embs_2.append(emb2) sim_matrix.append(sim) embs = np.array(embs) sim_matrix = np.array(sim_matrix) embs_1=np.array(embs_1) embs_2 =np.array(embs_2) #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f: #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f: #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f: # with open('/Users/xiulingwang/Downloads/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag, 'w') as f: # f.write('%d %d\n' % (edge_list.shape[0], args.embedding_dim)) # for i in range(edge_list.shape[0]): # e = ' '.join(map(lambda x: str(x), embs[i])) # f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e)) return embs,sim_matrix,embs_1,embs_2 # Train-set edge embeddings pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges,ego_user,DATASET, Flag, flag='pos-train') neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, ego_user,DATASET,Flag, flag='neg-train') train_edge_embs = np.concatenate((pos_train_edge_embs, neg_train_edge_embs), axis=0) train_sim_matrix= np.concatenate((pos_train_sim_matrix, neg_train_sim_matrix), axis=0) train_embs_1 = np.concatenate((pos_embs_1_train, neg_embs_1_train), axis=0) train_embs_2 = np.concatenate((pos_embs_2_train, neg_embs_2_train), axis=0) # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.concatenate((np.ones(len(train_edges)), np.zeros(len(train_edges_false))), axis=0) # Val-set edge embeddings, labels if len(val_edges) > 0 and len(val_edges_false) > 0: pos_val_edge_embs,pos_val_sim_matrix,pos_embs_1_val,pos_embs_2_val = get_edge_embeddings(val_edges,ego_user,DATASET,Flag, flag='pos-val') neg_val_edge_embs,neg_val_sim_matrix,neg_embs_1_val,neg_embs_2_val = get_edge_embeddings(val_edges_false,ego_user,DATASET,Flag, flag='neg-val') val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs]) val_edge_labels = np.concatenate((np.ones(len(val_edges)), np.zeros(len(val_edges_false))), axis=0) val_sim_matrix = np.concatenate((pos_val_sim_matrix, neg_val_sim_matrix), axis=0) val_embs_1 = np.concatenate((pos_embs_1_val, neg_embs_1_val), axis=0) val_embs_2 = np.concatenate((pos_embs_2_val, neg_embs_2_val), axis=0) # Test-set edge embeddings, labels pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges,ego_user,DATASET,Flag, flag='pos-test') neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(test_edges_false,ego_user,DATASET,Flag, flag='neg-test') test_edge_embs = np.concatenate((pos_test_edge_embs, neg_test_edge_embs), axis=0) test_sim_matrix = np.concatenate((pos_test_sim_matrix, neg_test_sim_matrix), axis=0) test_embs_1 = np.concatenate((pos_embs_1_test, neg_embs_1_test), axis=0) test_embs_2 = np.concatenate((pos_embs_2_test, neg_embs_2_test), axis=0) # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.concatenate((np.ones(len(test_edges)), np.zeros(len(test_edges_false))), axis=0) # Train logistic regression classifier on train-set edge embeddings edge_classifier = LogisticRegression(random_state=0) edge_classifier.fit(train_edge_embs, train_edge_labels) # Predicted edge scores: probability of being of class "1" (real edge) if len(val_edges) > 0 and len(val_edges_false) > 0: val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1] test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1] print(test_preds) print(np.shape(test_preds)) runtime = time.time() - start_time # Calculate scores if len(val_edges) > 0 and len(val_edges_false) > 0: n2v_val_roc = roc_auc_score(val_edge_labels, val_preds) # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds) n2v_val_ap = average_precision_score(val_edge_labels, val_preds) else: n2v_val_roc = None n2v_val_roc_curve = None n2v_val_ap = None n2v_test_roc = roc_auc_score(test_edge_labels, test_preds) # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds) n2v_test_ap = average_precision_score(test_edge_labels, test_preds) # Generate edge scores using simple dot product of node embeddings (like in GAE paper) elif args.edge_score_mode == "dot-product": score_matrix = np.dot(emb_matrix, emb_matrix.T) runtime = time.time() - start_time # Val set scores if len(val_edges) > 0: n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True) else: n2v_val_roc = None n2v_val_roc_curve = None n2v_val_ap = None # Test set scores n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True) else: print "Invalid edge_score_mode! Either use edge-emb or dot-product." # Record scores n2v_scores = {} n2v_scores['test_roc'] = n2v_test_roc # n2v_scores['test_roc_curve'] = n2v_test_roc_curve n2v_scores['test_ap'] = n2v_test_ap n2v_scores['val_roc'] = n2v_val_roc # n2v_scores['val_roc_curve'] = n2v_val_roc_curve n2v_scores['val_ap'] = n2v_val_ap n2v_scores['runtime'] = runtime return n2v_scores, train_edge_labels,test_edge_labels, test_preds,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2 def LINE2(g_train, train_test_split,graph_file,DATASET,METHOD,ego_user, F,dp,res,sigma,ord): parser = argparse.ArgumentParser() parser.add_argument('--embedding_dim', default=128) parser.add_argument('--batch_size', default=1000) parser.add_argument('--K', default=5) parser.add_argument('--proximity', default='second-order', help='first-order or second-order') parser.add_argument('--learning_rate', default=0.025) parser.add_argument('--mode', default='train') parser.add_argument('--num_batches', default=2000) parser.add_argument('--total_graph', default=True) parser.add_argument('--graph_file', default='/Users/xiulingwang/Downloads/line-master/data/0-adj-feat.pkl') parser.add_argument('--edge_score_mode', default='edge-emb') parser.add_argument('--uid', default='0') parser.add_argument('--flag', default='weighted') args = parser.parse_args() #args.proximity='first-order' args.graph_file=graph_file args.uid = str(ego_user) args.flag=str(F) if ord=='s': args.proximity='second-order' if ord == 'f': args.proximity = 'first-order' print(args.graph_file) adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = train_test_split if args.mode == 'train': if dp==1: normalized_embedding=train_dp1(args,sigma) elif dp==0: normalized_embedding = train(args) elif dp==2: normalized_embedding = train_defense(args,train_edges,test_edges,g_train ) elif dp==3: normalized_embedding = train_defense2(args,train_edges,test_edges,g_train ) elif dp==4: normalized_embedding = train_defense3(args,train_edges,test_edges,g_train,F, res) elif dp==5: normalized_embedding = train(args) # data_loader = DBLPDataLoader(graph_file=args.graph_file) elif dp==6: normalized_embedding = train_adj_defense(args,sigma) emb_list = [] # print(np.shape(g_train)[0]) for node_index in range(0, np.shape(g_train)[0]): node_str = str(node_index) node_emb = normalized_embedding[node_index] emb_list.append(node_emb) emb_matrix = np.vstack(emb_list) # print(emb_list) # print(np.shape(emb_list)) # with open(res+F+'-embeds-'+str(ego_user), # 'w') as f: # f.write('%d %d\n' % (np.shape(g_train)[0], args.embedding_dim)) # for i in range(np.shape(g_train)[0]): # e = ' '.join(map(lambda x: str(x), emb_list[i])) # f.write('%s %s\n' % (str(i), e)) # with open('/Users/xiulingwang/Downloads/' + DATASET + '/' + METHOD + '/embeds/' + F + '-' + str(ego_user),'w') as f: # pickle.dump(data_loader.embedding_mapping(normalized_embedding), f) # print(args.graph_file) train_edge_labels, test_edge_labels, train_sim_matrix, test_sim_matrix, train_edge_embs, test_edge_embs, train_embs_1, train_embs_2, test_embs_1, test_embs_2, train_edges_sampled=linkpre_scores2(args, emb_matrix, train_test_split, ego_user,DATASET,METHOD, F) return train_edge_labels,test_edge_labels, emb_matrix,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2,train_edges_sampled elif args.mode == 'test': test(args) def train_dp(args,sigma): data_loader = DBLPDataLoader(graph_file=args.graph_file) suffix = args.proximity args.num_of_nodes = data_loader.num_of_nodes model = LINEModel(args) C=1 with tf.Session() as sess: print(args) print('batches\tloss\tsampling time\ttraining_time\tdatetime') tf.global_variables_initializer().run() initial_embedding = sess.run(model.embedding) learning_rate = args.learning_rate sampling_time, training_time = 0, 0 loss_dp=[] for b in range(args.num_batches): t1 = time.time() u_i, u_j, label = data_loader.fetch_batch(batch_size=args.batch_size, K=args.K) feed_dict = {model.u_i: u_i, model.u_j: u_j, model.label: label, model.learning_rate: learning_rate} # tf.Print(model.u_i) # print(sess.run(model.u_i)) t2 = time.time() sampling_time += t2 - t1 if b % 100 != 0: sess.run(model.train_op, feed_dict=feed_dict) training_time += time.time() - t2 if learning_rate > args.learning_rate * 0.0001: learning_rate = args.learning_rate * (1 - b / args.num_batches) else: learning_rate = args.learning_rate * 0.0001 gradients_list,loss = sess.run([model.gradients_list,model.loss], feed_dict=feed_dict) print(loss) else: gradients_list,loss = sess.run([model.gradients_list,model.loss], feed_dict=feed_dict) print('%d\t%f\t%0.2f\t%0.2f\t%s' % (b, loss, sampling_time, training_time, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sampling_time, training_time = 0, 0 variables_list=tf.trainable_variables() # print(variables_list) # print(variables_list[0][0][0]) # print(variables_list[1]) print(loss) print('****') # print(gradients_list) # print(np.shape(gradients_list)) # print(type(gradients_list)) grads = [tf.zeros(p.shape) for p in tf.trainable_variables()] l2_norm=tf.constant(0.000) for gd_list in gradients_list: # print('*****') # print(gd_list) # print(type(gd_list)) # print((gd_list * gd_list)) # # print(tf.reduce_sum(gd_list * gd_list)) l2_norm=tf.add(l2_norm,tf.rsqrt(tf.reduce_sum(gd_list * gd_list) + 0.000001)) # print(l2_norm.eval(session=sess)) # print(l2_norm / C) # exit() # divisor = max(tf.convert_to_tensor(1.00000001), l2_norm / C) divisor = tf.maximum(tf.convert_to_tensor(1.00000001), l2_norm/ C) for gd in gd_list: # print(gd) gd += gd / divisor # print(gd) grads_noisy=(gd_list +tf.random_normal(tf.shape(grads),stddev=sigma))/args.batch_size # print(gradients_list) # print(grads_noisy) # # print(variables_list) # print('******') grads_noisy = grads_noisy.eval(session=sess) # # print(grads_noisy) # print(type(grads_noisy)) # print(type(variables_list)) # # print(np.shape(grads_noisy)) # # print((variables_list).size()) # print(np.shape(variables_list)) # print(np.shape(variables_list[0])) # print(variables_list) model.optimizer.apply_gradients(zip(list(grads_noisy), variables_list)) model.gradients_list=tf.convert_to_tensor(grads_noisy) # print(model.gradients_list) # print(model.gradients_list.eval(session=sess)) # exit() gradients_list, loss = sess.run([model.gradients_list, model.loss], feed_dict=feed_dict) # _, loss = sess.run([list(grads_noisy), model.loss], feed_dict=feed_dict) print(loss) # exit() if b % 1000 == 0 or b == (args.num_batches - 1): embedding = sess.run(model.embedding) normalized_embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True) pickle.dump(data_loader.embedding_mapping(normalized_embedding), open('data/embedding_%s_%s.pkl' % (args.uid,args.flag), 'wb')) return (embedding) def linkpre_scores2(args, emb_matrix, train_test_split, ego_user,DATASET,METHOD, Flag): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split start_time = time.time() # Generate bootstrapped edge embeddings (as is done in node2vec paper) # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2 if args.edge_score_mode == "edge-emb": def get_edge_embeddings(edge_list,ego_user,DATASET, Flag, flag): tsts=[] embs = [] sim_matrix=[] embs_1=[] embs_2 = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = emb_matrix[node1] #print(np.shape(emb1)) emb2 = emb_matrix[node2] edge_emb = np.multiply(emb1, emb2) sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2)) #edge_emb = np.array(emb1) + np.array(emb2) # print(np.shape(edge_emb)) sim2 = np.dot(emb1, emb2) # sim3 = np.sqrt(np.sum(np.sqrt(np.array(emb1)-np.array(emb2)))) # print(sim3) sim3 = np.linalg.norm(np.array(emb1) - np.array(emb2)) # print(sim3) sim4 = 1 / (1 + sim3) embs.append(edge_emb) embs_1.append(emb1) embs_2.append(emb2) sim_matrix.append([sim, sim2, sim3, sim4]) tst = [node1, node2, sim, sim2, sim3, sim4] tsts.append(tst) embs = np.array(embs) sim_matrix = np.array(sim_matrix) embs_1 = np.array(embs_1) embs_2 = np.array(embs_2) name = ['node1', 'node2', 'sim1', 'sim2', 'sim3', 'sim4'] result = pd.DataFrame(columns=name, data=tsts) result.to_csv("{}{}-similarity.csv".format(Flag, flag)) # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f: # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f: # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f: # f.write('%d %d\n' % (edge_list.shape[0], args.representation_size)) # for i in range(edge_list.shape[0]): # e = ' '.join(map(lambda x: str(x), embs[i])) # f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e)) return embs,sim_matrix,embs_1,embs_2 edgeall = list([list(edge_tuple) for edge_tuple in train_edges]) # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) # Train-set edge embeddings pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_sampled,ego_user,DATASET, Flag, flag='pos-train') neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, ego_user,DATASET,Flag, flag='neg-train') train_edge_embs = pos_train_edge_embs train_sim_matrix= pos_train_sim_matrix train_embs_1 = pos_embs_1_train train_embs_2 = pos_embs_2_train # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.ones(len(train_edges_sampled)) # Val-set edge embeddings, labels if len(val_edges) > 0 and len(val_edges_false) > 0: pos_val_edge_embs,pos_val_sim_matrix,pos_embs_1_val,pos_embs_2_val = get_edge_embeddings(val_edges,ego_user,DATASET,Flag, flag='pos-val') neg_val_edge_embs,neg_val_sim_matrix,neg_embs_1_val,neg_embs_2_val = get_edge_embeddings(val_edges_false,ego_user,DATASET,Flag, flag='neg-val') val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs]) val_edge_labels = np.concatenate((np.ones(len(val_edges)), np.zeros(len(val_edges_false))), axis=0) val_sim_matrix = np.concatenate((pos_val_sim_matrix, neg_val_sim_matrix), axis=0) val_embs_1 = np.concatenate((pos_embs_1_val, neg_embs_1_val), axis=0) val_embs_2 = np.concatenate((pos_embs_2_val, neg_embs_2_val), axis=0) # Test-set edge embeddings, labels pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges,ego_user,DATASET,Flag, flag='pos-test') neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(test_edges_false,ego_user,DATASET,Flag, flag='neg-test') test_edge_embs = pos_test_edge_embs test_sim_matrix = pos_test_sim_matrix test_embs_1 = pos_embs_1_test test_embs_2 = pos_embs_2_test # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.ones(len(test_edges)) # # Train logistic regression classifier on train-set edge embeddings # edge_classifier = LogisticRegression(random_state=0) # edge_classifier.fit(train_edge_embs, train_edge_labels) # # # Predicted edge scores: probability of being of class "1" (real edge) # if len(val_edges) > 0 and len(val_edges_false) > 0: # val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1] # test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1] # print(test_preds) # print(np.shape(test_preds)) # # runtime = time.time() - start_time # # # Calculate scores # if len(val_edges) > 0 and len(val_edges_false) > 0: # n2v_val_roc = roc_auc_score(val_edge_labels, val_preds) # # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds) # n2v_val_ap = average_precision_score(val_edge_labels, val_preds) # else: # n2v_val_roc = None # n2v_val_roc_curve = None # n2v_val_ap = None # # n2v_test_roc = roc_auc_score(test_edge_labels, test_preds) # # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds) # n2v_test_ap = average_precision_score(test_edge_labels, test_preds) # # # # Generate edge scores using simple dot product of node embeddings (like in GAE paper) # elif args.edge_score_mode == "dot-product": # score_matrix = np.dot(emb_matrix, emb_matrix.T) # runtime = time.time() - start_time # # # Val set scores # if len(val_edges) > 0: # n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True) # else: # n2v_val_roc = None # n2v_val_roc_curve = None # n2v_val_ap = None # # # Test set scores # n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True) # # else: # print # "Invalid edge_score_mode! Either use edge-emb or dot-product." # # # Record scores # n2v_scores = {} # # n2v_scores['test_roc'] = n2v_test_roc # # n2v_scores['test_roc_curve'] = n2v_test_roc_curve # n2v_scores['test_ap'] = n2v_test_ap # # n2v_scores['val_roc'] = n2v_val_roc # # n2v_scores['val_roc_curve'] = n2v_val_roc_curve # n2v_scores['val_ap'] = n2v_val_ap # # n2v_scores['runtime'] = runtime return train_edge_labels,test_edge_labels,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2,train_edges_sampled def train_dp1(args,sigma): data_loader = DBLPDataLoader(graph_file=args.graph_file) suffix = args.proximity args.num_of_nodes = data_loader.num_of_nodes model = LINEModel(args) C=1 with tf.Session() as sess: print(args) print('batches\tloss\tsampling time\ttraining_time\tdatetime') tf.global_variables_initializer().run() initial_embedding = sess.run(model.embedding) learning_rate = args.learning_rate sampling_time, training_time = 0, 0 loss_dp=[] for b in range(args.num_batches): t1 = time.time() u_i, u_j, label = data_loader.fetch_batch(batch_size=args.batch_size, K=args.K) feed_dict = {model.u_i: u_i, model.u_j: u_j, model.label: label, model.learning_rate: learning_rate} # tf.Print(model.u_i) # print(sess.run(model.u_i)) t2 = time.time() sampling_time += t2 - t1 if b % 100 != 0: sess.run(model.train_op, feed_dict=feed_dict) training_time += time.time() - t2 if learning_rate > args.learning_rate * 0.0001: learning_rate = args.learning_rate * (1 - b / args.num_batches) else: learning_rate = args.learning_rate * 0.0001 # print(args.uid) gradients_list,loss = sess.run([model.gradients_list,model.loss], feed_dict=feed_dict) # print(loss) # print('****') else: # print(model.gradients_list) # print(model.loss) # print(feed_dict) gradients_list,loss = sess.run([model.gradients_list,model.loss], feed_dict=feed_dict) print('%d\t%f\t%0.2f\t%0.2f\t%s' % (b, loss, sampling_time, training_time, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sampling_time, training_time = 0, 0 variables_list=tf.trainable_variables() # print(variables_list) # print(variables_list[0][0][0]) # print(variables_list[1]) # print(loss) # print('****') # print(gradients_list) # print(np.shape(gradients_list)) # print(type(gradients_list)) grads = [tf.zeros(p.shape) for p in tf.trainable_variables()] l2_norm=tf.constant(0.000) for gd_list in gradients_list: # print('*****') # print(gd_list) # print(type(gd_list)) # print((gd_list * gd_list)) # l2_norm=tf.add(l2_norm,tf.rsqrt(tf.reduce_sum(gd_list * gd_list) + 0.000001)) # print(l2_norm.eval(session=sess)) # print(l2_norm / C) # exit() # divisor = max(tf.convert_to_tensor(1.00000001), l2_norm / C) divisor = tf.maximum(tf.convert_to_tensor(1.00000001), l2_norm/ C) for gd in gd_list: # print(gd) gd += gd / divisor # print(gd) grads_noisy=(gd_list +tf.random_normal(tf.shape(grads),stddev=sigma))/args.batch_size # print(gradients_list) # print(grads_noisy) # # print(variables_list) # print('******') grads_noisy = grads_noisy.eval(session=sess) # # print(grads_noisy) # print(type(grads_noisy)) # print(np.shape(grads_noisy)) # print(type(variables_list)) # # print((variables_list).size()) # print(np.shape(variables_list)) # print(np.shape(variables_list[0])) # print(variables_list) model.optimizer.apply_gradients(zip(list(grads_noisy), variables_list)) model.gradients_list=tf.convert_to_tensor(grads_noisy) embedding = sess.run(model.embedding) # print(embedding) # print(type(embedding)) # print(np.shape(embedding)) # print(model.gradients_list) # print(model.gradients_list.eval(session=sess)) # exit() gradients_list, loss = sess.run([model.gradients_list, model.loss], feed_dict=feed_dict) # print(gradients_list) # print(type(gradients_list)) # # print(np.shape(gradients_list)) for i in range(np.shape(embedding)[0]): for j in range(np.shape(embedding)[1]): if args.proximity == 'first-order': embedding[i][j]+=gradients_list[0][i][j]*learning_rate if args.proximity == 'second-order': embedding[i][j]+=gradients_list[1][i][j]*learning_rate # print(embedding) model.embedding = tf.convert_to_tensor(embedding) # print(loss) # # gradients_list, loss = sess.run([model.gradients_list, model.loss], feed_dict=feed_dict) # print(loss) # exit() if b % 1000 == 0 or b == (args.num_batches - 1): embedding = sess.run(model.embedding) normalized_embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True) pickle.dump(data_loader.embedding_mapping(normalized_embedding), open('data/embedding_%s_%s.pkl' % (args.uid,args.flag), 'wb')) return (embedding) def train_defense(args,train_edges,test_edges,g_train): data_loader = DBLPDataLoader(graph_file=args.graph_file) suffix = args.proximity args.num_of_nodes = data_loader.num_of_nodes model = LINEModel(args) with tf.Session() as sess: print(args) print('batches\tloss\tsampling time\ttraining_time\tdatetime') tf.global_variables_initializer().run() initial_embedding = sess.run(model.embedding) learning_rate = args.learning_rate sampling_time, training_time = 0, 0 for b in range(args.num_batches): print(sess.run(model.embedding)) t1 = time.time() u_i, u_j, label = data_loader.fetch_batch(batch_size=args.batch_size, K=args.K) feed_dict = {model.u_i: u_i, model.u_j: u_j, model.label: label, model.learning_rate: learning_rate} t2 = time.time() sampling_time += t2 - t1 if b % 100 != 0: sess.run(model.train_op, feed_dict=feed_dict) training_time += time.time() - t2 if learning_rate > args.learning_rate * 0.0001: learning_rate = args.learning_rate * (1 - b / args.num_batches) else: learning_rate = args.learning_rate * 0.0001 else: loss = sess.run(model.loss, feed_dict=feed_dict) print('%d\t%f\t%0.2f\t%0.2f\t%s' % (b, loss, sampling_time, training_time, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sampling_time, training_time = 0, 0 loss_dis = discriminator(train_edges, test_edges, embedding= sess.run(model.embedding)) print(loss,loss_dis) loss_new = loss - 100*tf.to_double(loss_dis) model.loss= tf.convert_to_tensor(loss_new) if b % 1000 == 0 or b == (args.num_batches - 1): embedding = sess.run(model.embedding) normalized_embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True) pickle.dump(data_loader.embedding_mapping(normalized_embedding), open('data/embedding_%s_%s.pkl' % (args.uid,args.flag), 'wb')) return (embedding) def train_defense2(args,train_edges,test_edges,g_train): data_loader = DBLPDataLoader(graph_file=args.graph_file) suffix = args.proximity args.num_of_nodes = data_loader.num_of_nodes model = LINEModel(args) with tf.Session() as sess: print(args) print('batches\tloss\tsampling time\ttraining_time\tdatetime') tf.global_variables_initializer().run() initial_embedding = sess.run(model.embedding) learning_rate = args.learning_rate sampling_time, training_time = 0, 0 for b in range(args.num_batches): # print(sess.run(model.embedding)) t1 = time.time() u_i, u_j, label = data_loader.fetch_batch(batch_size=args.batch_size, K=args.K) feed_dict = {model.u_i: u_i, model.u_j: u_j, model.label: label, model.learning_rate: learning_rate} t2 = time.time() sampling_time += t2 - t1 if b % 100 != 0: sess.run(model.train_op, feed_dict=feed_dict) training_time += time.time() - t2 if learning_rate > args.learning_rate * 0.0001: learning_rate = args.learning_rate * (1 - b / args.num_batches) else: learning_rate = args.learning_rate * 0.0001 else: loss = sess.run(model.loss, feed_dict=feed_dict) print('%d\t%f\t%0.2f\t%0.2f\t%s' % (b, loss, sampling_time, training_time, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sampling_time, training_time = 0, 0 loss_dis = discriminator(train_edges, test_edges, embedding= sess.run(model.embedding)) print(loss,loss_dis) loss_new = loss - 10*tf.to_double(loss_dis) model.loss= tf.convert_to_tensor(loss_new) if b % 1000 == 0 or b == (args.num_batches - 1): embedding = sess.run(model.embedding) normalized_embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True) pickle.dump(data_loader.embedding_mapping(normalized_embedding), open('data/embedding_%s_%s.pkl' % (args.uid,args.flag), 'wb')) return (embedding) def train_defense3(args,train_edges,test_edges,g_train,F, res_dir): data_loader = DBLPDataLoader(graph_file=args.graph_file) suffix = args.proximity args.num_of_nodes = data_loader.num_of_nodes model = LINEModel(args) with tf.Session() as sess: print(args) print('batches\tloss\tsampling time\ttraining_time\tdatetime') tf.global_variables_initializer().run() initial_embedding = sess.run(model.embedding) learning_rate = args.learning_rate sampling_time, training_time = 0, 0 cnt_it = 0 min=1000000 edgeall = list([list(edge_tuple) for edge_tuple in train_edges]) # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) for b in range(args.num_batches): # print(sess.run(model.embedding)) t1 = time.time() u_i, u_j, label = data_loader.fetch_batch(batch_size=args.batch_size, K=args.K) feed_dict = {model.u_i: u_i, model.u_j: u_j, model.label: label, model.learning_rate: learning_rate} t2 = time.time() sampling_time += t2 - t1 if b % 100 != 0: sess.run(model.train_op, feed_dict=feed_dict) training_time += time.time() - t2 # if learning_rate > args.learning_rate * 0.0001: # learning_rate = args.learning_rate * (1 - b / args.num_batches) # else: # learning_rate = args.learning_rate * 0.0001 else: loss = sess.run(model.loss, feed_dict=feed_dict) print('%d\t%f\t%0.2f\t%0.2f\t%s' % (b, loss, sampling_time, training_time, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sampling_time, training_time = 0, 0 if cnt_it==0: loss_dis,acc_kmeans_sim, acc_mlp_sim, acc_rf_sim, acc_svm_sim = discriminator_gain(train_edges_sampled, test_edges, F, res_dir,cnt_it, embedding= sess.run(model.embedding)) else: loss_dis = discriminator_gain(train_edges_sampled, test_edges, F, res_dir,cnt_it, embedding= sess.run(model.embedding)) print(loss,loss_dis) loss_new = loss+0.001*tf.to_double(loss_dis) model.loss= tf.convert_to_tensor(loss_new) print('loss:',model.loss) if min > loss_new.eval(): print('~~~~~~') idx2vec = copy.deepcopy(sess.run(model.embedding)) # print('min') # print(idx2vec[9]) min = loss cnt_it = 0 if min < loss_new.eval(): print('@@@@@@@@') print(cnt_it) cnt_it += 1 if cnt_it == 10: if b >= 1: break # if b >0 and cnt_it == 10: # break print(acc_kmeans_sim, acc_mlp_sim, acc_rf_sim, acc_svm_sim) normalized_embedding = idx2vec / np.linalg.norm(idx2vec, axis=1, keepdims=True) pickle.dump(data_loader.embedding_mapping(normalized_embedding), open('data/embedding_%s_%s.pkl' % (args.uid,args.flag), 'wb')) return (idx2vec) def discriminator(train_edges,test_edges,embedding): # print(num_nodes) # print(word2idx) # start_time = time.time() # Generate bootstrapped edge embeddings (as is done in node2vec paper) # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2 emb_matrix=embedding def get_edge_embeddings(edge_list): embs = [] sim_matrix=[] embs_1=[] embs_2 = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = emb_matrix[node1] #print(np.shape(emb1)) emb2 = emb_matrix[node2] edge_emb = np.multiply(emb1, emb2) sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2)) #edge_emb = np.array(emb1) + np.array(emb2) # print(np.shape(edge_emb)) embs.append(edge_emb) embs_1.append(emb1) embs_2.append(emb2) sim_matrix.append(sim) embs = np.array(embs) sim_matrix = np.array(sim_matrix) embs_1=np.array(embs_1) embs_2 =np.array(embs_2) # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f: # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f: # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f: # f.write('%d %d\n' % (edge_list.shape[0], args.representation_size)) # for i in range(edge_list.shape[0]): # e = ' '.join(map(lambda x: str(x), embs[i])) # f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e)) return embs,sim_matrix,embs_1,embs_2 edgeall = list([list(edge_tuple) for edge_tuple in train_edges]) # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) # Train-set edge embeddings pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_sampled) # neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, ego_user,DATASET,Flag, flag='neg-train') train_edge_embs = pos_train_edge_embs train_sim_matrix= pos_train_sim_matrix train_embs_1 = pos_embs_1_train train_embs_2 = pos_embs_2_train # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.ones(len(train_edges_sampled)) # Test-set edge embeddings, labels pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges) # neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(test_edges_false,ego_user,DATASET,Flag, flag='neg-test') test_edge_embs = pos_test_edge_embs test_sim_matrix = pos_test_sim_matrix test_embs_1 = pos_embs_1_test test_embs_2 = pos_embs_2_test # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.ones(len(test_edges)) ###########sim_svm train_edges_list = np.array(train_edges_sampled) # print(train_edges_list) test_edges_list = test_edges # print(test_edges_list) edges_list = np.concatenate((train_edges_list, test_edges_list), axis=0) # print(type(train_edges_list)) # print(type(test_edges_list)) # print(type(edges_list)) # print(np.shape(train_edges_list)) # print(np.shape(test_edges_list)) # print(np.shape(edges_list)) ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0] # print(train_sim_matrix) # print(test_sim_matrix) sim_matrix = np.concatenate((train_sim_matrix, test_sim_matrix), axis=0) # print(sim_matrix) # print(np.shape(train_sim_matrix)) # print(np.shape(test_sim_matrix)) sim_matrix = sim_matrix.reshape(-1, 1) # print(sim_matrix) # print(np.shape(sim_matrix)) # exit() sim_matrix_train = train_sim_matrix sim_matrix_test = test_sim_matrix sim_matrix_train = sim_matrix_train.reshape(-1, 1) sim_matrix_test = sim_matrix_test.reshape(-1, 1) # print(np.shape(sim_matrix_train)) # print(np.shape(sim_matrix_test)) from sklearn.model_selection import train_test_split ylabel1 = ylabel ylable1 = np.reshape(len(ylabel1), 1) # print((edges_list)) # print((ylabel1)) # print(np.shape(ylabel1)) # print(np.shape(edges_list)) y_label = np.zeros((np.shape(edges_list)[0], 3)) for i in range(np.shape(edges_list)[0]): y_label[i][0] = edges_list[i][0] y_label[i][1] = edges_list[i][1] y_label[i][2] = ylabel[i] # print(np.shape(y_label)) y_label_train = np.zeros((np.shape(train_edges_list)[0], 3)) for i in range(np.shape(train_edges_list)[0]): y_label_train[i][0] = train_edges_list[i][0] y_label_train[i][1] = train_edges_list[i][1] y_label_train[i][2] = 1 # print(np.shape(y_label_train)) y_label_test = np.zeros((np.shape(test_edges_list)[0], 3)) for i in range(np.shape(test_edges_list)[0]): y_label_test[i][0] = test_edges_list[i][0] y_label_test[i][1] = test_edges_list[i][1] y_label_test[i][2] = 0 # print(np.shape(y_label_test)) X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(sim_matrix_train, y_label_train, test_size=0.1, random_state=42) X_test_train, X_test_test, y_test_train, y_test_test = train_test_split(sim_matrix_test, y_label_test, test_size=0.1, random_state=42) X_train = np.concatenate((X_train_train, X_test_train), axis=0) X_test = np.concatenate((X_train_test, X_test_test), axis=0) y_train = np.concatenate((y_train_train, y_test_train), axis=0) y_test = np.concatenate((y_train_test, y_test_test), axis=0) from sklearn import metrics from sklearn.neural_network import MLPClassifier mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16, 18), random_state=1, max_iter=500) mlp.fit(X_train, y_train[:, 2]) loss=mlp.loss_ # y_score = mlp.predict(X_test) # ls = 0 # for i in range(len(y_score)): # if y_score[i] != y_test[i][2]: # if y_score[i] == 1: # ls += y_score[i] # else: # ls += 1-y_score[i] # loss = ls / len(y_score) # print("Training set score: %f" % mlp.score(X_train, y_train[:, 2])) # print("Test set score: %f" % mlp.score(X_test, y_test[:, 2])) # # y_score = mlp.predict(X_test) # print(metrics.f1_score(y_test[:, 2], y_score, average='micro')) # print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3))) return loss def discriminator_gain(train_edges_sampled,test_edges, F, res_dir,cnt_it,embedding): # print(num_nodes) # print(word2idx) # start_time = time.time() # Generate bootstrapped edge embeddings (as is done in node2vec paper) # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2 emb_matrix=embedding def get_edge_embeddings(edge_list): embs = [] sim_matrix=[] embs_1=[] embs_2 = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = emb_matrix[node1] #print(np.shape(emb1)) emb2 = emb_matrix[node2] edge_emb = np.multiply(emb1, emb2) sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2)) #edge_emb = np.array(emb1) + np.array(emb2) # print(np.shape(edge_emb)) embs.append(edge_emb) embs_1.append(emb1) embs_2.append(emb2) sim_matrix.append(sim) embs = np.array(embs) sim_matrix = np.array(sim_matrix) embs_1=np.array(embs_1) embs_2 =np.array(embs_2) # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f: # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f: # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f: # f.write('%d %d\n' % (edge_list.shape[0], args.representation_size)) # for i in range(edge_list.shape[0]): # e = ' '.join(map(lambda x: str(x), embs[i])) # f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e)) return embs,sim_matrix,embs_1,embs_2 # edgeall = list([list(edge_tuple) for edge_tuple in train_edges]) # # # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) # Train-set edge embeddings pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_sampled) # neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, ego_user,DATASET,Flag, flag='neg-train') train_edge_embs = pos_train_edge_embs train_sim_matrix= pos_train_sim_matrix train_embs_1 = pos_embs_1_train train_embs_2 = pos_embs_2_train # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.ones(len(train_edges_sampled)) # Test-set edge embeddings, labels pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges) # neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(test_edges_false,ego_user,DATASET,Flag, flag='neg-test') test_edge_embs = pos_test_edge_embs test_sim_matrix = pos_test_sim_matrix test_embs_1 = pos_embs_1_test test_embs_2 = pos_embs_2_test # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.ones(len(test_edges)) ###########sim_svm train_edges_list = np.array(train_edges_sampled) # print(train_edges_list) test_edges_list = test_edges # print(test_edges_list) edges_list = np.concatenate((train_edges_list, test_edges_list), axis=0) # print(type(train_edges_list)) # print(type(test_edges_list)) # print(type(edges_list)) # print(np.shape(train_edges_list)) # print(np.shape(test_edges_list)) # print(np.shape(edges_list)) ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0] # print(train_sim_matrix) # print(test_sim_matrix) sim_matrix = np.concatenate((train_sim_matrix, test_sim_matrix), axis=0) # print(sim_matrix) # print(np.shape(train_sim_matrix)) # print(np.shape(test_sim_matrix)) sim_matrix = sim_matrix.reshape(-1, 1) # print(sim_matrix) # print(np.shape(sim_matrix)) # exit() sim_matrix_train = train_sim_matrix sim_matrix_test = test_sim_matrix sim_matrix_train = sim_matrix_train.reshape(-1, 1) sim_matrix_test = sim_matrix_test.reshape(-1, 1) # print(np.shape(sim_matrix_train)) # print(np.shape(sim_matrix_test)) from sklearn.model_selection import train_test_split ylabel1 = ylabel ylable1 = np.reshape(len(ylabel1), 1) # print((edges_list)) # print((ylabel1)) # print(np.shape(ylabel1)) # print(np.shape(edges_list)) y_label = np.zeros((np.shape(edges_list)[0], 3)) for i in range(np.shape(edges_list)[0]): y_label[i][0] = edges_list[i][0] y_label[i][1] = edges_list[i][1] y_label[i][2] = ylabel[i] # print(np.shape(y_label)) y_label_train = np.zeros((np.shape(train_edges_list)[0], 3)) for i in range(np.shape(train_edges_list)[0]): y_label_train[i][0] = train_edges_list[i][0] y_label_train[i][1] = train_edges_list[i][1] y_label_train[i][2] = 1 # print(np.shape(y_label_train)) y_label_test = np.zeros((np.shape(test_edges_list)[0], 3)) for i in range(np.shape(test_edges_list)[0]): y_label_test[i][0] = test_edges_list[i][0] y_label_test[i][1] = test_edges_list[i][1] y_label_test[i][2] = 0 # print(np.shape(y_label_test)) X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(sim_matrix_train, y_label_train, test_size=0.3, random_state=42) X_test_train, X_test_test, y_test_train, y_test_test = train_test_split(sim_matrix_test, y_label_test, test_size=0.3, random_state=42) X_train = np.concatenate((X_train_train, X_test_train), axis=0) X_test = np.concatenate((X_train_test, X_test_test), axis=0) y_train = np.concatenate((y_train_train, y_test_train), axis=0) y_test = np.concatenate((y_train_test, y_test_test), axis=0) from sklearn.metrics import accuracy_score from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC # mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16, 18), random_state=1, # max_iter=500) # # mlp.fit(X_train, y_train[:, 2]) svm = OneVsRestClassifier(SVC()) svm.fit(X_train, y_train[:, 2]) gain = 0 cnt_true = 0 pree = svm.predict(X_test) # print('prob') # print(prob) # print('pree') # print(pree) for i in range(len(pree)): if i < np.shape(X_train_test)[0]: if pree[i] == 1: cnt_true += 1 # gain+=np.log(prob[i][1]) else: if pree[i] == 0: cnt_true += 1 # gain += np.log(prob[i][0]) # gain=gain/np.shape(X_train)[0] acc = cnt_true / np.shape(X_test)[0] gain = acc print('acc', acc) acc_sim = accuracy_score(pree, y_test[:, 2]) print(acc_sim) # exit() # print('gain') # print(gain) # y_score = mlp.predict(X_train) # print(y_score) # # exit() if cnt_it == 0: acc_kmeans_sim, acc_mlp_sim, acc_rf_sim, acc_svm_sim = discriminator_gain2(train_edges_sampled, test_edges, embedding, F, res_dir) return gain, acc_kmeans_sim, acc_mlp_sim, acc_rf_sim, acc_svm_sim else: return gain # y_score = mlp.predict(X_test) # ls = 0 # for i in range(len(y_score)): # if y_score[i] != y_test[i][2]: # if y_score[i] == 1: # ls += y_score[i] # else: # ls += 1-y_score[i] # loss = ls / len(y_score) # print(loss) # exit() # print("Training set score: %f" % mlp.score(X_train, y_train[:, 2])) # print("Test set score: %f" % mlp.score(X_test, y_test[:, 2])) # # y_score = mlp.predict(X_test) # print(metrics.f1_score(y_test[:, 2], y_score, average='micro')) # print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3))) def discriminator_gain2(train_edges_sampled, test_edges, embedding, F, res_dir): # idx2vec = embedding.ivectors.weight.data.cpu().numpy() # print(num_nodes) # print(word2idx) emb_matrix = embedding # start_time = time.time() # Generate bootstrapped edge embeddings (as is done in node2vec paper) # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2 def get_edge_embeddings(edge_list): embs = [] sim_matrix = [] embs_1 = [] embs_2 = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = emb_matrix[node1] # print(np.shape(emb1)) emb2 = emb_matrix[node2] edge_emb = np.multiply(emb1, emb2) sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)) # edge_emb = np.array(emb1) + np.array(emb2) # print(np.shape(edge_emb)) embs.append(edge_emb) embs_1.append(emb1) embs_2.append(emb2) sim_matrix.append(sim) embs = np.array(embs) sim_matrix = np.array(sim_matrix) embs_1 = np.array(embs_1) embs_2 = np.array(embs_2) # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f: # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f: # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f: # f.write('%d %d\n' % (edge_list.shape[0], args.representation_size)) # for i in range(edge_list.shape[0]): # e = ' '.join(map(lambda x: str(x), embs[i])) # f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e)) return embs, sim_matrix, embs_1, embs_2 # edgeall = list([list(edge_tuple) for edge_tuple in train_edges]) # # # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0]) # Train-set edge embeddings pos_train_edge_embs, pos_train_sim_matrix, pos_embs_1_train, pos_embs_2_train = get_edge_embeddings( train_edges_sampled) # neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, ego_user,DATASET,Flag, flag='neg-train') train_edge_embs = pos_train_edge_embs train_sim_matrix = pos_train_sim_matrix train_embs_1 = pos_embs_1_train train_embs_2 = pos_embs_2_train # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.ones(len(train_edges_sampled)) # Test-set edge embeddings, labels pos_test_edge_embs, pos_test_sim_matrix, pos_embs_1_test, pos_embs_2_test = get_edge_embeddings(test_edges) # neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(test_edges_false,ego_user,DATASET,Flag, flag='neg-test') test_edge_embs = pos_test_edge_embs test_sim_matrix = pos_test_sim_matrix test_embs_1 = pos_embs_1_test test_embs_2 = pos_embs_2_test # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.ones(len(test_edges)) ###########sim_svm train_edges_list = np.array(train_edges_sampled) # print(train_edges_list) test_edges_list = test_edges # print(test_edges_list) edges_list = np.concatenate((train_edges_list, test_edges_list), axis=0) # print(type(train_edges_list)) # print(type(test_edges_list)) # print(type(edges_list)) # print(np.shape(train_edges_list)) # print(np.shape(test_edges_list)) # print(np.shape(edges_list)) ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0] # print(train_sim_matrix) # print(test_sim_matrix) sim_matrix = np.concatenate((train_sim_matrix, test_sim_matrix), axis=0) # print(sim_matrix) # print(np.shape(train_sim_matrix)) # print(np.shape(test_sim_matrix)) sim_matrix = sim_matrix.reshape(-1, 1) # print(sim_matrix) # print(np.shape(sim_matrix)) # exit() sim_matrix_train = train_sim_matrix sim_matrix_test = test_sim_matrix sim_matrix_train = sim_matrix_train.reshape(-1, 1) sim_matrix_test = sim_matrix_test.reshape(-1, 1) # print(np.shape(sim_matrix_train)) # print(np.shape(sim_matrix_test)) # print(loss_matrix_test) from sklearn.model_selection import train_test_split ylabel1 = ylabel ylable1 = np.reshape(len(ylabel1), 1) # print((edges_list)) # print((ylabel1)) # print(np.shape(ylabel1)) # print(np.shape(edges_list)) y_label = np.zeros((np.shape(edges_list)[0], 3)) for i in range(np.shape(edges_list)[0]): y_label[i][0] = edges_list[i][0] y_label[i][1] = edges_list[i][1] y_label[i][2] = ylabel[i] # print(np.shape(y_label)) y_label_train = np.zeros((np.shape(train_edges_list)[0], 3)) for i in range(np.shape(train_edges_list)[0]): y_label_train[i][0] = train_edges_list[i][0] y_label_train[i][1] = train_edges_list[i][1] y_label_train[i][2] = 1 # print(np.shape(y_label_train)) y_label_test = np.zeros((np.shape(test_edges_list)[0], 3)) for i in range(np.shape(test_edges_list)[0]): y_label_test[i][0] = test_edges_list[i][0] y_label_test[i][1] = test_edges_list[i][1] y_label_test[i][2] = 0 # print(np.shape(y_label_test)) X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(sim_matrix_train, y_label_train, test_size=0.3, random_state=42) X_test_train, X_test_test, y_test_train, y_test_test = train_test_split(sim_matrix_test, y_label_test, test_size=0.3, random_state=42) X_train = np.concatenate((X_train_train, X_test_train), axis=0) X_test = np.concatenate((X_train_test, X_test_test), axis=0) y_train = np.concatenate((y_train_train, y_test_train), axis=0) y_test = np.concatenate((y_train_test, y_test_test), axis=0) from sklearn.cluster import KMeans from sklearn.metrics import accuracy_score accuracy = [] for i in range(500): kmeans = KMeans(n_clusters=2, random_state=i).fit(sim_matrix) # kmeans = KMeans(n_clusters=2, random_state=i).fit(X) ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0] acc = accuracy_score(kmeans.labels_, ylabel) accuracy.append(acc) acc_kmeans_sim = max(accuracy) tsts = [] print(len(kmeans.labels_)) for i in range(len(kmeans.labels_)): node1 = edges_list[i][0] node2 = edges_list[i][1] # dgr1=g.degree(node1) # dgr2 = g.degree(node2) # gender1 = g.nodes[node1]['gender'] # gender2 = g.nodes[node2]['gender'] sim0 = sim_matrix[i] # print(sim0) # exit() # if (node1, node2) in g.edges(): # edge_betw = edge_between[(node1, node2)] # else: # edge_betw = 0 tst = [kmeans.labels_[i], ylabel[i], node1, node2] tsts.append(tst) name = ['y_score', 'y_test_grd', 'node1', 'node2'] result = pd.DataFrame(columns=name, data=tsts) result.to_csv("{}{}-kmeans_sim.csv".format(res_dir, F)) from sklearn import metrics from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16, 18), random_state=1, max_iter=500) mlp.fit(X_train, y_train[:, 2]) pree = mlp.predict(X_train) # print('prob') # print(prob) # print('pree') # print(pree) print("Training set score: %f" % mlp.score(X_train, y_train[:, 2])) print("Test set score: %f" % mlp.score(X_test, y_test[:, 2])) y_score = mlp.predict(X_test) print(metrics.f1_score(y_test[:, 2], y_score, average='micro')) print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3))) acc_mlp_sim = accuracy_score(y_score, y_test[:, 2]) tsts = [] for i in range(len(y_score)): node1 = y_test[i][0] node2 = y_test[i][1] # dgr1 = g.degree(node1) # dgr2 = g.degree(node2) # # gender1 = g.nodes[node1]['gender'] # gender2 = g.nodes[node2]['gender'] # # if (node1, node2) in g.edges(): # edge_betw = edge_between[(node1, node2)] # else: # edge_betw = 0 tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]] tsts.append(tst) name = ['y_score', 'y_test_grd', 'node1', 'node2'] result = pd.DataFrame(columns=name, data=tsts) result.to_csv("{}{}-mlp_sim.csv".format(res_dir, F)) from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(max_depth=150, random_state=0) rf.fit(X_train, y_train[:, 2]) print("Training set score: %f" % rf.score(X_train, y_train[:, 2])) print("Test set score: %f" % rf.score(X_test, y_test[:, 2])) y_score = rf.predict(X_test) print(metrics.f1_score(y_test[:, 2], y_score, average='micro')) print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3))) acc_rf_sim = accuracy_score(y_score, y_test[:, 2]) tsts = [] for i in range(len(y_score)): node1 = y_test[i][0] node2 = y_test[i][1] # dgr1 = g.degree(node1) # dgr2 = g.degree(node2) # # gender1 = g.nodes[node1]['gender'] # gender2 = g.nodes[node2]['gender'] # if (node1, node2) in g.edges(): # edge_betw = edge_between[(node1, node2)] # else: # edge_betw = 0 tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]] tsts.append(tst) name = ['y_score', 'y_test_grd', 'node1', 'node2'] result = pd.DataFrame(columns=name, data=tsts) result.to_csv("{}{}-rf_sim.csv".format(res_dir, F)) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC svm = OneVsRestClassifier(SVC()) svm.fit(X_train, y_train[:, 2]) print("Training set score: %f" % svm.score(X_train, y_train[:, 2])) print("Test set score: %f" % svm.score(X_test, y_test[:, 2])) y_score = svm.predict(X_test) print(metrics.f1_score(y_test[:, 2], y_score, average='micro')) print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3))) acc_svm_sim = accuracy_score(y_score, y_test[:, 2]) tsts = [] for i in range(len(y_score)): node1 = y_test[i][0] node2 = y_test[i][1] # dgr1 = g.degree(node1) # dgr2 = g.degree(node2) # gender1 = g.nodes[node1]['gender'] # gender2 = g.nodes[node2]['gender'] # # if (node1, node2) in g.edges(): # edge_betw = edge_between[(node1, node2)] # else: # edge_betw = 0 tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]] tsts.append(tst) name = ['y_score', 'y_test_grd', 'node1', 'node2'] result = pd.DataFrame(columns=name, data=tsts) result.to_csv("{}{}-svm_sim.csv".format(res_dir, F)) print(acc_kmeans_sim, acc_mlp_sim, acc_rf_sim, acc_svm_sim) return (acc_kmeans_sim, acc_mlp_sim, acc_rf_sim, acc_svm_sim)