#!/usr/bin/env python # coding: utf-8 # https://github.com/DaehanKim/vgae_pytorch import torch import torch.nn.functional as F from torch.optim import Adam from sklearn.metrics import roc_auc_score, average_precision_score import scipy.sparse as sp import numpy as np import os import time from preprocessing import * from sklearn.metrics import roc_auc_score, recall_score, precision_score from sklearn.metrics import f1_score, accuracy_score, confusion_matrix import random import itertools import pandas as pd def get_edge_embeddings(edge_list, emb_matrixs): embs = [] i = 0 for edge in edge_list: node1 = int(edge[0]) node2 = int(edge[1]) emb = [] # print(i) # print(idx_epoches_all[i,:]) # print(len(idx_epoches_all[i,:])) emb1 = emb_matrixs[node1] # print(np.shape(emb1)) emb2 = emb_matrixs[node2] edge_emb = np.multiply(emb1, emb2) sim1 = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2) + 0.0000000000000000000000000000001) sim2 = np.dot(emb1, emb2) sim3 = np.linalg.norm(np.array(emb1) - np.array(emb2)) # edge_emb = np.array(emb1) + np.array(emb2) # print(np.shape(edge_emb)) # emb.append(sim1) # emb.append(sim2) i += 1 embs.append([sim1, sim2, sim3]) embs = np.array(embs) return embs class Args: dataset = 'cora' model = 'GAE' input_dim = 1433 hidden1_dim = 32 hidden2_dim = 16 use_feature = True num_epoch = 200 learning_rate = 0.01 args = Args() ''' ****************NOTE***************** CREDITS : Thomas Kipf since datasets are the same as those in kipf's implementation, Their preprocessing source was used as-is. ************************************* ''' import numpy as np import sys import pickle as pkl import networkx as nx import scipy.sparse as sp # from google.colab import drive # drive.mount('/content/gdrive') # colab='Colab Notebooks' # path = F"/content/gdrive/My Drive/{colab}/GraphNN/data/" def parse_index_file(filename): index = [] for line in open(filename): index.append(int(line.strip())) return index def load_data(dataset): # load the data: x, tx, allx, graph names = ['x', 'tx', 'allx', 'graph'] objects = [] for i in range(len(names)): with open('./data/' + "ind.{}.{}".format(dataset, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, tx, allx, graph = tuple(objects) test_idx_reorder = parse_index_file('./data/' + "ind.{}.test.index".format(dataset)) test_idx_range = np.sort(test_idx_reorder) if dataset == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) return adj, features ''' ****************NOTE***************** CREDITS : Thomas Kipf since datasets are the same as those in kipf's implementation, Their preprocessing source was used as-is. ************************************* ''' import numpy as np import scipy.sparse as sp def sparse_to_tuple(sparse_mx): if not sp.isspmatrix_coo(sparse_mx): sparse_mx = sparse_mx.tocoo() coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() values = sparse_mx.data shape = sparse_mx.shape return coords, values, shape def preprocess_graph(adj): adj = sp.coo_matrix(adj) adj_ = adj + sp.eye(adj.shape[0]) rowsum = np.array(adj_.sum(1)) degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo() return sparse_to_tuple(adj_normalized) def mask_test_edges(adj): # Function to build test set with 10% positive links # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. # TODO: Clean up. # Remove diagonal elements adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) adj.eliminate_zeros() # Check that diag is zero: assert np.diag(adj.todense()).sum() == 0 adj_triu = sp.triu(adj) adj_tuple = sparse_to_tuple(adj_triu) edges = adj_tuple[0] edges_all = sparse_to_tuple(adj)[0] # num_test = int(np.floor(edges.shape[0] / 60.)) # num_val = int(np.floor(edges.shape[0] / 10.)) num_val = int(edges.shape[0] * 0.1) num_test = int(edges.shape[0] * 0.6) all_edge_idx = list(range(edges.shape[0])) np.random.shuffle(all_edge_idx) val_edge_idx = all_edge_idx[:num_val] test_edge_idx = all_edge_idx[num_val:(num_val + num_test)] test_edges = edges[test_edge_idx] val_edges = edges[val_edge_idx] train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0) def ismember(a, b, tol=5): rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) return np.any(rows_close) test_edges_false = [] while len(test_edges_false) < len(test_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue if ismember([idx_i, idx_j], edges_all): continue if test_edges_false: if ismember([idx_j, idx_i], np.array(test_edges_false)): continue if ismember([idx_i, idx_j], np.array(test_edges_false)): continue test_edges_false.append([idx_i, idx_j]) val_edges_false = [] while len(val_edges_false) < len(val_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue if ismember([idx_i, idx_j], train_edges): continue if ismember([idx_j, idx_i], train_edges): continue if ismember([idx_i, idx_j], val_edges): continue if ismember([idx_j, idx_i], val_edges): continue if val_edges_false: if ismember([idx_j, idx_i], np.array(val_edges_false)): continue if ismember([idx_i, idx_j], np.array(val_edges_false)): continue val_edges_false.append([idx_i, idx_j]) assert ~ismember(test_edges_false, edges_all) assert ~ismember(val_edges_false, edges_all) assert ~ismember(val_edges, train_edges) assert ~ismember(test_edges, train_edges) assert ~ismember(val_edges, test_edges) data = np.ones(train_edges.shape[0]) # Re-build adj matrix adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) adj_train = adj_train + adj_train.T # NOTE: these edge lists only contain single direction of edge! return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false import torch import torch.nn as nn import torch.nn.functional as F import os import numpy as np class GraphConvSparse(nn.Module): def __init__(self, input_dim, output_dim, adj, activation=F.relu, **kwargs): super(GraphConvSparse, self).__init__(**kwargs) self.weight = glorot_init(input_dim, output_dim) self.adj = adj self.activation = activation def forward(self, inputs): x = inputs x = torch.mm(x, self.weight) x = torch.mm(self.adj, x) outputs = self.activation(x) return outputs def dot_product_decode(Z): A_pred = torch.sigmoid(torch.matmul(Z, Z.t())) return A_pred def glorot_init(input_dim, output_dim): init_range = np.sqrt(6.0 / (input_dim + output_dim)) initial = torch.rand(input_dim, output_dim) * 2 * init_range - init_range return nn.Parameter(initial) class GAE(nn.Module): def __init__(self, adj): super(GAE, self).__init__() self.base_gcn = GraphConvSparse(args.input_dim, args.hidden1_dim, adj) self.base_gcn2 = GraphConvSparse(args.hidden1_dim, args.hidden1_dim, adj) self.gcn_mean = GraphConvSparse(args.hidden1_dim, args.hidden2_dim, adj, activation=lambda x: x) def encode(self, X): hidden = self.base_gcn(X) hidden = self.base_gcn2(hidden) z = self.mean = self.gcn_mean(hidden) return z def forward(self, X): Z = self.encode(X) A_pred = dot_product_decode(Z) return A_pred, Z os.environ['CUDA_VISIBLE_DEVICES'] = "" # adj, features = load_data_popets(args.dataset) ego_user = args.dataset dt = args.dataset feat_dir = './data/' + str(ego_user) + '-adj-feat.pkl' f2 = open(feat_dir, 'rb') adj, ft = pkl.load(f2, encoding='latin1') g = nx.Graph(adj) features = sp.coo_matrix(ft).tolil() with open('.data/' + str(ego_user) + '-target.txt') as tfile: Lines = tfile.readlines() target = [] for line in Lines: arr = line.strip().split(',') target.append(int(arr[1])) for n in range(g.number_of_nodes()): print(n) g.nodes[n]['gender'] = target[n] # np.random.seed(sed) # make sure train-test split is consistent between notebooks adj_sparse = nx.to_scipy_sparse_matrix(g) adj = nx.adjacency_matrix(g) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) # res_dir0 = '/Wang-ds/xwang193/deepwalk-master/%s/' % (dt) # f2 = open('%s/%s-train_test_split' % (res_dir0, dt), 'rb') # train_test_split = pkl.load(f2, encoding='latin1') # # adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = train_test_split # Unpack train-test split adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create Model pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) adj_norm = torch.sparse.FloatTensor(torch.LongTensor(adj_norm[0].T), torch.FloatTensor(adj_norm[1]), torch.Size(adj_norm[2])) adj_label = torch.sparse.FloatTensor(torch.LongTensor(adj_label[0].T), torch.FloatTensor(adj_label[1]), torch.Size(adj_label[2])) features = torch.sparse.FloatTensor(torch.LongTensor(features[0].T), torch.FloatTensor(features[1]), torch.Size(features[2])) weight_mask = adj_label.to_dense().view(-1) == 1 weight_tensor = torch.ones(weight_mask.size(0)) weight_tensor[weight_mask] = pos_weight # init model and optimizer # model = getattr(model,"GAE")(adj_norm) # model=VGAE(adj_norm) model = GAE(adj_norm) optimizer = Adam(model.parameters(), lr=args.learning_rate) def get_scores(edges_pos, edges_neg, adj_rec): def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges preds = [] pos = [] for e in edges_pos: # print(e) # print(adj_rec[e[0], e[1]]) preds.append(sigmoid(adj_rec[e[0], e[1]].item())) pos.append(adj_orig[e[0], e[1]]) preds_neg = [] neg = [] for e in edges_neg: preds_neg.append(sigmoid(adj_rec[e[0], e[1]].data)) neg.append(adj_orig[e[0], e[1]]) preds_all = np.hstack([preds, preds_neg]) labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))]) roc_score = roc_auc_score(labels_all, preds_all) ap_score = average_precision_score(labels_all, preds_all) return roc_score, ap_score def get_acc(adj_rec, adj_label): labels_all = adj_label.to_dense().view(-1).long() preds_all = (adj_rec > 0.5).view(-1).long() accuracy = (preds_all == labels_all).sum().float() / labels_all.size(0) return accuracy # train model for epoch in range(1500): t = time.time() A_pred, embed = model(features) optimizer.zero_grad() loss = log_lik = norm * F.binary_cross_entropy(A_pred.view(-1), adj_label.to_dense().view(-1), weight=weight_tensor) if args.model == 'VGAE': kl_divergence = 0.5 / A_pred.size(0) * (1 + 2 * model.logstd - model.mean ** 2 - torch.exp(model.logstd)).sum( 1).mean() loss -= kl_divergence loss.backward() optimizer.step() res_dir = '%s/' % (dt) if not os.path.exists(res_dir): os.makedirs(res_dir) emb_matrix = embed.detach().numpy() savepath = res_dir + 'embeds-' + str(ego_user) print('***') np.save(savepath, emb_matrix)