LMIA / node2vec / link_prediction_scores.py
link_prediction_scores.py
Raw
from __future__ import division
import networkx as nx

import scipy.sparse as sp
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve
from sklearn.manifold import spectral_embedding
import node2vec
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
import time
import os
import tensorflow as tf
from gae.optimizer import OptimizerAE, OptimizerVAE
from gae.model import GCNModelAE, GCNModelVAE
from gae.preprocessing import preprocess_graph, construct_feed_dict, sparse_to_tuple, mask_test_edges
import pickle
from copy import deepcopy
from word2vec import data_reader, model, trainer
import random

from deepwalk_pytorch import word2vec
import pandas as pd


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


# Input: positive test/val edges, negative test/val edges, edge score matrix
# Output: ROC AUC score, ROC Curve (FPR, TPR, Thresholds), AP score
def get_roc_score(edges_pos, edges_neg, score_matrix, apply_sigmoid=False):
    # Edge case
    if len(edges_pos) == 0 or len(edges_neg) == 0:
        return (None, None, None)

    # Store positive edge predictions, actual values
    preds_pos = []
    pos = []
    for edge in edges_pos:
        if apply_sigmoid == True:
            preds_pos.append(sigmoid(score_matrix[edge[0], edge[1]]))
        else:
            preds_pos.append(score_matrix[edge[0], edge[1]])
        pos.append(1)  # actual value (1 for positive)

    # Store negative edge predictions, actual values
    preds_neg = []
    neg = []
    for edge in edges_neg:
        if apply_sigmoid == True:
            preds_neg.append(sigmoid(score_matrix[edge[0], edge[1]]))
        else:
            preds_neg.append(score_matrix[edge[0], edge[1]])
        neg.append(0)  # actual value (0 for negative)

    # Calculate scores
    preds_all = np.hstack([preds_pos, preds_neg])
    labels_all = np.hstack([np.ones(len(preds_pos)), np.zeros(len(preds_neg))])
    roc_score = roc_auc_score(labels_all, preds_all)
    # roc_curve_tuple = roc_curve(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)

    # return roc_score, roc_curve_tuple, ap_score
    return roc_score, ap_score


# Return a list of tuples (node1, node2) for networkx link prediction evaluation
def get_ebunch(train_test_split):
    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split

    test_edges_list = test_edges.tolist()  # convert to nested list
    test_edges_list = [tuple(node_pair) for node_pair in test_edges_list]  # convert node-pairs to tuples
    test_edges_false_list = test_edges_false.tolist()
    test_edges_false_list = [tuple(node_pair) for node_pair in test_edges_false_list]
    return (test_edges_list + test_edges_false_list)


# Input: NetworkX training graph, train_test_split (from mask_test_edges)
# Output: dictionary with ROC AUC, ROC Curve, AP, Runtime
def adamic_adar_scores(g_train, train_test_split):
    if g_train.is_directed():  # Only works for undirected graphs
        g_train = g_train.to_undirected()

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack input

    start_time = time.time()

    aa_scores = {}

    # Calculate scores
    aa_matrix = np.zeros(adj_train.shape)
    for u, v, p in nx.adamic_adar_index(g_train, ebunch=get_ebunch(
            train_test_split)):  # (u, v) = node indices, p = Adamic-Adar index
        aa_matrix[u][v] = p
        aa_matrix[v][u] = p  # make sure it's symmetric
    aa_matrix = aa_matrix / aa_matrix.max()  # Normalize matrix

    runtime = time.time() - start_time
    aa_roc, aa_ap = get_roc_score(test_edges, test_edges_false, aa_matrix)

    aa_scores['test_roc'] = aa_roc
    # aa_scores['test_roc_curve'] = aa_roc_curve
    aa_scores['test_ap'] = aa_ap
    aa_scores['runtime'] = runtime
    return aa_scores


# Input: NetworkX training graph, train_test_split (from mask_test_edges)
# Output: dictionary with ROC AUC, ROC Curve, AP, Runtime
def jaccard_coefficient_scores(g_train, train_test_split):
    if g_train.is_directed():  # Jaccard coef only works for undirected graphs
        g_train = g_train.to_undirected()

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack input

    start_time = time.time()
    jc_scores = {}

    # Calculate scores
    jc_matrix = np.zeros(adj_train.shape)
    for u, v, p in nx.jaccard_coefficient(g_train, ebunch=get_ebunch(
            train_test_split)):  # (u, v) = node indices, p = Jaccard coefficient
        jc_matrix[u][v] = p
        jc_matrix[v][u] = p  # make sure it's symmetric
    jc_matrix = jc_matrix / jc_matrix.max()  # Normalize matrix

    runtime = time.time() - start_time
    jc_roc, jc_ap = get_roc_score(test_edges, test_edges_false, jc_matrix)

    jc_scores['test_roc'] = jc_roc
    # jc_scores['test_roc_curve'] = jc_roc_curve
    jc_scores['test_ap'] = jc_ap
    jc_scores['runtime'] = runtime
    return jc_scores


# Input: NetworkX training graph, train_test_split (from mask_test_edges)
# Output: dictionary with ROC AUC, ROC Curve, AP, Runtime
def preferential_attachment_scores(g_train, train_test_split):
    if g_train.is_directed():  # Only defined for undirected graphs
        g_train = g_train.to_undirected()

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack input

    start_time = time.time()
    pa_scores = {}

    # Calculate scores
    pa_matrix = np.zeros(adj_train.shape)
    for u, v, p in nx.preferential_attachment(g_train, ebunch=get_ebunch(
            train_test_split)):  # (u, v) = node indices, p = Jaccard coefficient
        pa_matrix[u][v] = p
        pa_matrix[v][u] = p  # make sure it's symmetric
    pa_matrix = pa_matrix / pa_matrix.max()  # Normalize matrix

    runtime = time.time() - start_time
    pa_roc, pa_ap = get_roc_score(test_edges, test_edges_false, pa_matrix)

    pa_scores['test_roc'] = pa_roc
    # pa_scores['test_roc_curve'] = pa_roc_curve
    pa_scores['test_ap'] = pa_ap
    pa_scores['runtime'] = runtime
    return pa_scores


# Input: train_test_split (from mask_test_edges)
# Output: dictionary with ROC AUC, ROC Curve, AP, Runtime
def spectral_clustering_scores(train_test_split, random_state=0):
    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack input

    start_time = time.time()
    sc_scores = {}

    # Perform spectral clustering link prediction
    spectral_emb = spectral_embedding(adj_train, n_components=16, random_state=random_state)
    sc_score_matrix = np.dot(spectral_emb, spectral_emb.T)

    runtime = time.time() - start_time
    sc_test_roc, sc_test_ap = get_roc_score(test_edges, test_edges_false, sc_score_matrix, apply_sigmoid=True)
    sc_val_roc, sc_val_ap = get_roc_score(val_edges, val_edges_false, sc_score_matrix, apply_sigmoid=True)

    # Record scores
    sc_scores['test_roc'] = sc_test_roc
    # sc_scores['test_roc_curve'] = sc_test_roc_curve
    sc_scores['test_ap'] = sc_test_ap

    sc_scores['val_roc'] = sc_val_roc
    # sc_scores['val_roc_curve'] = sc_val_roc_curve
    sc_scores['val_ap'] = sc_val_ap

    sc_scores['runtime'] = runtime
    return sc_scores


# Input: NetworkX training graph, train_test_split (from mask_test_edges), n2v hyperparameters
# Output: dictionary with ROC AUC, ROC Curve, AP, Runtime
def node2vec_scores(
        g_train, train_test_split, DATASET, METHOD, F,
        P=1,  # Return hyperparameter
        Q=1,  # In-out hyperparameter
        WINDOW_SIZE=10,  # Context size for optimization
        NUM_WALKS=10,  # Number of walks per source
        WALK_LENGTH=80,  # Length of walk per source
        DIMENSIONS=256,  # Embedding dimension
        DIRECTED=False,  # Graph directed/undirected
        WORKERS=8,  # Num. parallel workers
        ITER=1,  # SGD epochs
        edge_score_mode="edge-emb",  # Whether to use bootstrapped edge embeddings + LogReg (like in node2vec paper),
        # or simple dot-product (like in GAE paper) for edge scoring
        verbose=1,
        Ego_user=0,
):
    if g_train.is_directed():
        DIRECTED = True

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack train-test split

    start_time = time.time()

    # Preprocessing, generate walks
    if verbose >= 1:
        print('Preprocessing grpah for node2vec...')
    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q)  # create node2vec graph instance
    g_n2v.preprocess_transition_probs()
    if verbose == 2:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=True)
    else:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=False)

    file_name = str(Ego_user)
    file_ = open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'walks-' + F + '-' + file_name, 'w')
    for walk in walks:
        line = str()
        for node in walk:
            line += str(node) + ' '
        line += '\n'
        file_.write(line)
    file_.close()

    walks = [map(str, walk) for walk in walks]

    # Train skip-gram model
    model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER)

    # Store embeddings mapping
    emb_mappings = model.wv

    # Create node embeddings matrix (rows = nodes, columns = embedding features)
    emb_list = []
    for node_index in range(0, adj_train.shape[0]):
        node_str = str(node_index)
        node_emb = emb_mappings[node_str]
        emb_list.append(node_emb)
    emb_matrix = np.vstack(emb_list)

    with open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'embeds-' + F + '-' + file_name, 'w') as f:
        f.write('%d %d\n' % (adj_train.shape[0], DIMENSIONS))
        for i in range(adj_train.shape[0]):
            e = ' '.join(map(lambda x: str(x), emb_list[i]))
            f.write('%s %s\n' % (str(i), e))


            # Generate bootstrapped edge embeddings (as is done in node2vec paper)
            # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    if edge_score_mode == "edge-emb":

        def get_edge_embeddings(edge_list, DATASET, METHOD, flag):
            embs = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = emb_matrix[node1]
                print(np.shape(emb1))
                emb2 = emb_matrix[node2]
                print(np.shape(emb2))
                print(emb2[2])
                print(emb2)
                edge_emb = np.multiply(emb1, emb2)
                # edge_emb=np.array(emb1)+np.array(emb2)
                print(np.shape(edge_emb))
                embs.append(list(edge_emb))
            embs = np.array(embs)

            # with open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'embeds-' + F + '-' + file_name, 'w') as f:
            #     f.write('%d %d\n' % (edge_list.shape[0], DIMENSIONS))
            #     for i in range(edge_list.shape[0]):
            #         e = ' '.join(map(lambda x: str(x), embs[i]))
            #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

            return embs

        # Train-set edge embeddings
        pos_train_edge_embs = get_edge_embeddings(train_edges, DATASET, METHOD, flag='pos-train')
        neg_train_edge_embs = get_edge_embeddings(train_edges_false, DATASET, METHOD, flag='neg-train')
        train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])

        # Create train-set edge labels: 1 = real edge, 0 = false edge
        train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])

        # Val-set edge embeddings, labels
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs = get_edge_embeddings(val_edges, DATASET, METHOD, flag='pos-val')
            neg_val_edge_embs = get_edge_embeddings(val_edges_false, DATASET, METHOD, flag='neg-val')
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))])

        # Test-set edge embeddings, labels
        pos_test_edge_embs = get_edge_embeddings(test_edges, DATASET, METHOD, flag='pos-test')
        neg_test_edge_embs = get_edge_embeddings(test_edges_false, DATASET, METHOD, flag='neg-test')
        test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])

        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

        # Train logistic regression classifier on train-set edge embeddings
        edge_classifier = LogisticRegression(random_state=0)
        edge_classifier.fit(train_edge_embs, train_edge_labels)

        # Predicted edge scores: probability of being of class "1" (real edge)
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
        test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
        print(test_preds)
        print(np.shape(test_preds))

        runtime = time.time() - start_time

        # Calculate scores
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            n2v_val_roc = roc_auc_score(val_edge_labels, val_preds)
            # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds)
            n2v_val_ap = average_precision_score(val_edge_labels, val_preds)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
        # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
        n2v_test_ap = average_precision_score(test_edge_labels, test_preds)


    # Generate edge scores using simple dot product of node embeddings (like in GAE paper)
    elif edge_score_mode == "dot-product":
        score_matrix = np.dot(emb_matrix, emb_matrix.T)
        runtime = time.time() - start_time

        # Val set scores
        if len(val_edges) > 0:
            n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        # Test set scores
        n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True)

    else:
        print("Invalid edge_score_mode! Either use edge-emb or dot-product.")

    # Record scores
    n2v_scores = {}

    n2v_scores['test_roc'] = n2v_test_roc
    # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    n2v_scores['test_ap'] = n2v_test_ap

    n2v_scores['val_roc'] = n2v_val_roc
    # n2v_scores['val_roc_curve'] = n2v_val_roc_curve
    n2v_scores['val_ap'] = n2v_val_ap

    n2v_scores['runtime'] = runtime

    return n2v_scores, val_edge_labels, val_preds, test_edge_labels, test_preds


# Input: original adj_sparse, train_test_split (from mask_test_edges), features matrix, n2v hyperparameters
# Output: dictionary with ROC AUC, ROC Curve, AP, Runtime
def gae_scores(
        adj_sparse,
        train_test_split,
        features_matrix=None,
        LEARNING_RATE=0.01,
        EPOCHS=200,
        HIDDEN1_DIM=32,
        HIDDEN2_DIM=16,
        DROPOUT=0,
        edge_score_mode="dot-product",
        verbose=1,
        dtype=tf.float32
):
    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack train-test split

    if verbose >= 1:
        print('GAE preprocessing...')

    start_time = time.time()

    # Train on CPU (hide GPU) due to memory constraints
    os.environ['CUDA_VISIBLE_DEVICES'] = ""

    # Convert features from normal matrix --> sparse matrix --> tuple
    # features_tuple contains: (list of matrix coordinates, list of values, matrix dimensions)
    if features_matrix is None:
        x = sp.lil_matrix(np.identity(adj_sparse.shape[0]))
    else:
        x = sp.lil_matrix(features_matrix)
    features_tuple = sparse_to_tuple(x)
    features_shape = features_tuple[2]

    # Get graph attributes (to feed into model)
    num_nodes = adj_sparse.shape[0]  # number of nodes in adjacency matrix
    num_features = features_shape[1]  # number of features (columsn of features matrix)
    features_nonzero = features_tuple[1].shape[
        0]  # number of non-zero entries in features matrix (or length of values list)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = deepcopy(adj_sparse)
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    # Normalize adjacency matrix
    adj_norm = preprocess_graph(adj_train)

    # Add in diagonals
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    # Define placeholders
    placeholders = {  # TODO: try making these dense from the get-go
        'features': tf.sparse_placeholder(tf.float16),
        'adj': tf.sparse_placeholder(tf.float16),
        'adj_orig': tf.sparse_placeholder(tf.float16),
        'dropout': tf.placeholder_with_default(0., shape=())
    }

    # How much to weigh positive examples (true edges) in cost print_function
    # Want to weigh less-frequent classes higher, so as to prevent model output bias
    # pos_weight = (num. negative samples / (num. positive samples)
    pos_weight = float(adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) / adj_sparse.sum()

    # normalize (scale) average weighted cost
    norm = adj_sparse.shape[0] * adj_sparse.shape[0] / float(
        (adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) * 2)

    if verbose >= 1:
        print('Initializing GAE model...')

    # Create VAE model
    model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero,
                        HIDDEN1_DIM, HIDDEN2_DIM, dtype=dtype, flatten_output=False)

    opt = OptimizerVAE(preds=model.reconstructions,
                       labels=tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False),
                       # labels=placeholders['adj_orig'],
                       model=model, num_nodes=num_nodes,
                       pos_weight=pos_weight,
                       norm=norm,
                       learning_rate=LEARNING_RATE,
                       dtype=tf.float16)

    cost_val = []
    acc_val = []
    val_roc_score = []

    prev_embs = []

    # Initialize session
    sess = tf.Session()

    if verbose >= 1:
        # Print total # trainable variables
        total_parameters = 0
        for variable in tf.trainable_variables():
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            print("Variable shape: ", shape)
            variable_parameters = 1
            for dim in shape:
                print("Current dimension: ", dim)
                variable_parameters *= dim.value
            print("Variable params: ", variable_parameters)
            total_parameters += variable_parameters
            print('')
        print("TOTAL TRAINABLE PARAMS: ", total_parameters)

        print('Initializing TF variables...')

    sess.run(tf.global_variables_initializer())

    if verbose >= 1:
        print('Starting GAE training!')

    # Train model
    for epoch in range(EPOCHS):

        t = time.time()
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features_tuple, placeholders)
        feed_dict.update({placeholders['dropout']: DROPOUT})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict)

        # Compute average loss
        avg_cost = outs[1]
        avg_accuracy = outs[2]

        # Evaluate predictions
        feed_dict.update({placeholders['dropout']: 0})
        gae_emb = sess.run(model.z_mean, feed_dict=feed_dict)

        prev_embs.append(gae_emb)

        gae_score_matrix = np.dot(gae_emb, gae_emb.T)

        # # TODO: remove this (debugging)
        # if not np.isfinite(gae_score_matrix).all():
        #     print 'Found non-finite value in GAE score matrix! Epoch: {}'.format(epoch)
        #     with open('numpy-nan-debugging.pkl', 'wb') as f:
        #         dump_info = {}
        #         dump_info['gae_emb'] = gae_emb
        #         dump_info['epoch'] = epoch
        #         dump_info['gae_score_matrix'] = gae_score_matrix
        #         dump_info['adj_norm'] = adj_norm
        #         dump_info['adj_label'] = adj_label
        #         dump_info['features_tuple'] = features_tuple
        #         # dump_info['feed_dict'] = feed_dict
        #         dump_info['prev_embs'] = prev_embs
        #         pickle.dump(dump_info, f, protocol=2)
        # # END TODO


        roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false, gae_score_matrix, apply_sigmoid=True)
        val_roc_score.append(roc_curr)

        # Print results for this epoch
        if verbose == 2:
            print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
                  "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]),
                  "val_ap=", "{:.5f}".format(ap_curr),
                  "time=", "{:.5f}".format(time.time() - t))

    if verbose == 2:
        print("Optimization Finished!")

    # Print final results
    feed_dict.update({placeholders['dropout']: 0})
    gae_emb = sess.run(model.z_mean, feed_dict=feed_dict)

    # Dot product edge scores (default)
    if edge_score_mode == "dot-product":
        gae_score_matrix = np.dot(gae_emb, gae_emb.T)

        runtime = time.time() - start_time

        # Calculate final scores
        gae_val_roc, gae_val_ap = get_roc_score(val_edges, val_edges_false, gae_score_matrix)
        gae_test_roc, gae_test_ap = get_roc_score(test_edges, test_edges_false, gae_score_matrix)

    # Take bootstrapped edge embeddings (via hadamard product)
    elif edge_score_mode == "edge-emb":
        def get_edge_embeddings(edge_list):
            embs = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = gae_emb[node1]
                emb2 = gae_emb[node2]
                edge_emb = np.multiply(emb1, emb2)
                embs.append(edge_emb)
            embs = np.array(embs)
            return embs

        # Train-set edge embeddings
        pos_train_edge_embs = get_edge_embeddings(train_edges)
        neg_train_edge_embs = get_edge_embeddings(train_edges_false)
        train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])

        # Create train-set edge labels: 1 = real edge, 0 = false edge
        train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])

        # Val-set edge embeddings, labels
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs = get_edge_embeddings(val_edges)
            neg_val_edge_embs = get_edge_embeddings(val_edges_false)
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))])

        # Test-set edge embeddings, labels
        pos_test_edge_embs = get_edge_embeddings(test_edges)
        neg_test_edge_embs = get_edge_embeddings(test_edges_false)
        test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])

        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

        # Train logistic regression classifier on train-set edge embeddings
        edge_classifier = LogisticRegression(random_state=0)
        edge_classifier.fit(train_edge_embs, train_edge_labels)

        # Predicted edge scores: probability of being of class "1" (real edge)
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
        test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]

        runtime = time.time() - start_time

        # Calculate scores
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            gae_val_roc = roc_auc_score(val_edge_labels, val_preds)
            # gae_val_roc_curve = roc_curve(val_edge_labels, val_preds)
            gae_val_ap = average_precision_score(val_edge_labels, val_preds)
        else:
            gae_val_roc = None
            gae_val_roc_curve = None
            gae_val_ap = None

        gae_test_roc = roc_auc_score(test_edge_labels, test_preds)
        # gae_test_roc_curve = roc_curve(test_edge_labels, test_preds)
        gae_test_ap = average_precision_score(test_edge_labels, test_preds)

    # Record scores
    gae_scores = {}

    gae_scores['test_roc'] = gae_test_roc
    # gae_scores['test_roc_curve'] = gae_test_roc_curve
    gae_scores['test_ap'] = gae_test_ap

    gae_scores['val_roc'] = gae_val_roc
    # gae_scores['val_roc_curve'] = gae_val_roc_curve
    gae_scores['val_ap'] = gae_val_ap

    gae_scores['val_roc_per_epoch'] = val_roc_score
    gae_scores['runtime'] = runtime
    return gae_scores


# Input: adjacency matrix (in sparse format), features_matrix (normal format), test_frac, val_frac, verbose
# Verbose: 0 - print nothing, 1 - print scores, 2 - print scores + GAE training progress
# Returns: Dictionary of results (ROC AUC, ROC Curve, AP, Runtime) for each link prediction method
def calculate_all_scores(adj_sparse, features_matrix=None, directed=False, \
                         test_frac=.3, val_frac=.1, random_state=0, verbose=1, \
                         train_test_split_file=None,
                         tf_dtype=tf.float32):
    np.random.seed(random_state)  # Guarantee consistent train/test split
    tf.set_random_seed(random_state)  # Consistent GAE training

    # Prepare LP scores dictionary
    lp_scores = {}

    ### ---------- PREPROCESSING ---------- ###
    train_test_split = None
    try:  # If found existing train-test split, use that file
        with open(train_test_split_file, 'rb') as f:
            train_test_split = pickle.load(f)
            print('Found existing train-test split!')
    except:  # Else, generate train-test split on the fly
        print('Generating train-test split...')
        if directed == False:
            train_test_split = mask_test_edges(adj_sparse, test_frac=test_frac, val_frac=val_frac)
        else:
            train_test_split = mask_test_edges_directed(adj_sparse, test_frac=test_frac, val_frac=val_frac)

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack tuple

    # g_train: new graph object with only non-hidden edges
    if directed == True:
        g_train = nx.DiGraph(adj_train)
    else:
        g_train = nx.Graph(adj_train)

    # Inspect train/test split
    if verbose >= 1:
        print("Total nodes:", adj_sparse.shape[0])
        print("Total edges:", int(adj_sparse.nnz / 2))  # adj is symmetric, so nnz (num non-zero) = 2*num_edges
        print("Training edges (positive):", len(train_edges))
        print("Training edges (negative):", len(train_edges_false))
        print("Validation edges (positive):", len(val_edges))
        print("Validation edges (negative):", len(val_edges_false))
        print("Test edges (positive):", len(test_edges))
        print("Test edges (negative):", len(test_edges_false))
        print('')
        print("------------------------------------------------------")

    ### ---------- LINK PREDICTION BASELINES ---------- ###
    # Adamic-Adar
    aa_scores = adamic_adar_scores(g_train, train_test_split)
    lp_scores['aa'] = aa_scores
    if verbose >= 1:
        print('')
        print('Adamic-Adar Test ROC score: ', str(aa_scores['test_roc']))
        print('Adamic-Adar Test AP score: ', str(aa_scores['test_ap']))

    # Jaccard Coefficient
    jc_scores = jaccard_coefficient_scores(g_train, train_test_split)
    lp_scores['jc'] = jc_scores
    if verbose >= 1:
        print('')
        print('Jaccard Coefficient Test ROC score: ', str(jc_scores['test_roc']))
        print('Jaccard Coefficient Test AP score: ', str(jc_scores['test_ap']))

    # Preferential Attachment
    pa_scores = preferential_attachment_scores(g_train, train_test_split)
    lp_scores['pa'] = pa_scores
    if verbose >= 1:
        print('')
        print('Preferential Attachment Test ROC score: ', str(pa_scores['test_roc']))
        print('Preferential Attachment Test AP score: ', str(pa_scores['test_ap']))

    ### ---------- SPECTRAL CLUSTERING ---------- ###
    sc_scores = spectral_clustering_scores(train_test_split)
    lp_scores['sc'] = sc_scores
    if verbose >= 1:
        print('')
        print('Spectral Clustering Validation ROC score: ', str(sc_scores['val_roc']))
        print('Spectral Clustering Validation AP score: ', str(sc_scores['val_ap']))
        print('Spectral Clustering Test ROC score: ', str(sc_scores['test_roc']))
        print('Spectral Clustering Test AP score: ', str(sc_scores['test_ap']))

    ## ---------- NODE2VEC ---------- ###
    # node2vec settings
    # NOTE: When p = q = 1, this is equivalent to DeepWalk
    P = 1  # Return hyperparameter
    Q = 1  # In-out hyperparameter
    WINDOW_SIZE = 10  # Context size for optimization
    NUM_WALKS = 10  # Number of walks per source
    WALK_LENGTH = 80  # Length of walk per source
    DIMENSIONS = 128  # Embedding dimension
    DIRECTED = False  # Graph directed/undirected
    WORKERS = 8  # Num. parallel workers
    ITER = 1  # SGD epochs

    # Using bootstrapped edge embeddings + logistic regression
    n2v_edge_emb_scores = node2vec_scores(g_train, train_test_split,
                                          P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS,
                                          ITER,
                                          "edge-emb",
                                          verbose)
    lp_scores['n2v_edge_emb'] = n2v_edge_emb_scores

    if verbose >= 1:
        print('')
        print('node2vec (Edge Embeddings) Validation ROC score: ', str(n2v_edge_emb_scores['val_roc']))
        print('node2vec (Edge Embeddings) Validation AP score: ', str(n2v_edge_emb_scores['val_ap']))
        print('node2vec (Edge Embeddings) Test ROC score: ', str(n2v_edge_emb_scores['test_roc']))
        print('node2vec (Edge Embeddings) Test AP score: ', str(n2v_edge_emb_scores['test_ap']))

    # Using dot products to calculate edge scores
    n2v_dot_prod_scores = node2vec_scores(g_train, train_test_split,
                                          P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS,
                                          ITER,
                                          "dot-product",
                                          verbose)
    lp_scores['n2v_dot_prod'] = n2v_dot_prod_scores

    if verbose >= 1:
        print('')
        print('node2vec (Dot Product) Validation ROC score: ', str(n2v_dot_prod_scores['val_roc']))
        print('node2vec (Dot Product) Validation AP score: ', str(n2v_dot_prod_scores['val_ap']))
        print('node2vec (Dot Product) Test ROC score: ', str(n2v_dot_prod_scores['test_roc']))
        print('node2vec (Dot Product) Test AP score: ', str(n2v_dot_prod_scores['test_ap']))

    ### ---------- (VARIATIONAL) GRAPH AUTOENCODER ---------- ###
    # # GAE hyperparameters
    # LEARNING_RATE = 0.001 # Default: 0.01
    # EPOCHS = 200
    # HIDDEN1_DIM = 32
    # HIDDEN2_DIM = 16
    # DROPOUT = 0

    # # Use dot product
    # tf.set_random_seed(random_state) # Consistent GAE training
    # gae_results = gae_scores(adj_sparse, train_test_split, features_matrix,
    #     LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT,
    #     "dot-product",
    #     verbose,
    #     dtype=tf.float16)
    # lp_scores['gae'] = gae_results

    # if verbose >= 1:
    #     print ''
    #     print 'GAE (Dot Product) Validation ROC score: ', str(gae_results['val_roc'])
    #     print 'GAE (Dot Product) Validation AP score: ', str(gae_results['val_ap'])
    #     print 'GAE (Dot Product) Test ROC score: ', str(gae_results['test_roc'])
    #     print 'GAE (Dot Product) Test AP score: ', str(gae_results['test_ap'])


    # # Use edge embeddings
    # tf.set_random_seed(random_state) # Consistent GAE training
    # gae_edge_emb_results = gae_scores(adj_sparse, train_test_split, features_matrix,
    #     LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT,
    #     "edge-emb",
    #     verbose)
    # lp_scores['gae_edge_emb'] = gae_edge_emb_results

    # if verbose >= 1:
    #     print ''
    #     print 'GAE (Edge Embeddings) Validation ROC score: ', str(gae_edge_emb_results['val_roc'])
    #     print 'GAE (Edge Embeddings) Validation AP score: ', str(gae_edge_emb_results['val_ap'])
    #     print 'GAE (Edge Embeddings) Test ROC score: ', str(gae_edge_emb_results['test_roc'])
    #     print 'GAE (Edge Embeddings) Test AP score: ', str(gae_edge_emb_results['test_ap'])


    ### ---------- RETURN RESULTS ---------- ###
    return lp_scores

def node2vec_scores1(
        g_train, train_test_split, DATASET, METHOD, F,
        P=1,  # Return hyperparameter
        Q=1,  # In-out hyperparameter
        WINDOW_SIZE=10,  # Context size for optimization
        NUM_WALKS=10,  # Number of walks per source
        WALK_LENGTH=80,  # Length of walk per source
        DIMENSIONS=256,  # Embedding dimension
        DIRECTED=False,  # Graph directed/undirected
        WORKERS=8,  # Num. parallel workers
        ITER=1,  # SGD epochs
        edge_score_mode="edge-emb",  # Whether to use bootstrapped edge embeddings + LogReg (like in node2vec paper),
        # or simple dot-product (like in GAE paper) for edge scoring
        verbose=1,
        Ego_user=0,
):
    if g_train.is_directed():
        DIRECTED = True

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack train-test split

    start_time = time.time()

    # Preprocessing, generate walks
    if verbose >= 1:
        print('Preprocessing grpah for node2vec...')
    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q)  # create node2vec graph instance
    g_n2v.preprocess_transition_probs()
    if verbose == 2:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=True)
    else:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=False)

    file_name = str(Ego_user)
    file_ = open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'walks-' + F + '-' + file_name, 'w')
    for walk in walks:
        line = str()
        for node in walk:
            line += str(node) + ' '
        line += '\n'
        file_.write(line)
    file_.close()

    walks = [map(str, walk) for walk in walks]

    # Train skip-gram model
    model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER)

    # Store embeddings mapping
    emb_mappings = model.wv

    # Create node embeddings matrix (rows = nodes, columns = embedding features)
    emb_list = []
    for node_index in range(0, adj_train.shape[0]):
        node_str = str(node_index)
        node_emb = emb_mappings[node_str]
        emb_list.append(node_emb)
    emb_matrix = np.vstack(emb_list)

    with open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'embeds-' + F + '-' + file_name, 'w') as f:
        f.write('%d %d\n' % (adj_train.shape[0], DIMENSIONS))
        for i in range(adj_train.shape[0]):
            e = ' '.join(map(lambda x: str(x), emb_list[i]))
            f.write('%s %s\n' % (str(i), e))


            # Generate bootstrapped edge embeddings (as is done in node2vec paper)
            # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    if edge_score_mode == "edge-emb":

        def get_edge_embeddings(edge_list, DATASET, METHOD, flag):
            embs = []
            sim_matrix = []
            embs_1 = []
            embs_2 = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = emb_matrix[node1]
                # print(np.shape(emb1))
                emb2 = emb_matrix[node2]
                edge_emb = np.multiply(emb1, emb2)
                sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
                # edge_emb = np.array(emb1) + np.array(emb2)
                #print(np.shape(edge_emb))
                embs.append(edge_emb)
                embs_1.append(emb1)
                embs_2.append(emb2)
                sim_matrix.append(sim)
            embs = np.array(embs)
            sim_matrix = np.array(sim_matrix)
            embs_1 = np.array(embs_1)
            embs_2 = np.array(embs_2)

            # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
            # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
            # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
            #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
            #     for i in range(edge_list.shape[0]):
            #         e = ' '.join(map(lambda x: str(x), embs[i]))
            #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

            return embs, sim_matrix, embs_1, embs_2

        # Train-set edge embeddings
        pos_train_edge_embs,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train = get_edge_embeddings(train_edges, DATASET, METHOD, flag='pos-train')
        neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, DATASET, METHOD, flag='neg-train')
        train_edge_embs = np.concatenate((pos_train_edge_embs, neg_train_edge_embs), axis=0)
        train_sim_matrix= np.concatenate((pos_train_sim_matrix, neg_train_sim_matrix), axis=0)
        train_embs_1 = np.concatenate((pos_embs_1_train, neg_embs_1_train), axis=0)
        train_embs_2 = np.concatenate((pos_embs_2_train, neg_embs_2_train), axis=0)

        # Create train-set edge labels: 1 = real edge, 0 = false edge
        train_edge_labels = np.concatenate((np.ones(len(train_edges)), np.zeros(len(train_edges_false))), axis=0)

        # Val-set edge embeddings, labels
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs,pos_val_sim_matrix,pos_embs_1_val,pos_embs_2_val = get_edge_embeddings(val_edges, DATASET, METHOD, flag='pos-val')
            neg_val_edge_embs,neg_val_sim_matrix,neg_embs_1_val,neg_embs_2_val = get_edge_embeddings(val_edges_false, DATASET, METHOD, flag='neg-val')
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate((np.ones(len(val_edges)), np.zeros(len(val_edges_false))), axis=0)
            val_sim_matrix = np.concatenate((pos_val_sim_matrix, neg_val_sim_matrix), axis=0)
            val_embs_1 = np.concatenate((pos_embs_1_val, neg_embs_1_val), axis=0)
            val_embs_2 = np.concatenate((pos_embs_2_val, neg_embs_2_val), axis=0)

        # Test-set edge embeddings, labels
        pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges, DATASET, METHOD, flag='pos-test')
        neg_test_edge_embs,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test = get_edge_embeddings(test_edges_false, DATASET, METHOD, flag='neg-test')
        test_edge_embs = np.concatenate((pos_test_edge_embs, neg_test_edge_embs), axis=0)
        test_sim_matrix = np.concatenate((pos_test_sim_matrix, neg_test_sim_matrix), axis=0)
        test_embs_1 = np.concatenate((pos_embs_1_test, neg_embs_1_test), axis=0)
        test_embs_2 = np.concatenate((pos_embs_2_test, neg_embs_2_test), axis=0)

        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

        # Train logistic regression classifier on train-set edge embeddings
        edge_classifier = LogisticRegression(random_state=0)
        edge_classifier.fit(train_edge_embs, train_edge_labels)

        # Predicted edge scores: probability of being of class "1" (real edge)
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
        test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
        print(test_preds)
        print(np.shape(test_preds))

        runtime = time.time() - start_time

        # Calculate scores
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            n2v_val_roc = roc_auc_score(val_edge_labels, val_preds)
            # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds)
            n2v_val_ap = average_precision_score(val_edge_labels, val_preds)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
        # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
        n2v_test_ap = average_precision_score(test_edge_labels, test_preds)


    # Generate edge scores using simple dot product of node embeddings (like in GAE paper)
    elif edge_score_mode == "dot-product":
        score_matrix = np.dot(emb_matrix, emb_matrix.T)
        runtime = time.time() - start_time

        # Val set scores
        if len(val_edges) > 0:
            n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        # Test set scores
        n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True)

    else:
        print("Invalid edge_score_mode! Either use edge-emb or dot-product.")

    # Record scores
    n2v_scores = {}

    n2v_scores['test_roc'] = n2v_test_roc
    # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    n2v_scores['test_ap'] = n2v_test_ap

    n2v_scores['val_roc'] = n2v_val_roc
    # n2v_scores['val_roc_curve'] = n2v_val_roc_curve
    n2v_scores['val_ap'] = n2v_val_ap

    n2v_scores['runtime'] = runtime

    return n2v_scores, train_edge_labels,test_edge_labels, test_preds,emb_matrix,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2


def node2vec_scores2(
        g_train, train_test_split, DATASET, METHOD, F,
        P=1,  # Return hyperparameter
        Q=1,  # In-out hyperparameter
        WINDOW_SIZE=10,  # Context size for optimization
        NUM_WALKS=10,  # Number of walks per source
        WALK_LENGTH=80,  # Length of walk per source
        DIMENSIONS=256,  # Embedding dimension
        DIRECTED=False,  # Graph directed/undirected
        WORKERS=8,  # Num. parallel workers
        ITER=1,  # SGD epochs
        edge_score_mode="edge-emb",  # Whether to use bootstrapped edge embeddings + LogReg (like in node2vec paper),
        # or simple dot-product (like in GAE paper) for edge scoring
        verbose=1,
        Ego_user=0,
):
    if g_train.is_directed():
        DIRECTED = True

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack train-test split

    start_time = time.time()

    # Preprocessing, generate walks
    if verbose >= 1:
        print('Preprocessing grpah for node2vec...')
    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q)  # create node2vec graph instance
    g_n2v.preprocess_transition_probs()
    if verbose == 2:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=True)
    else:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=False)

    file_name = str(Ego_user)
    file_ = open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'walks-' + F + '-' + file_name, 'w')
    for walk in walks:
        line = str()
        for node in walk:
            line += str(node) + ' '
        line += '\n'
        file_.write(line)
    file_.close()

    walks = [map(str, walk) for walk in walks]

    # Train skip-gram model
    model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER)

    # Store embeddings mapping
    emb_mappings = model.wv

    # Create node embeddings matrix (rows = nodes, columns = embedding features)
    emb_list = []
    for node_index in range(0, adj_train.shape[0]):
        node_str = str(node_index)
        node_emb = emb_mappings[node_str]
        emb_list.append(node_emb)
    emb_matrix = np.vstack(emb_list)

    with open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'embeds-' + F + '-' + file_name, 'w') as f:
        f.write('%d %d\n' % (adj_train.shape[0], DIMENSIONS))
        for i in range(adj_train.shape[0]):
            e = ' '.join(map(lambda x: str(x), emb_list[i]))
            f.write('%s %s\n' % (str(i), e))


            # Generate bootstrapped edge embeddings (as is done in node2vec paper)
            # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    if edge_score_mode == "edge-emb":

        def get_edge_embeddings(edge_list, DATASET, METHOD, flag):
            embs = []
            sim_matrix = []
            embs_1 = []
            embs_2 = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = emb_matrix[node1]
                # print(np.shape(emb1))
                emb2 = emb_matrix[node2]
                edge_emb = np.multiply(emb1, emb2)
                sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
                # edge_emb = np.array(emb1) + np.array(emb2)
                #print(np.shape(edge_emb))
                embs.append(edge_emb)
                embs_1.append(emb1)
                embs_2.append(emb2)
                sim_matrix.append(sim)
            embs = np.array(embs)
            sim_matrix = np.array(sim_matrix)
            embs_1 = np.array(embs_1)
            embs_2 = np.array(embs_2)

            # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
            # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
            # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
            #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
            #     for i in range(edge_list.shape[0]):
            #         e = ' '.join(map(lambda x: str(x), embs[i]))
            #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

            return embs, sim_matrix, embs_1, embs_2

        # Train-set edge embeddings
        pos_train_edge_embs,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train = get_edge_embeddings(train_edges, DATASET, METHOD, flag='pos-train')
        neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, DATASET, METHOD, flag='neg-train')
        train_edge_embs = np.concatenate((pos_train_edge_embs, neg_train_edge_embs), axis=0)
        train_sim_matrix= np.concatenate((pos_train_sim_matrix, neg_train_sim_matrix), axis=0)
        train_embs_1 = np.concatenate((pos_embs_1_train, neg_embs_1_train), axis=0)
        train_embs_2 = np.concatenate((pos_embs_2_train, neg_embs_2_train), axis=0)

        # Create train-set edge labels: 1 = real edge, 0 = false edge
        train_edge_labels = np.concatenate((np.ones(len(train_edges)), np.zeros(len(train_edges_false))), axis=0)

        # Val-set edge embeddings, labels
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs,pos_val_sim_matrix,pos_embs_1_val,pos_embs_2_val = get_edge_embeddings(val_edges, DATASET, METHOD, flag='pos-val')
            neg_val_edge_embs,neg_val_sim_matrix,neg_embs_1_val,neg_embs_2_val = get_edge_embeddings(val_edges_false, DATASET, METHOD, flag='neg-val')
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate((np.ones(len(val_edges)), np.zeros(len(val_edges_false))), axis=0)
            val_sim_matrix = np.concatenate((pos_val_sim_matrix, neg_val_sim_matrix), axis=0)
            val_embs_1 = np.concatenate((pos_embs_1_val, neg_embs_1_val), axis=0)
            val_embs_2 = np.concatenate((pos_embs_2_val, neg_embs_2_val), axis=0)

        # Test-set edge embeddings, labels
        pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges, DATASET, METHOD, flag='pos-test')
        neg_test_edge_embs,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test = get_edge_embeddings(test_edges_false, DATASET, METHOD, flag='neg-test')
        test_edge_embs = np.concatenate((pos_test_edge_embs, neg_test_edge_embs), axis=0)
        test_sim_matrix = np.concatenate((pos_test_sim_matrix, neg_test_sim_matrix), axis=0)
        test_embs_1 = np.concatenate((pos_embs_1_test, neg_embs_1_test), axis=0)
        test_embs_2 = np.concatenate((pos_embs_2_test, neg_embs_2_test), axis=0)

        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

        # Train logistic regression classifier on train-set edge embeddings
        edge_classifier = LogisticRegression(random_state=0)
        edge_classifier.fit(train_edge_embs, train_edge_labels)

        # Predicted edge scores: probability of being of class "1" (real edge)
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
        test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
        print(test_preds)
        print(np.shape(test_preds))

        runtime = time.time() - start_time

        # Calculate scores
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            n2v_val_roc = roc_auc_score(val_edge_labels, val_preds)
            # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds)
            n2v_val_ap = average_precision_score(val_edge_labels, val_preds)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
        # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
        n2v_test_ap = average_precision_score(test_edge_labels, test_preds)


    # Generate edge scores using simple dot product of node embeddings (like in GAE paper)
    elif edge_score_mode == "dot-product":
        score_matrix = np.dot(emb_matrix, emb_matrix.T)
        runtime = time.time() - start_time

        # Val set scores
        if len(val_edges) > 0:
            n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        # Test set scores
        n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True)

    else:
        print("Invalid edge_score_mode! Either use edge-emb or dot-product.")

    # Record scores
    n2v_scores = {}

    n2v_scores['test_roc'] = n2v_test_roc
    # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    n2v_scores['test_ap'] = n2v_test_ap

    n2v_scores['val_roc'] = n2v_val_roc
    # n2v_scores['val_roc_curve'] = n2v_val_roc_curve
    n2v_scores['val_ap'] = n2v_val_ap

    n2v_scores['runtime'] = runtime

    return n2v_scores, train_edge_labels,test_edge_labels, test_preds,emb_matrix,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2


def node2vec_scores3(
        g_train, train_test_split, DATASET, METHOD, F,dp,
        P=1,  # Return hyperparameter
        Q=1,  # In-out hyperparameter
        WINDOW_SIZE=10,  # Context size for optimization
        NUM_WALKS=10,  # Number of walks per source
        WALK_LENGTH=80,  # Length of walk per source
        DIMENSIONS=256,  # Embedding dimension
        DIRECTED=False,  # Graph directed/undirected
        WORKERS=8,  # Num. parallel workers
        ITER=1,  # SGD epochs
        edge_score_mode="edge-emb",  # Whether to use bootstrapped edge embeddings + LogReg (like in node2vec paper),
        # or simple dot-product (like in GAE paper) for edge scoring
        verbose=1,
        Ego_user=0,
):
    if g_train.is_directed():
        DIRECTED = True

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack train-test split

    start_time = time.time()

    # Preprocessing, generate walks
    if verbose >= 1:
        print('Preprocessing grpah for node2vec...')
    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q)  # create node2vec graph instance
    g_n2v.preprocess_transition_probs()
    if verbose == 2:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=True)
    else:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=False)

    file_name = str(Ego_user)
    file_ = open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'walks-' + F + '-' + file_name, 'w')
    for walk in walks:
        line = str()
        for node in walk:
            line += str(node) + ' '
        line += '\n'
        file_.write(line)
    file_.close()

    walks = [map(str, walk) for walk in walks]

    input_file='E:\\python\\banlance\\code\\' + DATASET + '\\' + 'walks-' + F + '-' + file_name

    w2v = trainer.Word2VecTrainer(input_file, output_file="out.vec")
    if dp == 0:
        emb_mappings = w2v.train()
    if dp == 1:
        emb_mappings = w2v.train_dp()

    emb_mappings = emb_mappings.cpu().detach().numpy()

    # # Train skip-gram model
    # model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER)
    #
    # # Store embeddings mapping
    # emb_mappings = model.wv

    # Create node embeddings matrix (rows = nodes, columns = embedding features)
    emb_list = []
    for node_index in range(0, adj_train.shape[0]):
        node_str = int(node_index)
        node_emb = emb_mappings[node_str]
        emb_list.append(node_emb)
    emb_matrix = np.vstack(emb_list)

    with open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'embeds-' + F + '-' + file_name, 'w') as f:
        f.write('%d %d\n' % (adj_train.shape[0], DIMENSIONS))
        for i in range(adj_train.shape[0]):
            e = ' '.join(map(lambda x: str(x), emb_list[i]))
            f.write('%s %s\n' % (str(i), e))


            # Generate bootstrapped edge embeddings (as is done in node2vec paper)
            # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    if edge_score_mode == "edge-emb":

        def get_edge_embeddings(edge_list, DATASET, METHOD, flag):
            embs = []
            sim_matrix = []
            embs_1 = []
            embs_2 = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = emb_matrix[node1]
                # print(np.shape(emb1))
                emb2 = emb_matrix[node2]
                edge_emb = np.multiply(emb1, emb2)
                sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
                # edge_emb = np.array(emb1) + np.array(emb2)
                #print(np.shape(edge_emb))
                embs.append(edge_emb)
                embs_1.append(emb1)
                embs_2.append(emb2)
                sim_matrix.append(sim)
            embs = np.array(embs)
            sim_matrix = np.array(sim_matrix)
            embs_1 = np.array(embs_1)
            embs_2 = np.array(embs_2)

            # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
            # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
            # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
            #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
            #     for i in range(edge_list.shape[0]):
            #         e = ' '.join(map(lambda x: str(x), embs[i]))
            #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

            return embs, sim_matrix, embs_1, embs_2

        # Train-set edge embeddings
        pos_train_edge_embs,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train = get_edge_embeddings(train_edges, DATASET, METHOD, flag='pos-train')
        neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, DATASET, METHOD, flag='neg-train')
        train_edge_embs = np.concatenate((pos_train_edge_embs, neg_train_edge_embs), axis=0)
        train_sim_matrix= np.concatenate((pos_train_sim_matrix, neg_train_sim_matrix), axis=0)
        train_embs_1 = np.concatenate((pos_embs_1_train, neg_embs_1_train), axis=0)
        train_embs_2 = np.concatenate((pos_embs_2_train, neg_embs_2_train), axis=0)

        # Create train-set edge labels: 1 = real edge, 0 = false edge
        train_edge_labels = np.concatenate((np.ones(len(train_edges)), np.zeros(len(train_edges_false))), axis=0)

        # Val-set edge embeddings, labels
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs,pos_val_sim_matrix,pos_embs_1_val,pos_embs_2_val = get_edge_embeddings(val_edges, DATASET, METHOD, flag='pos-val')
            neg_val_edge_embs,neg_val_sim_matrix,neg_embs_1_val,neg_embs_2_val = get_edge_embeddings(val_edges_false, DATASET, METHOD, flag='neg-val')
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate((np.ones(len(val_edges)), np.zeros(len(val_edges_false))), axis=0)
            val_sim_matrix = np.concatenate((pos_val_sim_matrix, neg_val_sim_matrix), axis=0)
            val_embs_1 = np.concatenate((pos_embs_1_val, neg_embs_1_val), axis=0)
            val_embs_2 = np.concatenate((pos_embs_2_val, neg_embs_2_val), axis=0)

        # Test-set edge embeddings, labels
        pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges, DATASET, METHOD, flag='pos-test')
        neg_test_edge_embs,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test = get_edge_embeddings(test_edges_false, DATASET, METHOD, flag='neg-test')
        test_edge_embs = np.concatenate((pos_test_edge_embs, neg_test_edge_embs), axis=0)
        test_sim_matrix = np.concatenate((pos_test_sim_matrix, neg_test_sim_matrix), axis=0)
        test_embs_1 = np.concatenate((pos_embs_1_test, neg_embs_1_test), axis=0)
        test_embs_2 = np.concatenate((pos_embs_2_test, neg_embs_2_test), axis=0)

        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

        # Train logistic regression classifier on train-set edge embeddings
        edge_classifier = LogisticRegression(random_state=0)
        edge_classifier.fit(train_edge_embs, train_edge_labels)

        # Predicted edge scores: probability of being of class "1" (real edge)
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
        test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
        print(test_preds)
        print(np.shape(test_preds))

        runtime = time.time() - start_time

        # Calculate scores
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            n2v_val_roc = roc_auc_score(val_edge_labels, val_preds)
            # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds)
            n2v_val_ap = average_precision_score(val_edge_labels, val_preds)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
        # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
        n2v_test_ap = average_precision_score(test_edge_labels, test_preds)


    # Generate edge scores using simple dot product of node embeddings (like in GAE paper)
    elif edge_score_mode == "dot-product":
        score_matrix = np.dot(emb_matrix, emb_matrix.T)
        runtime = time.time() - start_time

        # Val set scores
        if len(val_edges) > 0:
            n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        # Test set scores
        n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True)

    else:
        print("Invalid edge_score_mode! Either use edge-emb or dot-product.")

    # Record scores
    n2v_scores = {}

    n2v_scores['test_roc'] = n2v_test_roc
    # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    n2v_scores['test_ap'] = n2v_test_ap

    n2v_scores['val_roc'] = n2v_val_roc
    # n2v_scores['val_roc_curve'] = n2v_val_roc_curve
    n2v_scores['val_ap'] = n2v_val_ap

    n2v_scores['runtime'] = runtime

    return n2v_scores, train_edge_labels,test_edge_labels, test_preds,emb_matrix,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2


def node2vec_scores4(
        g_train, train_test_split, DATASET, METHOD, F,dp,res_dir,ego_user,sigma,
        P=1,  # Return hyperparameter
        Q=1,  # In-out hyperparameter
        WINDOW_SIZE=10,  # Context size for optimization
        NUM_WALKS=10,  # Number of walks per source
        WALK_LENGTH=80,  # Length of walk per source
        DIMENSIONS=256,  # Embedding dimension
        DIRECTED=False,  # Graph directed/undirected
        WORKERS=8,  # Num. parallel workers
        ITER=1,  # SGD epochs
        edge_score_mode="edge-emb",  # Whether to use bootstrapped edge embeddings + LogReg (like in node2vec paper),
        # or simple dot-product (like in GAE paper) for edge scoring
        verbose=1,
        Ego_user=0,
):
    if g_train.is_directed():
        DIRECTED = True

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack train-test split

    start_time = time.time()

    # Preprocessing, generate walks
    if verbose >= 1:
        print('Preprocessing grpah for node2vec...')
    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q)  # create node2vec graph instance
    g_n2v.preprocess_transition_probs()
    if verbose == 2:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=True)
    else:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=False)

    file_name = str(Ego_user)
    file_ = open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'walks-' + F + '-' + file_name, 'w')
    for walk in walks:
        line = str()
        for node in walk:
            line += str(node) + ' '
        line += '\n'
        file_.write(line)
    file_.close()

    walks = [map(str, walk) for walk in walks]

    input_file='E:\\python\\banlance\\code\\' + DATASET + '\\' + 'walks-' + F + '-' + file_name

    w2v = trainer.Word2VecTrainer(input_file, output_file="out.vec")
    if dp==0:
        emb_mappings=w2v.train(res_dir,DATASET,METHOD, F,ego_user)
    if dp==1:
        emb_mappings=w2v.train_dp(res_dir,DATASET,METHOD, F,ego_user,sigma)

    emb_mappings = emb_mappings.cpu().detach().numpy()

    # # Train skip-gram model
    # model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER)
    #
    # # Store embeddings mapping
    # emb_mappings = model.wv

    # Create node embeddings matrix (rows = nodes, columns = embedding features)
    emb_list = []
    for node_index in range(0, adj_train.shape[0]):
        node_str = int(node_index)
        node_emb = emb_mappings[node_str]
        emb_list.append(node_emb)
    emb_matrix = np.vstack(emb_list)

    with open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'embeds-' + F + '-' + file_name, 'w') as f:
        f.write('%d %d\n' % (adj_train.shape[0], DIMENSIONS))
        for i in range(adj_train.shape[0]):
            e = ' '.join(map(lambda x: str(x), emb_list[i]))
            f.write('%s %s\n' % (str(i), e))

    return emb_matrix


            # Generate bootstrapped edge embeddings (as is done in node2vec paper)
            # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2



def linkpre_scores4(emb_matrix, train_edges_pos,train_edges_neg,test_edges, other_edge):

    start_time = time.time()
    # Generate bootstrapped edge embeddings (as is done in node2vec paper)
    # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2

    def get_edge_embeddings(edge_list):
        embs = []
        sim_matrix=[]
        embs_1=[]
        embs_2 = []
        for edge in edge_list:
            node1 = edge[0]
            node2 = edge[1]
            emb1 = emb_matrix[node1]
            #print(np.shape(emb1))
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2))
            #edge_emb = np.array(emb1) + np.array(emb2)
            #print(np.shape(edge_emb))
            embs.append(edge_emb)
            embs_1.append(emb1)
            embs_2.append(emb2)
            sim_matrix.append(sim)
        embs = np.array(embs)
        sim_matrix = np.array(sim_matrix)
        embs_1=np.array(embs_1)
        embs_2 =np.array(embs_2)

        # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
        # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
        # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
        #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
        #     for i in range(edge_list.shape[0]):
        #         e = ' '.join(map(lambda x: str(x), embs[i]))
        #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

        return embs,sim_matrix,embs_1,embs_2

    # Train-set edge embeddings
    pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_pos)
    neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_neg)
    train_edge_embs = np.concatenate((pos_train_edge_embs, neg_train_edge_embs), axis=0)
    train_sim_matrix= np.concatenate((pos_train_sim_matrix, neg_train_sim_matrix), axis=0)
    train_embs_1 = np.concatenate((pos_embs_1_train, neg_embs_1_train), axis=0)
    train_embs_2 = np.concatenate((pos_embs_2_train, neg_embs_2_train), axis=0)

    # Create train-set edge labels: 1 = real edge, 0 = false edge
    train_edge_labels = np.concatenate((np.ones(len(train_edges_pos)), np.zeros(len(train_edges_neg))), axis=0)


    # Test-set edge embeddings, labels
    pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges)
    neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(other_edge)

    test_edge_embs = np.concatenate((pos_test_edge_embs, neg_test_edge_embs), axis=0)
    test_sim_matrix = np.concatenate((pos_test_sim_matrix, neg_test_sim_matrix), axis=0)
    test_embs_1 = np.concatenate((pos_embs_1_test, neg_embs_1_test), axis=0)
    test_embs_2 = np.concatenate((pos_embs_2_test, neg_embs_2_test), axis=0)

    # Create val-set edge labels: 1 = real edge, 0 = false edge
    test_edge_labels = np.concatenate((np.ones(len(test_edges)), np.zeros(len(other_edge))), axis=0)

    # Train logistic regression classifier on train-set edge embeddings
    edge_classifier = LogisticRegression(random_state=0)
    edge_classifier.fit(train_edge_embs, train_edge_labels)

    # Predicted edge scores: probability of being of class "1" (real edge)

    test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
    # print(test_preds)
    #print(np.shape(test_preds))

    runtime = time.time() - start_time

    # Calculate scores

    n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
    # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
    n2v_test_ap = average_precision_score(test_edge_labels, test_preds)


    # Record scores
    n2v_scores = {}

    n2v_scores['test_roc'] = n2v_test_roc
    # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    n2v_scores['test_ap'] = n2v_test_ap

    n2v_scores['runtime'] = runtime

    return n2v_scores, train_edge_labels,test_edge_labels, test_preds,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2



def node2vec_scores5(
        g_train, train_test_split,DATASET, METHOD, F,dp,
        P=1,  # Return hyperparameter
        Q=1,  # In-out hyperparameter
        WINDOW_SIZE=10,  # Context size for optimization
        NUM_WALKS=10,  # Number of walks per source
        WALK_LENGTH=80,  # Length of walk per source
        DIMENSIONS=256,  # Embedding dimension
        DIRECTED=False,  # Graph directed/undirected
        WORKERS=8,  # Num. parallel workers
        ITER=1,  # SGD epochs
        edge_score_mode="edge-emb",  # Whether to use bootstrapped edge embeddings + LogReg (like in node2vec paper),
        # or simple dot-product (like in GAE paper) for edge scoring
        verbose=1,
        Ego_user=0,
):
    if g_train.is_directed():
        DIRECTED = True

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack train-test split

    start_time = time.time()

    # Preprocessing, generate walks
    if verbose >= 1:
        print('Preprocessing grpah for node2vec...')
    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q)  # create node2vec graph instance
    g_n2v.preprocess_transition_probs()
    if verbose == 2:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=True)
    else:
        walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=False)

    file_name = str(Ego_user)
    file_ = open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'walks-' + F + '-' + file_name, 'w')
    for walk in walks:
        line = str()
        for node in walk:
            line += str(node) + ' '
        line += '\n'
        file_.write(line)
    file_.close()

    walks = [map(str, walk) for walk in walks]

    input_file='E:\\python\\banlance\\code\\' + DATASET + '\\' + 'walks-' + F + '-' + file_name

    w2v = trainer.Word2VecTrainer(input_file, output_file="out.vec")
    if dp == 0:
        emb_mappings = w2v.train()
    if dp == 1:
        emb_mappings = w2v.train_dp()

    emb_mappings = emb_mappings.cpu().detach().numpy()

    # # Train skip-gram model
    # model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER)
    #
    # # Store embeddings mapping
    # emb_mappings = model.wv

    # Create node embeddings matrix (rows = nodes, columns = embedding features)
    emb_list = []
    for node_index in range(0, adj_train.shape[0]):
        node_str = int(node_index)
        node_emb = emb_mappings[node_str]
        emb_list.append(node_emb)
    emb_matrix = np.vstack(emb_list)

    with open('E:\\python\\banlance\\code\\' + DATASET + '\\' + 'embeds-' + F + '-' + file_name, 'w') as f:
        f.write('%d %d\n' % (adj_train.shape[0], DIMENSIONS))
        for i in range(adj_train.shape[0]):
            e = ' '.join(map(lambda x: str(x), emb_list[i]))
            f.write('%s %s\n' % (str(i), e))


            # Generate bootstrapped edge embeddings (as is done in node2vec paper)
            # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    if edge_score_mode == "edge-emb":

        def get_edge_embeddings(edge_list, DATASET, METHOD, flag):
            embs = []
            sim_matrix = []
            embs_1 = []
            embs_2 = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = emb_matrix[node1]
                # print(np.shape(emb1))
                emb2 = emb_matrix[node2]
                edge_emb = np.multiply(emb1, emb2)
                sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
                # edge_emb = np.array(emb1) + np.array(emb2)
                #print(np.shape(edge_emb))
                embs.append(edge_emb)
                embs_1.append(emb1)
                embs_2.append(emb2)
                sim_matrix.append(sim)
            embs = np.array(embs)
            sim_matrix = np.array(sim_matrix)
            embs_1 = np.array(embs_1)
            embs_2 = np.array(embs_2)

            # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
            # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
            # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
            #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
            #     for i in range(edge_list.shape[0]):
            #         e = ' '.join(map(lambda x: str(x), embs[i]))
            #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

            return embs, sim_matrix, embs_1, embs_2

        # Train-set edge embeddings
        pos_train_edge_embs,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train = get_edge_embeddings(train_edges, DATASET, METHOD, flag='pos-train')
        neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, DATASET, METHOD, flag='neg-train')
        train_edge_embs = np.concatenate((pos_train_edge_embs, neg_train_edge_embs), axis=0)
        train_sim_matrix= np.concatenate((pos_train_sim_matrix, neg_train_sim_matrix), axis=0)
        train_embs_1 = np.concatenate((pos_embs_1_train, neg_embs_1_train), axis=0)
        train_embs_2 = np.concatenate((pos_embs_2_train, neg_embs_2_train), axis=0)

        # Create train-set edge labels: 1 = real edge, 0 = false edge
        train_edge_labels = np.concatenate((np.ones(len(train_edges)), np.zeros(len(train_edges_false))), axis=0)

        # Val-set edge embeddings, labels
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs,pos_val_sim_matrix,pos_embs_1_val,pos_embs_2_val = get_edge_embeddings(val_edges, DATASET, METHOD, flag='pos-val')
            neg_val_edge_embs,neg_val_sim_matrix,neg_embs_1_val,neg_embs_2_val = get_edge_embeddings(val_edges_false, DATASET, METHOD, flag='neg-val')
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate((np.ones(len(val_edges)), np.zeros(len(val_edges_false))), axis=0)
            val_sim_matrix = np.concatenate((pos_val_sim_matrix, neg_val_sim_matrix), axis=0)
            val_embs_1 = np.concatenate((pos_embs_1_val, neg_embs_1_val), axis=0)
            val_embs_2 = np.concatenate((pos_embs_2_val, neg_embs_2_val), axis=0)

        # Test-set edge embeddings, labels
        pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges, DATASET, METHOD, flag='pos-test')
        neg_test_edge_embs,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test = get_edge_embeddings(test_edges_false, DATASET, METHOD, flag='neg-test')
        test_edge_embs = np.concatenate((pos_test_edge_embs, neg_test_edge_embs), axis=0)
        test_sim_matrix = np.concatenate((pos_test_sim_matrix, neg_test_sim_matrix), axis=0)
        test_embs_1 = np.concatenate((pos_embs_1_test, neg_embs_1_test), axis=0)
        test_embs_2 = np.concatenate((pos_embs_2_test, neg_embs_2_test), axis=0)

        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

        other_edge_embs, other_sim_matrix, other_embs_1_test, other_embs_2_test = get_edge_embeddings(
            other_edge, DATASET, METHOD,flag='other')
        other_edge_labels = np.zeros(len(other_edge))


        # Train logistic regression classifier on train-set edge embeddings
        edge_classifier = LogisticRegression(random_state=0)
        edge_classifier.fit(train_edge_embs, train_edge_labels)

        # Predicted edge scores: probability of being of class "1" (real edge)
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
        test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
        print(test_preds)
        print(np.shape(test_preds))

        runtime = time.time() - start_time

        # Calculate scores
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            n2v_val_roc = roc_auc_score(val_edge_labels, val_preds)
            # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds)
            n2v_val_ap = average_precision_score(val_edge_labels, val_preds)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
        # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
        n2v_test_ap = average_precision_score(test_edge_labels, test_preds)


    # Generate edge scores using simple dot product of node embeddings (like in GAE paper)
    elif edge_score_mode == "dot-product":
        score_matrix = np.dot(emb_matrix, emb_matrix.T)
        runtime = time.time() - start_time

        # Val set scores
        if len(val_edges) > 0:
            n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True)
        else:
            n2v_val_roc = None
            n2v_val_roc_curve = None
            n2v_val_ap = None

        # Test set scores
        n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True)

    else:
        print("Invalid edge_score_mode! Either use edge-emb or dot-product.")

    # Record scores
    n2v_scores = {}

    n2v_scores['test_roc'] = n2v_test_roc
    # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    n2v_scores['test_ap'] = n2v_test_ap

    n2v_scores['val_roc'] = n2v_val_roc
    # n2v_scores['val_roc_curve'] = n2v_val_roc_curve
    n2v_scores['val_ap'] = n2v_val_ap

    n2v_scores['runtime'] = runtime

    return n2v_scores, train_edge_labels,test_edge_labels, test_preds,emb_matrix,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2, other_edge_embs, other_sim_matrix, other_embs_1_test, other_embs_2_test, other_edge_labels



def linkpre_scores6(emb_matrix, train_edges_pos,train_edges_neg,test_edges, other_edge):

    start_time = time.time()
    # Generate bootstrapped edge embeddings (as is done in node2vec paper)
    # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2

    def get_edge_embeddings(edge_list):
        embs = []
        sim_matrix=[]
        embs_1=[]
        embs_2 = []
        for edge in edge_list:
            node1 = edge[0]
            node2 = edge[1]
            emb1 = emb_matrix[node1]
            #print(np.shape(emb1))
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2))
            #edge_emb = np.array(emb1) + np.array(emb2)
            #print(np.shape(edge_emb))
            embs.append(edge_emb)
            embs_1.append(emb1)
            embs_2.append(emb2)
            sim_matrix.append(sim)
        embs = np.array(embs)
        sim_matrix = np.array(sim_matrix)
        embs_1=np.array(embs_1)
        embs_2 =np.array(embs_2)

        # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
        # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
        # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
        #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
        #     for i in range(edge_list.shape[0]):
        #         e = ' '.join(map(lambda x: str(x), embs[i]))
        #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

        return embs,sim_matrix,embs_1,embs_2

    # Train-set edge embeddings
    pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_pos)
    neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_neg)
    train_edge_embs = np.concatenate((pos_train_edge_embs, neg_train_edge_embs), axis=0)
    train_sim_matrix= np.concatenate((pos_train_sim_matrix, neg_train_sim_matrix), axis=0)
    train_embs_1 = np.concatenate((pos_embs_1_train, neg_embs_1_train), axis=0)
    train_embs_2 = np.concatenate((pos_embs_2_train, neg_embs_2_train), axis=0)

    # Create train-set edge labels: 1 = real edge, 0 = false edge
    train_edge_labels = np.concatenate((np.ones(len(train_edges_pos)), np.zeros(len(train_edges_neg))), axis=0)


    # Test-set edge embeddings, labels
    pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges)
    neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(other_edge)

    test_edge_embs = np.concatenate((pos_test_edge_embs, neg_test_edge_embs), axis=0)
    test_sim_matrix = np.concatenate((pos_test_sim_matrix, neg_test_sim_matrix), axis=0)
    test_embs_1 = np.concatenate((pos_embs_1_test, neg_embs_1_test), axis=0)
    test_embs_2 = np.concatenate((pos_embs_2_test, neg_embs_2_test), axis=0)

    # Create val-set edge labels: 1 = real edge, 0 = false edge
    test_edge_labels = np.concatenate((np.ones(len(test_edges)), np.zeros(len(other_edge))), axis=0)

    # Train logistic regression classifier on train-set edge embeddings
    edge_classifier = LogisticRegression(random_state=0)
    edge_classifier.fit(train_edge_embs, train_edge_labels)

    # Predicted edge scores: probability of being of class "1" (real edge)

    test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
    # print(test_preds)
    #print(np.shape(test_preds))

    runtime = time.time() - start_time

    # Calculate scores

    n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
    # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
    n2v_test_ap = average_precision_score(test_edge_labels, test_preds)


    # Record scores
    n2v_scores = {}

    n2v_scores['test_roc'] = n2v_test_roc
    # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    n2v_scores['test_ap'] = n2v_test_ap

    n2v_scores['runtime'] = runtime

    return n2v_scores, train_edge_labels,test_edge_labels, test_preds,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2



def linkpre_scores7(emb_matrix, train_edges_pos,train_edges_neg,test_edges):

    start_time = time.time()
    # Generate bootstrapped edge embeddings (as is done in node2vec paper)
    # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2

    def get_edge_embeddings(edge_list):
        embs = []
        sim_matrix=[]
        embs_1=[]
        embs_2 = []
        for edge in edge_list:
            node1 = edge[0]
            node2 = edge[1]
            emb1 = emb_matrix[node1]
            #print(np.shape(emb1))
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            # sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2))
            sim = np.dot(emb1, emb2)
            #edge_emb = np.array(emb1) + np.array(emb2)
            #print(np.shape(edge_emb))
            embs.append(edge_emb)
            embs_1.append(emb1)
            embs_2.append(emb2)
            sim_matrix.append(sim)
        embs = np.array(embs)
        sim_matrix = np.array(sim_matrix)
        embs_1=np.array(embs_1)
        embs_2 =np.array(embs_2)

        # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
        # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
        # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
        #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
        #     for i in range(edge_list.shape[0]):
        #         e = ' '.join(map(lambda x: str(x), embs[i]))
        #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

        return embs,sim_matrix,embs_1,embs_2

    edgeall = list([list(edge_tuple) for edge_tuple in train_edges_pos])

    # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])
    train_edges_sampled = train_edges_pos

    # Train-set edge embeddings
    pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_sampled)
    neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_neg)
    train_edge_embs = pos_train_edge_embs
    train_sim_matrix= pos_train_sim_matrix
    train_embs_1 = pos_embs_1_train
    train_embs_2 = pos_embs_2_train

    # Create train-set edge labels: 1 = real edge, 0 = false edge
    train_edge_labels = np.ones(len(train_edges_sampled))


    # Test-set edge embeddings, labels
    pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges)
    # neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(other_edge)

    test_edge_embs = pos_test_edge_embs
    test_sim_matrix = pos_test_sim_matrix
    test_embs_1 = pos_embs_1_test
    test_embs_2 = pos_embs_2_test

    # Create val-set edge labels: 1 = real edge, 0 = false edge
    test_edge_labels = np.ones(len(test_edges))

    # # Train logistic regression classifier on train-set edge embeddings
    # edge_classifier = LogisticRegression(random_state=0)
    # edge_classifier.fit(train_edge_embs, train_edge_labels)
    #
    # # Predicted edge scores: probability of being of class "1" (real edge)
    #
    # test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
    # # print(test_preds)
    # #print(np.shape(test_preds))
    #
    # runtime = time.time() - start_time
    #
    # # Calculate scores
    #
    # n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
    # # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
    # n2v_test_ap = average_precision_score(test_edge_labels, test_preds)
    #
    #
    # # Record scores
    # n2v_scores = {}
    #
    # n2v_scores['test_roc'] = n2v_test_roc
    # # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    # n2v_scores['test_ap'] = n2v_test_ap
    #
    # n2v_scores['runtime'] = runtime

    return train_edge_labels,test_edge_labels,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2,train_edges_sampled



def linkpre_scores8(emb_matrix, train_edges_pos,train_edges_neg,test_edges, other_edge):

    start_time = time.time()
    # Generate bootstrapped edge embeddings (as is done in node2vec paper)
    # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2

    def get_edge_embeddings(edge_list):
        embs = []
        sim_matrix=[]
        embs_1=[]
        embs_2 = []
        for edge in edge_list:
            node1 = edge[0]
            node2 = edge[1]
            emb1 = emb_matrix[node1]
            #print(np.shape(emb1))
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2))
            #edge_emb = np.array(emb1) + np.array(emb2)
            #print(np.shape(edge_emb))
            embs.append(edge_emb)
            embs_1.append(emb1)
            embs_2.append(emb2)
            sim_matrix.append(sim)
        embs = np.array(embs)
        sim_matrix = np.array(sim_matrix)
        embs_1=np.array(embs_1)
        embs_2 =np.array(embs_2)

        # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
        # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
        # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
        #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
        #     for i in range(edge_list.shape[0]):
        #         e = ' '.join(map(lambda x: str(x), embs[i]))
        #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

        return embs,sim_matrix,embs_1,embs_2

    # Train-set edge embeddings
    pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_pos)
    neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_neg)
    train_edge_embs = np.concatenate((pos_train_edge_embs, neg_train_edge_embs), axis=0)
    train_sim_matrix= np.concatenate((pos_train_sim_matrix, neg_train_sim_matrix), axis=0)
    train_embs_1 = np.concatenate((pos_embs_1_train, neg_embs_1_train), axis=0)
    train_embs_2 = np.concatenate((pos_embs_2_train, neg_embs_2_train), axis=0)

    # Create train-set edge labels: 1 = real edge, 0 = false edge
    train_edge_labels = np.concatenate((np.ones(len(train_edges_pos)), np.zeros(len(train_edges_neg))), axis=0)


    # Test-set edge embeddings, labels
    pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges)
    neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(other_edge)

    test_edge_embs = np.concatenate((pos_test_edge_embs, neg_test_edge_embs), axis=0)
    test_sim_matrix = np.concatenate((pos_test_sim_matrix, neg_test_sim_matrix), axis=0)
    test_embs_1 = np.concatenate((pos_embs_1_test, neg_embs_1_test), axis=0)
    test_embs_2 = np.concatenate((pos_embs_2_test, neg_embs_2_test), axis=0)

    # Create val-set edge labels: 1 = real edge, 0 = false edge
    test_edge_labels = np.concatenate((np.ones(len(test_edges)), np.zeros(len(other_edge))), axis=0)

    # Train logistic regression classifier on train-set edge embeddings
    edge_classifier = LogisticRegression(random_state=0)
    edge_classifier.fit(train_edge_embs, train_edge_labels)

    # Predicted edge scores: probability of being of class "1" (real edge)

    test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
    # print(test_preds)
    #print(np.shape(test_preds))

    runtime = time.time() - start_time

    # Calculate scores

    n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
    # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
    n2v_test_ap = average_precision_score(test_edge_labels, test_preds)


    # Record scores
    n2v_scores = {}

    n2v_scores['test_roc'] = n2v_test_roc
    # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    n2v_scores['test_ap'] = n2v_test_ap

    n2v_scores['runtime'] = runtime

    return n2v_scores, train_edge_labels,test_edge_labels, test_preds,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2



def node2vec_scores8(
        g_train, train_test_split, DATASET, METHOD, F,dp,res_dir,ego_user,sigma,
        P=1,  # Return hyperparameter
        Q=1,  # In-out hyperparameter
        WINDOW_SIZE=10,  # Context size for optimization
        NUM_WALKS=10,  # Number of walks per source
        WALK_LENGTH=80,  # Length of walk per source
        DIMENSIONS=256,  # Embedding dimension
        DIRECTED=False,  # Graph directed/undirected
        WORKERS=8,  # Num. parallel workers
        ITER=1,  # SGD epochs
        edge_score_mode="edge-emb",  # Whether to use bootstrapped edge embeddings + LogReg (like in node2vec paper),
        # or simple dot-product (like in GAE paper) for edge scoring
        verbose=1,
        Ego_user=0,
):
    if g_train.is_directed():
        DIRECTED = True

    output = res_dir + METHOD + '-embeds-' + F + '-' + str(ego_user)

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack train-test split

    start_time = time.time()

    # Preprocessing, generate walks
    if verbose >= 1:
        print('Preprocessing grpah for node2vec...')
    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q)  # create node2vec graph instance
    g_n2v.preprocess_transition_probs()
    if dp==6:
        if verbose == 2:
            walks = g_n2v.simulate_walks_defense(sigma,NUM_WALKS, WALK_LENGTH, verbose=True)
        else:
            walks = g_n2v.simulate_walks_defense(sigma,NUM_WALKS, WALK_LENGTH, verbose=False)
    else:
        if verbose == 2:
            walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=True)
        else:
            walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=False)

    file_name = str(Ego_user)
    file_ = open(res_dir + 'walks-' + F + '-' + file_name, 'w')
    for walk in walks:
        line = str()
        for node in walk:
            line += str(node) + ' '
        line += '\n'
        file_.write(line)
    file_.close()

    print(walks)

    walks = [list(map(str, walk)) for walk in walks]

    g_train = nx.adjacency_matrix(g_train)

    G = nx.Graph(g_train)

    # model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER)

    print(G.nodes())

    # Train skip-gram model
    if dp == 1:
        model = word2vec.ModWord2Vec_dp(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0,
                                        sg=1, hs=1, workers=WORKERS, compute_loss=True, budget=sigma)
    elif dp==0 or dp==6:
        model = word2vec.ModWord2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0,
                                        sg=1, hs=1, workers=WORKERS, compute_loss=True)

    elif dp==2:
        model = word2vec.ModWord2Vec_defense(train_edges, test_edges, len(G.nodes()), output, walks,
                                             size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1,
                                             hs=1, workers=WORKERS, compute_loss=True)
    elif dp==3:
        model = word2vec.ModWord2Vec_defense2(train_edges, test_edges, len(G.nodes()), output, walks,
                                             size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1,
                                             hs=1, workers=WORKERS, compute_loss=True)

    elif dp==4:
        model = word2vec.ModWord2Vec_defense3(train_edges, test_edges, len(G.nodes()), output, walks,
                                             size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1,
                                             hs=1, workers=WORKERS, compute_loss=True)

    elif dp == 5:
        model = word2vec.ModWord2Vec5(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0,
                                        sg=1, hs=1, workers=WORKERS, compute_loss=True)


    # Store embeddings mapping
    emb_matrix = model.save_emb(output, len(G.nodes()))

    # # Create node embeddings matrix (rows = nodes, columns = embedding features)
    # emb_list = []
    # for node_index in range(0, adj_train.shape[0]):
    #     node_str = str(node_index)
    #     node_emb = emb_mappings[node_str]
    #     emb_list.append(node_emb)
    # emb_matrix = np.vstack(emb_list)

    with open('./data/embeds-' + F + '-' + file_name, 'w') as f:
        f.write('%d %d\n' % (adj_train.shape[0], DIMENSIONS))
        for i in range(adj_train.shape[0]):
            e = ' '.join(map(lambda x: str(x), emb_matrix[i]))
            f.write('%s %s\n' % (str(i), e))


            # Generate bootstrapped edge embeddings (as is done in node2vec paper)
            # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    if edge_score_mode == "edge-emb":

        def get_edge_embeddings(edge_list,ego_user,DATASET, Flag, flag):
            tsts=[]
            embs = []
            sim_matrix = []
            embs_1 = []
            embs_2 = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = emb_matrix[node1]
                # print(np.shape(emb1))
                emb2 = emb_matrix[node2]
                edge_emb = np.multiply(emb1, emb2)
                sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
                sim2 = np.dot(emb1, emb2)
                # sim3 = np.sqrt(np.sum(np.sqrt(np.array(emb1)-np.array(emb2))))
                # print(sim3)
                sim3 = np.linalg.norm(np.array(emb1) - np.array(emb2))
                # edge_emb = np.array(emb1) + np.array(emb2)
                #print(np.shape(edge_emb))
                sim4 = 1 / (1 + sim3)
                embs.append(edge_emb)
                embs_1.append(emb1)
                embs_2.append(emb2)
                sim_matrix.append([sim, sim2, sim3, sim4])

                tst = [node1, node2, sim, sim2, sim3, sim4]
                tsts.append(tst)

            embs = np.array(embs)
            sim_matrix = np.array(sim_matrix)
            embs_1 = np.array(embs_1)
            embs_2 = np.array(embs_2)

            name = ['node1', 'node2', 'sim1', 'sim2', 'sim3', 'sim4']
            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-similarity.csv".format(Flag, flag))

            # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
            # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
            # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
            #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
            #     for i in range(edge_list.shape[0]):
            #         e = ' '.join(map(lambda x: str(x), embs[i]))
            #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

            return embs, sim_matrix, embs_1, embs_2

        edgeall = list([list(edge_tuple) for edge_tuple in train_edges])

        # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])
        train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])

        # Train-set edge embeddings
        pos_train_edge_embs0, pos_train_sim_matrix0, pos_embs_1_train0, pos_embs_2_train0 = get_edge_embeddings(
            edgeall, ego_user, DATASET, F, flag='pos-train-all')
        pos_train_edge_embs, pos_train_sim_matrix, pos_embs_1_train, pos_embs_2_train = get_edge_embeddings(
            train_edges_sampled, ego_user, DATASET, F, flag='pos-train')
        neg_train_edge_embs, neg_train_sim_matrix, neg_embs_1_train, neg_embs_2_train = get_edge_embeddings(
            train_edges_false, ego_user, DATASET, F, flag='neg-train')
        train_edge_embs = pos_train_edge_embs
        train_sim_matrix = pos_train_sim_matrix
        train_embs_1 = pos_embs_1_train
        train_embs_2 = pos_embs_2_train

        # Create train-set edge labels: 1 = real edge, 0 = false edge
        train_edge_labels = np.ones(len(train_edges_sampled))

        # Val-set edge embeddings, labels
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs, pos_val_sim_matrix, pos_embs_1_val, pos_embs_2_val = get_edge_embeddings(val_edges,ego_user,DATASET,F,flag='pos-val')
            neg_val_edge_embs, neg_val_sim_matrix, neg_embs_1_val, neg_embs_2_val = get_edge_embeddings(
                val_edges_false, ego_user, DATASET, F, flag='neg-val')
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate((np.ones(len(val_edges)), np.zeros(len(val_edges_false))), axis=0)
            val_sim_matrix = np.concatenate((pos_val_sim_matrix, neg_val_sim_matrix), axis=0)
            val_embs_1 = np.concatenate((pos_embs_1_val, neg_embs_1_val), axis=0)
            val_embs_2 = np.concatenate((pos_embs_2_val, neg_embs_2_val), axis=0)

        # Test-set edge embeddings, labels
        pos_test_edge_embs, pos_test_sim_matrix, pos_embs_1_test, pos_embs_2_test = get_edge_embeddings(test_edges,ego_user,DATASET, F,flag='pos-test')

        neg_test_edge_embs, neg_test_sim_matrix, neg_embs_1_test, neg_embs_2_test = get_edge_embeddings(
            test_edges_false, ego_user, DATASET, F, flag='neg-test')
        test_edge_embs = pos_test_edge_embs
        test_sim_matrix = pos_test_sim_matrix
        test_embs_1 = pos_embs_1_test
        test_embs_2 = pos_embs_2_test

        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.ones(len(test_edges))

    #     # Train logistic regression classifier on train-set edge embeddings
    #     edge_classifier = LogisticRegression(random_state=0)
    #     edge_classifier.fit(train_edge_embs, train_edge_labels)
    #
    #     # Predicted edge scores: probability of being of class "1" (real edge)
    #     if len(val_edges) > 0 and len(val_edges_false) > 0:
    #         val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
    #     test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
    #     print(test_preds)
    #     print(np.shape(test_preds))
    #
    #     runtime = time.time() - start_time
    #
    #     # Calculate scores
    #     if len(val_edges) > 0 and len(val_edges_false) > 0:
    #         n2v_val_roc = roc_auc_score(val_edge_labels, val_preds)
    #         # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds)
    #         n2v_val_ap = average_precision_score(val_edge_labels, val_preds)
    #     else:
    #         n2v_val_roc = None
    #         n2v_val_roc_curve = None
    #         n2v_val_ap = None
    #
    #     n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
    #     # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
    #     n2v_test_ap = average_precision_score(test_edge_labels, test_preds)
    #
    #
    # # Generate edge scores using simple dot product of node embeddings (like in GAE paper)
    # elif args.edge_score_mode == "dot-product":
    #     score_matrix = np.dot(emb_matrix, emb_matrix.T)
    #     runtime = time.time() - start_time
    #
    #     # Val set scores
    #     if len(val_edges) > 0:
    #         n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True)
    #     else:
    #         n2v_val_roc = None
    #         n2v_val_roc_curve = None
    #         n2v_val_ap = None
    #
    #     # Test set scores
    #     n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True)
    #
    # else:
    #     print
    #     "Invalid edge_score_mode! Either use edge-emb or dot-product."
    #
    # # Record scores
    n2v_scores = {}
    #
    # n2v_scores['test_roc'] = n2v_test_roc
    # # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    # n2v_scores['test_ap'] = n2v_test_ap
    #
    # n2v_scores['val_roc'] = n2v_val_roc
    # # n2v_scores['val_roc_curve'] = n2v_val_roc_curve
    # n2v_scores['val_ap'] = n2v_val_ap
    #
    # n2v_scores['runtime'] = runtime

    return train_edge_labels,test_edge_labels,emb_matrix,train_sim_matrix,test_sim_matrix,train_edge_embs,test_edge_embs,train_embs_1,train_embs_2,test_embs_1,test_embs_2,train_edges_sampled



def node2vec_scores_inf_debias(
        g_train, train_test_split, DATASET, METHOD, F,dp,res_dir,ego_user,sigma,
        P=1,  # Return hyperparameter
        Q=1,  # In-out hyperparameter
        WINDOW_SIZE=10,  # Context size for optimization
        NUM_WALKS=10,  # Number of walks per source
        WALK_LENGTH=80,  # Length of walk per source
        DIMENSIONS=256,  # Embedding dimension
        DIRECTED=False,  # Graph directed/undirected
        WORKERS=8,  # Num. parallel workers
        ITER=1,  # SGD epochs
        edge_score_mode="edge-emb",  # Whether to use bootstrapped edge embeddings + LogReg (like in node2vec paper),
        # or simple dot-product (like in GAE paper) for edge scoring
        verbose=1,
        Ego_user=0,
):
    if g_train.is_directed():
        DIRECTED = True

    output = res_dir + METHOD + '-embeds-' + F + '-' + str(ego_user)

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = train_test_split  # Unpack train-test split

    start_time = time.time()

    # Preprocessing, generate walks
    if verbose >= 1:
        print('Preprocessing grpah for node2vec...')
    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q)  # create node2vec graph instance
    g_n2v.preprocess_transition_probs()
    if dp==6:
        if verbose == 2:
            walks = g_n2v.simulate_walks_defense(sigma,NUM_WALKS, WALK_LENGTH, verbose=True)
        else:
            walks = g_n2v.simulate_walks_defense(sigma,NUM_WALKS, WALK_LENGTH, verbose=False)
    else:
        if verbose == 2:
            walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=True)
        else:
            walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH, verbose=False)

    file_name = str(Ego_user)
    file_ = open(res_dir + 'walks-' + F + '-' + file_name, 'w')
    for walk in walks:
        line = str()
        for node in walk:
            line += str(node) + ' '
        line += '\n'
        file_.write(line)
    file_.close()

    print(walks)

    walks = [list(map(str, walk)) for walk in walks]

    g_train = nx.adjacency_matrix(g_train)

    G = nx.Graph(g_train)

    # model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER)

    print(G.nodes())

    # Train skip-gram model
    if dp == 1:
        model = word2vec.ModWord2Vec_dp(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0,
                                        sg=1, hs=1, workers=WORKERS, compute_loss=True, budget=sigma)
    elif dp==0 or dp==6:
        model = word2vec.ModWord2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0,
                                        sg=1, hs=1, workers=WORKERS, compute_loss=True)

    elif dp==2:
        model = word2vec.ModWord2Vec_defense(train_edges, test_edges, len(G.nodes()), output, walks,
                                             size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1,
                                             hs=1, workers=WORKERS, compute_loss=True)
    elif dp==3:
        model = word2vec.ModWord2Vec_defense2(train_edges, test_edges, len(G.nodes()), output, walks,
                                             size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1,
                                             hs=1, workers=WORKERS, compute_loss=True)

    elif dp==4:
        model = word2vec.ModWord2Vec_defense3(train_edges, test_edges, len(G.nodes()), output, walks,
                                             size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1,
                                             hs=1, workers=WORKERS, compute_loss=True)

    elif dp == 5:
        model = word2vec.ModWord2Vec5(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0,
                                        sg=1, hs=1, workers=WORKERS, compute_loss=True)


    # Store embeddings mapping
    emb_matrix = model.save_emb(output, len(G.nodes()))

    # # Create node embeddings matrix (rows = nodes, columns = embedding features)
    # emb_list = []
    # for node_index in range(0, adj_train.shape[0]):
    #     node_str = str(node_index)
    #     node_emb = emb_mappings[node_str]
    #     emb_list.append(node_emb)
    # emb_matrix = np.vstack(emb_list)

    with open('./data/embeds-' + F + '-' + file_name, 'w') as f:
        f.write('%d %d\n' % (adj_train.shape[0], DIMENSIONS))
        for i in range(adj_train.shape[0]):
            e = ' '.join(map(lambda x: str(x), emb_matrix[i]))
            f.write('%s %s\n' % (str(i), e))


            # Generate bootstrapped edge embeddings (as is done in node2vec paper)
            # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    if edge_score_mode == "edge-emb":

        def get_edge_embeddings(edge_list,ego_user,DATASET, Flag, flag):
            tsts=[]
            embs = []
            sim_matrix = []
            embs_1 = []
            embs_2 = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = emb_matrix[node1]
                # print(np.shape(emb1))
                emb2 = emb_matrix[node2]
                edge_emb = np.multiply(emb1, emb2)
                sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
                sim2 = np.dot(emb1, emb2)
                # sim3 = np.sqrt(np.sum(np.sqrt(np.array(emb1)-np.array(emb2))))
                # print(sim3)
                sim3 = np.linalg.norm(np.array(emb1) - np.array(emb2))
                # edge_emb = np.array(emb1) + np.array(emb2)
                #print(np.shape(edge_emb))
                sim4 = 1 / (1 + sim3)
                embs.append(edge_emb)
                embs_1.append(emb1)
                embs_2.append(emb2)
                sim_matrix.append([sim, sim2, sim3, sim4])

                tst = [node1, node2, sim, sim2, sim3, sim4]
                tsts.append(tst)

            embs = np.array(embs)
            sim_matrix = np.array(sim_matrix)
            embs_1 = np.array(embs_1)
            embs_2 = np.array(embs_2)

            name = ['node1', 'node2', 'sim1', 'sim2', 'sim3', 'sim4']
            result = pd.DataFrame(columns=name, data=tsts)
            result.to_csv("{}{}-similarity.csv".format(Flag, flag))

            # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
            # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
            # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
            #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
            #     for i in range(edge_list.shape[0]):
            #         e = ' '.join(map(lambda x: str(x), embs[i]))
            #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

            return embs, sim_matrix, embs_1, embs_2

        edgeall = list([list(edge_tuple) for edge_tuple in train_edges])

        # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])
        # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])

        # Train-set edge embeddings
        # pos_train_edge_embs0, pos_train_sim_matrix0, pos_embs_1_train0, pos_embs_2_train0 = get_edge_embeddings(
        #     edgeall, ego_user, DATASET, F, flag='pos-train-all')
        pos_train_edge_embs, pos_train_sim_matrix, pos_embs_1_train, pos_embs_2_train = get_edge_embeddings(
            train_edges, ego_user, DATASET, F, flag='pos-train')
        neg_train_edge_embs, neg_train_sim_matrix, neg_embs_1_train, neg_embs_2_train = get_edge_embeddings(
            train_edges_false, ego_user, DATASET, F, flag='neg-train')
        train_edge_embs = pos_train_edge_embs
        train_sim_matrix = pos_train_sim_matrix
        train_embs_1 = pos_embs_1_train
        train_embs_2 = pos_embs_2_train

        # Create train-set edge labels: 1 = real edge, 0 = false edge
        # train_edge_labels = np.ones(len(train_edges_sampled))

        # Val-set edge embeddings, labels
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs, pos_val_sim_matrix, pos_embs_1_val, pos_embs_2_val = get_edge_embeddings(val_edges,ego_user,DATASET,F,flag='pos-val')
            neg_val_edge_embs, neg_val_sim_matrix, neg_embs_1_val, neg_embs_2_val = get_edge_embeddings(
                val_edges_false, ego_user, DATASET, F, flag='neg-val')
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate((np.ones(len(val_edges)), np.zeros(len(val_edges_false))), axis=0)
            val_sim_matrix = np.concatenate((pos_val_sim_matrix, neg_val_sim_matrix), axis=0)
            val_embs_1 = np.concatenate((pos_embs_1_val, neg_embs_1_val), axis=0)
            val_embs_2 = np.concatenate((pos_embs_2_val, neg_embs_2_val), axis=0)

        # Test-set edge embeddings, labels
        pos_test_edge_embs, pos_test_sim_matrix, pos_embs_1_test, pos_embs_2_test = get_edge_embeddings(test_edges,ego_user,DATASET, F,flag='pos-test')

        neg_test_edge_embs, neg_test_sim_matrix, neg_embs_1_test, neg_embs_2_test = get_edge_embeddings(
            test_edges_false, ego_user, DATASET, F, flag='neg-test')
        test_edge_embs = pos_test_edge_embs
        test_sim_matrix = pos_test_sim_matrix
        test_embs_1 = pos_embs_1_test
        test_embs_2 = pos_embs_2_test

        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.ones(len(test_edges))

        train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])


        # Train logistic regression classifier on train-set edge embeddings
        edge_classifier = LogisticRegression(random_state=0)
        edge_classifier.fit(train_edge_embs, train_edge_labels)

        # Predicted edge scores: probability of being of class "1" (real edge)
        # if len(val_edges) > 0 and len(val_edges_false) > 0:
        #     val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
        test_preds = edge_classifier.predict_proba(test_edge_embs)
        # print(test_preds)
        # print(np.shape(test_preds))

        runtime = time.time() - start_time

        # Calculate scores
        # if len(val_edges) > 0 and len(val_edges_false) > 0:
        #     n2v_val_roc = roc_auc_score(val_edge_labels, val_preds)
        #     # n2v_val_roc_curve = roc_curve(val_edge_labels, val_preds)
        #     n2v_val_ap = average_precision_score(val_edge_labels, val_preds)
        # else:
        #     n2v_val_roc = None
        #     n2v_val_roc_curve = None
        #     n2v_val_ap = None
        #
        # n2v_test_roc = roc_auc_score(test_edge_labels, test_preds)
        # # n2v_test_roc_curve = roc_curve(test_edge_labels, test_preds)
        # n2v_test_ap = average_precision_score(test_edge_labels, test_preds)
    #
    #
    # # Generate edge scores using simple dot product of node embeddings (like in GAE paper)
    # elif args.edge_score_mode == "dot-product":
    #     score_matrix = np.dot(emb_matrix, emb_matrix.T)
    #     runtime = time.time() - start_time
    #
    #     # Val set scores
    #     if len(val_edges) > 0:
    #         n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_edges_false, score_matrix, apply_sigmoid=True)
    #     else:
    #         n2v_val_roc = None
    #         n2v_val_roc_curve = None
    #         n2v_val_ap = None
    #
    #     # Test set scores
    #     n2v_test_roc, n2v_test_ap = get_roc_score(test_edges, test_edges_false, score_matrix, apply_sigmoid=True)
    #
    # else:
    #     print
    #     "Invalid edge_score_mode! Either use edge-emb or dot-product."
    #
    # # Record scores
    n2v_scores = {}
    #
    # n2v_scores['test_roc'] = n2v_test_roc
    # # n2v_scores['test_roc_curve'] = n2v_test_roc_curve
    # n2v_scores['test_ap'] = n2v_test_ap
    #
    # n2v_scores['val_roc'] = n2v_val_roc
    # # n2v_scores['val_roc_curve'] = n2v_val_roc_curve
    # n2v_scores['val_ap'] = n2v_val_ap
    #
    # n2v_scores['runtime'] = runtime

    return test_edge_labels,test_preds