MIA-GCL / MVGRL / MVGRL-mia-white.py
MVGRL-mia-white.py
Raw
import torch
import os.path as osp
import GCL2.losses as L

import sys
import os

print(sys.path)

sys.path.append('./GCL2')


import torch_geometric.transforms as T

#
import GCL2.augmentors as A

from torch import nn
from tqdm import tqdm
from torch.optim import Adam
from GCL2.eval import get_split, LREvaluator
from GCL2.models import DualBranchContrast
from torch_geometric.nn import GCNConv
from torch_geometric.nn.inits import uniform
from torch_geometric.datasets import Planetoid,Amazon

import numpy as np
import random
import pandas as pd

from GCL2 import preprocessing

import networkx as nx
import pickle as pk
import itertools
from sklearn.metrics import f1_score,accuracy_score


def readedges2(file_name):
    file = open(file_name)

    dataMat = []
    for line in file.readlines():
        curLine = line.strip().split('\t')
        # print('111',curLine)
        if curLine==['']:
            dataMat.append([])
        else:
            floatLine = list(map(int, curLine))
        # print(floatLine)
            dataMat.append(floatLine)

    # embeddings = np.array(dataMat,dtype='int')

    return dataMat



class GConv(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(GConv, self).__init__()
        self.layers = torch.nn.ModuleList()
        self.activation = nn.PReLU(hidden_dim)
        for i in range(num_layers):
            if i == 0:
                self.layers.append(GCNConv(input_dim, hidden_dim))
            else:
                self.layers.append(GCNConv(hidden_dim, hidden_dim))

    def forward(self, x, edge_index, edge_weight=None):
        z = x
        for conv in self.layers:
            z = conv(z, edge_index, edge_weight)
            z = self.activation(z)
        return z


class Encoder(torch.nn.Module):
    def __init__(self, encoder1, encoder2, augmentor, hidden_dim):
        super(Encoder, self).__init__()
        self.encoder1 = encoder1
        self.encoder2 = encoder2
        self.augmentor = augmentor
        self.project = torch.nn.Linear(hidden_dim, hidden_dim)
        uniform(hidden_dim, self.project.weight)

    @staticmethod
    def corruption(x, edge_index, edge_weight):
        return x[torch.randperm(x.size(0))], edge_index, edge_weight

    def forward(self, x, edge_index, edge_weight=None):
        aug1, aug2 = self.augmentor
        x1, edge_index1, edge_weight1 = aug1(x, edge_index, edge_weight)
        x2, edge_index2, edge_weight2 = aug2(x, edge_index, edge_weight)
        z1 = self.encoder1(x1, edge_index1, edge_weight1)
        z2 = self.encoder2(x2, edge_index2, edge_weight2)
        g1 = self.project(torch.sigmoid(z1.mean(dim=0, keepdim=True)))
        g2 = self.project(torch.sigmoid(z2.mean(dim=0, keepdim=True)))
        z1n = self.encoder1(*self.corruption(x1, edge_index1, edge_weight1))
        z2n = self.encoder2(*self.corruption(x2, edge_index2, edge_weight2))
        return z1, z2, g1, g2, z1n, z2n, edge_index1, edge_index2


def train(dt,device,encoder_model, contrast_model, data,edges_train_index, optimizer):



    encoder_model.train()
    optimizer.zero_grad()


    z1, z2, g1, g2, z1n, z2n, edge_index1, edge_index2 = encoder_model(data.x, edges_train_index)

    loss = contrast_model(h1=z1, h2=z2, g1=g1, g2=g2, h3=z1n, h4=z2n)
    loss.backward()
    optimizer.step()
    return loss.item(),z1, z2,edge_index1, edge_index2


def test(encoder_model, data,edges_train_index):
    encoder_model.eval()
    z1, z2, _, _, _, _,_,_ = encoder_model(data.x,edges_train_index)
    z = z1 + z2
    split = get_split(num_samples=z.size()[0], train_ratio=0.1, test_ratio=0.8)
    result = LREvaluator()(z, data.y, split)
    return result,z1, z2,z


def main():
    device = torch.device('cuda')
    os.environ["CUDA_VISIBLE_DEVICES"] = "4"
    # device = torch.device(f'cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}')
    # torch.cuda.set_device(4)
    # device='cpu'

    path = osp.join(osp.expanduser('~'), 'datasets')
    dt='cora'  ##cora , citeseer
    dataset = Planetoid(path, name=dt, transform=T.NormalizeFeatures())

    dataset = Amazon(path, dt, pre_transform=None)  # #Computers, photo

    data = dataset[0].to(device)

    aug1 = A.Identity()
    alpha = 0.2
    aug2 = A.PPRDiffusion(alpha)
    gconv1 = GConv(input_dim=dataset.num_features, hidden_dim=512, num_layers=2).to(device)
    gconv2 = GConv(input_dim=dataset.num_features, hidden_dim=512, num_layers=2).to(device)
    encoder_model = Encoder(encoder1=gconv1, encoder2=gconv2, augmentor=(aug1, aug2), hidden_dim=512).to(device)
    contrast_model = DualBranchContrast(loss=L.JSD(), mode='G2L').to(device)

    optimizer = Adam(encoder_model.parameters(), lr=0.001)

    edge_index0 = data.edge_index.detach().cpu().numpy()
    edge_index0=edge_index0.transpose()
    edges0_idx=edge_index0[:,0]*np.shape(data.x)[0]+edge_index0[:,1]
    print(edges0_idx)




    edge_index0_all_oneside = []
    edge_index0_all = []

    edge_index0 = data.edge_index.detach().cpu().numpy()
    edge_index0 = edge_index0.transpose()
    for ed in edge_index0:
        if ed[0] > ed[1]:
            edge_index0_all.append([ed[0], ed[1]])
            continue
        else:
            edge_index0_all.append([ed[0], ed[1]])
            edge_index0_all_oneside.append([ed[0], ed[1]])
    edge_index0_all_oneside = np.array(edge_index0_all_oneside)
    edge_index0_all = np.array(edge_index0_all)

    g = nx.Graph()
    g.add_edges_from(edge_index0_all)
    adj_sparse = nx.to_scipy_sparse_matrix(g)
    random.seed(42)
    train_test_split = preprocessing.mask_test_edges(adj_sparse, test_frac=.3, val_frac=0)


    ratio=alpha

    res_dir = '%s-mvgrl-mia-white-2-%s' % (dt, alpha)


    with open('./%s/%s-train_test_split' % (res_dir, dt), 'wb') as f:
        pk.dump(train_test_split, f)

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = train_test_split  # Unpack train-test split
    # print(adj_train)
    g_train0 = nx.from_scipy_sparse_matrix(
        adj_train)  # new graph object with only non-hidden edges, keep all the original nodes

    edge_tuples0 = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in g_train0.edges()]
    # print(edge_tuples0)

    train_edges0 = set(edge_tuples0)  # initialize train_edges to have all edges
    train_edges0 = np.array([list(edge_tuple) for edge_tuple in train_edges0])
    # print(train_edges1)


    edge_tuples_test0 = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in test_edges]

    edges_test0 = set(edge_tuples_test0)  # initialize test_edges to have all edges
    edges_test0 = np.array([list(edge_tuple) for edge_tuple in edges_test0])

    test_edges0=edges_test0

    out = open('%s/%s-edges-train.txt' % (res_dir, dt), 'w')
    for item in train_edges0:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    out = open('%s/%s-edges-test.txt' % (res_dir, dt), 'w')
    for item in edges_test0:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    # adj = adj_train
    #
    # adj = process.normalize_adj(adj + sp.eye(adj.shape[0]))
    train_edges_1=np.concatenate((train_edges0[:,1].reshape(-1,1),train_edges0[:,0].reshape(-1,1)),axis=1)
    train_edges_1=np.transpose(np.array(train_edges_1))
    train_edges_2 = np.transpose(np.array(train_edges0))
    test_edges_1 = np.concatenate((test_edges0[:, 1].reshape(-1, 1), test_edges0[:, 0].reshape(-1, 1)), axis=1)
    test_edges_1 = np.transpose(np.array(test_edges_1))
    test_edges_2 = np.transpose(np.array(test_edges0))


    edges_train_index=np.concatenate((train_edges_1,train_edges_2),axis=1)

    edges_test_index = np.concatenate((test_edges_1, test_edges_2), axis=1)


    edges_train_index = torch.from_numpy(np.array(edges_train_index)).long().to(device)
    edges_test_index = torch.from_numpy(np.array(edges_test_index)).long().to(device)



    z1_trains = []
    z2_trains = []

    edge_index1_trains=[]
    edge_index2_trains = []

    best_valid_loss = 99999999


    with tqdm(total=1000, desc='(T)') as pbar:
        for epoch in range(1, 1001):
            loss,z1, z2,edge_index1, edge_index2 = train(dt,device,encoder_model, contrast_model, data,edges_train_index, optimizer)
            pbar.set_postfix({'loss': loss})
            pbar.update()

            patience = 100

            if loss < best_valid_loss:
                best_valid_loss = loss
                trail_count = 0
                best_epoch = epoch
                torch.save(encoder_model.state_dict(), os.path.join('./checkpoint',
                                                                    'tmp',
                                                                    f'mvgrl_{dt}_{ratio}_best.pt'))
                z1_trains.append(z1.detach().cpu().numpy())
                z2_trains.append(z2.detach().cpu().numpy())

                # print(edge_index1.detach().cpu().numpy())
                # print(edge_index1.size())

                edge_index1_trains.append(edge_index1.detach().cpu().numpy())
                edge_index2_trains.append(edge_index2.detach().cpu().numpy())


            else:
                trail_count += 1

                if trail_count > patience:
                    print(f'  Early Stop, the best Epoch is {best_epoch}, validation loss: {best_valid_loss:.4f}.')
                    break

                else:
                    edge_index1_trains.append(edge_index1.detach().cpu().numpy())
                    edge_index2_trains.append(edge_index2.detach().cpu().numpy())

                    z1_trains.append(z1.detach().cpu().numpy())
                    z2_trains.append(z2.detach().cpu().numpy())

        encoder_model.load_state_dict(torch.load(os.path.join('./checkpoint', 'tmp',
                                                              f'mvgrl_{dt}_{ratio}_best.pt')))

    test_result,z1, z2,z = test(encoder_model, data,edges_train_index)
    print(f'(E): Best test F1Mi={test_result["micro_f1"]:.4f}, F1Ma={test_result["macro_f1"]:.4f}')

    train_preds = test_result["output_train"]

    # res_dir = '%s-mvgrl-node-mia-mi' % (dt)

    emb_matrix0 = z.detach().cpu().numpy()

    emb_matrix1 = z1.detach().cpu().numpy()
    emb_matrix2 = z2.detach().cpu().numpy()

    with open('./%s/%s-embed0.txt' % (res_dir, dt), 'w') as f:
        f.write('%d %d\n' % (np.shape(emb_matrix0)[0], np.shape(emb_matrix0)[1]))
        for item in emb_matrix0:
            for jtem in item:
                f.write(str(jtem) + '\t')
            f.write('\n')
        f.close()

    with open('./%s/%s-output_train.txt' % (res_dir, dt), 'w') as f:
        for item in train_preds:
            for jtem in item:
                f.write(str(jtem) + '\t')
            f.write('\n')
        f.close()


    edge_index1 = edge_index1.detach().cpu().numpy()
    edge_index2 = edge_index2.detach().cpu().numpy()

    # print(edge_index0, edge_index1, edge_index2, edges_test)

    print(np.shape(edge_index0), np.shape(edge_index1), np.shape(edge_index2))


    #
    # for name, param in encoder_model.named_parameters():
    #     print(name,param.size())
    #
    # for name, param in contrast_model.named_parameters():
    #     print('***',name,param.size())

    # encoder.layers.0.bias torch.Size([32])
    # encoder.layers.0.lin.weight torch.Size([32, 1433])
    # encoder.layers.1.bias torch.Size([32])
    # encoder.layers.1.lin.weight torch.Size([32, 32])
    # fc1.weight torch.Size([32, 32])
    # fc1.bias torch.Size([32])
    # fc2.weight torch.Size([32, 32])
    # fc2.bias torch.Size([32])
    # exit()

    print(np.shape(z1_trains),np.shape(z2_trains))


    z1_trains_=np.array(z1_trains)
    z2_trains_=np.array(z2_trains)

    with open('./%s/%s-aug1.pkl' % (res_dir, dt), 'wb') as f:
        pk.dump(edge_index1_trains, f)

    with open('./%s/%s-aug2.pkl' % (res_dir, dt), 'wb') as f:
        pk.dump(edge_index2_trains, f)

    with open('./%s/%s-aug1-embed.pkl' % (res_dir, dt), 'wb') as f:
        pk.dump(z1_trains_, f)

    with open('./%s/%s-aug2-embed.pkl' % (res_dir, dt), 'wb') as f:
        pk.dump(z2_trains_, f)


    return emb_matrix0,train_preds,train_edges0,edges_test0,data.y,res_dir,z1_trains,z2_trains,dt,edge_index1_trains,edge_index2_trains,z1_trains_,z2_trains_



def get_edge_embeddings(edge_list, emb_matrixs,idx_epoches_all ):
    embs = []
    i=0
    for edge in edge_list:
        node1 = int(edge[0])
        node2 = int(edge[1])
        emb=[]
        # print(i)
        # print(idx_epoches_all[i,:])
        # print(len(idx_epoches_all[i,:]))
        for emb_matrix in emb_matrixs[idx_epoches_all[i,:],:,:]:
            emb1 = emb_matrix[node1]
            #print(np.shape(emb1))
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            sim1 = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)+0.0000000000000000000000000000001)

            sim2 = np.dot(emb1, emb2)

            sim3 = np.linalg.norm(np.array(emb1) - np.array(emb2))

            #edge_emb = np.array(emb1) + np.array(emb2)
            # print(np.shape(edge_emb))
            emb.append(sim1)
            emb.append(sim2)
        i+=1
        embs.append(emb)
    embs = np.array(embs)
    return embs


def get_edge_posts(edge_list,train_preds):
    embs = []
    for edge in edge_list:
        node1 = edge[0]
        node2 = edge[1]
        pre1 = train_preds[node1]
        #print(np.shape(emb1))
        pre2 = train_preds[node2]

        pre_idx1 = np.argmax(pre1)
        pre_idx2 = np.argmax(pre2)
        train_pres_temp1 = np.sort(pre1)
        train_pres_temp2 = np.sort(pre2)
        if pre_idx1 == label[node1]:
            corr = 1
        else:
            corr = 0

        train_pres1_=([train_pres_temp1[-1], train_pres_temp1[-2], corr])

        if pre_idx2 == label[node2]:
            corr = 1
        else:
            corr = 0
        train_pres2_ = ([train_pres_temp2[-1], train_pres_temp2[-2], corr])

        edge_emb = np.multiply(train_pres1_, train_pres2_)
        #edge_emb = np.array(emb1) + np.array(emb2)
        print(np.shape(edge_emb))

        emb1=train_pres1_
        emb2 = train_pres2_

        sim1 = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

        sim2 = np.dot(emb1, emb2)

        sim3 = np.linalg.norm(np.array(emb1) - np.array(emb2))


        embs.append([sim1,sim2])
    embs = np.array(embs)

    return embs







if __name__ == '__main__':
    emb_matrix, output_train, edges_train_all, edges_test_all, label, res_dir,emb_matrix1,emb_matrix2,dt,aug1s,aug2s,aug1s_embed,aug2s_embed =main()

    edges_train_all = np.array(edges_train_all)
    edges_test_all = np.array(edges_test_all)

    train_preds = output_train

    train_range1 = list(np.arange(np.shape(edges_train_all)[0]))
    # train_range2 = list(np.arange(np.shape(edges_train_inter)[0]))

    # Train-set edge embeddings
    train_preds_sampled_idx1 = np.array(random.sample(train_range1, np.shape(edges_test_all)[0]))
    # train_preds_sampled_idx2 = np.array(random.sample(train_range2, np.shape(edges_test_all)[0]))

    print(train_preds_sampled_idx1)

    # train_preds_sampled1 = np.array(edges_train_all)[train_preds_sampled_idx1]
    train_edges_sampled1 = np.array(edges_train_all)[train_preds_sampled_idx1, :]

    # train_preds_sampled2 = np.array(edges_train_all)[train_preds_sampled_idx2]
    # train_edges_sampled2 = np.array(edges_train_inter)[train_preds_sampled_idx2,:]

    out = open('%s/%s-edges-train_sampled.txt' % (res_dir, dt), 'w')
    for item in train_edges_sampled1:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    out = open('%s/%s-edges-test_sampled.txt' % (res_dir, dt), 'w')
    for item in edges_test_all:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    print(train_edges_sampled1)
    print(edges_test_all)

    z1_trains = emb_matrix1
    z2_trains = emb_matrix2

    ylabel = [1] * len(train_preds_sampled_idx1) + [0] * len(train_preds_sampled_idx1)

    from sklearn.model_selection import train_test_split

    train_edges_list = train_edges_sampled1
    test_edges_list = np.array(edges_test_all)

    edges_list = np.concatenate((train_edges_list, test_edges_list), axis=0)

    ylabel1 = ylabel
    ylable1 = np.reshape(len(ylabel1), 1)
    y_label = np.zeros((np.shape(edges_list)[0], 3))
    for i in range(np.shape(edges_list)[0]):
        y_label[i][0] = edges_list[i][0]
        y_label[i][1] = edges_list[i][1]
        y_label[i][2] = ylabel[i]
    print(np.shape(y_label))

    y_label_train = np.zeros((np.shape(train_edges_list)[0], 3))
    for i in range(np.shape(train_edges_list)[0]):
        y_label_train[i][0] = train_edges_list[i][0]
        y_label_train[i][1] = train_edges_list[i][1]
        y_label_train[i][2] = 1
    print(np.shape(y_label_train))

    y_label_test = np.zeros((np.shape(test_edges_list)[0], 3))
    for i in range(np.shape(test_edges_list)[0]):
        y_label_test[i][0] = test_edges_list[i][0]
        y_label_test[i][1] = test_edges_list[i][1]
        y_label_test[i][2] = 0
    print(np.shape(y_label_test))

    sam_list_idx = list(range(np.shape(y_label_train)[0]))

    sam_list_idx_train = np.array(random.sample(sam_list_idx, int(0.3 * len(sam_list_idx))))

    sam_list_idx = list(range(np.shape(y_label_test)[0]))

    sam_list_idx_test = np.array(random.sample(sam_list_idx, int(0.3 * len(sam_list_idx))))

    y_test = np.concatenate((y_label_train[sam_list_idx_train], y_label_test[sam_list_idx_test]), axis=0)

    edges_mia = y_test

    edges_mia0 = np.array(edges_mia)[:, 0:2]

    edges_mia = np.array(edges_mia)
    index_pos = np.where(edges_mia[:, 2] == 1)[0]
    index_neg = np.where(edges_mia[:, 2] == 0)[0]

    print(len(index_pos), len(index_neg))

    edges_mia_pos0 = edges_mia[index_pos]
    edges_mia_neg0 = edges_mia[index_neg]

    edges_mia_pos = [[min(edge[0], edge[1]), max(edge[0], edge[1])] for edge in edges_mia_pos0]
    print(np.shape(edges_mia_pos))
    edges_mia_pos_idx = np.array(edges_mia_pos)[:, 0] * 99999 + np.array(edges_mia_pos)[:, 1]

    edges_mia_neg = [[min(edge[0], edge[1]), max(edge[0], edge[1])] for edge in edges_mia_neg0]

    edges_mia_neg_idx = np.array(edges_mia_neg)[:, 0] * 99999 + np.array(edges_mia_neg)[:, 1]

    train_edges_sampled_ = [[min(edge[0], edge[1]), max(edge[0], edge[1])] for edge in train_edges_sampled1]
    test_edges_sampled_ = [[min(edge[0], edge[1]), max(edge[0], edge[1])] for edge in edges_test_all]

    train_edges_sampled_idx = np.array(train_edges_sampled_)[:, 0] * 99999 + np.array(train_edges_sampled_)[:, 1]
    test_edges_sampled_idx = np.array(test_edges_sampled_)[:, 0] * 99999 + np.array(test_edges_sampled_)[:, 1]

    train_edges_pos_idx = np.setdiff1d(train_edges_sampled_idx, edges_mia_pos_idx)
    train_edges_neg_idx = np.setdiff1d(test_edges_sampled_idx, edges_mia_neg_idx)

    print(len(train_edges_sampled_idx), len(test_edges_sampled_idx), len(train_edges_pos_idx), len(train_edges_neg_idx))
    print(len(train_edges_pos_idx), len(train_edges_neg_idx))

    results = []
    aug1s_idx = []
    for aug in aug1s:
        # print(aug,np.shape(aug))
        aug = aug.T
        aug_ = [[min(edge[0], edge[1]), max(edge[0], edge[1])] for edge in aug]
        aug_idx = np.array(aug_)[:, 0] * 99999 + np.array(aug_)[:, 1]
        # print('$$$$$$$',np.shape(aug_idx))
        aug1s_idx.append(aug_idx)

    aug2s_idx = []
    for aug in aug2s:
        aug = aug.T
        aug_ = [[min(edge[0], edge[1]), max(edge[0], edge[1])] for edge in aug]
        aug_idx = np.array(aug_)[:, 0] * 99999 + np.array(aug_)[:, 1]
        # print('$$$$$$$', np.shape(aug_idx))
        aug2s_idx.append(aug_idx)

    #
    add1s_pos_idx = []
    add2s_pos_idx = []

    for aug_idx in aug1s_idx:
        drop_idx = np.intersect1d(train_edges_neg_idx, aug_idx)
        add1s_pos_idx.append(drop_idx)

    for aug_idx in aug2s_idx:
        drop_idx = np.intersect1d(train_edges_neg_idx, aug_idx)
        add2s_pos_idx.append(drop_idx)

    # print(drop1s_pos_idx)
    # print(drop2s_pos_idx)


    with open('./%s/%s-add1s_pos_idx.txt' % (res_dir, dt), 'w') as f:
        for item in add1s_pos_idx:
            for jtem in item:
                f.write(str(jtem) + '\t')
            f.write('\n')
        f.close()

    with open('./%s/%s-add2s_pos_idx.txt' % (res_dir, dt), 'w') as f:
        for item in add2s_pos_idx:
            for jtem in item:
                f.write(str(jtem) + '\t')
            f.write('\n')
        f.close()

    # file_name='./%s/%s-add1s_pos_idx.txt' % (res_dir,dt)
    # add1s_pos_idx0=readedges2(file_name)
    # # print(drop1s_pos_idx)

    file_name = './%s/%s-add2s_pos_idx.txt' % (res_dir, dt)
    add2s_pos_idx0 = readedges2(file_name)

    print('####', add2s_pos_idx0[0])

    # print(drop2s_pos_idx0[0])

    # print(drop2s_pos_idx0[0])

    iterations = np.shape(add2s_pos_idx0)[0]

    # iter_ratios=[0.2,0.4,0.6,0.8,1]
    iter_ratios = [1]

    # drop1s_pos_idx0=add1s_pos_idx0
    drop2s_pos_idx0 = add2s_pos_idx0

    # results=[]
    for iters in iter_ratios:
        iter_ = int(iterations * iters) - 1

        # drop1s_pos_idx=drop1s_pos_idx0[0:iter_]
        drop2s_pos_idx = drop2s_pos_idx0[0:iter_]

        # drop1s_pos_idx_=list(itertools.chain.from_iterable(drop1s_pos_idx))
        drop2s_pos_idx_ = list(itertools.chain.from_iterable(drop2s_pos_idx))

        # print(len(drop1s_pos_idx_),len(drop2s_pos_idx_))
        # set1=list(set(drop1s_pos_idx_))
        set2 = list(set(drop2s_pos_idx_))
        print(len(set2))
        set0 = list(set(set2))
        # print(set0)
        print(len(set0))
        print(np.shape(edges_test_all)[0])
        # exit()
        idx_dic1 = dict()
        idx_dic2 = dict()
        idx_dic1_ = dict()
        idx_dic2_ = dict()
        for idx in set0:
            idx_dic1[idx] = 0
            idx_dic2[idx] = 0
            idx_dic1_[idx] = []
            idx_dic2_[idx] = []

        # i=0
        # for idx in drop1s_pos_idx:
        #     for j in idx:
        #         idx_dic1[j]+=1
        #         idx_dic1_[j].append(i)
        #     i+=1

        i = 0
        for idx in drop2s_pos_idx:
            for j in idx:
                idx_dic2[j] += 1
                idx_dic2_[j].append(i)
            i += 1

        # print(min(idx_dic1.values()),max(idx_dic1.values()))
        print(min(idx_dic2.values()), max(idx_dic2.values()))

        # print(idx_dic1,idx_dic2)
        # idx_dic0=[]
        # for idx in set0:
        #     idx_dic0.append(idx_dic1[idx]+idx_dic2[idx])
        # # print(idx_dic0)

        idx_dic0 = idx_dic2

        print(min(idx_dic0), max(idx_dic0))

        train_edges_pos = []
        train_edges_neg = []
        for i in train_edges_pos_idx:
            node1 = int(i / 99999)
            node2 = i % 99999
            train_edges_pos.append([node1, node2])

        for i in train_edges_neg_idx:
            node1 = int(i / 99999)
            node2 = i % 99999
            train_edges_neg.append([node1, node2])

        test_edges_pos = np.array(edges_mia_pos)
        test_edges_neg = np.array(edges_mia_neg)

        epoches = np.shape(aug1s_embed)[0]
        idx_epoches = list(range(epoches))

        idx_epoches_all = []
        drop_idx_all = []
        for i in train_edges_neg_idx:

            if i in idx_dic2_.keys():

                drop_idx = idx_dic2_[i]
                idx_epoches_ = list(set(idx_epoches).difference(set(drop_idx)))
                if len(drop_idx) < max(idx_dic2.values()):
                    drop_idx_sample2 = random.sample(idx_epoches_, (epoches - max(idx_dic2.values()) - len(drop_idx)))
                    drop_idx_sample = random.sample(idx_epoches_, (max(idx_dic2.values()) - len(drop_idx)))
                    idx_epoches_ = list(set(idx_epoches_).difference(set(drop_idx_sample)))
                    drop_idx_ = list(drop_idx) + drop_idx_sample2
                else:
                    idx_epoches_ = list(set(idx_epoches_))
                    drop_idx_ = idx_epoches_
                    drop_idx_ = idx_epoches_

            else:
                idx_epoches_ = idx_epoches
                drop_idx_sample = random.sample(idx_epoches_, (max(idx_dic2.values())))

                idx_epoches_ = list(set(idx_epoches).difference(set(drop_idx_sample)))
                drop_idx_ = idx_epoches_

            idx_epoches_all.append(idx_epoches_)
            drop_idx_all.append(drop_idx_)

        idx_epoches_all = np.array(idx_epoches_all)
        drop_idx_all = np.array(drop_idx_all)
        train_edges_pos = np.array(train_edges_pos)
        train_edges_neg = np.array(train_edges_neg)

        y_train_train = np.concatenate((train_edges_pos, np.ones(np.shape(train_edges_pos)[0]).reshape(-1, 1)), axis=1)
        y_train_test = np.concatenate((train_edges_neg, np.zeros(np.shape(train_edges_neg)[0]).reshape(-1, 1)), axis=1)
        y_test_train = np.concatenate((test_edges_pos, np.ones(np.shape(test_edges_pos)[0]).reshape(-1, 1)), axis=1)
        y_test_test = np.concatenate((test_edges_neg, np.zeros(np.shape(test_edges_neg)[0]).reshape(-1, 1)), axis=1)

        print(np.shape(train_edges_pos), np.shape(idx_epoches_all), np.shape(aug1s_embed))
        pos_train_edge_embs0 = get_edge_embeddings(train_edges_pos, drop_idx_all)
        neg_train_edge_embs0 = get_edge_embeddings(train_edges_neg, aug1s_embed, idx_epoches_all)

        pos_test_edge_embs0 = get_edge_embeddings(test_edges_pos, aug1s_embed, drop_idx_all)
        neg_test_edge_embs0 = get_edge_embeddings(test_edges_neg, aug1s_embed, idx_epoches_all)

        pos_train_edge_embs1 = get_edge_embeddings(train_edges_pos, aug2s_embed, drop_idx_all)
        neg_train_edge_embs1 = get_edge_embeddings(train_edges_neg, aug2s_embed, idx_epoches_all)

        pos_test_edge_embs1 = get_edge_embeddings(test_edges_pos, aug2s_embed, drop_idx_all)
        neg_test_edge_embs1 = get_edge_embeddings(test_edges_neg, aug2s_embed, idx_epoches_all)

        X_train = np.concatenate((pos_train_edge_embs0, neg_train_edge_embs0), axis=0)
        X_test = np.concatenate((pos_test_edge_embs0, neg_test_edge_embs0), axis=0)
        y_train = np.concatenate((y_train_train, y_train_test), axis=0)
        y_test = np.concatenate((y_test_train, y_test_test), axis=0)

        # # ######################################################################

        from sklearn import metrics
        from sklearn.neural_network import MLPClassifier

        mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16), random_state=1,
                            max_iter=1000)

        mlp.fit(X_train, y_train[:, 2])

        print("Training set score: %f" % mlp.score(X_train, y_train[:, 2]))
        print("Test set score: %f" % mlp.score(X_test, y_test[:, 2]))

        y_score = mlp.predict(X_test)
        print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
        print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

        acc_mlp_sim_embed0 = accuracy_score(y_score, y_test[:, 2])

        tsts = []
        for i in range(len(y_score)):
            node1 = y_test[i][0]
            node2 = y_test[i][1]

            tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]]
            tsts.append(tst)
        name = ['y_score', 'y_test_grd', 'node1', 'node2']
        result = pd.DataFrame(columns=name, data=tsts)
        result.to_csv("{}/{}-embed-mlp_sim0.csv".format(res_dir, dt))

        # # ######################################################################

        from sklearn.ensemble import RandomForestClassifier

        rf = RandomForestClassifier(max_depth=150, random_state=0)
        rf.fit(X_train, y_train[:, 2])

        print("Training set score: %f" % rf.score(X_train, y_train[:, 2]))
        print("Test set score: %f" % rf.score(X_test, y_test[:, 2]))

        y_score = rf.predict(X_test)
        print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
        print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

        acc_rf_sim_embed0 = accuracy_score(y_score, y_test[:, 2])

        tsts = []
        for i in range(len(y_score)):
            node1 = y_test[i][0]
            node2 = y_test[i][1]

            tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]]
            tsts.append(tst)
        name = ['y_score', 'y_test_grd', 'node1', 'node2']

        result = pd.DataFrame(columns=name, data=tsts)
        result.to_csv("{}/{}-embed-rf_sim0.csv".format(res_dir, dt))

        # # ######################################################################

        from sklearn.multiclass import OneVsRestClassifier
        from sklearn.svm import SVC

        svm = OneVsRestClassifier(SVC())
        svm.fit(X_train, y_train[:, 2])

        print("Training set score: %f" % svm.score(X_train, y_train[:, 2]))
        print("Test set score: %f" % svm.score(X_test, y_test[:, 2]))

        y_score = svm.predict(X_test)
        print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
        print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

        acc_svm_sim_embed0 = accuracy_score(y_score, y_test[:, 2])

        tsts = []
        for i in range(len(y_score)):
            node1 = y_test[i][0]
            node2 = y_test[i][1]

            tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]]
            tsts.append(tst)
        name = ['y_score', 'y_test_grd', 'node1', 'node2']
        result = pd.DataFrame(columns=name, data=tsts)
        result.to_csv("{}/{}-embed-svm_sim0.csv".format(res_dir, dt))

        X_train = np.concatenate((pos_train_edge_embs1, neg_train_edge_embs1), axis=0)
        X_test = np.concatenate((pos_test_edge_embs1, neg_test_edge_embs1), axis=0)
        y_train = np.concatenate((y_train_train, y_train_test), axis=0)
        y_test = np.concatenate((y_test_train, y_test_test), axis=0)

        # # ######################################################################

        from sklearn import metrics
        from sklearn.neural_network import MLPClassifier

        mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16), random_state=1,
                            max_iter=1000)

        mlp.fit(X_train, y_train[:, 2])

        print("Training set score: %f" % mlp.score(X_train, y_train[:, 2]))
        print("Test set score: %f" % mlp.score(X_test, y_test[:, 2]))

        y_score = mlp.predict(X_test)
        print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
        print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

        acc_mlp_sim_embed1 = accuracy_score(y_score, y_test[:, 2])

        tsts = []
        for i in range(len(y_score)):
            node1 = y_test[i][0]
            node2 = y_test[i][1]

            tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]]
            tsts.append(tst)
        name = ['y_score', 'y_test_grd', 'node1', 'node2']
        result = pd.DataFrame(columns=name, data=tsts)
        result.to_csv("{}/{}-embed-mlp_sim1.csv".format(res_dir, dt))

        # # ######################################################################

        from sklearn.ensemble import RandomForestClassifier

        rf = RandomForestClassifier(max_depth=150, random_state=0)
        rf.fit(X_train, y_train[:, 2])

        print("Training set score: %f" % rf.score(X_train, y_train[:, 2]))
        print("Test set score: %f" % rf.score(X_test, y_test[:, 2]))

        y_score = rf.predict(X_test)
        print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
        print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

        acc_rf_sim_embed1 = accuracy_score(y_score, y_test[:, 2])

        tsts = []
        for i in range(len(y_score)):
            node1 = y_test[i][0]
            node2 = y_test[i][1]

            tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]]
            tsts.append(tst)
        name = ['y_score', 'y_test_grd', 'node1', 'node2']

        result = pd.DataFrame(columns=name, data=tsts)
        result.to_csv("{}/{}-embed-rf_sim1.csv".format(res_dir, dt))

        # # ######################################################################

        from sklearn.multiclass import OneVsRestClassifier
        from sklearn.svm import SVC

        svm = OneVsRestClassifier(SVC())
        svm.fit(X_train, y_train[:, 2])

        print("Training set score: %f" % svm.score(X_train, y_train[:, 2]))
        print("Test set score: %f" % svm.score(X_test, y_test[:, 2]))

        y_score = svm.predict(X_test)
        print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
        print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

        acc_svm_sim_embed1 = accuracy_score(y_score, y_test[:, 2])

        tsts = []
        for i in range(len(y_score)):
            node1 = y_test[i][0]
            node2 = y_test[i][1]

            tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]]
            tsts.append(tst)
        name = ['y_score', 'y_test_grd', 'node1', 'node2']
        result = pd.DataFrame(columns=name, data=tsts)
        result.to_csv("{}/{}-embed-svm_sim1.csv".format(res_dir, dt))

        pos_train_edge_embs1 = np.concatenate((pos_train_edge_embs0, pos_train_edge_embs1), axis=1)
        neg_train_edge_embs1 = np.concatenate((neg_train_edge_embs0, neg_train_edge_embs1), axis=1)

        pos_test_edge_embs1 = np.concatenate((pos_test_edge_embs0, pos_test_edge_embs1), axis=1)
        neg_test_edge_embs1 = np.concatenate((neg_test_edge_embs0, neg_test_edge_embs1), axis=1)

        X_train = np.concatenate((pos_train_edge_embs1, neg_train_edge_embs1), axis=0)
        X_test = np.concatenate((pos_test_edge_embs1, neg_test_edge_embs1), axis=0)
        y_train = np.concatenate((y_train_train, y_train_test), axis=0)
        y_test = np.concatenate((y_test_train, y_test_test), axis=0)

        # # ######################################################################

        from sklearn import metrics
        from sklearn.neural_network import MLPClassifier

        mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16), random_state=1,
                            max_iter=1000)

        mlp.fit(X_train, y_train[:, 2])

        print("Training set score: %f" % mlp.score(X_train, y_train[:, 2]))
        print("Test set score: %f" % mlp.score(X_test, y_test[:, 2]))

        y_score = mlp.predict(X_test)
        print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
        print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

        acc_mlp_sim_embed2 = accuracy_score(y_score, y_test[:, 2])

        tsts = []
        for i in range(len(y_score)):
            node1 = y_test[i][0]
            node2 = y_test[i][1]

            tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]]
            tsts.append(tst)
        name = ['y_score', 'y_test_grd', 'node1', 'node2']
        result = pd.DataFrame(columns=name, data=tsts)
        result.to_csv("{}/{}-embed-mlp_sim2.csv".format(res_dir, dt))

        # # ######################################################################

        from sklearn.ensemble import RandomForestClassifier

        rf = RandomForestClassifier(max_depth=150, random_state=0)
        rf.fit(X_train, y_train[:, 2])

        print("Training set score: %f" % rf.score(X_train, y_train[:, 2]))
        print("Test set score: %f" % rf.score(X_test, y_test[:, 2]))

        y_score = rf.predict(X_test)
        print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
        print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

        acc_rf_sim_embed2 = accuracy_score(y_score, y_test[:, 2])

        tsts = []
        for i in range(len(y_score)):
            node1 = y_test[i][0]
            node2 = y_test[i][1]

            tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]]
            tsts.append(tst)
        name = ['y_score', 'y_test_grd', 'node1', 'node2']

        result = pd.DataFrame(columns=name, data=tsts)
        result.to_csv("{}/{}-embed-rf_sim2.csv".format(res_dir, dt))

        # # ######################################################################

        from sklearn.multiclass import OneVsRestClassifier
        from sklearn.svm import SVC

        svm = OneVsRestClassifier(SVC())
        svm.fit(X_train, y_train[:, 2])

        print("Training set score: %f" % svm.score(X_train, y_train[:, 2]))
        print("Test set score: %f" % svm.score(X_test, y_test[:, 2]))

        y_score = svm.predict(X_test)
        print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
        print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

        acc_svm_sim_embed2 = accuracy_score(y_score, y_test[:, 2])

        tsts = []
        for i in range(len(y_score)):
            node1 = y_test[i][0]
            node2 = y_test[i][1]

            tst = [y_score[i], y_test[i][2], y_test[i][0], y_test[i][1]]
            tsts.append(tst)
        name = ['y_score', 'y_test_grd', 'node1', 'node2']
        result = pd.DataFrame(columns=name, data=tsts)
        result.to_csv("{}/{}-embed-svm_sim2.csv".format(res_dir, dt))

        print(acc_mlp_sim_embed0, acc_rf_sim_embed0, acc_svm_sim_embed0)

        print(acc_mlp_sim_embed1, acc_rf_sim_embed1, acc_svm_sim_embed1)

        print(acc_mlp_sim_embed2, acc_rf_sim_embed2, acc_svm_sim_embed2)

        results.append(
            [acc_mlp_sim_embed0, acc_rf_sim_embed0, acc_svm_sim_embed0, acc_mlp_sim_embed1, acc_rf_sim_embed1,
             acc_svm_sim_embed1, acc_mlp_sim_embed2, acc_rf_sim_embed2, acc_svm_sim_embed2])

result_all = pd.DataFrame(data=results)
result_all.to_csv("{}/results_all.csv".format(res_dir))