LMIA / node2vec / deepwalk_pytorch / word2vec.py
word2vec.py
Raw
import os
import pickle
import random
import argparse
import torch
import numpy as np

from tqdm import tqdm
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from .model import Word2Vec, SGNS,SGNS_dp

import pdb
from .preprocess import Preprocess
import pdb
import math

class PermutedSubsampledCorpus(Dataset):
    def __init__(self, data, ws=None):
        #data = pickle.load(open(datapath, 'rb'))
        if ws is not None:
            self.data = []
            for iword, owords in data:
                if random.random() > ws[iword]:
                    self.data.append((iword, owords))
        else:
            self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        iword, owords = self.data[idx]
        return iword, np.array(owords)


class ModWord2Vec():
    def __init__(self,sentences=None,
            corpus_file=None, 
            size=100, 
            alpha=0.025, 
            window=5, 
            min_count=5, 
            max_vocab_size=None, 
            sample=0.001, 
            seed=1, 
            workers=3, 
            min_alpha=0.0001, 
            sg=0, 
            hs=0, 
            negative=5, 
            ns_exponent=0.75, 
            cbow_mean=1, 
            hashfxn=None, 
            iter=5,
            null_word='<UNK>', 
            trim_rule=None, 
            sorted_vocab=1, 
            batch_words=10000, 
            compute_loss=False, 
            callbacks=(), 
            max_final_vocab=None):
        self.data = sentences
        self.e_dim = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = len(sentences)
        self.ss_t = sample
        self.seed = 1
        self.workers = 3
        self.min_alpha = min_alpha
        self.n_negs = negative
        self.ns_exponent = ns_exponent
        self.sg = sg
        self.hs = hs
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epoch = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.compute_loss = compute_loss
        self.batch_words = batch_words
        self.callbacks = callbacks
        self.max_final_vocab = None

        self.data = self.preprocess(sentences)
        self.idx2vec = self.train()

    def preprocess(self, sentences):
        pre = Preprocess(self.data, window = self.window, unk = self.null_word, max_vocab = self.max_vocab_size)
        self.idx2word, self.word2idx, self.vocab, self.wc = pre.build()
        return pre.convert()

    def train(self,cuda=False, weights=False):
        wf = np.array([self.wc[word] for word in self.idx2word])
        wf = wf / wf.sum()
        ws = 1 - np.sqrt(self.ss_t / wf)
        ws = np.clip(ws, 0, 1)
        vocab_size = len(self.idx2word)
        weights = wf if weights else None
        model = Word2Vec(vocab_size=vocab_size, embedding_size=self.e_dim)
        sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=self.n_negs, weights=weights)
        if cuda:
            sgns = sgns.cuda()
        optim = Adam(sgns.parameters())
        for self.epoch in range(1, self.epoch + 1):
            dataset = PermutedSubsampledCorpus(self.data)
            dataloader = DataLoader(dataset, batch_size=self.batch_words, shuffle=True)
            total_batches = int(np.ceil(len(dataset) / self.batch_words))
            # print(len(dataset))
            # print(self.batch_words)
            # exit()
            pbar = tqdm(dataloader)
            pbar.set_description("[Epoch {}]".format(self.epoch))
            for iword, owords in pbar:
                loss = sgns(iword, owords)
                print(loss)
                optim.zero_grad()
                loss.backward()
                optim.step()
                pbar.set_postfix(loss=loss.item())
        idx2vec = model.ivectors.weight.data.cpu().numpy()
        return idx2vec

    def save_emb(self, savepath, num_nodes):
        perm = np.array([self.word2idx[str(word)] for word in range(num_nodes)])
        emb = self.idx2vec[perm]
        np.save(savepath , emb)

        return emb


class ModWord2Vec5():
    def __init__(self,sentences=None,
            corpus_file=None,
            size=100,
            alpha=0.025,
            window=5,
            min_count=5,
            max_vocab_size=None,
            sample=0.001,
            seed=1,
            workers=3,
            min_alpha=0.0001,
            sg=0,
            hs=0,
            negative=5,
            ns_exponent=0.75,
            cbow_mean=1,
            hashfxn=None,
            iter=5,
            null_word='<UNK>',
            trim_rule=None,
            sorted_vocab=1,
            batch_words=10000,
            compute_loss=False,
            callbacks=(),
            max_final_vocab=None):
        self.data = sentences
        self.e_dim = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = len(sentences)
        self.ss_t = sample
        self.seed = 1
        self.workers = 3
        self.min_alpha = min_alpha
        self.n_negs = negative
        self.ns_exponent = ns_exponent
        self.sg = sg
        self.hs = hs
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epoch = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.compute_loss = compute_loss
        self.batch_words = batch_words
        self.callbacks = callbacks
        self.max_final_vocab = None

        self.data = self.preprocess(sentences)
        self.idx2vec = self.train()

    def preprocess(self, sentences):
        pre = Preprocess(self.data, window = self.window, unk = self.null_word, max_vocab = self.max_vocab_size)
        self.idx2word, self.word2idx, self.vocab, self.wc = pre.build()
        return pre.convert()

    def train(self,cuda=False, weights=False):
        wf = np.array([self.wc[word] for word in self.idx2word])
        wf = wf / wf.sum()
        ws = 1 - np.sqrt(self.ss_t / wf)
        ws = np.clip(ws, 0, 1)
        vocab_size = len(self.idx2word)
        weights = wf if weights else None
        model = Word2Vec(vocab_size=vocab_size, embedding_size=self.e_dim)
        sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=self.n_negs, weights=weights)
        if cuda:
            sgns = sgns.cuda()
        optim = Adam(sgns.parameters())
        for self.epoch in range(1, self.epoch + 1):
            dataset = PermutedSubsampledCorpus(self.data)
            dataloader = DataLoader(dataset, batch_size=self.batch_words, shuffle=True)
            total_batches = int(np.ceil(len(dataset) / self.batch_words))
            # print(len(dataset))
            # print(self.batch_words)
            # exit()
            pbar = tqdm(dataloader)
            pbar.set_description("[Epoch {}]".format(self.epoch))
            for iword, owords in pbar:
                loss = sgns(iword, owords)
                print(loss)
                optim.zero_grad()
                loss.backward()
                optim.step()
                pbar.set_postfix(loss=loss.item())
        idx2vec = model.ivectors.weight.data.cpu().numpy()
        return idx2vec

    def save_emb(self, savepath, num_nodes):
        perm = np.array([self.word2idx[str(word)] for word in range(num_nodes)])
        emb = self.idx2vec[perm]
        np.save(savepath , emb)

        return emb



class ModWord2Vec_dp():
    def __init__(self,sentences=None,
            corpus_file=None,
            size=100,
            alpha=0.025,
            window=5,
            min_count=5,
            max_vocab_size=None,
            sample=0.001,
            seed=1,
            workers=3,
            min_alpha=0.0001,
            sg=0,
            hs=0,
            negative=5,
            ns_exponent=0.75,
            cbow_mean=1,
            hashfxn=None,
            iter=5,
            null_word='<UNK>',
            trim_rule=None,
            sorted_vocab=1,
            batch_words=10000,
            compute_loss=False,
            callbacks=(),
            max_final_vocab=None,
            budget=400):
        self.data = sentences
        self.e_dim = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = len(sentences)
        self.ss_t = sample
        self.seed = 1
        self.workers = 3
        self.min_alpha = min_alpha
        self.n_negs = negative
        self.ns_exponent = ns_exponent
        self.sg = sg
        self.hs = hs
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epoch = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.compute_loss = compute_loss
        self.batch_words = batch_words
        self.callbacks = callbacks
        self.max_final_vocab = None
        self.sigma = budget

        self.data = self.preprocess(sentences)
        self.idx2vec = self.train()


        # self.use_cuda = torch.cuda.is_available()
        #
        # self.device = torch.device("cuda" if self.use_cuda else "cpu")

    def preprocess(self, sentences):
        pre = Preprocess(self.data, window = self.window, unk = self.null_word, max_vocab = self.max_vocab_size)
        self.idx2word, self.word2idx, self.vocab, self.wc = pre.build()
        return pre.convert()

    def train(self,cuda=False, weights=False):
        wf = np.array([self.wc[word] for word in self.idx2word])
        wf = wf / wf.sum()
        ws = 1 - np.sqrt(self.ss_t / wf)
        ws = np.clip(ws, 0, 1)
        vocab_size = len(self.idx2word)
        weights = wf if weights else None
        model = Word2Vec(vocab_size=vocab_size, embedding_size=self.e_dim)
        sgns = SGNS_dp(embedding=model, vocab_size=vocab_size, n_negs=self.n_negs, weights=weights)
        self.device = torch.device("cuda" if cuda else "cpu")
        if cuda:
            sgns = sgns.cuda()
        optim = Adam(sgns.parameters())
        loss_list = {}
        for self.epoch in range(1, self.epoch + 1):
            loss_s = []
            C=1
            sigma=self.sigma
            print(sigma)
            dataset = PermutedSubsampledCorpus(self.data)
            dataloader = DataLoader(dataset, batch_size=self.batch_words, shuffle=True)
            total_batches = int(np.ceil(len(dataset) / self.batch_words))
            pbar = tqdm(dataloader)
            pbar.set_description("[Epoch {}]".format(self.epoch))
            for iword, owords in pbar:
                loss = sgns(iword, owords)
                optim.zero_grad()

                grads = [torch.zeros(p.shape).to(self.device) for p in sgns.parameters()]

                igrad = torch.autograd.grad(loss, sgns.parameters(), retain_graph=True)
                # print(igradgrads=[torch.zeros(p.shape).to(self.device) for p in sgns.parameters()])


                l2_norm = torch.tensor(0.0).to(self.device)
                for g in igrad:
                    l2_norm += g.norm(2) ** 2
                    # l2_norm += g.sum().square().tolist()
                # print('time12:', int(time.time() / 1000))
                l2_norm = l2_norm.sqrt()
                divisor = max(torch.tensor(1.0).to(self.device), l2_norm / C)
                for i in range(len(igrad)):
                    grads[i] += igrad[i] / divisor

                for i in range(len(grads)):
                    print(grads[i])
                    grads[i] += sigma * C * (torch.randn_like(grads[i]).to(self.device))
                    print(grads[i])
                    grads[i] /= np.shape(iword)[0]
                    grads[i].detach_()

                # exit()

                p_list = [p for p in sgns.parameters()]
                for i in range(len(p_list)):
                    p_list[i].grad = grads[i]
                    print(p_list[i].grad)
                    p_list[i].grad.detach_()

                    print(p_list[i].grad)


                for p in sgns.parameters():
                    print('*******')
                    print(p.grad)

                loss.backward()
                optim.step()
                pbar.set_postfix(loss=loss.item())

                loss_s.append(loss.item())

            loss_list[self.epoch] = loss_s
        idx2vec = model.ivectors.weight.data.cpu().numpy()

        file_ = open( './'+'loss_item'+str(sigma), 'w')
        for los in loss_list:
            line = str()
            for lo in loss_list[los]:
                line += str(lo) + ' '
            line += '\n'
            file_.write(line)
        file_.close()
        return idx2vec

    def save_emb(self, savepath, num_nodes):
        perm = np.array([self.word2idx[str(word)] for word in range(num_nodes)])
        emb = self.idx2vec[perm]
        np.save(savepath , emb)

        return emb


class ModWord2Vec_defense():
    def __init__(self,train_edges,test_edges,num_nodes,out, sentences=None,
            corpus_file=None,
            size=100,
            alpha=0.025,
            window=5,
            min_count=5,
            max_vocab_size=None,
            sample=0.001,
            seed=1,
            workers=3,
            min_alpha=0.0001,
            sg=0,
            hs=0,
            negative=5,
            ns_exponent=0.75,
            cbow_mean=1,
            hashfxn=None,
            iter=5,
            null_word='<UNK>',
            trim_rule=None,
            sorted_vocab=1,
            batch_words=10000,
            compute_loss=False,
            callbacks=(),
            max_final_vocab=None):
        self.data = sentences
        self.e_dim = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = len(sentences)
        self.ss_t = sample
        self.seed = 1
        self.workers = 3
        self.min_alpha = min_alpha
        self.n_negs = negative
        self.ns_exponent = ns_exponent
        self.sg = sg
        self.hs = hs
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epoch = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.compute_loss = compute_loss
        self.batch_words = batch_words
        self.callbacks = callbacks
        self.max_final_vocab = None
        self.train_edges=train_edges
        self.test_edges=test_edges
        self.num_nodes=num_nodes
        self.out=out

        self.data = self.preprocess(sentences)
        self.idx2vec = self.train()



    def preprocess(self, sentences):
        pre = Preprocess(self.data, window = self.window, unk = self.null_word, max_vocab = self.max_vocab_size)
        self.idx2word, self.word2idx, self.vocab, self.wc = pre.build()
        return pre.convert()

    def train(self,cuda=False, weights=False):
        wf = np.array([self.wc[word] for word in self.idx2word])
        wf = wf / wf.sum()
        ws = 1 - np.sqrt(self.ss_t / wf)
        ws = np.clip(ws, 0, 1)
        vocab_size = len(self.idx2word)
        weights = wf if weights else None
        model = Word2Vec(vocab_size=vocab_size, embedding_size=self.e_dim)
        sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=self.n_negs, weights=weights)
        if cuda:
            sgns = sgns.cuda()
        optim = Adam(sgns.parameters())
        for self.epoch in range(1, self.epoch + 1):
            dataset = PermutedSubsampledCorpus(self.data)
            dataloader = DataLoader(dataset, batch_size=self.batch_words, shuffle=True)
            total_batches = int(np.ceil(len(dataset) / self.batch_words))
            print(len(dataset))
            print(self.batch_words)
            # exit()
            pbar = tqdm(dataloader)
            pbar.set_description("[Epoch {}]".format(self.epoch))
            for iword, owords in pbar:
                loss = sgns(iword, owords)
                print(loss)
                loss_dis=discriminator_loss(self.train_edges,self.test_edges,self.num_nodes,self.word2idx,embedding=model)
                print(loss_dis)
                # print(type(loss_dis))
                # print(torch.tensor(loss_dis))
                loss=loss-100*torch.tensor(loss_dis)
                print(loss)
                optim.zero_grad()
                loss.backward()
                optim.step()
                pbar.set_postfix(loss=loss.item())
        idx2vec = model.ivectors.weight.data.cpu().numpy()
        return idx2vec

    def save_emb(self, savepath, num_nodes):
        perm = np.array([self.word2idx[str(word)] for word in range(num_nodes)])
        emb = self.idx2vec[perm]
        np.save(savepath , emb)

        return emb


class ModWord2Vec_defense2():
    def __init__(self,train_edges,test_edges,num_nodes,out, sentences=None,
            corpus_file=None,
            size=100,
            alpha=0.025,
            window=5,
            min_count=5,
            max_vocab_size=None,
            sample=0.001,
            seed=1,
            workers=3,
            min_alpha=0.0001,
            sg=0,
            hs=0,
            negative=5,
            ns_exponent=0.75,
            cbow_mean=1,
            hashfxn=None,
            iter=5,
            null_word='<UNK>',
            trim_rule=None,
            sorted_vocab=1,
            batch_words=10000,
            compute_loss=False,
            callbacks=(),
            max_final_vocab=None):
        self.data = sentences
        self.e_dim = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = len(sentences)
        self.ss_t = sample
        self.seed = 1
        self.workers = 3
        self.min_alpha = min_alpha
        self.n_negs = negative
        self.ns_exponent = ns_exponent
        self.sg = sg
        self.hs = hs
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epoch = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.compute_loss = compute_loss
        self.batch_words = batch_words
        self.callbacks = callbacks
        self.max_final_vocab = None
        self.train_edges=train_edges
        self.test_edges=test_edges
        self.num_nodes=num_nodes
        self.out=out

        self.data = self.preprocess(sentences)
        self.idx2vec = self.train()



    def preprocess(self, sentences):
        pre = Preprocess(self.data, window = self.window, unk = self.null_word, max_vocab = self.max_vocab_size)
        self.idx2word, self.word2idx, self.vocab, self.wc = pre.build()
        return pre.convert()

    def train(self,cuda=False, weights=False):
        wf = np.array([self.wc[word] for word in self.idx2word])
        wf = wf / wf.sum()
        ws = 1 - np.sqrt(self.ss_t / wf)
        ws = np.clip(ws, 0, 1)
        vocab_size = len(self.idx2word)
        weights = wf if weights else None
        model = Word2Vec(vocab_size=vocab_size, embedding_size=self.e_dim)
        sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=self.n_negs, weights=weights)
        if cuda:
            sgns = sgns.cuda()
        optim = Adam(sgns.parameters())
        for self.epoch in range(1, self.epoch + 1):
            dataset = PermutedSubsampledCorpus(self.data)
            dataloader = DataLoader(dataset, batch_size=self.batch_words, shuffle=True)
            total_batches = int(np.ceil(len(dataset) / self.batch_words))
            print(len(dataset))
            print(self.batch_words)
            # exit()
            pbar = tqdm(dataloader)
            pbar.set_description("[Epoch {}]".format(self.epoch))
            for iword, owords in pbar:
                loss = sgns(iword, owords)
                print(loss)
                loss_dis=discriminator(self.train_edges,self.test_edges,self.num_nodes,self.word2idx,embedding=model)
                print(loss_dis)
                # print(type(loss_dis))
                # print(torch.tensor(loss_dis))
                loss=loss-100*torch.tensor(loss_dis)
                print(loss)
                optim.zero_grad()
                loss.backward()
                optim.step()
                pbar.set_postfix(loss=loss.item())
        idx2vec = model.ivectors.weight.data.cpu().numpy()
        return idx2vec

    def save_emb(self, savepath, num_nodes):
        perm = np.array([self.word2idx[str(word)] for word in range(num_nodes)])
        emb = self.idx2vec[perm]
        np.save(savepath , emb)

        return emb


class ModWord2Vec_defense3():
    def __init__(self,train_edges,test_edges,num_nodes,out,sentences=None,
            corpus_file=None,
            size=100,
            alpha=0.025,
            window=5,
            min_count=5,
            max_vocab_size=None,
            sample=0.001,
            seed=1,
            workers=3,
            min_alpha=0.0001,
            sg=0,
            hs=0,
            negative=5,
            ns_exponent=0.75,
            cbow_mean=1,
            hashfxn=None,
            iter=500,
            null_word='<UNK>',
            trim_rule=None,
            sorted_vocab=1,
            batch_words=10000,
            compute_loss=False,
            callbacks=(),
            max_final_vocab=None):
        self.data = sentences
        self.e_dim = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = len(sentences)
        self.ss_t = sample
        self.seed = 1
        self.workers = 3
        self.min_alpha = min_alpha
        self.n_negs = negative
        self.ns_exponent = ns_exponent
        self.sg = sg
        self.hs = hs
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epoch = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.compute_loss = compute_loss
        self.batch_words = batch_words
        self.callbacks = callbacks
        self.max_final_vocab = None
        self.train_edges=train_edges
        self.test_edges=test_edges
        self.num_nodes=num_nodes
        self.out=out

        self.data = self.preprocess(sentences)
        self.idx2vec = self.train()



    def preprocess(self, sentences):
        pre = Preprocess(self.data, window = self.window, unk = self.null_word, max_vocab = self.max_vocab_size)
        self.idx2word, self.word2idx, self.vocab, self.wc = pre.build()
        return pre.convert()

    def train(self,cuda=False, weights=False):
        wf = np.array([self.wc[word] for word in self.idx2word])
        wf = wf / wf.sum()
        ws = 1 - np.sqrt(self.ss_t / wf)
        ws = np.clip(ws, 0, 1)
        vocab_size = len(self.idx2word)
        weights = wf if weights else None
        model = Word2Vec(vocab_size=vocab_size, embedding_size=self.e_dim)
        sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=self.n_negs, weights=weights)
        if cuda:
            sgns = sgns.cuda()
        optim = Adam(sgns.parameters())
        for self.epoch in range(1, self.epoch + 1):
            dataset = PermutedSubsampledCorpus(self.data)
            dataloader = DataLoader(dataset, batch_size=self.batch_words, shuffle=True)
            total_batches = int(np.ceil(len(dataset) / self.batch_words))
            print(len(dataset))
            print(self.batch_words)
            # exit()
            pbar = tqdm(dataloader)
            pbar.set_description("[Epoch {}]".format(self.epoch))
            for iword, owords in pbar:
                loss = sgns(iword, owords)
                print(loss)
                loss_dis=discriminator_gain(self.train_edges,self.test_edges,self.num_nodes,self.word2idx,embedding=model)
                print(loss_dis)
                # print(type(loss_dis))
                # print(torch.tensor(loss_dis))
                loss=loss+0.001*torch.tensor(loss_dis)
                print(loss)
                optim.zero_grad()
                loss.backward()
                optim.step()
                pbar.set_postfix(loss=loss.item())
        idx2vec = model.ivectors.weight.data.cpu().numpy()
        return idx2vec

    def save_emb(self, savepath, num_nodes):
        perm = np.array([self.word2idx[str(word)] for word in range(num_nodes)])
        emb = self.idx2vec[perm]
        np.save(savepath , emb)

        return emb




def discriminator(train_edges,test_edges,num_nodes,word2idx,embedding):
    idx2vec = embedding.ivectors.weight.data.cpu().numpy()
    # print(num_nodes)
    # print(word2idx)

    perm = np.array([word2idx[str(word)] for word in range(num_nodes)])
    emb = idx2vec[perm]

    # start_time = time.time()
    # Generate bootstrapped edge embeddings (as is done in node2vec paper)
    # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    emb_matrix=emb
    def get_edge_embeddings(edge_list):
        embs = []
        sim_matrix=[]
        embs_1=[]
        embs_2 = []
        for edge in edge_list:
            node1 = edge[0]
            node2 = edge[1]
            emb1 = emb_matrix[node1]
            #print(np.shape(emb1))
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2))
            #edge_emb = np.array(emb1) + np.array(emb2)
            # print(np.shape(edge_emb))
            embs.append(edge_emb)
            embs_1.append(emb1)
            embs_2.append(emb2)
            sim_matrix.append(sim)
        embs = np.array(embs)
        sim_matrix = np.array(sim_matrix)
        embs_1=np.array(embs_1)
        embs_2 =np.array(embs_2)

        # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
        # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
        # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
        #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
        #     for i in range(edge_list.shape[0]):
        #         e = ' '.join(map(lambda x: str(x), embs[i]))
        #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

        return embs,sim_matrix,embs_1,embs_2

    edgeall = list([list(edge_tuple) for edge_tuple in train_edges])

    # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])
    train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])


    # Train-set edge embeddings
    pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_sampled)
    # neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, ego_user,DATASET,Flag, flag='neg-train')
    train_edge_embs = pos_train_edge_embs
    train_sim_matrix= pos_train_sim_matrix
    train_embs_1 = pos_embs_1_train
    train_embs_2 = pos_embs_2_train

    # Create train-set edge labels: 1 = real edge, 0 = false edge
    train_edge_labels = np.ones(len(train_edges_sampled))


    # Test-set edge embeddings, labels
    pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges)
    # neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(test_edges_false,ego_user,DATASET,Flag, flag='neg-test')
    test_edge_embs = pos_test_edge_embs
    test_sim_matrix = pos_test_sim_matrix
    test_embs_1 = pos_embs_1_test
    test_embs_2 = pos_embs_2_test

    # Create val-set edge labels: 1 = real edge, 0 = false edge
    test_edge_labels = np.ones(len(test_edges))


    ###########sim_svm

    train_edges_list = np.array(train_edges_sampled)
    # print(train_edges_list)
    test_edges_list = test_edges
    # print(test_edges_list)

    edges_list = np.concatenate((train_edges_list, test_edges_list), axis=0)

    # print(type(train_edges_list))
    # print(type(test_edges_list))
    # print(type(edges_list))

    # print(np.shape(train_edges_list))
    # print(np.shape(test_edges_list))
    # print(np.shape(edges_list))

    ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0]

    # print(train_sim_matrix)
    # print(test_sim_matrix)

    sim_matrix = np.concatenate((train_sim_matrix, test_sim_matrix), axis=0)
    # print(sim_matrix)
    # print(np.shape(train_sim_matrix))
    # print(np.shape(test_sim_matrix))
    sim_matrix = sim_matrix.reshape(-1, 1)
    # print(sim_matrix)
    # print(np.shape(sim_matrix))
    # exit()

    sim_matrix_train = train_sim_matrix
    sim_matrix_test = test_sim_matrix

    sim_matrix_train = sim_matrix_train.reshape(-1, 1)
    sim_matrix_test = sim_matrix_test.reshape(-1, 1)

    # print(np.shape(sim_matrix_train))
    # print(np.shape(sim_matrix_test))

    from sklearn.model_selection import train_test_split

    ylabel1 = ylabel
    ylable1 = np.reshape(len(ylabel1), 1)

    # print((edges_list))
    # print((ylabel1))
    # print(np.shape(ylabel1))
    # print(np.shape(edges_list))
    y_label = np.zeros((np.shape(edges_list)[0], 3))
    for i in range(np.shape(edges_list)[0]):
        y_label[i][0] = edges_list[i][0]
        y_label[i][1] = edges_list[i][1]
        y_label[i][2] = ylabel[i]
    # print(np.shape(y_label))

    y_label_train = np.zeros((np.shape(train_edges_list)[0], 3))
    for i in range(np.shape(train_edges_list)[0]):
        y_label_train[i][0] = train_edges_list[i][0]
        y_label_train[i][1] = train_edges_list[i][1]
        y_label_train[i][2] = 1
    # print(np.shape(y_label_train))

    y_label_test = np.zeros((np.shape(test_edges_list)[0], 3))
    for i in range(np.shape(test_edges_list)[0]):
        y_label_test[i][0] = test_edges_list[i][0]
        y_label_test[i][1] = test_edges_list[i][1]
        y_label_test[i][2] = 0
    # print(np.shape(y_label_test))

    X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(sim_matrix_train, y_label_train,
                                                                                test_size=0.1, random_state=42)

    X_test_train, X_test_test, y_test_train, y_test_test = train_test_split(sim_matrix_test, y_label_test,
                                                                            test_size=0.1, random_state=42)

    X_train = np.concatenate((X_train_train, X_test_train), axis=0)
    X_test = np.concatenate((X_train_test, X_test_test), axis=0)
    y_train = np.concatenate((y_train_train, y_test_train), axis=0)
    y_test = np.concatenate((y_train_test, y_test_test), axis=0)

    from sklearn import metrics
    from sklearn.neural_network import MLPClassifier

    mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16, 18), random_state=1,
                        max_iter=500)

    mlp.fit(X_train, y_train[:, 2])

    # loss=mlp.loss_

    y_score = mlp.predict(X_test)
    ls = 0
    for i in range(len(y_score)):
        if y_score[i] != y_test[i][2]:
            if y_score[i] == 1:
                ls += y_score[i]
            else:
                ls += 1-y_score[i]
    loss = ls / len(y_score)


    # print("Training set score: %f" % mlp.score(X_train, y_train[:, 2]))
    # print("Test set score: %f" % mlp.score(X_test, y_test[:, 2]))
    #
    # y_score = mlp.predict(X_test)
    # print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
    # print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

    return loss

def discriminator_loss(train_edges,test_edges,num_nodes,word2idx,embedding):
    idx2vec = embedding.ivectors.weight.data.cpu().numpy()
    # print(num_nodes)
    # print(word2idx)

    perm = np.array([word2idx[str(word)] for word in range(num_nodes)])
    emb = idx2vec[perm]

    # start_time = time.time()
    # Generate bootstrapped edge embeddings (as is done in node2vec paper)
    # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    emb_matrix=emb
    def get_edge_embeddings(edge_list):
        embs = []
        sim_matrix=[]
        embs_1=[]
        embs_2 = []
        for edge in edge_list:
            node1 = edge[0]
            node2 = edge[1]
            emb1 = emb_matrix[node1]
            #print(np.shape(emb1))
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2))
            #edge_emb = np.array(emb1) + np.array(emb2)
            # print(np.shape(edge_emb))
            embs.append(edge_emb)
            embs_1.append(emb1)
            embs_2.append(emb2)
            sim_matrix.append(sim)
        embs = np.array(embs)
        sim_matrix = np.array(sim_matrix)
        embs_1=np.array(embs_1)
        embs_2 =np.array(embs_2)

        # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
        # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
        # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
        #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
        #     for i in range(edge_list.shape[0]):
        #         e = ' '.join(map(lambda x: str(x), embs[i]))
        #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

        return embs,sim_matrix,embs_1,embs_2

    edgeall = list([list(edge_tuple) for edge_tuple in train_edges])

    # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])
    train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])


    # Train-set edge embeddings
    pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_sampled)
    # neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, ego_user,DATASET,Flag, flag='neg-train')
    train_edge_embs = pos_train_edge_embs
    train_sim_matrix= pos_train_sim_matrix
    train_embs_1 = pos_embs_1_train
    train_embs_2 = pos_embs_2_train

    # Create train-set edge labels: 1 = real edge, 0 = false edge
    train_edge_labels = np.ones(len(train_edges_sampled))


    # Test-set edge embeddings, labels
    pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges)
    # neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(test_edges_false,ego_user,DATASET,Flag, flag='neg-test')
    test_edge_embs = pos_test_edge_embs
    test_sim_matrix = pos_test_sim_matrix
    test_embs_1 = pos_embs_1_test
    test_embs_2 = pos_embs_2_test

    # Create val-set edge labels: 1 = real edge, 0 = false edge
    test_edge_labels = np.ones(len(test_edges))


    ###########sim_svm

    train_edges_list = np.array(train_edges_sampled)
    # print(train_edges_list)
    test_edges_list = test_edges
    # print(test_edges_list)

    edges_list = np.concatenate((train_edges_list, test_edges_list), axis=0)

    # print(type(train_edges_list))
    # print(type(test_edges_list))
    # print(type(edges_list))

    # print(np.shape(train_edges_list))
    # print(np.shape(test_edges_list))
    # print(np.shape(edges_list))

    ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0]

    # print(train_sim_matrix)
    # print(test_sim_matrix)

    loss_train = []
    for i in range(len(train_embs_1)):
        los_denominator = 0
        for j in range(np.shape(emb_matrix)[0]):
            if i != j:
                los_denominator += math.exp(np.dot(train_embs_1[i], emb_matrix[j]))
        los = math.exp(np.dot(train_embs_1[i], train_embs_2[i]) / los_denominator - train_edge_labels[i]) ** 2
        loss_train.append(los)
        # print(train_embs_1[i], train_embs_2[i],train_edge_labels[i])

    loss_test = []
    for i in range(len(test_embs_1)):
        los_denominator = 0
        for j in range(np.shape(emb_matrix)[0]):
            if i != j:
                los_denominator += math.exp(np.dot(test_embs_1[i], emb_matrix[j]))
        los = math.exp(np.dot(test_embs_1[i], test_embs_2[i]) / los_denominator - test_edge_labels[i]) ** 2

        loss_test.append(los)

    loss_matrix = np.concatenate((loss_train, loss_test), axis=0)
    # print(loss_matrix)

    loss_matrix = loss_matrix.reshape(-1, 1)
    # print(loss_matrix)

    loss_matrix_train = np.array(loss_train)
    # print(loss_matrix_train)

    loss_matrix_test = np.array(loss_test)
    # print(loss_matrix_test)

    loss_matrix_train = loss_matrix_train.reshape(-1, 1)
    # print(loss_matrix_train)

    loss_matrix_test = loss_matrix_test.reshape(-1, 1)
    # print(loss_matrix_test)
    from sklearn.model_selection import train_test_split

    ylabel1 = ylabel
    ylable1 = np.reshape(len(ylabel1), 1)

    # print((edges_list))
    # print((ylabel1))
    # print(np.shape(ylabel1))
    # print(np.shape(edges_list))
    y_label = np.zeros((np.shape(edges_list)[0], 3))
    for i in range(np.shape(edges_list)[0]):
        y_label[i][0] = edges_list[i][0]
        y_label[i][1] = edges_list[i][1]
        y_label[i][2] = ylabel[i]
    # print(np.shape(y_label))

    y_label_train = np.zeros((np.shape(train_edges_list)[0], 3))
    for i in range(np.shape(train_edges_list)[0]):
        y_label_train[i][0] = train_edges_list[i][0]
        y_label_train[i][1] = train_edges_list[i][1]
        y_label_train[i][2] = 1
    # print(np.shape(y_label_train))

    y_label_test = np.zeros((np.shape(test_edges_list)[0], 3))
    for i in range(np.shape(test_edges_list)[0]):
        y_label_test[i][0] = test_edges_list[i][0]
        y_label_test[i][1] = test_edges_list[i][1]
        y_label_test[i][2] = 0
    # print(np.shape(y_label_test))

    X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(loss_matrix_train, y_label_train,
                                                                                test_size=0.3, random_state=42)

    X_test_train, X_test_test, y_test_train, y_test_test = train_test_split(loss_matrix_test, y_label_test,
                                                                            test_size=0.3, random_state=42)

    X_train = np.concatenate((X_train_train, X_test_train), axis=0)
    X_test = np.concatenate((X_train_test, X_test_test), axis=0)
    y_train = np.concatenate((y_train_train, y_test_train), axis=0)
    y_test = np.concatenate((y_train_test, y_test_test), axis=0)


    from sklearn import metrics
    from sklearn.neural_network import MLPClassifier

    mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16, 18), random_state=1,
                        max_iter=500)

    mlp.fit(X_train, y_train[:, 2])

    loss=mlp.loss_

    # print("Training set score: %f" % mlp.score(X_train, y_train[:, 2]))
    # print("Test set score: %f" % mlp.score(X_test, y_test[:, 2]))
    #
    # y_score = mlp.predict(X_test)
    # print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
    # print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

    return loss


def discriminator_gain(train_edges,test_edges,num_nodes,word2idx,embedding):
    idx2vec = embedding.ivectors.weight.data.cpu().numpy()
    # print(num_nodes)
    # print(word2idx)

    perm = np.array([word2idx[str(word)] for word in range(num_nodes)])
    emb = idx2vec[perm]

    # start_time = time.time()
    # Generate bootstrapped edge embeddings (as is done in node2vec paper)
    # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    emb_matrix=emb
    def get_edge_embeddings(edge_list):
        embs = []
        sim_matrix=[]
        embs_1=[]
        embs_2 = []
        for edge in edge_list:
            node1 = edge[0]
            node2 = edge[1]
            emb1 = emb_matrix[node1]
            #print(np.shape(emb1))
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2))
            #edge_emb = np.array(emb1) + np.array(emb2)
            # print(np.shape(edge_emb))
            embs.append(edge_emb)
            embs_1.append(emb1)
            embs_2.append(emb2)
            sim_matrix.append(sim)
        embs = np.array(embs)
        sim_matrix = np.array(sim_matrix)
        embs_1=np.array(embs_1)
        embs_2 =np.array(embs_2)

        # #with open('/Users/xiulingwang/Downloads/line-master/data/embds/' + str(ego_user) + flag + '-' + Flag, 'w') as f:
        # with open('E:/python/banlance/code/' + DATASET + '/' + METHOD + '/embeds/' + Flag + '-' + str(ego_user) + flag,'w') as f:
        # #with open('/Users/xiulingwang/Downloads/'+DATASET+'/line/3-split/' + str(ego_user)+ '-' + flag + '-' + Flag+'-' +'embds','w') as f:
        #     f.write('%d %d\n' % (edge_list.shape[0], args.representation_size))
        #     for i in range(edge_list.shape[0]):
        #         e = ' '.join(map(lambda x: str(x), embs[i]))
        #         f.write('%s %s %s\n' % (str(edge_list[i][0]), str(edge_list[i][1]), e))

        return embs,sim_matrix,embs_1,embs_2

    edgeall = list([list(edge_tuple) for edge_tuple in train_edges])

    # train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])
    train_edges_sampled = random.sample(edgeall, np.shape(test_edges)[0])


    # Train-set edge embeddings
    pos_train_edge_embs ,pos_train_sim_matrix,pos_embs_1_train,pos_embs_2_train= get_edge_embeddings(train_edges_sampled)
    # neg_train_edge_embs,neg_train_sim_matrix,neg_embs_1_train,neg_embs_2_train = get_edge_embeddings(train_edges_false, ego_user,DATASET,Flag, flag='neg-train')
    train_edge_embs = pos_train_edge_embs
    train_sim_matrix= pos_train_sim_matrix
    train_embs_1 = pos_embs_1_train
    train_embs_2 = pos_embs_2_train

    # Create train-set edge labels: 1 = real edge, 0 = false edge
    train_edge_labels = np.ones(len(train_edges_sampled))


    # Test-set edge embeddings, labels
    pos_test_edge_embs,pos_test_sim_matrix,pos_embs_1_test,pos_embs_2_test = get_edge_embeddings(test_edges)
    # neg_test_edge_embs ,neg_test_sim_matrix,neg_embs_1_test,neg_embs_2_test= get_edge_embeddings(test_edges_false,ego_user,DATASET,Flag, flag='neg-test')
    test_edge_embs = pos_test_edge_embs
    test_sim_matrix = pos_test_sim_matrix
    test_embs_1 = pos_embs_1_test
    test_embs_2 = pos_embs_2_test

    # Create val-set edge labels: 1 = real edge, 0 = false edge
    test_edge_labels = np.ones(len(test_edges))


    ###########sim_svm

    train_edges_list = np.array(train_edges_sampled)
    # print(train_edges_list)
    test_edges_list = test_edges
    # print(test_edges_list)

    edges_list = np.concatenate((train_edges_list, test_edges_list), axis=0)

    # print(type(train_edges_list))
    # print(type(test_edges_list))
    # print(type(edges_list))

    # print(np.shape(train_edges_list))
    # print(np.shape(test_edges_list))
    # print(np.shape(edges_list))

    ylabel = [1] * train_sim_matrix.shape[0] + [0] * test_sim_matrix.shape[0]

    # print(train_sim_matrix)
    # print(test_sim_matrix)

    sim_matrix = np.concatenate((train_sim_matrix, test_sim_matrix), axis=0)
    # print(sim_matrix)
    # print(np.shape(train_sim_matrix))
    # print(np.shape(test_sim_matrix))
    sim_matrix = sim_matrix.reshape(-1, 1)
    # print(sim_matrix)
    # print(np.shape(sim_matrix))
    # exit()

    sim_matrix_train = train_sim_matrix
    sim_matrix_test = test_sim_matrix

    sim_matrix_train = sim_matrix_train.reshape(-1, 1)
    sim_matrix_test = sim_matrix_test.reshape(-1, 1)

    # print(np.shape(sim_matrix_train))
    # print(np.shape(sim_matrix_test))

    from sklearn.model_selection import train_test_split

    ylabel1 = ylabel
    ylable1 = np.reshape(len(ylabel1), 1)

    # print((edges_list))
    # print((ylabel1))
    # print(np.shape(ylabel1))
    # print(np.shape(edges_list))
    y_label = np.zeros((np.shape(edges_list)[0], 3))
    for i in range(np.shape(edges_list)[0]):
        y_label[i][0] = edges_list[i][0]
        y_label[i][1] = edges_list[i][1]
        y_label[i][2] = ylabel[i]
    # print(np.shape(y_label))

    y_label_train = np.zeros((np.shape(train_edges_list)[0], 3))
    for i in range(np.shape(train_edges_list)[0]):
        y_label_train[i][0] = train_edges_list[i][0]
        y_label_train[i][1] = train_edges_list[i][1]
        y_label_train[i][2] = 1
    # print(np.shape(y_label_train))

    y_label_test = np.zeros((np.shape(test_edges_list)[0], 3))
    for i in range(np.shape(test_edges_list)[0]):
        y_label_test[i][0] = test_edges_list[i][0]
        y_label_test[i][1] = test_edges_list[i][1]
        y_label_test[i][2] = 0
    # print(np.shape(y_label_test))

    X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(sim_matrix_train, y_label_train,
                                                                                test_size=0.1, random_state=42)

    X_test_train, X_test_test, y_test_train, y_test_test = train_test_split(sim_matrix_test, y_label_test,
                                                                            test_size=0.1, random_state=42)

    X_train = np.concatenate((X_train_train, X_test_train), axis=0)
    X_test = np.concatenate((X_train_test, X_test_test), axis=0)
    y_train = np.concatenate((y_train_train, y_test_train), axis=0)
    y_test = np.concatenate((y_train_test, y_test_test), axis=0)

    from sklearn import metrics
    from sklearn.neural_network import MLPClassifier

    mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64, 32, 16, 18), random_state=1,
                        max_iter=500)

    mlp.fit(X_train, y_train[:, 2])

    # loss=mlp.loss_

    gain=0

    prob=mlp.predict_proba(X_train)
    for i in range(len(prob)):
        if i <np.shape(X_train_train)[0]:
            gain+=prob[i][1]

        else:
            gain += prob[i][0]
    gain=gain/np.shape(X_train)[0]

    loss=gain


    # print("Training set score: %f" % mlp.score(X_train, y_train[:, 2]))
    # print("Test set score: %f" % mlp.score(X_test, y_test[:, 2]))
    #
    # y_score = mlp.predict(X_test)
    # print(metrics.f1_score(y_test[:, 2], y_score, average='micro'))
    # print(metrics.classification_report(y_test[:, 2], y_score, labels=range(3)))

    return loss






if __name__ == "__main__":
        data = np.array(np.random.randint(0,13210, size=(13210, 80)),str)
        w2v = ModWord2Vec(data)
        w2v.save_emb("embedding.npy",13210)