MIA-GCL / MERIT / utils / process.py
process.py
Raw
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import sys
import torch
import random
import preprocessing
import pickle as pk


def parse_index_file(filename):
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def load_data(dataset_str):
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    return adj, features, labels, idx_train, idx_val, idx_test


def load_data_mia(dataset_str,drop_edge_rate_1 ,drop_feature_rate_1):
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))


    # g = nx.Graph()
    # adj_sparse=nx.from_scipy_sparse_matrix(adj)
    # adj_sparse = nx.to_scipy_sparse_matrix(g)
    # print('***',adj,y)
    # exit()
    if dataset_str=='citeseer':
        dt0='Citeseer'
    else:
        dt0='Cora'

    adj_sparse=adj
    random.seed(42)
    # train_test_split = preprocessing.mask_test_edges(adj_sparse, test_frac=.3, val_frac=0)
    f2 = open('/Wang-ds/xwang193/PyGCL-main/examples/%s-0.2-grace-mia-mi-white-2-nofeature-perturb/%s-train_test_split' % (dt0,dt0), 'rb')
    train_test_split = pk.load(f2, encoding='latin1')


    res_dir = '%s-merit-mia-white-2-nodiffusion-%s-%s' % (dataset_str,drop_edge_rate_1 ,drop_feature_rate_1)
    with open('./%s/%s-train_test_split' % (res_dir, dt0), 'wb') as f:
        pk.dump(train_test_split, f)

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = train_test_split  # Unpack train-test split
    # print(adj_train)
    g_train0 = nx.from_scipy_sparse_matrix(
        adj_train)  # new graph object with only non-hidden edges, keep all the original nodes

    edge_tuples0 = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in g_train0.edges()]
    # print(edge_tuples0)

    train_edges0 = set(edge_tuples0)  # initialize train_edges to have all edges
    train_edges0 = np.array([list(edge_tuple) for edge_tuple in train_edges0])
    # print(train_edges1)


    edge_tuples_test0 = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in test_edges]

    edges_test0 = set(edge_tuples_test0)  # initialize test_edges to have all edges
    edges_test0 = np.array([list(edge_tuple) for edge_tuple in edges_test0])

    out = open('%s/%s-edges-train.txt' % (res_dir, dataset_str), 'w')
    for item in train_edges0:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    out = open('%s/%s-edges-test.txt' % (res_dir, dataset_str), 'w')
    for item in edges_test0:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    # # adj = adj_train
    # #
    # # adj = process.normalize_adj(adj + sp.eye(adj.shape[0]))
    # train_edges_1 = np.concatenate((train_edges0[:, 1].reshape(-1, 1), train_edges0[:, 0].reshape(-1, 1)), axis=1)
    # train_edges_1 = np.transpose(np.array(train_edges_1))
    # train_edges_2 = np.transpose(np.array(train_edges0))
    # # loop_nodes=np.arange(0,g.number_of_nodes())
    # # train_edges_3=np.concatenate((loop_nodes.reshape(-1,1),loop_nodes.reshape(-1,1)),axis=1)
    # # train_edges_3 = np.transpose(np.array(train_edges_3))
    #
    # edges_train_index = np.concatenate((train_edges_1, train_edges_2), axis=1)
    #
    # adj_idx=edges_train_index




    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)
    # idx_all=range(y)

    # print(adj_train)

    return adj_train, features, labels, idx_train, idx_val, idx_test,train_edges0,edges_test0,res_dir


def load_data_mia2(dataset_str,res_dir):
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()

    # print(features)
    # print(features.type())
    # exit()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))


    # g = nx.Graph()
    # adj_sparse=nx.from_scipy_sparse_matrix(adj)
    # adj_sparse = nx.to_scipy_sparse_matrix(g)
    # print('***',adj,y)
    # exit()
    if dataset_str=='citeseer':
        dt0='Citeseer'
    else:
        dt0='Cora'

    adj_sparse=adj
    random.seed(42)
    # train_test_split = preprocessing.mask_test_edges(adj_sparse, test_frac=.3, val_frac=0)
    # f2 = open('/Wang-ds/xwang193/PyGCL-main/examples/%s-0.2-grace-mia-mi-white-2-nofeature-perturb/%s-train_test_split' % (dt0,dt0), 'rb')
    # train_test_split = pk.load(f2, encoding='latin1')


    # res_dir = '%s-merit-mia-white-2-nodiffusion-%s-%s' % (dataset_str,drop_edge_rate_1 ,drop_feature_rate_1)
    # with open('./%s/%s-train_test_split' % (res_dir, dt0), 'wb') as f:
    #     pk.dump(train_test_split, f)

    f2 = open(
        './%s/%s-train_test_split' % (res_dir, dt0), 'rb')
    train_test_split = pk.load(f2, encoding='latin1')

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = train_test_split  # Unpack train-test split
    # print(adj_train)
    g_train0 = nx.from_scipy_sparse_matrix(
        adj_train)  # new graph object with only non-hidden edges, keep all the original nodes

    edge_tuples0 = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in g_train0.edges()]
    # print(edge_tuples0)

    train_edges0 = set(edge_tuples0)  # initialize train_edges to have all edges
    train_edges0 = np.array([list(edge_tuple) for edge_tuple in train_edges0])
    # print(train_edges1)


    edge_tuples_test0 = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in test_edges]

    edges_test0 = set(edge_tuples_test0)  # initialize test_edges to have all edges
    edges_test0 = np.array([list(edge_tuple) for edge_tuple in edges_test0])

    out = open('%s/%s-edges-train.txt' % (res_dir, dataset_str), 'w')
    for item in train_edges0:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    out = open('%s/%s-edges-test.txt' % (res_dir, dataset_str), 'w')
    for item in edges_test0:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    # # adj = adj_train
    # #
    # # adj = process.normalize_adj(adj + sp.eye(adj.shape[0]))
    # train_edges_1 = np.concatenate((train_edges0[:, 1].reshape(-1, 1), train_edges0[:, 0].reshape(-1, 1)), axis=1)
    # train_edges_1 = np.transpose(np.array(train_edges_1))
    # train_edges_2 = np.transpose(np.array(train_edges0))
    # # loop_nodes=np.arange(0,g.number_of_nodes())
    # # train_edges_3=np.concatenate((loop_nodes.reshape(-1,1),loop_nodes.reshape(-1,1)),axis=1)
    # # train_edges_3 = np.transpose(np.array(train_edges_3))
    #
    # edges_train_index = np.concatenate((train_edges_1, train_edges_2), axis=1)
    #
    # adj_idx=edges_train_index




    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)
    # idx_all=range(y)

    # print(labels)
    # exit()

    return adj_train, features, labels, idx_train, idx_val, idx_test,train_edges0,edges_test0


def load_data_mia2_fb(dataset_str,res_dir):
    # num_nodes = 1600
    # num_feats = 1283

    feat_dir = '3980-adj-feat.pkl'
    # feat_dir = '/Users/xiulingwang/Downloads/line-master/26-adj-feat.pkl'

    f2 = open(feat_dir, 'rb')

    adj, ft = pk.load(f2, encoding='latin1')

    print(np.shape(ft))

    # ft=ft-1


    gender_idx=77
    edu_idx=53

    lbs=np.sum(ft[:, edu_idx:edu_idx+1],axis=1)
    print(ft[:, edu_idx:edu_idx+1])
    classes = int(np.max(lbs))
    print(set(list(lbs)))
    for cls in range(classes+1):
        print('KKKKK', cls, len(np.where(lbs == cls)[0]))

    g = nx.Graph(adj)


    featname_dir = '3980.featnames'
    # facebook feature map
    f = open(featname_dir)
    featnames = []
    for line in f:
        line = line.strip().split(' ')
        feats = line[1]
        feats = feats.split(';')
        feat = feats[0]
        featnames.append(feat)
    # print(featnames)
    # exit()
    f.close()

    # gender 77, gender 78
    gindex = featnames.index('gender')

    x = np.delete(ft,[gindex],axis=1)
    for ii in range(np.shape(x)[0]):
        if np.sum(x[ii])!=0:

            x[ii]=x[ii]/np.sum(x[ii])
    num_features = np.shape(ft)[1]
    labels=np.sum(ft[:, gindex:gindex+1],axis=1)
    print(labels)

    lbs=[]

    for lb in labels:
        if lb==0:
            lbs.append([1,0])
        else:
            lbs.append([0,1])
    lbs=np.array(lbs)


    g = nx.Graph(adj)


    # adj1=np.array(adj.todense())


    feat_data = ft
    print((np.shape(ft)))


    random.seed(42)
    train_test_split = preprocessing.mask_test_edges(adj, test_frac=.3, val_frac=0)

    # res_dir = '%s-merit-mia-white-2-nodiffusion-%s-%s' % (dataset_str,drop_edge_rate_1 ,drop_feature_rate_1)
    with open('./%s/3980-train_test_split' % (res_dir), 'wb') as f:
        pk.dump(train_test_split, f)

    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = train_test_split  # Unpack train-test split
    # print(adj_train)
    g_train0 = nx.from_scipy_sparse_matrix(
        adj_train)  # new graph object with only non-hidden edges, keep all the original nodes

    edge_tuples0 = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in g_train0.edges()]
    # print(edge_tuples0)

    train_edges0 = set(edge_tuples0)  # initialize train_edges to have all edges
    train_edges0 = np.array([list(edge_tuple) for edge_tuple in train_edges0])
    # print(train_edges1)


    edge_tuples_test0 = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in test_edges]

    edges_test0 = set(edge_tuples_test0)  # initialize test_edges to have all edges
    edges_test0 = np.array([list(edge_tuple) for edge_tuple in edges_test0])

    out = open('%s/%s-edges-train.txt' % (res_dir, dataset_str), 'w')
    for item in train_edges0:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    out = open('%s/%s-edges-test.txt' % (res_dir, dataset_str), 'w')
    for item in edges_test0:
        for jtem in item:
            out.write(str(jtem) + '\t')
        out.write('\n')
    out.close()

    # build symmetric adjacency matrix

    idx= np.arange(num_features)
    idx=np.delete(idx,gindex)

    features = ft[:,idx]
    # print(np.shape(features))
    # exit()
    features = sp.coo_matrix(features, dtype=np.float32).tolil()
    # print(features)


    idx_test = range(0,int(len(labels)*0.2))
    idx_val = range(int(len(labels)*0.2), int(len(labels)*0.3))
    idx_train = range(int(len(labels)*0.2),len(labels))


    return adj_train, features, lbs, idx_train, idx_val, idx_test,train_edges0,edges_test0

def sparse_to_tuple(sparse_mx, insert_batch=False):
    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        if insert_batch:
            coords = np.vstack((np.zeros(mx.row.shape[0]), mx.row, mx.col)).transpose()
            values = mx.data
            shape = (1,) + mx.shape
        else:
            coords = np.vstack((mx.row, mx.col)).transpose()
            values = mx.data
            shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx


def preprocess_features(features):
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return features.todense(), sparse_to_tuple(features)


def normalize_adj(adj):
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()


def preprocess_adj(adj):
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    return sparse_to_tuple(adj_normalized)


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)