import random from random import choices import numpy as np import pandas as pd import torch from torch_geometric.datasets import TUDataset from torch_geometric.loader import DataLoader from torch_geometric.transforms import OneHotDegree from models_ import GIN, serverGIN, GIN_dc, serverGIN_dc # from server import Server from server_ import Server from client import Client_GC from utils import get_maxDegree, get_stats, split_data, get_numGraphLabels, init_structure_encoding from scipy.special import rel_entr import scipy from torch_geometric.utils import erdos_renyi_graph, degree import itertools import time import os import pickle import networkx as nx from torch_geometric.data import Data import copy as cp def _randChunk(graphs, num_client, overlap, seed=None): random.seed(seed) np.random.seed(seed) totalNum = len(graphs) minSize = min(50, int(totalNum/num_client)) graphs_chunks = [] if not overlap: for i in range(num_client): graphs_chunks.append(graphs[i*minSize:(i+1)*minSize]) for g in graphs[num_client*minSize:]: idx_chunk = np.random.randint(low=0, high=num_client, size=1)[0] graphs_chunks[idx_chunk].append(g) else: sizes = np.random.randint(low=50, high=150, size=num_client) for s in sizes: graphs_chunks.append(choices(graphs, k=s)) return graphs_chunks def _randChunk_(graphs, num_client, overlap, seed=None): random.seed(seed) np.random.seed(seed) # print(set(list(graphs.y))) totalNum = len(graphs) # totalNum_=int(totalNum*0.9) minSize = min(50, int(totalNum/num_client)) graphs_chunks = [] if not overlap: for i in range(num_client): graphs_chunks.append(graphs[i*minSize:(i+1)*minSize]) for g in graphs[num_client*minSize:]: idx_chunk = np.random.randint(low=0, high=num_client, size=1)[0] graphs_chunks[idx_chunk].append(g) graph_global_test=graphs[num_client*minSize:] else: sizes = np.random.randint(low=50, high=150, size=num_client) for s in sizes: graphs_chunks.append(choices(graphs, k=s)) return graphs_chunks,graph_global_test def _randChunk2_(graphs, num_client, overlap, seed=None): random.seed(seed) np.random.seed(seed) # print(set(list(graphs.y))) totalNum = len(graphs) # totalNum_=int(totalNum*0.9) # minSize = min(100, int(totalNum/num_client)) minSize = min(100, int(totalNum/num_client)) # minSize = int(totalNum*0.6/num_client) graphs_chunks = [] if not overlap: for i in range(num_client): graphs_chunks.append(graphs[i*minSize:(i+1)*minSize]) for g in graphs[num_client*minSize:]: idx_chunk = np.random.randint(low=0, high=num_client, size=1)[0] graphs_chunks[idx_chunk].append(g) graph_global_test=graphs[num_client*minSize:] else: sizes = np.random.randint(low=50, high=150, size=num_client) for s in sizes: graphs_chunks.append(choices(graphs, k=s)) return graphs_chunks,graph_global_test def prepareData_oneDS(datapath, data, num_client, batchSize, convert_x=False, seed=None, overlap=False): if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) if convert_x: maxdegree = get_maxDegree(tudataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) graphs = [x for x in tudataset] print(" **", data, len(graphs)) graphs_chunks = _randChunk(graphs, num_client, overlap, seed=seed) splitedData = {} df = pd.DataFrame() num_node_features = graphs[0].num_node_features for idx, chunks in enumerate(graphs_chunks): ds = f'{idx}-{data}' ds_tvt = chunks ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) dataloader_train = DataLoader(ds_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(ds_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(ds_test, batch_size=batchSize, shuffle=True) num_graph_labels = get_numGraphLabels(ds_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(ds_train)) df = get_stats(df, ds, ds_train, graphs_val=ds_val, graphs_test=ds_test) return splitedData, df def prepareData_multiDS(args, datapath, group='chem', batchSize=32, seed=None): assert group in ['chem', "biochem", 'biochemsn', "biosncv"] if group == 'chem': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1"] elif group == 'biochem': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1", # small molecules "ENZYMES", "DD", "PROTEINS"] # bioinformatics elif group == 'biochemsn': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1", # small molecules "ENZYMES", "DD", "PROTEINS", # bioinformatics "COLLAB", "IMDB-BINARY", "IMDB-MULTI"] # social networks elif group == 'biosncv': datasets = ["ENZYMES", "DD", "PROTEINS", # bioinformatics "COLLAB", "IMDB-BINARY", "IMDB-MULTI", # social networks "Letter-high", "Letter-low", "Letter-med"] # computer vision splitedData = {} df = pd.DataFrame() for data in datasets: if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) elif "Letter" in data: tudataset = TUDataset(f"{datapath}/TUDataset", data, use_node_attr=True) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) graphs = [x for x in tudataset] print(" **", data, len(graphs)) graphs_train, graphs_valtest = split_data(graphs, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) dataloader_train = DataLoader(graphs_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(graphs_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(graphs_test, batch_size=batchSize, shuffle=True) num_node_features = graphs[0].num_node_features num_graph_labels = get_numGraphLabels(graphs_train) splitedData[data] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(graphs_train)) df = get_stats(df, data, graphs_train, graphs_val=graphs_val, graphs_test=graphs_test) return splitedData, df def prepareData_multiDS_multi(args, datapath, group='small', batchSize=32, nc_per_ds=1, seed=None): assert group in ['chem', "biochem", 'biochemsn', "biosncv"] if group == 'chem': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1"] elif group == 'biochem': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1", # small molecules "ENZYMES", "DD", "PROTEINS"] # bioinformatics elif group == 'biochemsn': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1", # small molecules "ENZYMES", "DD", "PROTEINS", # bioinformatics "COLLAB", "IMDB-BINARY", "IMDB-MULTI"] # social networks # "Letter-low", "Letter-med"] # computer vision elif group == 'biosncv': datasets = ["ENZYMES", "DD", "PROTEINS", # bioinformatics "COLLAB", "IMDB-BINARY", "IMDB-MULTI", # social networks "Letter-high", "Letter-low", "Letter-med"] # computer vision splitedData = {} df = pd.DataFrame() for data in datasets: if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) elif "Letter" in data: tudataset = TUDataset(f"{datapath}/TUDataset", data, use_node_attr=True) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) graphs = [x for x in tudataset] print(" **", data, len(graphs)) num_node_features = graphs[0].num_node_features graphs_chunks = _randChunk(graphs, nc_per_ds, overlap=False, seed=seed) for idx, chunks in enumerate(graphs_chunks): ds = f'{idx}-{data}' ds_tvt = chunks graphs_train, graphs_valtest = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) dataloader_train = DataLoader(graphs_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(graphs_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(graphs_test, batch_size=batchSize, shuffle=True) num_graph_labels = get_numGraphLabels(graphs_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(graphs_train)) df = get_stats(df, ds, graphs_train, graphs_val=graphs_val, graphs_test=graphs_test) return splitedData, df def js_diver(P,Q): M=P+Q return 0.5*scipy.stats.entropy(P,M,base=2)+0.5*scipy.stats.entropy(Q,M,base=2) def setup_devices(splitedData, args): idx_clients = {} clients = [] for idx, ds in enumerate(splitedData.keys()): idx_clients[idx] = ds dataloaders, num_node_features, num_graph_labels, train_size = splitedData[ds] if args.alg == 'fedstar': cmodel_gc = GIN_dc(num_node_features, args.n_se, args.hidden, num_graph_labels, args.nlayer, args.dropout) else: cmodel_gc = GIN(num_node_features, args.hidden, num_graph_labels, args.nlayer, args.dropout) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, cmodel_gc.parameters()), lr=args.lr, weight_decay=args.weight_decay) clients.append(Client_GC(cmodel_gc, idx, ds, train_size, dataloaders, optimizer, args)) if args.alg == 'fedstar': smodel = serverGIN_dc(n_se=args.n_se, nlayer=args.nlayer, nhid=args.hidden) else: smodel = serverGIN(nlayer=args.nlayer, nhid=args.hidden) server = Server(smodel, args.device) return clients, server, idx_clients def setup_devices_(splitedData,global_test_data, args): idx_clients = {} clients = [] for idx, ds in enumerate(splitedData.keys()): idx_clients[idx] = ds dataloaders, num_node_features, num_graph_labels, train_size = splitedData[ds] if args.alg == 'fedstar': cmodel_gc = GIN_dc(num_node_features, args.n_se, args.hidden, num_graph_labels, args.nlayer, args.dropout) else: cmodel_gc = GIN(num_node_features, args.hidden, num_graph_labels, args.nlayer, args.dropout) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, cmodel_gc.parameters()), lr=args.lr, weight_decay=args.weight_decay) clients.append(Client_GC(cmodel_gc, idx, ds, train_size, dataloaders, optimizer, args)) if args.alg == 'fedstar': smodel = serverGIN_dc(num_node_features,n_se=args.n_se, nlayer=args.nlayer, nclass=num_graph_labels, nhid=args.hidden,dropout=args.dropout) else: smodel = serverGIN(nlayer=args.nlayer, nhid=args.hidden) server = Server(smodel,global_test_data, args.device) return clients, server, idx_clients def prepareData_oneDS_(args,datapath, data, num_client, batchSize, convert_x=False, seed=None, overlap=False): if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) if convert_x: maxdegree = get_maxDegree(tudataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) graphs = [x for x in tudataset] # print(" **", data, len(graphs)) # ys=[] # for x in tudataset: # ys.append(x.y) # print(set(list(np.array(ys)))) # print(len(np.where(np.array(ys)==0)[0]),len(np.where(np.array(ys)==1)[0])) # exit() # if data == 'ENZYMES': random.shuffle(graphs) graphs_chunks,graph_global_test = _randChunk_(graphs, num_client, overlap, seed=seed) splitedData = {} df = pd.DataFrame() num_node_features = graphs[0].num_node_features for idx, chunks in enumerate(graphs_chunks): ds = f'{idx}-{data}' ds_tvt = chunks # print(ds_tvt) if data=='ENZYMES': ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) ds_val=ds_train ds_test=ds_train graphs_train=ds_train graphs_val=ds_val graphs_test=ds_test else: # ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) # ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train, graphs_valtest = split_data(ds_tvt, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) # print(graphs_train) # exit() graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) graph_global_test=init_structure_encoding(args, gs=graph_global_test, type_init=args.type_init) # ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) ds_train=graphs_train ds_val= graphs_val ds_test=graphs_test dataloader_train = DataLoader(ds_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(ds_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(ds_test, batch_size=len(ds_test), shuffle=True) num_graph_labels = get_numGraphLabels(ds_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(ds_train)) df = get_stats(df, ds, ds_train, graphs_val=ds_val, graphs_test=ds_test) # ds_global_test=split_data(graph_global_test, test=2, shuffle=True, seed=seed) # print(graph_global_test) dataloader_global_test = ({'test': DataLoader(graph_global_test, batch_size=len(graph_global_test), shuffle=True)}) return splitedData, df,dataloader_global_test def prepareData_oneDS2_(args,datapath, data, num_client, batchSize, convert_x=False, seed=None, overlap=False): if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) if convert_x: maxdegree = get_maxDegree(tudataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) graphs = [x for x in tudataset] # print(" **", data, len(graphs)) # ys=[] # for x in tudataset: # ys.append(x.y) # print(set(list(np.array(ys)))) # print(len(np.where(np.array(ys)==0)[0]),len(np.where(np.array(ys)==1)[0])) # exit() if data == 'ENZYMES': random.shuffle(graphs) graphs_chunks,graph_global_test = _randChunk2_(graphs, num_client, overlap, seed=seed) splitedData = {} df = pd.DataFrame() num_node_features = graphs[0].num_node_features for idx, chunks in enumerate(graphs_chunks): ds = f'{idx}-{data}' ds_tvt = chunks # print(ds_tvt) if data=='ENZYMES': ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) ds_val=ds_train ds_test=ds_train graphs_train=ds_train graphs_val=ds_val graphs_test=ds_test else: # ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) # ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train, graphs_valtest = split_data(ds_tvt, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) print(graphs_train) # exit() graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) graph_global_test=init_structure_encoding(args, gs=graph_global_test, type_init=args.type_init) # ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) ds_train=graphs_train ds_val= graphs_val ds_test=graphs_test dataloader_train = DataLoader(ds_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(ds_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(ds_test, batch_size=len(ds_test), shuffle=True) num_graph_labels = get_numGraphLabels(ds_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(ds_train)) df = get_stats(df, ds, ds_train, graphs_val=ds_val, graphs_test=ds_test) # ds_global_test=split_data(graph_global_test, test=2, shuffle=True, seed=seed) # print(graph_global_test) dataloader_global_test = ({'test': DataLoader(graph_global_test, batch_size=len(graph_global_test), shuffle=True)}) return splitedData, df,dataloader_global_test def prepareData_oneDS_time(args,datapath, data, num_client, batchSize, convert_x=False, seed=None, overlap=False): if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) if convert_x: maxdegree = get_maxDegree(tudataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) graphs = [x for x in tudataset] print(" **", data, len(graphs)) # ys=[] # for x in tudataset: # ys.append(x.y) # print(set(list(np.array(ys)))) # print(len(np.where(np.array(ys)==0)[0]),len(np.where(np.array(ys)==1)[0])) # exit() if data == 'ENZYMES': random.shuffle(graphs) # graphs_chunks,graph_global_test = _randChunk2_(graphs, num_client, overlap, seed=seed) # splitedData = {} # df = pd.DataFrame() # num_node_features = graphs[0].num_node_features # for idx, chunks in enumerate(graphs_chunks): # ds = f'{idx}-{data}' # ds_tvt = chunks # # print(ds_tvt) # if data=='ENZYMES': # ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) # ds_val=ds_train # ds_test=ds_train # graphs_train=ds_train # graphs_val=ds_val # graphs_test=ds_test # else: # # ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) # # ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) # graphs_train, graphs_valtest = split_data(ds_tvt, test=0.2, shuffle=True, seed=seed) # graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) # # print(graphs_train) # # exit() args.dataset=args.data_group data_dir = '/mnt/diskLv/luo/Federated-Learning-on-Graphs-main/Graph_Classification/federated/' file_name = args.dataset.lower() + '-train_feats_label_edge_list' file_path = os.path.join(data_dir, file_name) # feature_set = set() with open(file_path, 'rb') as f: feats_list_train0, label_list_train0, edge_list_train0 = pickle.load(f) for i in range(np.shape(label_list_train0)[0]): if np.max(edge_list_train0[i]) >= np.shape(feats_list_train0[i])[0]: print('error1') feats_list_train = [torch.tensor(ft_train) for ft_train in feats_list_train0] label_list_train = [torch.tensor(lb_train) for lb_train in label_list_train0] edge_list_train = [torch.tensor(eg_train) for eg_train in edge_list_train0] print('***', len(label_list_train)) file_name = args.dataset.lower() + '-test_feats_label_edge_list' file_path = os.path.join(data_dir, file_name) # feature_set = set() with open(file_path, 'rb') as f: feats_list_test0, label_list_test0, edge_list_test0 = pickle.load(f) for i in range(np.shape(label_list_test0)[0]): if np.max(edge_list_test0[i]) >= np.shape(feats_list_test0[i])[0]: print('error2') # if len(label_list_train)==len(label_list_test0): # print('error3') # print(np.shape(feats_list_test0[0])) feats_list_test = [torch.tensor(ft_test) for ft_test in feats_list_test0] label_list_test = [torch.tensor(lb_test) for lb_test in label_list_test0] edge_list_test = [torch.tensor(eg_test) for eg_test in edge_list_test0] # print(len(np.where(np.array(label_list_test0)[int(len(dataset) * args.split):] == 0)[0]), # len(np.where(np.array(label_list_test0)[int(len(dataset) * args.split):] == 1)[0])) # exit() # test_dataset = [] # for i in range(len(label_list_test[int(len(dataset) * args.split):])): # test_dataset.append(Data(x=feats_list_test[i], edge_index=edge_list_test[i], y=label_list_test[i])) graph_global_test = [] print(len(label_list_test)) dataset=tudataset if args.data_group == 'ENZYMES' or args.data_group == 'MUTAG': for i in range(len(label_list_test[int(len(dataset) * args.split):])): edge_index = torch.tensor(edge_list_test[i], dtype=torch.long) x = torch.tensor(feats_list_test[i], dtype=torch.float) y = torch.tensor(label_list_test[i], dtype=torch.long) pyg_graph = Data(x=x, edge_index=edge_index, y=y) graph_global_test.append(pyg_graph) print(len(graph_global_test)) else: for i in range(len(label_list_test)): edge_index = torch.tensor(edge_list_test[i], dtype=torch.long) x = torch.tensor(feats_list_test[i], dtype=torch.float) y = torch.tensor(label_list_test[i], dtype=torch.long) pyg_graph = Data(x=x, edge_index=edge_index, y=y) graph_global_test.append(pyg_graph) print(len(graph_global_test)) num_node_features = graph_global_test[0].num_node_features graphs_train = [] for jj in range(len(label_list_train)): edge_index = torch.tensor(edge_list_train[jj], dtype=torch.long) x = torch.tensor(feats_list_train[jj], dtype=torch.float) y = torch.tensor(label_list_train[jj], dtype=torch.long) pyg_graph = Data(x=x, edge_index=edge_index, y=y) # graph_global_test.append(pyg_graph) graphs_train.append(pyg_graph) # exit() startup = 0 Client_list = [] division = int(len(label_list_train) * args.split * 0.7 / args.clients) print(division) division_val = int(len(label_list_train) * args.split * 0.1 / args.clients) print('division_val',division_val) division_test = int(len(label_list_train) * args.split * 0.2 / args.clients) splitedData = {} df = pd.DataFrame() for i in range(args.clients): ds = f'{i}-{args.data_group}' client_data_train = graphs_train[startup:division + startup] client_data_val = graphs_train[division + startup:division + startup + division_val] client_data_test = graphs_train[ division + startup + division_val:division + startup + division_val + division_test] graphs_train = client_data_train graphs_val = client_data_val graphs_test = client_data_test # graph_global_test=graph_global_test graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) graph_global_test = init_structure_encoding(args, gs=graph_global_test, type_init=args.type_init) # exit() # ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) ds_train = graphs_train ds_val = graphs_val ds_test = graphs_test dataloader_train = DataLoader(ds_train, batch_size=args.batch_size, shuffle=True) dataloader_val = DataLoader(ds_val, batch_size=args.batch_size, shuffle=True) dataloader_test = DataLoader(ds_test, batch_size=len(ds_test), shuffle=True) num_graph_labels = get_numGraphLabels(ds_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(ds_train)) df = get_stats(df, ds, ds_train, graphs_val=ds_val, graphs_test=ds_test) # ds_global_test=split_data(graph_global_test, test=2, shuffle=True, seed=seed) # print(graph_global_test) dataloader_global_test = ( {'test': DataLoader(graph_global_test, batch_size=len(graph_global_test), shuffle=True)}) return splitedData, df, dataloader_global_test def prepareData_oneDS_ours_103_(args,datapath, data, num_client, batchSize, convert_x=False, seed=None, overlap=False): if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) if convert_x: maxdegree = get_maxDegree(tudataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) graphs = [x for x in tudataset] print(" **", data, len(graphs)) # ys=[] # for x in tudataset: # ys.append(x.y) # print(set(list(np.array(ys)))) # print(len(np.where(np.array(ys)==0)[0]),len(np.where(np.array(ys)==1)[0])) # exit() if data == 'ENZYMES': random.shuffle(graphs) graphs_chunks,graph_global_test = _randChunk_(graphs, num_client, overlap, seed=seed) splitedData = {} df = pd.DataFrame() num_node_features = graphs[0].num_node_features for idx, chunks in enumerate(graphs_chunks): ds = f'{idx}-{data}' ds_tvt = chunks # print(ds_tvt) if data=='ENZYMES': ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) ds_val=ds_train ds_test=ds_train graphs_train=ds_train graphs_val=ds_val graphs_test=ds_test else: # ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) # ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train, graphs_valtest = split_data(ds_tvt, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) # print(ds_vt) graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) graph_global_test=init_structure_encoding(args, gs=graph_global_test, type_init=args.type_init) # exit() # ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) ds_train=graphs_train ds_val= graphs_val ds_test=graphs_test dataloader_train = DataLoader(ds_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(ds_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(ds_test, batch_size=len(ds_test), shuffle=True) num_graph_labels = get_numGraphLabels(ds_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(ds_train)) df = get_stats(df, ds, ds_train, graphs_val=ds_val, graphs_test=ds_test) # ds_global_test=split_data(graph_global_test, test=2, shuffle=True, seed=seed) # print(graph_global_test) dataloader_global_test = ({'test': DataLoader(graph_global_test, batch_size=len(graph_global_test), shuffle=True)}) return splitedData, df,dataloader_global_test def prepareData_multiDS_(args, datapath, group='chem', batchSize=32, seed=None): assert group in ['chem', "biochem", 'biochemsn', "biosncv"] if group == 'chem': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1"] elif group == 'biochem': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1", # small molecules "ENZYMES", "DD", "PROTEINS"] # bioinformatics elif group == 'biochemsn': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1", # small molecules "ENZYMES", "DD", "PROTEINS", # bioinformatics "COLLAB", "IMDB-BINARY", "IMDB-MULTI"] # social networks elif group == 'biosncv': datasets = ["ENZYMES", "DD", "PROTEINS", # bioinformatics "COLLAB", "IMDB-BINARY", "IMDB-MULTI", # social networks "Letter-high", "Letter-low", "Letter-med"] # computer vision splitedData = {} df = pd.DataFrame() for data in datasets: if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) elif "Letter" in data: tudataset = TUDataset(f"{datapath}/TUDataset", data, use_node_attr=True) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) graphs = [x for x in tudataset] print(" **", data, len(graphs)) graphs_train, graphs_valtest = split_data(graphs, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) dataloader_train = DataLoader(graphs_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(graphs_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(graphs_test, batch_size=batchSize, shuffle=True) num_node_features = graphs[0].num_node_features num_graph_labels = get_numGraphLabels(graphs_train) splitedData[data] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(graphs_train)) df = get_stats(df, data, graphs_train, graphs_val=graphs_val, graphs_test=graphs_test) return splitedData, df def prepareData_multiDS_protein_(args,datapath,num_client,group='small',batchSize=32, convert_x=False, seed=None,overlap=False): # assert group in ['molecules', 'molecules_tiny', 'small', 'mix', "mix_tiny", "biochem", "biochem_tiny",'social'] # print('###',group) if group == 'molecules' or group == 'molecules_tiny': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1"] ft_dim=3 if group == 'small': datasets = ["MUTAG", # small molecules "ENZYMES"] # bioinformatics ft_dim = 3 if group == 'PROTEINS1': datasets = ["MUTAG", # small molecules "PROTEINS"] ft_dim = 3 if group == 'PROTEINS2': datasets = ["MUTAG", # small molecules "ENZYMES", "PROTEINS"] ft_dim = 3 if group == 'PROTEINS3': datasets = ["ENZYMES", "PROTEINS"] # small molecules ft_dim = 3 if group == 'mix' or group == 'mix_tiny': datasets = ["MUTAG", # small molecules "IMDB-BINARY", "IMDB-MULTI"] # social networks ft_dim = 3 if group == 'biochem' or group == 'biochem_tiny': datasets = ["MUTAG", # small molecules "ENZYMES"] # bioinformatics ft_dim = 3 if group == 'social' : datasets = ["IMDB-BINARY", "IMDB-MULTI"] ft_dim = 89 if group == 'mix_all' : datasets = ["MUTAG","IMDB-BINARY", "IMDB-MULTI","ENZYMES"] ft_dim = 3 splitedData = {} df = pd.DataFrame() graph_global_test=[] ii=0 for data in datasets: if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) if convert_x: maxdegree = get_maxDegree(tudataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) graphs = [x for x in tudataset] # print(" **", data, len(graphs)) # graphs=[] # for g_x in tudataset: # # print(g_x) # g_x.x=g_x.x[:,0:ft_dim] # g_x.num_nodes=np.shape(g_x.x)[0] # g_x.edge_attr=g_x.edge_attr # print(g_x) # graphs.append(g_x) # # exit() graphs_chunks, graph_global_test_ = _randChunk2_(graphs, num_client, overlap=False,seed=seed) # splitedData = {} # df = pd.DataFrame() num_node_features = graphs[0].num_node_features for idx, chunks in enumerate(graphs_chunks): ds = f'{idx}-{data}' ds_tvt = chunks # print(ds_tvt) if data == 'ENZYMES': ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) ds_val = ds_train ds_test = ds_train graphs_train = ds_train graphs_val = ds_val graphs_test = ds_test else: ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train, graphs_valtest = split_data(ds_tvt, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) # ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) # # print(ds_vt) # # # exit() # ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) # # graphs_train, graphs_valtest = split_data(graphs, test=0.2, shuffle=True, seed=seed) # graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) ds_train=graphs_train ds_val=graphs_val ds_test=graphs_test dataloader_train = DataLoader(ds_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(ds_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(ds_test, batch_size=len(ds_test), shuffle=True) num_graph_labels = get_numGraphLabels(ds_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(ds_train)) df = get_stats(df, ds, ds_train, graphs_val=ds_val, graphs_test=ds_test) # print(graph_global_test_) # if ii == 0: # graph_global_test = graph_global_test_ # else: # graph_global_test = torch.cat((graph_global_test, graph_global_test_)) graph_global_test.append(graph_global_test_) ii+=1 graph_global_test=list(itertools.chain.from_iterable(graph_global_test)) dataloader_global_test = ({'test': DataLoader(graph_global_test, batch_size=len(graph_global_test), shuffle=True)}) return splitedData, df,dataloader_global_test def prepareData_multiDS_protein_loss_(args,datapath,num_client,group='small',batchSize=32, convert_x=False, seed=None,overlap=False): # assert group in ['molecules', 'molecules_tiny', 'small', 'mix', "mix_tiny", "biochem", "biochem_tiny",'social'] # print('###',group) if group == 'molecules' or group == 'molecules_tiny': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1"] ft_dim=3 if group == 'small': datasets = ["MUTAG", # small molecules "ENZYMES"] # bioinformatics ft_dim = 3 if group == 'PROTEINS1': datasets = ["MUTAG", # small molecules "PROTEINS"] ft_dim = 3 if group == 'PROTEINS2': datasets = ["MUTAG", # small molecules "ENZYMES", "PROTEINS"] ft_dim = 3 if group == 'PROTEINS3': datasets = ["ENZYMES", "PROTEINS"] # small molecules ft_dim = 3 if group == 'mix' or group == 'mix_tiny': datasets = ["MUTAG", # small molecules "IMDB-BINARY", "IMDB-MULTI"] # social networks ft_dim = 3 if group == 'biochem' or group == 'biochem_tiny': datasets = ["MUTAG", # small molecules "ENZYMES"] # bioinformatics ft_dim = 3 if group == 'social' : datasets = ["IMDB-BINARY", "IMDB-MULTI"] ft_dim = 89 if group == 'mix_all' : datasets = ["MUTAG","ENZYMES","IMDB-BINARY", "IMDB-MULTI"] ft_dim = 3 splitedData = {} df = pd.DataFrame() graph_global_test=[] tudataset_list = [] # tudataset_num = 0 for data in datasets: if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) tudataset_list.append(tudataset) tudataset_num = len(tudataset) elif data == "IMDB-BINARY": # tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) dataset = TUDataset(f"{datapath}/TUDataset", data) maxdegree = get_maxDegree(dataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) # ft_dim = maxdegree + 1 tudataset_list.append(tudataset) tudataset_num = len(tudataset) elif data == "IMDB-MULTI": # tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) dataset = TUDataset(f"{datapath}/TUDataset", data) maxdegree = get_maxDegree(dataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) # ft_dim = maxdegree + 1 tudataset_list.append(tudataset) tudataset_num = len(tudataset) elif data.lower() == 'enzymes': tudataset = TUDataset(f"{datapath}/TUDataset", data) # ft_dim = 3 tudataset_list.append(tudataset) tudataset_num = len(tudataset) elif data.lower() == 'mutag': tudataset = TUDataset(f"{datapath}/TUDataset", data) # ft_dim = 7 tudataset_list.append(tudataset) tudataset_num = len(tudataset) elif data.lower() == 'proteins': tudataset = TUDataset(f"{datapath}/TUDataset", data) # ft_dim = 3 tudataset_list.append(tudataset) tudataset_num = len(tudataset) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) if convert_x: maxdegree = get_maxDegree(tudataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) tudataset_num = len(tudataset) tudataset_list.append(tudataset) graphs = [x for x in tudataset] # print(" **", data, len(graphs)) # graphs=[] # for g_x in tudataset: # # print(g_x) # g_x.x=g_x.x[:,0:ft_dim] # g_x.num_nodes=np.shape(g_x.x)[0] # g_x.edge_attr=g_x.edge_attr # print(g_x) # graphs.append(g_x) # # exit() if group != 'mix_all': graphs_chunks, graph_global_test_ = _randChunk2_(graphs, num_client, overlap=False,seed=seed) # splitedData = {} # df = pd.DataFrame() num_node_features = ft_dim for idx, chunks in enumerate(graphs_chunks): ds = f'{idx}-{data}' ds_tvt = chunks # print(ds_tvt) if data == 'ENZYMES': ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) ds_val = ds_train ds_test = ds_train graphs_train = ds_train graphs_val = ds_val graphs_test = ds_test else: ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train, graphs_valtest = split_data(ds_tvt, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) ds_train = graphs_train ds_val = graphs_val ds_test = graphs_test dataloader_train = DataLoader(ds_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(ds_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(ds_test, batch_size=len(ds_test), shuffle=True) num_graph_labels = get_numGraphLabels(ds_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(ds_train)) df = get_stats(df, ds, ds_train, graphs_val=ds_val, graphs_test=ds_test) graph_global_test.append(graph_global_test_) else: j_data=0 for data in datasets: data_dir = '/mnt/diskLv/luo/Federated-Learning-on-Graphs-main/Graph_Classification/federated/' file_name = data.lower() + '-train_feats_label_edge_list' file_path = os.path.join(data_dir, file_name) # feature_set = set() with open(file_path, 'rb') as f: feats_list_train0, label_list_train0, edge_list_train0 = pickle.load(f) for i in range(np.shape(label_list_train0)[0]): if np.max(edge_list_train0[i]) >= np.shape(feats_list_train0[i])[0]: print('error1') feats_list_train = [torch.tensor(ft_train[:, 0:ft_dim]) for ft_train in feats_list_train0] label_list_train = [torch.tensor(lb_train) for lb_train in label_list_train0] edge_list_train = [torch.tensor(eg_train) for eg_train in edge_list_train0] print('***', len(label_list_train)) file_name = data.lower() + '-test_feats_label_edge_list' file_path = os.path.join(data_dir, file_name) # feature_set = set() with open(file_path, 'rb') as f: feats_list_test0, label_list_test0, edge_list_test0 = pickle.load(f) for i in range(np.shape(label_list_test0)[0]): if np.max(edge_list_test0[i]) >= np.shape(feats_list_test0[i])[0]: print('error2') # if len(label_list_train)==len(label_list_test0): # print('error3') # print(np.shape(feats_list_test0[0])) feats_list_test = [torch.tensor(ft_test[:, 0:ft_dim]) for ft_test in feats_list_test0] label_list_test = [torch.tensor(lb_test) for lb_test in label_list_test0] edge_list_test = [torch.tensor(eg_test) for eg_test in edge_list_test0] # print(len(label_list_test)) if data == 'MUTAG': print(len(label_list_test)) for i in range(len(label_list_test[int(tudataset_num * args.split):])): edge_index = torch.tensor(edge_list_test[i], dtype=torch.long) x = torch.tensor(feats_list_test[i], dtype=torch.float) y = torch.tensor(label_list_test[i], dtype=torch.long) pyg_graph = Data(x=x, edge_index=edge_index, y=y) graph_global_test.append(pyg_graph) # print(len(graph_global_test)) elif data == 'ENZYMES': print(len(label_list_test)) for i in range(len(label_list_test[int(len(label_list_test) * 0.5):])): edge_index = torch.tensor(edge_list_test[i], dtype=torch.long) x = torch.tensor(feats_list_test[i], dtype=torch.float) y = torch.tensor(label_list_test[i], dtype=torch.long) pyg_graph = Data(x=x, edge_index=edge_index, y=y) # print(y) graph_global_test.append(pyg_graph) # exit() else: print(len(label_list_test)) for i in range(len(label_list_test)): edge_index = torch.tensor(edge_list_test[i], dtype=torch.long) x = torch.tensor(feats_list_test[i], dtype=torch.float) y = torch.tensor(label_list_test[i], dtype=torch.long) pyg_graph = Data(x=x, edge_index=edge_index, y=y) graph_global_test.append(pyg_graph) # print(len(graph_global_test)) # num_node_features =graph_global_test[0].num_node_features # Train Dataset split to Clients graphs_train = [] for jj in range(len(label_list_train)): edge_index = torch.tensor(edge_list_train[jj], dtype=torch.long) x = torch.tensor(feats_list_train[jj], dtype=torch.float) y = torch.tensor(label_list_train[jj], dtype=torch.long) pyg_graph = Data(x=x, edge_index=edge_index, y=y) graphs_train.append(pyg_graph) # exit() startup = 0 Client_list = [] division = int(len(label_list_train) * args.split * 0.8 / args.clients) # print(division) division_val = int(len(label_list_train) * args.split * 0.1 / args.clients) division_test = int(len(label_list_train) * args.split * 0.1 / args.clients) args.clients=args.num_clients graphs_train_ = cp.deepcopy(graphs_train) for i in range(args.clients): ds = f'{i + j_data * 3}-{args.data_group}' client_data_train = graphs_train_[startup:division + startup] client_data_val = graphs_train_[division + startup:division + startup + division_val] # print('~~~',len(graphs_train_),startup) if (args.data_group == 'PROTEINS3' and data == 'ENZYMES') or ( args.data_group == 'mix_all' and data == 'ENZYMES'): # print('999999') client_data_train = graphs_train_ client_data_test = client_data_train # exit() else: client_data_test = graphs_train_[ division + startup + division_val:division + startup + division_val + division_test] startup=division + startup + division_val + division_test graphs_train = client_data_train graphs_val = client_data_val graphs_test = client_data_test graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) ds_train=graphs_train ds_val=graphs_val ds_test=graphs_test # print('@@@',len(ds_train)) dataloader_train = DataLoader(ds_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(ds_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(ds_test, batch_size=len(ds_test), shuffle=True) num_graph_labels = get_numGraphLabels(ds_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, ft_dim, num_graph_labels, len(ds_train)) df = get_stats(df, ds, ds_train, graphs_val=ds_val, graphs_test=ds_test) j_data += 1 # graph_global_test = list(itertools.chain.from_iterable(graph_global_test)) # graph_global_test=init_structure_encoding(args, gs=graph_global_test, type_init=args.type_init) print(len(graph_global_test)) # graph_global_test = list(itertools.chain.from_iterable(graph_global_test)) # dataloader_global_test = ({'test': DataLoader(graph_global_test, batch_size=len(graph_global_test), shuffle=True)}) dataloader_global_test = ({'test': DataLoader(graph_global_test, batch_size=batchSize, shuffle=True)}) print(len(dataloader_global_test['test'])) # exit() return splitedData, df,dataloader_global_test def prepareData_multiDS_protein_time(args,datapath,num_client,group='small',batchSize=32, convert_x=False, seed=None,overlap=False): # assert group in ['molecules', 'molecules_tiny', 'small', 'mix', "mix_tiny", "biochem", "biochem_tiny",'social'] # print('###',group) if group == 'molecules' or group == 'molecules_tiny': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1"] if group == 'small': datasets = ["MUTAG", # small molecules "ENZYMES"] # bioinformatics if group == 'PROTEINS1': datasets = ["MUTAG", # small molecules "PROTEINS"] if group == 'PROTEINS2': datasets = ["MUTAG", # small molecules "ENZYMES", "PROTEINS"] if group == 'PROTEINS3': datasets = ["ENZYMES", "PROTEINS"] # small molecules if group == 'mix' or group == 'mix_tiny': datasets = ["MUTAG", # small molecules "IMDB-BINARY", "IMDB-MULTI"] # social networks if group == 'biochem' or group == 'biochem_tiny': datasets = ["MUTAG", # small molecules "ENZYMES"] # bioinformatics if group == 'social' : datasets = ["IMDB-BINARY", "IMDB-MULTI"] if group == 'mix_all' : datasets = ["MUTAG","IMDB-BINARY", "IMDB-MULTI","ENZYMES"] splitedData = {} df = pd.DataFrame() graph_global_test=[] ii=0 for data in datasets: if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) if convert_x: maxdegree = get_maxDegree(tudataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) graphs = [x for x in tudataset] # print(" **", data, len(graphs)) graphs_chunks, graph_global_test_ = _randChunk2_(graphs, num_client, overlap=False,seed=seed) # splitedData = {} # df = pd.DataFrame() num_node_features = graphs[0].num_node_features for idx, chunks in enumerate(graphs_chunks): ds = f'{idx}-{data}' ds_tvt = chunks # print(ds_tvt) ds_train, ds_vt = split_data(ds_tvt, train=0.8, test=0.2, shuffle=True, seed=seed) # print(ds_vt) # exit() ds_val, ds_test = split_data(ds_vt, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train, graphs_valtest = split_data(graphs, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) ds_train=graphs_train ds_val=graphs_val ds_test=graphs_test dataloader_train = DataLoader(ds_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(ds_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(ds_test, batch_size=len(ds_test), shuffle=True) num_graph_labels = get_numGraphLabels(ds_train) splitedData[ds] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(ds_train)) df = get_stats(df, ds, ds_train, graphs_val=ds_val, graphs_test=ds_test) # print(graph_global_test_) # if ii == 0: # graph_global_test = graph_global_test_ # else: # graph_global_test = torch.cat((graph_global_test, graph_global_test_)) graph_global_test.append(graph_global_test_) ii+=1 # graphs_train, graph_global_test_ = split_data(graphs, test=0.2, shuffle=True, seed=seed) # # print('###',graph_global_test_[0].x) # # exit() # # graphs_train, graphs_valtest = split_data(graphs_train, test=0.2, shuffle=True, seed=seed) # graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) # if group.endswith('tiny'): # graphs, _ = split_data(graphs, train=150, shuffle=True, seed=seed) # graphs_train, graphs_valtest = split_data(graphs, test=0.2, shuffle=True, seed=seed) # graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) # # num_node_features = graphs[0].num_node_features # num_graph_labels = get_numGraphLabels(graphs_train) # # dataloader_train = DataLoader(graphs_train, batch_size=batchSize, shuffle=True) # dataloader_val = DataLoader(graphs_val, batch_size=batchSize, shuffle=True) # dataloader_test = DataLoader(graphs_test, batch_size=batchSize, shuffle=True) # # splitedData[data] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, # num_node_features, num_graph_labels, len(graphs_train)) # # df = get_stats(df, data, graphs_train, graphs_val=graphs_val, graphs_test=graphs_test) # # if ii==0: # graph_global_test=graph_global_test_ # else: # # graph_global_test =torch.cat(( graph_global_test,graph_global_test_)) # print(graph_global_test) # exit() # graph_global_test=graph_global_test.view(-1) graph_global_test=list(itertools.chain.from_iterable(graph_global_test)) dataloader_global_test = ({'test': DataLoader(graph_global_test, batch_size=len(graph_global_test), shuffle=True)}) return splitedData, df,dataloader_global_test def prepareData_multiDS_protein(datapath, group='small', batchSize=32, convert_x=False, seed=None): assert group in ['molecules', 'molecules_tiny', 'small', 'mix', "mix_tiny", "biochem", "biochem_tiny",'social'] if group == 'molecules' or group == 'molecules_tiny': datasets = ["MUTAG", "BZR", "COX2", "DHFR", "PTC_MR", "AIDS", "NCI1"] if group == 'small': datasets = ["MUTAG", # small molecules "ENZYMES"] # bioinformatics if group == 'mix' or group == 'mix_tiny': datasets = ["MUTAG", # small molecules "ENZYMES", # bioinformatics "IMDB-BINARY", "IMDB-MULTI"] # social networks if group == 'biochem' or group == 'biochem_tiny': datasets = ["MUTAG", # small molecules "ENZYMES"] # bioinformatics if group == 'social' : datasets = ["IMDB-BINARY", "IMDB-MULTI"] splitedData = {} df = pd.DataFrame() ii=0 for data in datasets: if data == "COLLAB": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(491, cat=False)) elif data == "IMDB-BINARY": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(135, cat=False)) elif data == "IMDB-MULTI": tudataset = TUDataset(f"{datapath}/TUDataset", data, pre_transform=OneHotDegree(88, cat=False)) else: tudataset = TUDataset(f"{datapath}/TUDataset", data) if convert_x: maxdegree = get_maxDegree(tudataset) tudataset = TUDataset(f"{datapath}/TUDataset", data, transform=OneHotDegree(maxdegree, cat=False)) graphs = [x for x in tudataset] # print(" **", data, len(graphs)) graphs_train, graph_global_test_ = split_data(graphs, test=0.2, shuffle=True, seed=seed) # print('###',graph_global_test_[0].x) # exit() graphs_train, graphs_valtest = split_data(graphs_train, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) if group.endswith('tiny'): graphs, _ = split_data(graphs, train=150, shuffle=True, seed=seed) graphs_train, graphs_valtest = split_data(graphs, test=0.2, shuffle=True, seed=seed) graphs_val, graphs_test = split_data(graphs_valtest, train=0.5, test=0.5, shuffle=True, seed=seed) num_node_features = graphs[0].num_node_features num_graph_labels = get_numGraphLabels(graphs_train) graphs_train = init_structure_encoding(args, gs=graphs_train, type_init=args.type_init) graphs_val = init_structure_encoding(args, gs=graphs_val, type_init=args.type_init) graphs_test = init_structure_encoding(args, gs=graphs_test, type_init=args.type_init) ds_train = graphs_train ds_val = graphs_val ds_test = graphs_test dataloader_train = DataLoader(graphs_train, batch_size=batchSize, shuffle=True) dataloader_val = DataLoader(graphs_val, batch_size=batchSize, shuffle=True) dataloader_test = DataLoader(graphs_test, batch_size=batchSize, shuffle=True) splitedData[data] = ({'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}, num_node_features, num_graph_labels, len(graphs_train)) df = get_stats(df, data, graphs_train, graphs_val=graphs_val, graphs_test=graphs_test) if ii==0: graph_global_test=graph_global_test_ else: graph_global_test =torch.cat(( graph_global_test,graph_global_test_)) # print(graph_global_test) # exit() # graph_global_test=graph_global_test.view(-1) dataloader_global_test = ({'test': DataLoader(graph_global_test, batch_size=len(graph_global_test), shuffle=True)}) return splitedData, df,dataloader_global_test