import networkx as nx import numpy as np import pickle class DBLPDataLoader: def __init__(self, graph_file): #self.g = nx.read_gpickle(graph_file) with open(graph_file, 'rb') as f: adj, features = pickle.load(f, encoding='latin1') self.g = nx.Graph(adj) #self.g = nx.read_gpickle(graph_file) print(self.g) self.num_of_nodes = self.g.number_of_nodes() print(self.num_of_nodes) self.num_of_edges = self.g.number_of_edges() print(self.num_of_edges) self.edges_raw = self.g.edges(data=True) print(self.edges_raw) self.nodes_raw = self.g.nodes(data=True) print(self.nodes_raw) self.edge_distribution = np.array([attr['weight'] for _, _, attr in self.edges_raw], dtype=np.float32) self.edge_distribution /= np.sum(self.edge_distribution) self.edge_sampling = AliasSampling(prob=self.edge_distribution) self.node_negative_distribution = np.power( np.array([self.g.degree(node, weight='weight') for node, _ in self.nodes_raw], dtype=np.float32), 0.75) self.node_negative_distribution /= np.sum(self.node_negative_distribution) self.node_sampling = AliasSampling(prob=self.node_negative_distribution) self.node_index = {} self.node_index_reversed = {} for index, (node, _) in enumerate(self.nodes_raw): self.node_index[node] = index self.node_index_reversed[index] = node self.edges = [(self.node_index[u], self.node_index[v]) for u, v, _ in self.edges_raw] def fetch_batch(self, batch_size=16, K=10, edge_sampling='atlas', node_sampling='atlas'): if edge_sampling == 'numpy': edge_batch_index = np.random.choice(self.num_of_edges, size=batch_size, p=self.edge_distribution) elif edge_sampling == 'atlas': edge_batch_index = self.edge_sampling.sampling(batch_size) elif edge_sampling == 'uniform': edge_batch_index = np.random.randint(0, self.num_of_edges, size=batch_size) u_i = [] u_j = [] label = [] for edge_index in edge_batch_index: edge = self.edges[edge_index] if self.g.__class__ == nx.Graph: if np.random.rand() > 0.5: # important: second-order proximity is for directed edge edge = (edge[1], edge[0]) u_i.append(edge[0]) u_j.append(edge[1]) label.append(1) for i in range(K): while True: if node_sampling == 'numpy': negative_node = np.random.choice(self.num_of_nodes, p=self.node_negative_distribution) elif node_sampling == 'atlas': negative_node = self.node_sampling.sampling() elif node_sampling == 'uniform': negative_node = np.random.randint(0, self.num_of_nodes) if not self.g.has_edge(self.node_index_reversed[negative_node], self.node_index_reversed[edge[0]]): break u_i.append(edge[0]) u_j.append(negative_node) label.append(-1) return u_i, u_j, label def embedding_mapping(self, embedding): return {node: embedding[self.node_index[node]] for node, _ in self.nodes_raw} class DBLPDataLoader1: def __init__(self, graph_file,b): #self.g = nx.read_gpickle(graph_file) with open(graph_file, 'rb') as f: adj, features = pickle.load(f, encoding='latin1') def add_laplace_noise(data_list, μ=0, b=2): laplace_noise = np.random.laplace(μ, b, np.shape(data_list)) return laplace_noise + data_list # adj = nx.adjacency_matrix(G) u=0 adj = np.array(adj.todense()) adj1 = add_laplace_noise(np.array(adj), u, b) self.g = nx.Graph(adj) #self.g = nx.read_gpickle(graph_file) print(self.g) self.num_of_nodes = self.g.number_of_nodes() print(self.num_of_nodes) self.num_of_edges = self.g.number_of_edges() print(self.num_of_edges) self.edges_raw = self.g.edges(data=True) print(self.edges_raw) self.nodes_raw = self.g.nodes(data=True) print(self.nodes_raw) self.edge_distribution = np.array([attr['weight'] for _, _, attr in self.edges_raw], dtype=np.float32) self.edge_distribution /= np.sum(self.edge_distribution) self.edge_sampling = AliasSampling(prob=self.edge_distribution) self.node_negative_distribution = np.power( np.array([self.g.degree(node, weight='weight') for node, _ in self.nodes_raw], dtype=np.float32), 0.75) self.node_negative_distribution /= np.sum(self.node_negative_distribution) self.node_sampling = AliasSampling(prob=self.node_negative_distribution) self.node_index = {} self.node_index_reversed = {} for index, (node, _) in enumerate(self.nodes_raw): self.node_index[node] = index self.node_index_reversed[index] = node self.edges = [(self.node_index[u], self.node_index[v]) for u, v, _ in self.edges_raw] def fetch_batch(self, batch_size=16, K=10, edge_sampling='atlas', node_sampling='atlas'): if edge_sampling == 'numpy': edge_batch_index = np.random.choice(self.num_of_edges, size=batch_size, p=self.edge_distribution) elif edge_sampling == 'atlas': edge_batch_index = self.edge_sampling.sampling(batch_size) elif edge_sampling == 'uniform': edge_batch_index = np.random.randint(0, self.num_of_edges, size=batch_size) u_i = [] u_j = [] label = [] for edge_index in edge_batch_index: edge = self.edges[edge_index] if self.g.__class__ == nx.Graph: if np.random.rand() > 0.5: # important: second-order proximity is for directed edge edge = (edge[1], edge[0]) u_i.append(edge[0]) u_j.append(edge[1]) label.append(1) for i in range(K): while True: if node_sampling == 'numpy': negative_node = np.random.choice(self.num_of_nodes, p=self.node_negative_distribution) elif node_sampling == 'atlas': negative_node = self.node_sampling.sampling() elif node_sampling == 'uniform': negative_node = np.random.randint(0, self.num_of_nodes) if not self.g.has_edge(self.node_index_reversed[negative_node], self.node_index_reversed[edge[0]]): break u_i.append(edge[0]) u_j.append(negative_node) label.append(-1) return u_i, u_j, label def embedding_mapping(self, embedding): return {node: embedding[self.node_index[node]] for node, _ in self.nodes_raw} class AliasSampling: # Reference: https://en.wikipedia.org/wiki/Alias_method def __init__(self, prob): self.n = len(prob) self.U = np.array(prob) * self.n self.K = [i for i in range(len(prob))] overfull, underfull = [], [] for i, U_i in enumerate(self.U): if U_i > 1: overfull.append(i) elif U_i < 1: underfull.append(i) while len(overfull) and len(underfull): i, j = overfull.pop(), underfull.pop() self.K[j] = i self.U[i] = self.U[i] - (1 - self.U[j]) if self.U[i] > 1: overfull.append(i) elif self.U[i] < 1: underfull.append(i) def sampling(self, n=1): x = np.random.rand(n) i = np.floor(self.n * x) y = self.n * x - i i = i.astype(np.int32) res = [i[k] if y[k] < self.U[i[k]] else self.K[i[k]] for k in range(n)] if n == 1: return res[0] else: return res