dynamic_feature_extraction.py · DynamicNetworkSimulation

import numpy as np
import os
import csv

from collections import OrderedDict
import networkx as nx

import matplotlib.pyplot as plt

class GraphAnalysis:

    '''
    class for analyzing dynamic network.
    - load network from nodes and edgesf iles
    - calculate dynamic feature matrix
    - plot degree distribution
    - plot network growth over time

    '''
    def __init__(self, id, fp = "\\output\\", verbose=False):
        # whether to log 
        self.verbose = verbose

        self.id = id
        self.fp = fp

        # graph representation
        self.edges = {} # dict edges {node : (set of adjacent nodes)}
        self.nodes = OrderedDict() # OrderedDictionary of form {node : time}

        filepath_edges = os.path.join(fp, str(id) + "_edges.csv")
        filepath_nodes = os.path.join(fp, str(id) +"_nodes.csv")

        with open(filepath_nodes, 'r') as f:
            self.load_nodes(f)

        with open(filepath_edges, 'r') as f:
            self.load_edges(f)

    '''
    get so-called scaled timesteps such that each timestep has the same number of nodes arriving
    '''
    def get_scaled_timesteps(self, a, up_to):
        number_of_nodes_per_step = int(up_to / a) # number of nodes arriving per timestep
        time_max = list(self.nodes.items())[up_to-1][1] # total running time
        stamps = [] # timestamps indicating arrival of last node for timestep
        for i in range(a):

            # get the timestamp of the arrival of the last node
            succes = False
            j = 0
            while not succes: # ceiling of i/number_of_nodes_per_step that is in nodes.
                try:
                    s = float(self.nodes[str((i*number_of_nodes_per_step)+j)])
                    stamps.append(s)
                    succes = True
                except KeyError:
                    j += 1

        stamps.append(time_max)
        return stamps

    '''
    calculates feature matrix from Cai2021

    @param a: number of rows; timeinterval
    @param b: number of column; 

    @param save: whether to save the matrix as csv
    @param up_to: cutoff node, i.e. if consider only part of the network evolution.

    @return a x b matrix of floats.
    '''
    def calculate_feature_matrix(self, a=3, b=4, fp="", save=False, name=None, up_to=None, scaled_timesteps=True, normalize=True):
        
        if up_to is None: 
            up_to = len(self.nodes)

        assert up_to <= len(self.nodes), "up_to parameter larger than number of nodes"

        matrix = np.zeros((a, b))

        if scaled_timesteps:
            time_bounds = self.get_scaled_timesteps(a, up_to)
        else: 
            time_max = list(self.nodes.items())[up_to-1][1] # total running time
            time_bounds = [i*(time_max / len(matrix)) for i in range(len(matrix)+1)]

        node_groups = self.set_node_groups(up_to, len(matrix[0]))
        # fill the matrix for each row
        for i in range(len(matrix)):
            matrix[i] = self.get_matrix_row(node_groups, time_bounds[i], time_bounds[i+1])
        
        if self.verbose:
            print("all matrix rows are calculated")

        if normalize:
            self.normalize_matrix(matrix)

        if save:
            if name is None: # use default name
                self.save_matrix_to_csv(matrix, node_groups, fp, "_matrix")
            else:
                self.save_matrix_to_csv(matrix, node_groups, fp, name)

        return matrix

    '''
    calculate added edges
    '''        
    def get_matrix_row(self, node_groups, t_start, t_end):
        new_nodes = []

        # find which nodes were added in timestep
        for n, t in self.nodes.items(): # TODO optimize this iteration by storing some sort of iterator
            t = float(t)
            if t >= t_start:
                if t <= t_end:
                    new_nodes.append(n)
                else:
                    break

        row = np.zeros(len(node_groups))

        # update matrix
        for n in new_nodes:
            for v in self.edges[n]:
                for i, group in node_groups.items():
                    if v in group:
                        row[i] += 1

        # average out values, the matrix represents the average number of degrees a cell in the node group has received.
        for i, group in node_groups.items():
            if len(group) != 0:
                row[i] = row[i] / len(group)
            else:
                if self.verbose:
                    print(" Group size of 0! ")

        return row
    
    '''
    normalize matrix by dividing every element in matrix by sum over all.
    '''
    def normalize_matrix(self, matrix):
        total = sum(sum(matrix[i]) for i in range(len(matrix)))

        for i in range(len(matrix)):
            for j in range(len(matrix[i])):
                matrix[i][j] = matrix[i][j] / total
        
        if self.verbose:
            print("matrix has been normalized")

    '''
    set the node_groups to self.node_groups (G_j)
    '''
    def set_node_groups(self, up_to, group_amount):
        degree_sequence = {}

        # calculate the degree sequence untill the "up_to"
        for n in list(self.nodes.keys())[:up_to]:
            if n in self.edges:
                degree_sequence[n] = len(self.edges[n])
                for v in self.edges[n]: # since OrderedDict and PA only connects backwards, all v's already in degree_sequence
                    if v in degree_sequence:
                        degree_sequence[v] += 1     
                    else:
                        degree_sequence[v] = 1
            else:
                self.edges[n] = set()
                degree_sequence[n] = 0
        
        # get a sorted degree-sequence
        degrees = list(degree_sequence.values())
        degrees.sort()

        # calculate the empirical CDF (F_T in Weiting Cai)
        empirical_cdf = {}
        n = len(degrees)

        prev = 0
        empirical_cdf[0] = 0
        for v in degrees:
            if v > prev:
                empirical_cdf[v] = empirical_cdf[prev] + 1/n
                prev = v
            else:
                empirical_cdf[prev] += 1/n

        assert abs(empirical_cdf[degrees[-1]] - 1) < 0.00001
        empirical_cdf[degrees[-1]] = 0.999999999 # last cannot be >= 1 by overflow

        node_groups = {i : set() for i in range(group_amount)}
        for node, degree in degree_sequence.items():
            i = group_amount - 1 - ((empirical_cdf[degree]) // (1/group_amount))
            node_groups[i].add(node)

        if self.verbose:
            print("node_groups are set")

        return node_groups

    '''
    populate self.nodes from file
    '''
    def load_nodes(self, file, up_to=None):
        if not up_to is None:
            assert isinstance(up_to, int) and up_to > 0, "up_to should be positive integer"

        reader = csv.reader(file)
        
        counter = 0
        for node in reader:         # node is stored as name_node : timestamp
            self.nodes[node[0]] = float(node[1])
            counter += 1

        if self.verbose:
            print(f"loaded all {counter} nodes")

    '''
    populate self.edges from file.  
    '''
    def load_edges(self, file):
        reader = csv.reader(file)

        # we add the root manually as it has no originating edges
        self.edges['0'] = set()

        counter = 0
        for edge in reader:         # edge in file is stored as node1, node2, timestamp

            if edge[0] in self.edges:
                self.edges[edge[0]].add(edge[1])
            else:
                self.edges[edge[0]] = set((edge[1],))
            
            if self.verbose:
                counter += 1
                if counter % 1_000_000 == 0: # print update every million edges
                    print(f"++ loaded {counter} edges ")

        if self.verbose:
            print(f"Loaded edges")

    def save_matrix_to_csv(self, matrix, node_groups, fp, post_script):
        with open(os.path.join(fp, self.id + post_script + ".csv"), "w", newline="") as f:
            writer = csv.writer(f)

            writer.writerow([len(group) for group in node_groups.values()])

            for row in matrix:
                writer.writerow(row)
    
    '''
    Set self.G to the networkx.Graph() object representation
    '''
    def set_graph_representation(self):
        self.G = nx.Graph()

        for node in self.nodes.keys():
            self.G.add_node(node)
        
        for u in self.edges.keys():
            for v in self.edges[u]:
                self.G.add_edge(u, v)

        # remove self loops  
        if self.G.has_edge('0', '0'):
            self.G.remove_edge('0', '0')
    
    '''
    gets the networkx.Graph object representation, if it does not exist, creates it.
    '''
    def get_graph_representation(self):
        if hasattr(self, 'G'):
            return self.G
        else:
            self.set_graph_representation()
            return self.G
    '''
    plot degree distribution
    '''
    def plot_tail_degree_distribution(self, loglog=True, title=None, save=False, show=True):
        assert (not save) or not title is None, "need title to save"
        assert save or show, "neither saving nor showing, what u want?"

        # https://github.com/LourensT/degreedistributions
        from DegreeDistributions.DegreeDistributions import DegreeDistribution
        distr = DegreeDistribution(self.get_graph_representation() ,tail=True)
        plt.scatter(x=distr.keys(), y=distr.values(), color='red')

        if loglog:
            plt.yscale("log")
            plt.xscale("log")

        if not title is None:
            plt.title(title)
        
        if save:
            plt.savefig(self.fp + title+"_degrees.png")
            print(f"Saved {title}_degrees.png")

        if show:
            plt.show()
        else:
            plt.clf()
    
    '''
    plot the size over time. @pre's are checked, so method is reasonably robust.

    optional params:
    @title: string - title of the plot
    @save: bool    - if True, save as @title.png
            @pre: title != None
    @fit_exp: bool - if True, an exponential curve, OLS fit of y = a*exp(bx)

    '''
    def plot_size_over_time(self, title=None, save=False, fit_exp = False, show = True, log=False):
        assert (not save) or not title is None, "need title to save"
        assert save or show, "neither saving nor showing, what u want?"

        cumulative_growth = {}
        prev = 0
        cumulative_growth[prev] = 0
        for time in self.nodes.values():
            if time in cumulative_growth:
                cumulative_growth[time] += 1
            else:
                assert prev <= time, "not ordered; what the fuck"
                cumulative_growth[time] = 1 + cumulative_growth[prev]
                prev = time

        plt.scatter(x=cumulative_growth.keys(), y=cumulative_growth.values())

        legend = ["Population size over time"]

        # fit the exponential curve y = a*exp(bx)
        if fit_exp:
            logY = [np.log(i) for i in cumulative_growth.values() if i > 0][10:]
            x = [ [i,] for i in cumulative_growth.keys()][10:] 
            
            from sklearn.linear_model import LinearRegression
            reg = LinearRegression().fit(x, logY)
            a = np.exp(reg.intercept_)
            b = reg.coef_[0]

            x_range = np.linspace(x[9][0], max(cumulative_growth.keys()), num=100)
            y_pred = [a*np.exp(b*n) for n in x_range]

            plt.plot(x_range, y_pred, color="red")
            legend.append(f"fit of y = a*exp(bx) with a = {a}, b = {b}")

        if not title is None:
            plt.title(title)

        if log:
            plt.yscale("log")

        if save:
            plt.savefig(self.fp + title+"_growth.png")
            print(f"Saved {title}_growth.png")

        if show:
            plt.show()
        else:
            plt.clf()