DynamicNetworkSimulation / generate_dataset.py
generate_dataset.py
Raw
from dynamic_feature_extraction import *
from simulation import *
from empirical_distribution_outdegree import sample_empirical_outdegree

from typing import List
import os
import random

from multiprocessing import Pool

import scipy.stats
from datetime import datetime

FILEPATH = os.path.join(os.getcwd(), "Dataset")

# absolute filepath in which to put the features
FEATUREMATRIX_FP = os.path.join(FILEPATH, "DatasetFeatures")
# if folder does not exist, create it
if not os.path.isdir(FEATUREMATRIX_FP):
    os.mkdir(FEATUREMATRIX_FP)
# STORE NETWORKS
SOURCE_FP = os.path.join(FILEPATH, "DatasetNetworkSource")
# if folder does not exist, create it
if not os.path.isdir(SOURCE_FP):
    os.mkdir(SOURCE_FP)

# number of networks per category
START_NUMBER = 0
FINAL_NUMBER = 750
NR_OF_PROCESSES = 8

# max size of network being analyzed
NETWORK_SIZE = 20000
# number of different sizes to calculate features for

# maximum width or height for feature matrix, minumum 2
MATRIX_DIMENSIONS = (10,10)

'''
Dumps all feature files

@arg name: string, which is used to load the correct files and name the output directory
@arg fp: string, filepath where the name_nodes.csv and name_edges.csv can be found
@arg destination: where to dump the feature folder
@arg sizes. ...
@arg dimension, ...
'''
def dump_features(name : str):
    g = GraphAnalysis(name, SOURCE_FP, verbose=False)
    g.calculate_feature_matrix(a=MATRIX_DIMENSIONS[0], b=MATRIX_DIMENSIONS[1], fp=FEATUREMATRIX_FP, scaled_timesteps=True, save=True)

def append_result(name, params, result):
    row = [name,] + params + [result]
    assert len(row) == 7, "incorrect length of entry"

    filename = os.path.join(FILEPATH, "results.csv")
    if os.path.isfile(filename):
        with open(filename, "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(row)
    else:
        with open(filename, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["Name", "Code", "Network Size", "PA", "Fitness", "Aging", "Result"])
            writer.writerow(row)

def name_exists(name):
    return os.path.isfile(os.path.join(FEATUREMATRIX_FP, name + "_matrix.csv"))

def generate_network(name, code, *args):
    if name_exists(name):
        print(datetime.now().strftime("%H:%M:%S"), f"{name} already exists")
        return

    print(datetime.now().strftime("%H:%M:%S"), f"generating {name}")
    info = [code, NETWORK_SIZE] + list(args)

    res = False
    count = 0
    while not res:
        ctbp = simulator_from_parameters(NETWORK_SIZE, sample_empirical_outdegree, *args)
        res = ctbp.generate(save_to=[SOURCE_FP, name], debug=False)

        count += 1
        if count > 1000: # died out too many times
            append_result(name, info, "Died out for 1000 tries")
            return

    append_result(name, info, count) 

    dump_features(name)

# 0 - Uniform Attachment Model
# 1 - Affine PA(a,b)
# 2 - Power-law (xmin, tau) Fitness
# 3 - Exponential (lambda) Fitness 
# 4 - Power-law Fitness (xmin, tau), 
#     Aging (mu, st.dev).
# 5 - Exponential Fitness (lambda),
#     Aging (mu, st.dev)
# 6 - PA(a,b) with Exponential (lambda) Fitness
# 7 - PA(a,b),
#     Aging(mu, st.dev).
# 8 - PA(a,b), 
#     Exponential fitness (lambda),  
#     Aging (mu, st.dev).

def main_procedure():
    print(datetime.now().strftime("%H:%M:%S"), "Generating parameters")

    # # 0 - Uniform Attachment
    params = []
    for i in range(START_NUMBER, FINAL_NUMBER):
        name = "0-"+(str(i))
        params.append([name, 0, None, None, None])

    # # 1 - Affine PA(a,b)
    for i in range(START_NUMBER, FINAL_NUMBER):
        name = "1-"+(str(i))
        pa = (scipy.stats.uniform(loc=1, scale=3).rvs(), scipy.stats.uniform(loc=1, scale=3).rvs())
        params.append([name, 1,  pa, None, None])

    # # 2 - Power-law (xmin, tau) Fitness
    for i in range(START_NUMBER, FINAL_NUMBER):
        name = "2-"+(str(i))
        fitness = (scipy.stats.uniform(0, 1).rvs(), scipy.stats.uniform(loc=2, scale=2).rvs())
        params.append([name, 2,  None, fitness, None])

    # # 3 - Exponential (lambda) Fitness 
    for i in range(START_NUMBER, FINAL_NUMBER):
        name = "3-"+(str(i))
        fitness = scipy.stats.uniform(loc=0.1, scale=2.9).rvs()
        params.append([name, 3,  None, fitness, None])

    # 4 - Power-law Fitness (xmin, tau), 
    #     Aging (mu, st.dev).
    for i in range(START_NUMBER, FINAL_NUMBER):
        name = "4-"+(str(i))
        fitness = (scipy.stats.uniform(loc=0.5, scale=0.5).rvs(), scipy.stats.uniform(loc=2, scale=0.7).rvs())
        aging = (scipy.stats.uniform(loc=0.1, scale=2.9).rvs(), 1)
        params.append([name, 4,  None, fitness, aging])

    # 5 - Exponential Fitness (lambda),
    #     Aging (mu, st.dev)
    for i in range(START_NUMBER, FINAL_NUMBER):
        name = "5-"+(str(i))
        fitness = scipy.stats.uniform(loc=0.1, scale=0.9).rvs()
        aging = (scipy.stats.uniform(loc=0.1, scale=2.9).rvs(), 1)
        params.append([name, 5,  None, fitness, aging])

    # 6 - PA(a,b) with Uniform (a, b) Fitness
    for i in range(START_NUMBER, FINAL_NUMBER):
        name = "6-"+(str(i))
        pa = (scipy.stats.uniform(loc=1, scale=3).rvs(), scipy.stats.uniform(loc=1, scale=3).rvs())
        fitness = ("uniform", scipy.stats.uniform(loc=0.1, scale=0.9).rvs(), scipy.stats.uniform(loc=1, scale=4).rvs())
        params.append([name, 6,  pa, fitness, None])    

    # 7 - PA(a,b),
    #     Aging(mu, st.dev).
    for i in range(START_NUMBER, FINAL_NUMBER):
        name = "7-"+(str(i))
        pa = (scipy.stats.uniform(loc=3.3, scale=3.7).rvs(), scipy.stats.uniform(loc=1, scale=3).rvs())
        aging = (scipy.stats.uniform(loc=0.1, scale=29).rvs(), 1)
        params.append([name, 7,  pa, None, aging])

    # 8 - PA(a,b), 
    #     Exponential fitness (lambda),  
    #     Aging (mu, st.dev).
    for i in range(START_NUMBER, FINAL_NUMBER):
        name = "8-"+(str(i))
        pa = (scipy.stats.uniform(loc=1, scale=3).rvs(), scipy.stats.uniform(loc=1, scale=3).rvs())
        fitness = scipy.stats.uniform(loc=0.1, scale=((pa[0] + (pa[1]/10.29))- 0.1 )).rvs() #lambda  ∈ [0.1,  a + b/E[M]]
        aging = (scipy.stats.uniform(loc=0.1, scale=2.9).rvs(), 1)
        params.append([name, 8, pa, fitness, aging])

    random.shuffle(params)

    print(datetime.now().strftime("%H:%M:%S"), f"Generated Params, moving on to generating {len(params)} networks.")

    with Pool(processes=NR_OF_PROCESSES) as pool:
        pool.starmap(generate_network, params)

if __name__ == "__main__":
    main_procedure()