DynamicNetworkSimulation / static_feature_extraction.py
static_feature_extraction.py
Raw
# %%
import networkx as nx
import os
import csv
from datetime import datetime

from multiprocessing import Pool

from dynamic_feature_extraction import GraphAnalysis

import numpy as np
import pandas as pd

""" Static features feasible for classifying medium-to-large networks. Feature not repeated if already used by earlier paper.
            NB: scale invariance not needed, since training networks have same #nodes and approx. same #edges. Density is pointless, since constant.

    -- global clust. coef. (=transitivity), degree assortativity, 4-motif counts
            (Characterizing the structural diversity of complex networks across domains, 2017)
    -- avg local clust. coef., modularity, DDQC of degree distr.
            (Classification of complex networks based on similarity of topological network features, 2017)
    -- diameter, effective diameter, power-law exponent, discretized distr. of: degree, local clust. coef., core, betweenness, closeness, Katz, PageRank centralities. 
            (Towards a Systematic Evaluation of Generative Network Models, 2018)
    -- total/avg/max triangles centered on edge, max k-core number, max clique
            (Complex networks are structurally distinguishable by domain, 2019, 
             peer-reviewed later version of retracted paper from 2018)
    -- entropy of degree distr., #communities, 3-node and 4-node motif counts.
            (Empirically Classifying Network Mechanisms, 2021)
"""

def calc_static_features(id):
    start = datetime.now()

    print(datetime.now().strftime("%H:%M:%S"), f"calculating static features for {id}")
    g = GraphAnalysis(id, fp=os.path.join(os.getcwd(), "../", "Dataset", "DatasetNetworkSource"))
    G = g.get_graph_representation()

    # assert no self edges 
    selfloops = list(nx.selfloop_edges(G))
    assert len(selfloops) == 0, f"there are self loops! {selfloops, id}"

    # these don't scale:
    # print("\tpagerank distr.:", pageranks(G)) # doesn't scale
    # print("\tcloseness distr.:", closeness(G)) # doesn't scale
    # print("\tdiameter:", nx.diameter(G)) # doesn't scale


    row = [id] + list(degrees(G)) + list(clustering(G)) + list(triangles(G)) + [nx.transitivity(G), nx.degree_assortativity_coefficient(G)] + list(cores(G))
    add_to_csv(row)
    duration = (datetime.now() - start)
    print(f"network {id} done in {str(duration)}")

def degrees(G):
    deg_tuples = list(nx.degree(G)) # [(0, 5), (1, 2), ...]
    l = list(zip(*deg_tuples))[1]
    stats = np.mean(l), np.std(l), min(l), max(l), *list(np.quantile(l, [.125, .25, .5, .75, .875]))
    return stats

def clustering(G):
    l = list(nx.clustering(G).values())
    stats = np.mean(l), np.std(l), min(l), max(l), *list(np.quantile(l, [.5, .6, .7, .8, .9]))
    return stats

def triangles(G):
    l = list(nx.triangles(G).values()) # incident on nodes, not edges!
    stats = np.mean(l), np.std(l), min(l), max(l), *list(np.quantile(l, [.80, .90, .95, .97, .99]))
    return stats

def cores(G):
    l = list(nx.core_number(G).values())
    stats = np.mean(l), np.std(l), min(l), max(l), *list(np.quantile(l, [.25, .5, .75]))
    return stats

def closeness(G):
    l = list(nx.closeness_centrality(G).values())
    stats = np.mean(l), np.std(l), min(l), max(l), *list(np.quantile(l, [.25, .5, .75]))
    return stats

def pageranks(G):
    l = list(nx.pagerank(G, alpha=0.85).values())
    stats = np.mean(l), np.std(l), min(l), max(l), *list(np.quantile(l, [.25, .5, .75]))
    return stats

def add_to_csv(row):
    fp = os.path.join(os.getcwd(), "../", "Dataset","StaticFeatures.csv")
    with open(fp, "a", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(row)

def get_ids():
    FP_RES = os.path.join(os.getcwd(), "../", "Dataset", "results.csv")
    df_res = pd.read_csv(FP_RES,header=0)
    df_res = df_res[df_res["Result"] != "Died out for 1000 tries"]
    names = list(df_res["Name"].values)

    # remove names already processed locally
    FP_IDS = os.path.join(os.getcwd(), "../", "Dataset", "StaticFeatures.csv")
    df = pd.read_csv(FP_IDS, header=0)
    processed = list(df["Name"].values)
    # final list to be done
    names = [n for n in names if n not in processed]

    # remove names already in folder
    names = [n for n in names if n+".json" not in os.listdir(os.getcwd() + "/rawresponse")]

    # only category 0
    # names = [n for n in names if n[0] == str(group)]
    return names

# %%
G = nx.read_graphml("justone.graphml")
row = list(degrees(G)) + list(clustering(G)) + list(triangles(G)) + [nx.transitivity(G), nx.degree_assortativity_coefficient(G)] + list(cores(G))
print(row)

# %%
if __name__ == '__main__':
    columns = ['avg deg', 'std deg', 'min deg', 'max deg', 'Q1 deg', 'Q2 deg', 'Q3 deg', 'Q4 deg', 'Q5 deg', 
               'avg clust', 'std clust', 'min clust', 'max clust', 'Q1 clust', 'Q2 clust', 'Q3 clust', 'Q4 clust', 'Q5 clust', 
               'avg tri', 'std tri', 'min tri', 'max tri', 'Q1 tri', 'Q2 tri', 'Q3 tri', 'Q4 tri', 'Q5 tri', 
               'transitivity',
               'assortativity',
               'avg core', 'std core', 'min core', 'max core', 'Q1 core', 'Q2 core', 'Q3 core'
              ]

    # check if static.csv exists, if not create it
    fp = os.path.join(os.getcwd(), "../", "Dataset","StaticFeatures.csv")
 
    if not os.path.exists(fp):
        with open(fp, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["Name",] + columns)

    # final list to be done
    names = get_ids() 

    print(f"Calculating static features for still {len(names)} networks")

    # calculate features with parallel processes
    with Pool(processes=4) as pool:
        pool.map(calc_static_features, names)

# %%
# %%