Learn-to-Compress / scripts / gen_bitmap.py
gen_bitmap.py
Raw
import numpy as np
import struct
from scipy.stats import norm, lognorm
import os
import threading

# any arbitrary seed value will do, but this one is clearly the best.
np.random.seed(seed=42) 

# NUM_KEYS = 200000000
# NUM_KEYS = 289000
# NUM_KEYS_list = [1000000]
# NUM_KEYS_list = [200000000, 199999994]
# NUM_KEYS_list = [14057565]
NUM_KEYS_list = [200015910]
# NUM_KEYS_list = [200000000, 199999994, 289000, 2076000, 233000]


prob = [0.00001, 0.0001, 0.0005, 0.001, 0.01, 0.1]
# prob = [0.1]
# 0.001%, 0.01%, 0.05%, 0.1%, 1%

def uniform():
    for NUM_KEYS in NUM_KEYS_list:
        for prob_ in prob:
            print("Generating random bitmap...")
            print(prob_)
            file_name = "../data/bitmap_random/bitmap_random_"+str(prob_)+"_"+str(NUM_KEYS)+".txt"
            if not os.path.exists(file_name):
                count = 0
                keys = np.random.choice([0, 1], size=NUM_KEYS, p=[1.0-prob_, prob_])
                for i in range(NUM_KEYS):
                    if keys[i] == 1:
                        count+=1
                print(count)
                keys = np.array(keys)
                keys = keys.astype(np.uint32)
            

                np.savetxt(file_name, keys,fmt='%d')

def myThread(divide_base, num_cluster, prob_, NUM_KEYS):
    factor = 1 / divide_base**num_cluster
    print("Generating random bitmap...")
    print(prob_)
    file_name = "../data/bitmap_random_cluster/bitmap_random_"+str(prob_)+"_"+str(NUM_KEYS)+".txt"
    # if not os.path.exists(file_name):
    count = 0
    keys_list = []
    sum_prob = 0
    for i in range(num_cluster - 1):
        keys_list.append(np.random.choice([0, 1], size=NUM_KEYS//num_cluster, p=[1.0-prob_*factor, prob_*factor]))
        sum_prob += factor
        factor *= divide_base
    keys_list.append(np.random.choice([0, 1], size=NUM_KEYS//num_cluster, p=[1.0-prob_*(num_cluster - sum_prob), prob_*(num_cluster - sum_prob)]))
    for i in range(NUM_KEYS//num_cluster):
        for j in keys_list:
            if j[i] == 1:
                count += 1
    print(count)
    keys = np.concatenate(tuple(keys_list))
    keys = keys.astype(np.uint32)
    np.savetxt(file_name, keys,fmt='%d')
    
def cluster():
    num_cluster = 10
    divide_base = 2
    ts = []
    for NUM_KEYS in NUM_KEYS_list:
        for prob_ in prob:
            t = threading.Thread(target=myThread, args=(divide_base, num_cluster, prob_, NUM_KEYS))
            t.start()
            ts.append(t)
    for t in ts:
        t.join()


def cluster_v0():
    for NUM_KEYS in NUM_KEYS_list:
        selectivity_ = prob
        cluster_ = [10]
        print("Generating cluster bitmap...")
        for selectivity in selectivity_:
            for cluster in cluster_:
                if cluster > selectivity*NUM_KEYS:
                    continue
                
                file_name = f"../data/bitmap/bitmap_random_{selectivity}_{NUM_KEYS}.txt"
                print(file_name)
                if not os.path.exists(file_name):
                    start_ind = []
                    keys = []
                    # selectivity * NUM_KEYS
                    count = 0
                    for i in range(cluster):
                        ind = np.random.randint(0,(1-selectivity)*NUM_KEYS/cluster) #avoid that the last 1 is in the next seg
                        start_ind.append(i*NUM_KEYS/cluster + ind)
                    for i in range(cluster):
                        for j in range(int(NUM_KEYS/cluster)):
                            tmp = i*NUM_KEYS/cluster + j
                            if tmp>=start_ind[i] and tmp<start_ind[i]+selectivity*NUM_KEYS/cluster:
                                keys.append(1)
                                count+=1
                            else:
                                keys.append(0)
                    print(count)
                    keys = np.array(keys)
                    keys = keys.astype(np.uint32)
            

                    np.savetxt(file_name, keys,fmt='%d')

if __name__ == "__main__":
    cluster()
    # cluster_v0()