Learn-to-Compress / scripts / gen_norm.py
gen_norm.py
Raw
import numpy as np
import struct
from scipy.stats import norm, lognorm
import os

# any arbitrary seed value will do, but this one is clearly the best.
np.random.seed(seed=42) 

NUM_KEYS = 200000000

print("Generating linear data...")
if not os.path.exists("../integer_data/linear_200M_uint32.txt"):
    print("32 bit...")
    keys = np.linspace(0, 1, NUM_KEYS + 2)[1:-1]
    keys = (keys - np.min(keys)) / (np.max(keys) - np.min(keys))
    keys *= 2**32 - 1
    keys = keys.astype(np.uint32)
    np.savetxt("../integer_data/linear_200M_uint32.txt", keys,fmt='%d')


print("Generating normal data...")
if not os.path.exists("../integer_data/normal_200M_uint32.txt"):
    print("32 bit...")
    keys = np.linspace(0, 1, NUM_KEYS + 2)[1:-1]

    # for some reason, the PPF function seems to use quadratic memory
    # with the size of its input.
    keys = np.array_split(keys, 1000)
    keys = [norm.ppf(x) for x in keys]
    keys = np.array(keys).flatten()

    keys = (keys - np.min(keys)) / (np.max(keys) - np.min(keys))
    keys *= 2**32 - 1
    keys = keys.astype(np.uint32)
    np.savetxt("../integer_data/normal_200M_uint32.txt", keys,fmt='%d')


# print("Generating log normal data...")
# if not os.path.exists("data/lognormal_200M_uint32.txt"):
#     print("32 bit...")
#     keys = np.linspace(0, 1, NUM_KEYS + 2)[1:-1]

#     # using a sigma of 2 for the 32 bit keys produces WAY too many
#     # duplicates, so we will deviate from the RMI paper
#     # and use 1.
#     keys = np.array_split(keys, 1000)
#     keys = [lognorm.ppf(x, 1) for x in keys]
#     keys = np.array(keys).flatten()

#     keys = (keys - np.min(keys)) / (np.max(keys) - np.min(keys))
#     keys *= 2**32 - 1
#     keys = keys.astype(np.uint32)
#     np.savetxt("data/lognormal_200M_uint32.txt", keys,fmt='%d')


# print("Generating books data...")
# if not os.path.exists("data/books_200M_uint32.txt"):
#     print("32 bit...")
#     import chardet
#     keys = np.fromfile("data/books_200M_uint32", dtype=np.uint32)[2:]
#     np.savetxt("data/books_200M_uint32.txt", keys,fmt='%d')