WiscSort / variable_size / generate_binary.cc
generate_binary.cc
Raw
// generate a main method that takes file name, key size, value size range, and number of records
// and generates a binary file with the given number of records
// The key is of size key size
// The value size is randomly chosen from the given value size range
// The record is of format key, value size, value
// The file is of format record 1, record 2, ...
// The file is of size number of records * (key size + 4B + value size)
// The keys are uniformly distributed
// The values are uniformly distributed
// The value size is uniformly distributed

#include <iostream>
#include <fstream>
#include <cstdlib>
#include <ctime>
#include <string>
#include <cstring>
#include <math.h>
#include <random>

using namespace std;

double rand_normal(double mean, double stddev, int max_range)
{
    // Box muller method
    static double n2 = 0.0;
    static int n2_cached = 0;
    if (!n2_cached)
    {
        double x, y, r;
        do
        {
            x = 2.0 * rand() / RAND_MAX - 1;
            y = 2.0 * rand() / RAND_MAX - 1;

            r = x * x + y * y;
        } while (r == 0.0 || r > 1.0);
        {
            double d = sqrt(-2.0 * log(r) / r);
            double n1 = x * d;
            n2 = y * d;
            double result = n1 * stddev + mean;
            n2_cached = 1;
            return result;
        }
    }
    else
    {
        n2_cached = 0;
        return n2 * stddev + mean;
    }
}

int main(int argc, char *argv[])
{
    if (argc != 6)
    {
        cout << "Usage:" << argv[0] << " <file name> <key size> <value size mean> <value size std dev> <number of records>" << endl;
        return 1;
    }

    string file_name = argv[1];
    int key_size = atoi(argv[2]);
    int value_sz_mean = atoi(argv[3]);
    int value_sz_stddev = value_sz_mean - atoi(argv[4]);
    size_t num_records = atoi(argv[5]);

    // print parameters
    cout << "File name: " << file_name << endl;
    cout << "Key size: " << key_size << endl;
    cout << "Value size mean: " << value_sz_mean << endl;
    cout << "Value size std dev: " << value_sz_stddev << endl;
    cout << "Number of records: " << num_records << endl;

    ofstream file(file_name.c_str(), ios::out | ios::binary);

    srand(time(NULL));
    std::random_device rd;
    std::mt19937 e2(rd());
    // std::normal_distribution<> dist(value_sz_mean, value_sz_stddev);
    // For positive real numbers:
    std::uniform_real_distribution<> dist(value_sz_mean, value_sz_stddev);

    for (size_t i = 0; i < num_records; i++)
    {
        // generate key
        string key;
        for (int j = 0; j < key_size; j++)
        {
            key += (char)(rand() % 256);
        }

        // generate value size
        // int value_size = rand() % value_size_range;
        int value_size = std::round(dist(e2));
        // cout << "Value size: " << value_size << endl;

        // generate value
        string value;
        for (int j = 0; j < value_size; j++)
        {
            value += (char)(rand() % 256);
        }

        // write key
        file.write(key.c_str(), key_size);

        // write value size
        file.write((char *)&value_size, sizeof(int));

        // write value
        file.write(value.c_str(), value_size);
    }

    file.close();

    return 0;
}