Learn-to-Compress / experiments / snappy_int.cpp
snappy_int.cpp
Raw
#include "../headers/common.h"
#include "../headers/codecfactory.h"
#include "../headers/caltime.h"
#include "../headers/lr.h"
#include "snappy.h"
typedef uint64_t leco_type;
int random(int m)
{
    return rand() % m;
}
template <typename T>
static std::vector<T> load_data_binary(const std::string& filename,
    bool print = true) {
    std::vector<T> data;

    std::ifstream in(filename, std::ios::binary);
    if (!in.is_open()) {
        std::cerr << "unable to open " << filename << std::endl;
        exit(EXIT_FAILURE);
    }
    // Read size.
    uint64_t size;
    in.read(reinterpret_cast<char*>(&size), sizeof(uint64_t));
    data.resize(size);
    // Read values.
    in.read(reinterpret_cast<char*>(data.data()), size * sizeof(T));
    in.close();

    return data;
}


template <typename T>
static std::vector<T> load_data(const std::string& filename) {
    std::vector<T> data;
    std::ifstream srcFile(filename, std::ios::in);
    if (!srcFile) {
        std::cout << "error opening source file." << std::endl;
        return data;
    }

    while (srcFile.good()) {
        T next;
        srcFile >> next;
        if (!srcFile.good()) { break; }
        data.emplace_back(next);

    }
    srcFile.close();

    return data;
}

int main(int argc, const char *argv[])
{
    using namespace Codecset;
    std::string source_file = std::string(argv[1]);
    int blocks = atoi(argv[2]);
    int model_size = atoi(argv[3]);
    bool binary = atoi(argv[4]);
    std::ios::sync_with_stdio(false);
    // We pick a CODEC

    std::vector<leco_type> data;
    source_file = "../data/"+source_file;

    if(binary){
        data = load_data_binary<leco_type>(source_file);
    }
    else{
        data = load_data<leco_type>(source_file);
    }
    int N = data.size();
    if (data.size() == 0)
    {
        std::cout << "Empty vector" << std::endl;
        return 0;
    }
    
    // std::sort(data.begin(),data.end());
    int block_size = data.size() / blocks;
    blocks = data.size() / block_size;
    if (blocks * block_size < N)
    {
        blocks++;
    }
    int delta = 32;
    std::vector<std::string> block_start_vec;
    std::vector<int> start_index;
    std::vector<int> seglen;

    int totalsize = 0;
    double start = getNow();
    for (int i = 0; i < blocks; i++)
    {
        int block_length = block_size;
        if (i == blocks - 1)
        {
            block_length = N - (blocks - 1) * block_size;
        }
        // uint8_t *descriptor = (uint8_t *)malloc(block_length * sizeof(uint64_t)+1000);
        // uint8_t *res = descriptor;
        std::string compressed;
        int segment_size = snappy::Compress((char *)(data.data() + i * block_size), block_length * sizeof(leco_type), &compressed);
        // std::cout<<segment_size<<std::endl;
        seglen.push_back(segment_size);
        // descriptor = (uint8_t *)realloc(descriptor,segment_size);
        block_start_vec.push_back(compressed);
        totalsize += segment_size;
    }
    double end = getNow();
    double compress_time = end - start;

    double compress_throughput = N * sizeof(leco_type) / (compress_time * 1000000000);

    double origin_size = (sizeof(leco_type) * N * 1.0);
    double total_model_size = model_size * blocks;
    double cr_wo_model = (totalsize - total_model_size) * 100.0 / origin_size;
    double cr_model = total_model_size * 100.0 / origin_size;
    double compressrate = (totalsize)*100.0 / origin_size;

    bool flag = true;
    std::vector<leco_type> recover(data.size());
    double totaltime = 0.0;
    // std::cout << "decompress all!" << std::endl;
    int repeat = 10;

    start = getNow();
    for (int iter = 0; iter < repeat; iter++)
    {
        for (int i = 0; i < blocks; i++)
        {
            int block_length = block_size;
            if (i == blocks - 1)
            {
                block_length = N - (blocks - 1) * block_size;
            }
            std::string uncomp_str;
            // const char* reover= reinterpret_cast<const char*>(recover + block_size*i);
            snappy::Uncompress(block_start_vec[i].c_str(), seglen[i], &uncomp_str);
            // std::cout<<uncomp_str<<std::endl;
            // &recover[i*block_size] = reinterpret_cast<leco_type*>
        }
        
    }
    end = getNow();

    totaltime += (end - start);
    double da_ns = totaltime / (repeat * data.size()) * 1000000000;
    // std::cout << da_ns << std::endl;
    // std::cout << "random access decompress!" << std::endl;
    
    std::vector<uint32_t> ra_pos;
    repeat = 1;
    for(int i=0;i<N*repeat;i++){
        ra_pos.emplace_back(random(N));
        // ra_pos.push_back(i);
    }
    std::vector<uint32_t> buffer(data.size());

    double randomaccesstime = 0.0;
    start = getNow();
    uint32_t mark = 0;
    for (auto index : ra_pos)
    {
        std::string uncomp_str;
        snappy::Uncompress(block_start_vec[(int)index / block_size].c_str(), seglen[(int)index / block_size], &uncomp_str);
        leco_type tmpvalue;
        memcpy(&tmpvalue, uncomp_str.c_str() + (index % block_size) * sizeof(leco_type), sizeof(leco_type));
        // std::cout<<tmpvalue<<std::endl;
        mark += tmpvalue;

        // if (data[index] != tmpvalue)
        // {

        //     std::cout << "num: " << index << "true is: " << data[index] << " predict is: " << tmpvalue << std::endl;
        //     flag = false;
        //     std::cout << "something wrong! decompress failed" << std::endl;
        // }
        // if (!flag)
        // {
        //     break;
        // }
    }
    
    end = getNow();
    randomaccesstime += (end - start);
    std::ofstream outfile("fix_log", std::ios::app);
    outfile<<mark<<std::endl;

    double ra_ns = randomaccesstime / (N*repeat) * 1000000000;

    std::cout<<"snappy "<<source_file<<" "<<blocks<<" "<<compressrate<<" "<<cr_model<<" "<<cr_wo_model<<" "<<da_ns<<" "<<ra_ns<<" "<<compress_throughput<<std::endl;
    

    /*
        uint64_t total_sum = 0;
        start = getNow();
        for (int i = 0; i < blocks; i++)
        {
            int block_length = block_size;
            if (i == blocks - 1)
            {
                block_length = N - (blocks - 1) * block_size;
            }
            uint64_t block_sum = codec.summation(block_start_vec[i], block_length, i);
            total_sum += block_sum;
            uint64_t ground_truth = 0;
            for (int j = 0; j < block_length; j++)
            {
                ground_truth+=data[i*block_size+j];
            }
            // std::cout<<"block "<<i<<" "<<ground_truth<<" "<<block_sum<<std::endl;
            if(ground_truth!=block_sum){
                std::cout<<"block "<<i<<" "<<ground_truth<<" "<<block_sum<<std::endl;
            }
            assert(block_sum == ground_truth);
        }
        end = getNow();
        std::cout<<total_sum<<std::endl;
        double sum_time = end - start;
        std::cout << "sum time: " << sum_time/N * 1000000000 << std::endl;
    */
    // for (int i = 0; i < (int)block_start_vec.size(); i++)
    // {
    //     free(block_start_vec[i]);
    // }
}