Learn-to-Compress / experiments / example_fsst.cpp
example_fsst.cpp
Raw
// searching for hyper-parameter like block number

#include "../headers/common.h"
#include "../headers/caltime.h"
#include "../headers/string/fsst_string.h"
#include "../headers/bit_read.h"
#include "../headers/bit_write.h"
#include "../headers/delta_my.h"
using namespace Codecset;

int random(int m)
{
  return rand() % m;
}
int main(int argc, const char* argv[])
{
    std::string source_file = std::string(argv[1]);
    bool offset_delta_compress = std::atoi(argv[2]);
    int delta_block_size = std::atoi(argv[3]);

    std::vector<std::string> string_vec;
    char padding_char = 0;
    std::ifstream srcFile(source_file, std::ios::in);
    if (!srcFile)
    {
        std::cout << "error opening source file." << std::endl;
        return 0;
    }
    int cnt = 0;
    while (srcFile.good())
    {
        std::string tmp_str;
        srcFile >> tmp_str;
        if (!srcFile.good())
        {
            break;
        }
        // std::cout << next << std::endl;
        string_vec.push_back(tmp_str);
    }
    srcFile.close();

    FSST_string codec;
    delta_my delta_codec;
    

    std::vector<std::string> string_vec_base(string_vec);
    int N = string_vec.size();
    int block_size = N;
    // std::cout << "vector size = " << string_vec.size() << std::endl;

    int blocks = N / block_size;
    while (block_size * blocks < N)
    {
        blocks++;
    }
    int delta_blocks = N/delta_block_size;
    while (delta_block_size * delta_blocks < N)
    {
        delta_blocks++;
    }
    if(offset_delta_compress){
        delta_codec.init(delta_blocks, delta_block_size, 0);
    }

    uint64_t totalsize = 0;
    uint64_t totalsize_delta_offset = 0;
    std::vector<uint8_t*> descriptor_of_each_block;
    uint64_t totalsize_without_padding = 0;
    std::vector<uint32_t> offset;
    std::vector<uint8_t*> delta_descriptor_of_each_block;


    for(auto item: string_vec)
    {
        totalsize_without_padding += item.size();
    }
    for (int i = 0; i < blocks; i++) {
        int block_length = block_size;
        if (i == blocks - 1) {
            block_length = N - (blocks - 1) * block_size;
        }
        totalsize = codec.encodeArray8_string(string_vec, offset);
    }
    offset.insert(offset.begin(), 0);
    if(offset_delta_compress){
        for (int i = 0; i < delta_blocks; i++) {
            int block_length = delta_block_size;
            if (i == delta_blocks - 1) {
                block_length = N - (delta_blocks - 1) * delta_block_size;
            }
            uint8_t *descriptor = (uint8_t *)malloc(block_length * sizeof(uint64_t)*2);
            uint8_t *res = descriptor;
            res = delta_codec.encodeArray8(offset.data() + (i * delta_block_size), block_length, descriptor, i);
            descriptor = (uint8_t *)realloc(descriptor, (res - descriptor));
            delta_descriptor_of_each_block.push_back(descriptor);
            totalsize_delta_offset += (res - descriptor);
        }
        
    }

    totalsize_without_padding+=(N+1);
    double no_pad_compressrate = (totalsize) * 100.0 / (totalsize_without_padding * 1.0);

    uint64_t totalsize_with_index = N*sizeof(uint32_t)+totalsize;
    if(offset_delta_compress){
        totalsize_with_index = totalsize_delta_offset+totalsize;
    }
    double no_pad_compressrate_with_ind = (totalsize_with_index)*100.0/(totalsize_without_padding*1.0);

    std::vector<int> ra_pos;
    int repeat = 100;
    for(int i=0;i<N*repeat;i++){
        ra_pos.push_back(random(delta_block_size*(N/delta_block_size)));
        // ra_pos.push_back(i);
    }

    bool flag = true;
    double totaltime = 0.0;
    // std::cout << "random access decompress!" << std::endl;
    double randomaccesstime = 0.0;
    double start = getNow();
    std::vector<uint32_t> buffer(N);
    uint32_t* out = new uint32_t[delta_block_size];
    uint32_t* out2 = new uint32_t[delta_block_size];
    for (auto index : ra_pos)
    {
        std::string result;
        if(offset_delta_compress){
            
            uint32_t offset_val = 0;
            uint32_t offset_val2 = 0;

            out = delta_codec.decodeArray8(delta_descriptor_of_each_block[(int)(index) / delta_block_size], delta_block_size,out, N);
            offset_val = out[(index) % delta_block_size];
            
            if((index+1)%delta_block_size==0 && (index+1)<N){
                out2 = delta_codec.decodeArray8(delta_descriptor_of_each_block[(int)(index+1) / delta_block_size], delta_block_size,out2, N);
                offset_val2 = out2[(index+1)%delta_block_size];
            }
            else{
                offset_val2 = out[(index+1)%delta_block_size];
            }
            if(index==N-1){
                offset_val2 = offset_val+100;
            }
            // uint32_t offset_val2 = delta_codec.randomdecodeArray8(delta_descriptor_of_each_block[index/delta_block_size], index%delta_block_size, out, N);
            // std::cout<<index<<" "<<offset_val<<" "<<offset_val2<<std::endl;
            result = codec.randomdecode_string(index, offset_val, offset_val2);
            

        }
        else{
            uint32_t offset_val=0;
            if(index){
                offset_val = offset[(index-1)];
            }
            result = codec.randomdecode_string(index, offset_val, offset[index]);
        }
        
        // if (memcmp(result.c_str(), string_vec[index].c_str(), string_vec[index].size()) != 0)
        // {
        //     flag = false;
        //     std::cout << "error at index " << index << " result " << result << " expected " << string_vec[index] << std::endl;
        // }
        // if(!flag){
        //     break;
        // }

        
    }
    double end = getNow();
    randomaccesstime += (end - start)/repeat;
    double ra_ns = randomaccesstime / N * 1000000000;

    // std::cout << "random decoding time per int: " << std::setprecision(8)
    //     << randomaccesstime / N * 1000000000 << " ns" << std::endl;

    std::cout<<source_file<<" "<<delta_block_size<<" "<<no_pad_compressrate<<" "<<no_pad_compressrate_with_ind<<" "<<ra_ns<<std::endl;

    // int sample_size = 1000;
    // start = getNow();
    // codec.TestBsearch(sample_size, string_vec, N);
    // end = getNow();
    // double ourbinarytime = end - start;
    // std::cout << "binary time per time: " << std::setprecision(8)
    //           << ourbinarytime / sample_size * 1000000000 << " ns" << std::endl;
    free(out);
    free(out2);
    // ProfilerStop();
}