Learn-to-Compress / headers / search_blocksize.h
search_blocksize.h
Raw
#ifndef SEARCH_BLOCKSIZE_H_
#define SEARCH_BLOCKSIZE_H_

#include "../headers/common.h"
#include "../headers/codecfactory.h"
#include "../headers/caltime.h"
#include "../headers/lr.h"

namespace Codecset
{
    class search_bsize
    {
    public:
        int search_block_size(std::string codec_name, std::vector<uint32_t> &data, 
                                double sample_size_percent, int times)
        {
            int bsize[7] = {200, 400, 800, 1600, 3200, 6400, 10000};
            int select_;

            int delta = 0;
            int N = data.size();

            double mini_size = 1.0;
            double start = getNow();
            std::vector<int> pick_vec;
            int sample_size = N * sample_size_percent;

            for (int j = 0; j < 7; ++j)
            {
                IntegerCODEC &codec = *CODECFactory::getFromName(codec_name);
                int block_size = bsize[j];
                int blocks = N / block_size;
                if(block_size*blocks<N) ++blocks;
                codec.init(blocks, block_size, delta);

                int totalsize = 0;
                for (int p = 0; p < times; p++)
                {
                    int sample_blocks = sample_size / block_size;
                    if (sample_size % block_size != 0) ++sample_blocks;
                    std::set<int> s;
                    while (s.size() < sample_blocks) {
                        s.insert(rand() % (N / block_size));
                    }
                    auto it = s.begin();
                    for (int i = 0; i < sample_blocks; i++)
                    {
                        int block_length = block_size;
                        if(i == sample_blocks-1) block_length = sample_size - i * block_size;
                        uint8_t *descriptor = (uint8_t *)malloc(2 * block_length * sizeof(uint64_t));
                        uint8_t *res = descriptor;
                        int t = *it;
                        ++it;
                        res = codec.encodeArray8(data.data() + t * block_size, block_length, descriptor, i);
                        descriptor = (uint8_t *)realloc(descriptor, (res - descriptor));
                        totalsize += (res - descriptor);
                        free(descriptor);
                    }
                }
                double compressrate = totalsize * 1.0 / (4 * times * sample_size * 1.0);
                if (compressrate < mini_size)
                {
                    mini_size = compressrate;
                    select_ = j;
                }
            }
            //std::cout << "ok" << std::endl;
            mini_size = 1.0;
            int select_delta = 0;
            for (int j = -100; j <= 100; j += 50)
            {
                IntegerCODEC &codec = *CODECFactory::getFromName(codec_name);
                int block_size = bsize[select_] + j;
                int blocks = N / block_size;
                if(block_size*blocks<N) ++blocks;
                codec.init(blocks, block_size, delta);
                int totalsize = 0;
                for (int p = 0; p < times; p++)
                {
                    int sample_blocks = sample_size / block_size;
                    if (sample_size % block_size != 0) ++sample_blocks;
                    std::set<int> s;
                    while (s.size() < sample_blocks) {
                        s.insert(rand() % (N / block_size));
                    }
                    auto it = s.begin();
                    for (int i = 0; i < sample_blocks; i++)
                    {
                        int block_length = block_size;
                        if(i == sample_blocks-1) block_length = sample_size - i * block_size;
                        uint8_t *descriptor = (uint8_t *)malloc(2 * block_length * sizeof(uint64_t));
                        //std::cout << "malloc" << std::endl;
                        uint8_t *res = descriptor;
                        int t = *it;
                        ++it;
                        res = codec.encodeArray8(data.data() + t * block_size, block_length, descriptor, i);
                        descriptor = (uint8_t *)realloc(descriptor, (res - descriptor));
                        totalsize += (res - descriptor);
                        free(descriptor);
                    }
                }
                double compressrate = totalsize * 1.0 / (4 * times * sample_size * 1.0);
                if (compressrate < mini_size)
                {
                    mini_size = compressrate;
                    select_delta = j;
                }
            }

            double end = getNow();
            double search_time = end - start;

            return bsize[select_] + select_delta;
        }
    };
}

#endif