Learn-to-Compress / headers / string / piecewise_fix_string_outlier_detect.h
piecewise_fix_string_outlier_detect.h
Raw
#ifndef PIECEWISEFIX_STRING_OUTLIER_H_
#define PIECEWISEFIX_STRING_OUTLIER_H_

#include "../common.h"
#include "../codecs.h"
#include "../bit_read.h"
#include "../bit_write.h"
#include "lr_string.h"
#include "string_utils.h"
#include "bit_read_string.h"
#include <boost/multiprecision/cpp_int.hpp>
#include <boost/multiprecision/cpp_int/serialize.hpp>
#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/iostreams/device/back_inserter.hpp>
#include <boost/iostreams/device/array.hpp>
#include <boost/iostreams/stream.hpp>
#define INF 0x7f7fffff

namespace Codecset
{
    int cal_bits(long_int delta, int max_string)
    {
        int tmp_bit = bits_long(delta) + 1;
        tmp_bit = std::min(max_string, tmp_bit);
        return tmp_bit;
    }
    // bit + theta0 + theta1 + delta
    // max_string is the max string size (each string in string_vec is padding to this size, calculate by bit)
    uint8_t *encodeArray8_string(std::vector<std::string> &string_vec, int start_idx, const size_t length, uint8_t *res, size_t max_string /*, long_int & theta0, long_int & theta1*/)
    {
        uint8_t *out = res;
        std::vector<long_int> ascii_vec;
        std::vector<long_int> index;

        for (int i = 0; i < length; i++)
        {
            ascii_vec.emplace_back(convertToASCII(string_vec[i + start_idx]));
            index.emplace_back(long_int(i));
        }

        string_lr mylr;
        mylr.caltheta(index, ascii_vec, length);

        long_float theta0 = mylr.theta0;
        long_int theta0_int = theta0.convert_to<long_int>();
        long_float theta1 = mylr.theta1;
        long_int theta1_int = theta1.convert_to<long_int>();

        std::vector<long_int> delta;
        long_int max_delta = 0;

        // a counter to plot the histogram
        int counter[max_string] = {0};
        // because strings are not fixed length, we take 256 as a back-up.
        for (auto i = 0; i < length; i++)
        {
            long_int tmp_val = ascii_vec[i] - (theta1_int * i + theta0_int);
            // std::cout<<"delta "<<i<<" : "<<tmp_val<<std::endl;
            delta.emplace_back(tmp_val);
            counter[cal_bits(abs(tmp_val), max_string)]++;
            if (abs(delta[i]) > max_delta)
            {
                max_delta = abs(delta[i]);
            }
        }

        int quantile_sum = 0;
        int threshold = 0;
        int compress_len = 0;
        int compress_min = length * max_string;
        int max_bit = 0;
        int temp = ceil((double)length / 64.);
        for (int i = 0; i < max_string; i++)
        {
            quantile_sum += counter[i];
            compress_len = quantile_sum * i + (length - quantile_sum) * max_string;
            if (quantile_sum == length)
            {
                compress_len = quantile_sum * i - temp * 12 * 8 - 12 * 8; // bitmap & lookup table
                max_bit = i;
            }
            if (compress_len < compress_min)
            {
                compress_min = compress_len;
                threshold = i;
            }
            if (quantile_sum == length)
            {
                break;
            }
        }

        // GONNA USE PIECEWISE, BECAUSE NOT A SINGLE OUTLIER OCCURS
        if (threshold == max_bit)
        {
            max_bit = bits_long(max_delta) + 1;
            memcpy(out, &max_bit, sizeof(uint32_t));
            out += sizeof(uint32_t);
            // models[nvalue*2] = theta0;
            // models[nvalue*2+1] = theta1;
            mpz_t z;
            mpz_init(z);
            mpz_set(z, theta0_int.backend().data());
            auto theta0_len = (mpz_sizeinbase(z, 2) + 7) / 8;
            memcpy(out, &theta0_len, sizeof(uint32_t));
            out += sizeof(uint32_t);
            mpz_export(out, &theta0_len, -1, 1, 0, 0, z);
            out += theta0_len;

            mpz_set(z, theta1_int.backend().data());
            auto theta1_len = (mpz_sizeinbase(z, 2) + 7) / 8;
            memcpy(out, &theta1_len, sizeof(uint32_t));
            out += sizeof(uint32_t);
            mpz_export(out, &theta1_len, -1, 1, 0, 0, z);
            out += theta1_len;
            mpz_clear(z);

            out = write_string_delta_string(delta.data(), out, max_bit, length);

            return out;
        }
        else
        {
            // GONNA USE OUTLIER DETECT
            uint64_t writebitmap[temp] = {0};
            bool vote[length];
            int usedData = 0;
            for (int i = 0; i < length; i++)
            {
                if (cal_bits(abs(delta[i]), max_string) > threshold)
                {
                    vote[i] = 0;
                }
                else
                {
                    vote[i] = 1;
                    usedData++;
                }
            }
            uint64_t tmpbit = 0;
            int wtitebittmp = 0;
            int outlier_num = length - usedData;
            bool have_outlier = true;
            std::vector<long_int> tmpoutlier;
            if (!outlier_num)
            {
                have_outlier = false;
            }

            max_delta = 0;
            std::vector<long_int> deltax;
            int wtiteoutlier = 0;
            for (int i = 0; i < length; i++)
            {

                if (vote[i])
                {
                    long_int tmp = delta[i];
                    deltax.emplace_back(tmp);
                    if (abs(tmp) > max_delta)
                    {
                        max_delta = abs(tmp);
                    }
                }
                else
                {
                    tmpbit += ((1L) << (63 - i % 64));
                    deltax.emplace_back((long_int)1);
                    tmpoutlier.emplace_back(convertToASCII(string_vec[i + start_idx]));
                }

                if (i % 64 == 63)
                {
                    writebitmap[wtitebittmp] = tmpbit;
                    wtitebittmp++;
                    tmpbit = 0;
                }
            }
            if (length % 64 != 0)
            {
                writebitmap[wtitebittmp] = tmpbit;
                wtitebittmp++;
            }
            int rank_lut_[temp] = {0};
            int basic_block_size_ = 64;
            initRankLut(length, writebitmap, rank_lut_, basic_block_size_);

            // WRITE IN OUTLIER FORM
            // bit + theta0 + theta1 + #outlier + lookupsize64 + outlier_pos + bitmap + lookup + delta + outlier
            // ( 1L<<7 is to mark if we are using outlier or not)
            out[0] = (uint32_t)max_bit + (1L << 31);
            out += sizeof(uint32_t);

            mpz_t z;
            mpz_init(z);
            mpz_set(z, theta0_int.backend().data());
            auto theta0_len = (mpz_sizeinbase(z, 2) + 7) / 8;
            memcpy(out, &theta0_len, sizeof(uint32_t));
            out += sizeof(uint32_t);
            mpz_export(out, &theta0_len, -1, 1, 0, 0, z);
            out += theta0_len;

            mpz_set(z, theta1_int.backend().data());
            auto theta1_len = (mpz_sizeinbase(z, 2) + 7) / 8;
            memcpy(out, &theta1_len, sizeof(uint32_t));
            out += sizeof(uint32_t);
            mpz_export(out, &theta1_len, -1, 1, 0, 0, z);
            out += theta1_len;
            mpz_clear(z);

            memcpy(out, &outlier_num, sizeof(outlier_num));
            out += sizeof(outlier_num);
            memcpy(out, &basic_block_size_, sizeof(basic_block_size_));
            out += sizeof(basic_block_size_);
            uint8_t *outlier_pos = out;
            out += sizeof(basic_block_size_);

            memcpy(out, writebitmap, temp * sizeof(writebitmap[0]));

            out += sizeof(writebitmap[0]) * temp;
            memcpy(out, rank_lut_, temp * sizeof(rank_lut_[0]));
            out += sizeof(rank_lut_[0]) * temp;

            out = write_string_delta_string(deltax.data(), out, max_bit, length);

            int shift = out - res;

            memcpy(outlier_pos, &shift, sizeof(shift));
            if (have_outlier)
            {
                for (auto outlier : tmpoutlier)
                {
                    mpz_t z;
                    mpz_init(z);
                    mpz_set(z, outlier.backend().data());
                    mpz_export(out, NULL, -1, 1, 0, 0, z);
                    out += sizeof(long_int);
                    mpz_clear(z);
                }
            }

            return out;
        }
    }

    // max_string is the length of each string, measured in bit
    void decodeArray8_string(uint8_t *in, const size_t length, long_int *out, size_t max_string, std::vector<std::string> &string_vec)
    {
        // bit + theta0 + theta1 + #outlier + lookupsize64 + outlier_pos + bitmap + lookup + delta + outlier
        uint32_t whether_outlier;
        uint8_t *tmpin = in;
        whether_outlier = (tmpin[0] >> 31); // 1000 0000
        if (whether_outlier)
        {
            uint32_t maxerror;
            int *outlier_num = 0;
            maxerror = tmpin[0] - (1L << 31);
            tmpin += sizeof(maxerror);
            std::cout << maxerror << std::endl;

            tmpin += sizeof(uint32_t);
            uint32_t theta0_len;
            memcpy(&theta0_len, tmpin, sizeof(uint32_t));
            tmpin += sizeof(uint32_t);
            // theta0 = reinterpret_cast<long_int*>(tmpin);
            mpz_t tmp;
            mpz_init(tmp);
            mpz_import(tmp, theta0_len, -1, 1, 0, 0, tmpin);
            long_int theta0(tmp);
            // std::cout<<theta0<<std::endl;

            tmpin += theta0_len;

            uint32_t theta1_len;
            memcpy(&theta1_len, tmpin, sizeof(uint32_t));
            tmpin += sizeof(uint32_t);

            mpz_import(tmp, theta1_len, -1, 1, 0, 0, tmpin);
            long_int theta1(tmp);
            // std::cout<<theta1<<std::endl;
            mpz_clear(tmp);
            tmpin += theta1_len;

            outlier_num = reinterpret_cast<int *>(tmpin);
            tmpin += 8;
            int *outlier_position;
            outlier_position = reinterpret_cast<int *>(tmpin);
            tmpin += 4;
            uint8_t *bitmap_pos = tmpin;
            int temp = ceil((double)length / 64.);
            tmpin += temp * 12;

            uint8_t *outlier_pos = in + outlier_position[0];
            read_outlier_detect_string(tmpin, 0, 0, length, maxerror, theta1, theta0, out, outlier_pos, outlier_num[0], bitmap_pos, max_string);

            for (int i = 0; i < length; i++)
            {
                std::string tmp_string = convertToString(out[0]);
                string_vec.emplace_back(tmp_string);
                out++;
            }
        }
        else
        {
            uint32_t maxerror;
            maxerror = tmpin[0];
            tmpin += sizeof(maxerror);

            tmpin += sizeof(uint32_t);
            uint32_t theta0_len;
            memcpy(&theta0_len, tmpin, sizeof(uint32_t));
            tmpin += sizeof(uint32_t);
            // theta0 = reinterpret_cast<long_int*>(tmpin);
            mpz_t tmp;
            mpz_init(tmp);
            mpz_import(tmp, theta0_len, -1, 1, 0, 0, tmpin);
            long_int theta0(tmp);
            // std::cout<<theta0<<std::endl;

            tmpin += theta0_len;

            uint32_t theta1_len;
            memcpy(&theta1_len, tmpin, sizeof(uint32_t));
            tmpin += sizeof(uint32_t);

            mpz_import(tmp, theta1_len, -1, 1, 0, 0, tmpin);
            long_int theta1(tmp);
            // std::cout<<theta1<<std::endl;
            mpz_clear(tmp);
            tmpin += theta1_len;

            read_all_bit_fix_string(tmpin, 0, 0, length, maxerror, theta1, theta0, out);

            for (int i = 0; i < length; i++)
            {
                std::string tmp_string = convertToString(out[0]);
                string_vec.emplace_back(tmp_string);
                out++;
            }
        }
    }

} // namespace FastPFor

#endif /* SIMDFASTPFOR_H_ */