Learn-to-Compress / experiments / search_hyper_regress.cpp
search_hyper_regress.cpp
Raw
// searching for hyper-parameter like block number

#include "../headers/common.h"
#include "../headers/codecfactory.h"
#include "../headers/caltime.h"
#include "../headers/lr.h"
#include "../headers/search_hyper.h"
#include "../headers/create_feature.h"
#include "../headers/microunit.h"
#include "../headers/easylogging++.h"
#include "../headers/MLP.h"
#include "../headers/decision_tree.h"
#include "../headers/regress_tree.h"
#include "../headers/file_manage.h"
#include "../headers/model_selection.h"
using namespace Eigen;

INITIALIZE_EASYLOGGINGPP

using namespace Codecset;
int main()
{
    std::vector<std::string> codec_name = {"piecewise_fix","FOR","rle"};
    std::vector<std::string> weights = {"../reg_model/reg_model_piecewise.txt","../reg_model/reg_model_FOR.txt","../reg_model/reg_model_rle.txt"};
    std::vector<uint32_t> data;
    std::ifstream srcFile("../data/gap/gap_47_0.8_100M_uint32.txt", std::ios::in);
    if (!srcFile)
    {
        std::cout << "error opening source file." << std::endl;
        return 0;
    }
    while (1)
    {

        uint32_t next;
        srcFile >> next;
        if (srcFile.eof())
        {
            break;
        }
        data.push_back(next);
    }
    srcFile.close();
    int N = data.size();

    if (data.size() == 0)
    {
        std::cout << "Empty vector" << std::endl;
        return 0;
    }
    std::cout << "vector size = " << data.size() << std::endl;
    std::cout << "vector size = " << data.size() * sizeof(uint32_t) / 1024.0 << "KB"
              << std::endl;

    int bsize[7] = {200, 400, 800, 1600, 3200, 6400, 10000};
    int vote[7] ={0};
    int sample_time =1;
    double sample_rate = 0.01;
    int delta=0;
    double start = getNow();
    for(int i=0;i<(int)codec_name.size();i++){
        codec_vote tmp = pick_block_size(bsize,7,sample_time,sample_rate,N,data.data(),codec_name[i]);
        vote[tmp.select]+= 1.0/tmp.compression_rate;
    }
    int pos =0;
    int max_vote=0;
    for(int i=0;i<7;i++){
        if(vote[i]>max_vote){
            max_vote=vote[i];
            pos =i;
        }
    }
    int block_size = bsize[pos];
    
    double end = getNow();
    double search_time = end - start;

    
    int blocks = data.size() / block_size;
    if (blocks * block_size < N)
    {
        blocks++;
    } //handle with the last block, maybe < block_size
    const int input_size = 7;

    std::vector<RegressionTree> models;
    for (int i=0;i<(int)weights.size();i++){
    std::ifstream infile(weights[i], std::ios::in);
    RegressionTree model;
    model.rebuild(infile,0);
    models.push_back(model);
    infile.close();
  }

std::cout << "Total blocks " << blocks << " block size " << block_size << std::endl;
  std::vector<IntegerCODEC*> codec_fac;
  for(int i=0;i<(int)codec_name.size();i++){
      IntegerCODEC &codec = *CODECFactory::getFromName(codec_name[i]);
      codec.init(blocks,block_size,delta);
      codec_fac.push_back(&codec);
  }
//************************************ predict cocec ************************ 
  std::vector<uint8_t*> block_start_vec;
  std::vector<int> method_vec;
  int totalsize = 0;
  //outfile<< "len" <<"    "<<"avg"<<"    "<<"min"<<"    "<<"max"<<"    "<<"num_distinct"<<"    "<<"rl"<<"    label"<<std::endl;
   start = getNow();
  double totaltime_realcom=0;
  double percent = 1/(double)blocks;
  for(int i=0;i<blocks;i++){
    int block_length = block_size;
    if (i == blocks - 1) block_length = N - (blocks - 1) * block_size;

    seg_feature seg;
    seg.cal_feature(data.data()+(i*block_size),block_length);
    int pick_method =0;
    double pick_rate = 1.0;
    for(int j=0;j<(int)codec_name.size();j++){
        Eigen::MatrixXd tmp_feature = Eigen::MatrixXd::Zero(1 , input_size);
        tmp_feature<<seg.logdelta,seg.quarter,seg.half,seg.threequarter,seg.rl,j,percent;
        VectorXd pred(tmp_feature.rows());
        pred = models[j].predict( tmp_feature);
        double pred_rate = pred[0];
        //std::cout<<"method "<< codec_name[j]<<" pred rate "<<pred_rate<<std::endl;
        if(pred_rate<pick_rate){
          pick_rate = pred_rate;
          pick_method = j;
        }
    }
   
    double start2 = getNow();
    uint8_t * descriptor = (uint8_t*)malloc(block_length * sizeof(uint64_t)*2);
    uint8_t * res = descriptor;
    res = codec_fac[pick_method]->encodeArray8(data.data()+(i*block_size),block_length ,descriptor,i);
    int tmp_size = (res-descriptor);
    double end2 = getNow();
    totaltime_realcom +=(end2-start2);
   //seg.write_feature(outfile,method);
    method_vec.push_back(pick_method);
    block_start_vec.push_back(descriptor);
    totalsize +=tmp_size;   
 
  }
  //outfile.close();
    end = getNow();
   double totaltime = end -start;
    std::cout << "compressionrate time:" << std::setprecision(8) << 4*data.size() / totaltime << " Bytes/s" << std::endl;

   std::cout << "real compress speed: " << std::setprecision(10) << data.size()/(totaltime_realcom*1000) <<  std::endl;
  /*
  for(int i=0;i<blocks;i++){
      std::cout<<"block "<<i<<" method "<<codec_name[method_vec[i]]<<std::endl;
  }
  */
  int *times= new int[codec_name.size()];
  for(int i=0;i<(int)codec_name.size();i++){
      times[i]=0;
  }
  for(int i=0;i<blocks;i++){
      times[method_vec[i]]++;
  }
  for(int i=0;i<(int)codec_name.size();i++){
      std::cout<< "method "<<codec_name[i]<<" percentage "<<(double)times[i]/(double)blocks<<std::endl;
  }
  double compressrate = (totalsize)*100.0  / (4*N*1.0);
  std::cout << "total compression rate:" << std::setprecision(4)<< compressrate << std::endl;
    std::cout << "time searching for hyper-parameter / total time: " << std::setprecision(8)    
              << search_time / totaltime << std::endl;
//************************************************************

  bool flag = true;
  std::vector<uint32_t> recover(data.size());
  totaltime =0.0;
  std::cout<<"decompress all!"<<std::endl;
  start = getNow();
  for(int i=0;i<blocks;i++){
      int block_length = block_size;
        if (i == blocks - 1)
        {
            block_length = N - (blocks - 1) * block_size;
        }
      //std::cout<<"block "<<(int)i<<" method "<<codec_name[method_vec[(int)i]]<<std::endl;
      codec_fac[method_vec[i]]->decodeArray8(block_start_vec[i], block_length, recover.data()+i*block_size, i);
      
      for(int j=0;j<block_length;j++){
        if(data[j+i*block_size]!=recover[j+i*block_size]){
          std::cout<<"block: "<<i<<" num: "<<j<< " true is: "<<data[j+i*block_size]<<" predict is: "<<recover[j+i*block_size]<<std::endl;
          std::cout<<"something wrong! decompress failed"<<std::endl;
          flag = false;
          break;
         }
         
       }
       if(!flag){
          break;
       }
       

  }
   end = getNow();
  totaltime = (end - start);

std::cout << "all decoding time per int: " << std::setprecision(8)
     << totaltime / data.size() * 1000000000 << "ns" << std::endl;
std::cout << "all decoding speed: " << std::setprecision(10)
     << data.size()/(totaltime*1000) <<  std::endl;

  std::cout<<"random access decompress!"<<std::endl; 
  std::vector<uint32_t> buffer(data.size());
  double randomaccesstime =0.0;
   start = getNow();
   uint32_t mark=0;

  for(int i=0;i<N;i++){ 

      int block_length = block_size;
      if(i>=(blocks-1)*block_size) {
            block_length = N - (blocks-1)*block_size;
      }
      uint32_t tmpvalue = codec_fac[method_vec[(int)i/block_size]]->randomdecodeArray8(block_start_vec[(int)i/block_size], i%block_size, buffer.data(), block_length);
       mark+=tmpvalue;
      
      
       if(data[i]!=tmpvalue){
         std::cout<<"block "<<(int)i<<" method "<<codec_name[method_vec[(int)i]]<<std::endl;
        std::cout<<"num: "<<i<< "true is: "<<data[i]<<" predict is: "<<tmpvalue<<std::endl;
        flag = false;
        std::cout<<"something wrong! decompress failed"<<std::endl;
        
      }
    if(!flag){
        break;
    }
      
    }
       end = getNow();
      randomaccesstime+=(end-start);

std::cout << "random decoding time per int: " << std::setprecision(8)
     << randomaccesstime / data.size() * 1000000000 << "ns" << std::endl;
std::cout << "random decoding speed: " << std::setprecision(10)
     << data.size()/(randomaccesstime*1000) <<  std::endl;
    
   for(int i=0;i<(int)block_start_vec.size();i++){
       free(block_start_vec[i]);
   }
}