#ifndef FSST_STRING_TEMPLATE_H_ #define FSST_STRING_TEMPLATE_H_ #include "../common.h" #include "../codecs.h" #include "fsst.h" #define INF 0x7f7fffff #include using namespace std; namespace Codecset { class FSST_string { public: uint64_t encodeArray8_string(const std::vector& data, std::vector& offset) { compressedData.clear(); offsets.clear(); vector rowLens, compressedRowLens; vector rowPtrs, compressedRowPtrs; rowLens.reserve(data.size()); compressedRowLens.resize(data.size()); rowPtrs.reserve(data.size()); compressedRowPtrs.resize(data.size() + 1); unsigned totalLen = 0; for (auto& d : data) { totalLen += d.size(); rowLens.push_back(d.size()); rowPtrs.push_back(reinterpret_cast(const_cast(d.data()))); } auto startTime = std::chrono::steady_clock::now(); auto encoder = fsst_create(data.size(), rowLens.data(), rowPtrs.data(), false); auto createTime = std::chrono::steady_clock::now(); vector compressionBuffer; compressionBuffer.resize(16 + 2 * totalLen); auto compressTime = std::chrono::steady_clock::now(); fsst_compress(encoder, data.size(), rowLens.data(), rowPtrs.data(), compressionBuffer.size(), compressionBuffer.data(), compressedRowLens.data(), compressedRowPtrs.data()); auto stopTime = std::chrono::steady_clock::now(); unsigned long compressedLen = data.empty() ? 0 : (compressedRowPtrs[data.size() - 1] + compressedRowLens[data.size() - 1] - compressionBuffer.data()); compressedData.resize(compressedLen + 8192); memcpy(compressedData.data(), compressionBuffer.data(), compressedLen); offsets.reserve(data.size()); compressedRowPtrs[data.size()] = compressionBuffer.data() + compressedLen; for (unsigned index = 0, limit = data.size(); index != limit; ++index) offsets.push_back(compressedRowPtrs[index + 1] - compressionBuffer.data()); uint64_t result = compressedData.size()/* + (offsets.size() * sizeof(unsigned))*/; { unsigned char buffer[sizeof(fsst_decoder_t)]; unsigned dictLen = fsst_export(encoder, buffer); fsst_destroy(encoder); result += dictLen; fsst_import(&decoder, buffer); } offset = offsets; offsets.clear(); return result; } /// Decompress a single row. The target buffer is guaranteed to be large enough std::string randomdecode_string(int idx, int offset_val, int offset_val2) { vector target; target.resize((offset_val2-offset_val)*10); char* writer = target.data(); auto limit = writer + target.size(); auto data = compressedData.data(); auto start = offset_val; auto end = offset_val2; unsigned len = fsst_decompress(&decoder, end - start, data + start, limit - writer, reinterpret_cast(writer)); return std::string(writer, writer + len); } private: fsst_decoder_t decoder; /// The compressed data vector compressedData; /// The offsets vector offsets; uint8_t* compressedOffset; }; } // namespace FastPFor #endif /* SIMDFASTPFOR_H_ */