#ifndef ENTROPY_H_ #define ENTROPY_H_ #include "common.h" #include "util.h" namespace FastPForLib { class EntropyRecorder { public: EntropyRecorder() : counter(), totallength(0) {} void clear() { counter.clear(); totallength = 0; } void eat(const uint32_t *in, const size_t length) { if (length == 0) return; totallength += length; for (uint32_t k = 0; k < length; ++k, ++in) { maptype::iterator i = counter.find(*in); if (i != counter.end()) i->second += 1; else counter[*in] = 1; } } double computeShannon() { double total = 0; for (maptype::iterator i = counter.begin(); i != counter.end(); ++i) { const double x = static_cast(i->second); total += x / static_cast(totallength) * log(static_cast(totallength) / x) / log(2.0); } return total; } __attribute__((pure)) double computeDataBits() { double total = 0; for (maptype::const_iterator i = counter.begin(); i != counter.end(); ++i) { total += static_cast(i->second) / static_cast(totallength) * static_cast(gccbits(i->first)); } return total; } typedef std::unordered_map maptype; maptype counter; size_t totallength; }; /** * An entropic measure, * Index compression using 64-bit words by Vo Ngoc Anh and Alistair Moffat */ __attribute__((pure)) double databits(const uint32_t *in, const size_t length) { double sum = 0.0; for (size_t k = 0; k < length; ++k) { sum += static_cast(gccbits(in[k])) / static_cast(length); } return sum; } double entropy(const uint32_t *in, const size_t length) { if (length == 0) return 0; std::map counter; for (size_t k = 0; k < length; ++k, ++in) { std::map::iterator i = counter.find(*in); if (i != counter.end()) i->second += 1; else counter[*in] = 1; } double total = 0; for (std::map::iterator i = counter.begin(); i != counter.end(); ++i) { double x = i->second; total += x / static_cast(length) * log(static_cast(length) / x) / log(2.0); } return total; } } // namespace FastPFor #endif /* ENTROPY_H_ */