Learn-to-Compress / headers / bitvector.hpp
bitvector.hpp
Raw
#ifndef BITVECTOR_H_
#define BITVECTOR_H_

#include <assert.h>
#include <stdint.h>
#include <string.h>

#include <vector>

typedef uint64_t word_t;
typedef uint32_t position_t;
typedef uint32_t level_t;

static const unsigned kWordSize = 64;
static const word_t kMsbMask = 0x8000000000000000;
static const word_t kOneMask = 0xFFFFFFFFFFFFFFFF;

class Bitvector
{
public:
	Bitvector() : num_bits_(0), bits_(nullptr){};

	Bitvector(const std::vector<std::vector<word_t>> &bitvector_per_level,
			  const std::vector<position_t> &num_bits_per_level,
			  const level_t start_level = 0,
			  level_t end_level = 0 /* non-inclusive */)
	{
		if (end_level == 0)
			end_level = bitvector_per_level.size();
		num_bits_ = totalNumBits(num_bits_per_level, start_level, end_level);
		bits_ = new word_t[numWords()];
		memset(bits_, 0, bitsSize());
		concatenateBitvectors(bitvector_per_level, num_bits_per_level, start_level, end_level);
	}

	Bitvector(const std::vector<uint32_t> &bitmap)
	{
		num_bits_ = bitmap.size();
		bits_ = new word_t[numWords()];
		memset(bits_, 0, bitsSize());
		for (int i = 0; i < num_bits_; i++)
		{
			setBit(i, bitmap[i]);
		}
	}

	~Bitvector()
	{
		if (bits_ != nullptr)
			delete[] bits_;
	}

	position_t numBits() const
	{
		return num_bits_;
	}

	position_t numWords() const
	{
		if (num_bits_ % kWordSize == 0)
			return (num_bits_ / kWordSize);
		else
			return (num_bits_ / kWordSize + 1);
	}

	// in bytes
	position_t bitsSize() const
	{
		return (numWords() * (kWordSize / 8));
	}

	// in bytes
	position_t size() const
	{
		return (sizeof(Bitvector) + bitsSize());
	}

	bool readBit(const position_t pos) const;
	void setBit(const position_t pos, const bool value);
	uint64_t getWord(const position_t word_id) const;

	position_t distanceToNextSetBit(const position_t pos) const;
	position_t distanceToPrevSetBit(const position_t pos) const;

private:
	position_t totalNumBits(const std::vector<position_t> &num_bits_per_level,
							const level_t start_level,
							const level_t end_level /* non-inclusive */);

	void concatenateBitvectors(const std::vector<std::vector<word_t>> &bitvector_per_level,
							   const std::vector<position_t> &num_bits_per_level,
							   const level_t start_level,
							   const level_t end_level /* non-inclusive */);

protected:
	position_t num_bits_;
	word_t *bits_;
};

bool Bitvector::readBit(const position_t pos) const
{
	assert(pos <= num_bits_);
	position_t word_id = pos / kWordSize;
	position_t offset = pos & (kWordSize - 1);
	return bits_[word_id] & (kMsbMask >> offset);
}

uint64_t Bitvector::getWord(const position_t word_id) const
{
	assert(word_id < numWords());
	return bits_[word_id];
}

void Bitvector::setBit(const position_t pos, const bool value)
{
	assert(pos <= num_bits_);
	position_t word_id = pos / kWordSize;
	position_t offset = pos & (kWordSize - 1);
	if (value)
		bits_[word_id] |= (kMsbMask >> offset);
	else
		bits_[word_id] &= ~(kMsbMask >> offset);
}

position_t Bitvector::distanceToNextSetBit(const position_t pos) const
{
	assert(pos < num_bits_);
	position_t distance = 1;

	position_t word_id = (pos + 1) / kWordSize;
	position_t offset = (pos + 1) % kWordSize;

	// first word left-over bits
	word_t test_bits = bits_[word_id] << offset;
	if (test_bits > 0)
	{
		return (distance + __builtin_clzll(test_bits));
	}
	else
	{
		if (word_id == numWords() - 1)
			return (num_bits_ - pos);
		distance += (kWordSize - offset);
	}

	while (word_id < numWords() - 1)
	{
		word_id++;
		test_bits = bits_[word_id];
		if (test_bits > 0)
			return (distance + __builtin_clzll(test_bits));
		distance += kWordSize;
	}
	return distance;
}

position_t Bitvector::distanceToPrevSetBit(const position_t pos) const
{
	assert(pos <= num_bits_);
	if (pos == 0)
		return 0;
	position_t distance = 1;

	position_t word_id = (pos - 1) / kWordSize;
	position_t offset = (pos - 1) % kWordSize;

	// first word left-over bits
	word_t test_bits = bits_[word_id] >> (kWordSize - 1 - offset);
	if (test_bits > 0)
	{
		return (distance + __builtin_ctzll(test_bits));
	}
	else
	{
		// if (word_id == 0)
		// return (offset + 1);
		distance += (offset + 1);
	}

	while (word_id > 0)
	{
		word_id--;
		test_bits = bits_[word_id];
		if (test_bits > 0)
			return (distance + __builtin_ctzll(test_bits));
		distance += kWordSize;
	}
	return distance;
}

position_t Bitvector::totalNumBits(const std::vector<position_t> &num_bits_per_level,
								   const level_t start_level,
								   const level_t end_level /* non-inclusive */)
{
	position_t num_bits = 0;
	for (level_t level = start_level; level < end_level; level++)
		num_bits += num_bits_per_level[level];
	return num_bits;
}

void Bitvector::concatenateBitvectors(const std::vector<std::vector<word_t>> &bitvector_per_level,
									  const std::vector<position_t> &num_bits_per_level,
									  const level_t start_level,
									  const level_t end_level /* non-inclusive */)
{
	position_t bit_shift = 0;
	position_t word_id = 0;
	for (level_t level = start_level; level < end_level; level++)
	{
		if (num_bits_per_level[level] == 0)
			continue;
		position_t num_complete_words = num_bits_per_level[level] / kWordSize;
		for (position_t word = 0; word < num_complete_words; word++)
		{
			bits_[word_id] |= (bitvector_per_level[level][word] >> bit_shift);
			word_id++;
			if (bit_shift > 0)
				bits_[word_id] |= (bitvector_per_level[level][word] << (kWordSize - bit_shift));
		}

		word_t bits_remain = num_bits_per_level[level] - num_complete_words * kWordSize;
		if (bits_remain > 0)
		{
			word_t last_word = bitvector_per_level[level][num_complete_words];
			bits_[word_id] |= (last_word >> bit_shift);
			if (bit_shift + bits_remain < kWordSize)
			{
				bit_shift += bits_remain;
			}
			else
			{
				word_id++;
				bits_[word_id] |= (last_word << (kWordSize - bit_shift));
				bit_shift = bit_shift + bits_remain - kWordSize;
			}
		}
	}
}

#endif // BITVECTOR_H_