bitmap-index.cc
8.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
// See www.openfst.org for extensive documentation on this weighted
// finite-state transducer library.
#include <fst/extensions/ngram/bitmap-index.h>
#include <algorithm>
#include <iterator>
#include <fst/log.h>
#include <fst/extensions/ngram/nthbit.h>
namespace fst {
namespace {
const size_t kPrimaryBlockBits =
BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize;
// If [begin, begin+size) is a monotonically increasing running sum of
// popcounts for a bitmap, this will return the index of the word that contains
// the value'th zero. If value is larger then the number of zeros in the
// bitmap, size will be returned. The idea is that the number of zerocounts
// (i.e. the popcount of logical NOT of values) is offset * kStorageBitSize
// minus the value for each element of the running sum.
template <size_t BlockSize, typename Container>
size_t InvertedSearch(const Container& c,
size_t first_idx,
size_t last_idx,
size_t value) {
const size_t begin_idx = first_idx;
while (first_idx != last_idx) {
// Invariant: [first_idx, last_idx) is the search range.
size_t mid_idx = first_idx + ((last_idx - first_idx) / 2);
size_t mid_value = BlockSize * (1 + (mid_idx - begin_idx)) - c[mid_idx];
if (mid_value < value) {
first_idx = mid_idx + 1;
} else {
last_idx = mid_idx;
}
}
return first_idx;
}
} // namespace
size_t BitmapIndex::Rank1(size_t end) const {
if (end == 0) return 0;
const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize;
const uint32 sum = get_index_ones_count(end_word);
const size_t masked = end & kStorageBlockMask;
if (masked == 0) {
return sum + __builtin_popcountll(bits_[end_word]);
} else {
const uint64 zero = 0;
return sum + __builtin_popcountll(bits_[end_word] &
(~zero >> (kStorageBitSize - masked)));
}
}
size_t BitmapIndex::Select1(size_t bit_index) const {
if (bit_index >= GetOnesCount()) return Bits();
// search primary index for the relevant block
uint32 rembits = bit_index + 1;
const uint32 block = find_primary_block(bit_index + 1);
uint32 offset = 0;
if (block > 0) {
rembits -= primary_index_[block - 1];
offset += block * kSecondaryBlockSize;
}
// search the secondary index
uint32 word = find_secondary_block(offset, rembits);
if (word > 0) {
rembits -= secondary_index_[offset + word - 1];
offset += word;
}
int nth = nth_bit(bits_[offset], rembits);
return (offset << BitmapIndex::kStorageLogBitSize) + nth;
}
size_t BitmapIndex::Select0(size_t bit_index) const {
if (bit_index >= Bits() - GetOnesCount()) return Bits();
// search inverted primary index for relevant block
uint32 remzeros = bit_index + 1;
uint32 offset = 0;
const uint32 block = find_inverted_primary_block(bit_index + 1);
if (block > 0) {
remzeros -= kPrimaryBlockBits * block - primary_index_[block - 1];
offset += block * kSecondaryBlockSize;
}
// search the inverted secondary index
uint32 word = find_inverted_secondary_block(offset, remzeros);
if (word > 0) {
remzeros -= BitmapIndex::kStorageBitSize * word -
secondary_index_[offset + word - 1];
offset += word;
}
int nth = nth_bit(~bits_[offset], remzeros);
return (offset << BitmapIndex::kStorageLogBitSize) + nth;
}
std::pair<size_t, size_t> BitmapIndex::Select0s(size_t bit_index) const {
const uint64 zero = 0;
const uint64 ones = ~zero;
size_t zeros_count = Bits() - GetOnesCount();
if (bit_index >= zeros_count) return std::make_pair(Bits(), Bits());
if (bit_index + 1 >= zeros_count) {
return std::make_pair(Select0(bit_index), Bits());
}
// search inverted primary index for relevant block
uint32 remzeros = bit_index + 1;
uint32 offset = 0;
const uint32 block = find_inverted_primary_block(bit_index + 1);
size_t num_zeros_in_block =
kPrimaryBlockBits * (1 + block) - primary_index_[block];
if (block > 0) {
size_t num_zeros_next =
kPrimaryBlockBits * block - primary_index_[block - 1];
num_zeros_in_block -= num_zeros_next;
remzeros -= num_zeros_next;
offset += block * kSecondaryBlockSize;
}
// search the inverted secondary index
uint32 word = find_inverted_secondary_block(offset, remzeros);
uint32 sum_zeros_next_word = BitmapIndex::kStorageBitSize * (1 + word) -
secondary_index_[offset + word];
uint32 sum_zeros_this_word = 0;
if (word > 0) {
sum_zeros_this_word = BitmapIndex::kStorageBitSize * word -
secondary_index_[offset + word - 1];
remzeros -= sum_zeros_this_word;
offset += word;
}
int nth = nth_bit(~bits_[offset], remzeros);
size_t current_zero = (offset << BitmapIndex::kStorageLogBitSize) + nth;
size_t next_zero;
// Does the current block contain the next zero?
if (num_zeros_in_block > remzeros + 1) {
if (sum_zeros_next_word - sum_zeros_this_word >= remzeros + 1) {
// the next zero is in this word
next_zero = (offset << BitmapIndex::kStorageLogBitSize) +
nth_bit(~bits_[offset], remzeros + 1);
} else {
// Find the first field that is not all ones by linear scan.
// In the worst case, this may scan 8Kbytes. The alternative is
// to inspect secondary_index_ looking for a place to jump to, but
// that would probably use more cache.
while (bits_[++offset] == ones) {
}
next_zero = (offset << BitmapIndex::kStorageLogBitSize) +
__builtin_ctzll(~bits_[offset]);
}
} else {
// the next zero is in a different block, a full search is required.
next_zero = Select0(bit_index + 1);
}
return std::make_pair(current_zero, next_zero);
}
size_t BitmapIndex::get_index_ones_count(size_t array_index) const {
uint32 sum = 0;
if (array_index > 0) {
sum += secondary_index_[array_index - 1];
uint32 end_block = (array_index - 1) / kSecondaryBlockSize;
if (end_block > 0) sum += primary_index_[end_block - 1];
}
return sum;
}
void BitmapIndex::BuildIndex(const uint64 *bits, size_t size) {
bits_ = bits;
size_ = size;
primary_index_.resize(primary_index_size());
secondary_index_.resize(ArraySize());
const uint64 zero = 0;
const uint64 ones = ~zero;
uint32 popcount = 0;
for (uint32 block = 0; block * kSecondaryBlockSize < ArraySize(); block++) {
uint32 block_popcount = 0;
uint32 block_begin = block * kSecondaryBlockSize;
uint32 block_end = block_begin + kSecondaryBlockSize;
if (block_end > ArraySize()) block_end = ArraySize();
for (uint32 j = block_begin; j < block_end; ++j) {
uint64 mask = ones;
if (j == ArraySize() - 1) {
mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask);
}
block_popcount += __builtin_popcountll(bits_[j] & mask);
secondary_index_[j] = block_popcount;
}
popcount += block_popcount;
primary_index_[block] = popcount;
}
}
size_t BitmapIndex::find_secondary_block(size_t block_begin,
size_t rem_bit_index) const {
size_t block_end = block_begin + kSecondaryBlockSize;
if (block_end > ArraySize()) block_end = ArraySize();
return std::distance(
secondary_index_.begin() + block_begin,
std::lower_bound(secondary_index_.begin() + block_begin,
secondary_index_.begin() + block_end, rem_bit_index));
}
size_t BitmapIndex::find_inverted_secondary_block(size_t block_begin,
size_t rem_bit_index) const {
size_t block_end = block_begin + kSecondaryBlockSize;
if (block_end > ArraySize()) block_end = ArraySize();
return InvertedSearch<BitmapIndex::kStorageBitSize>(secondary_index_,
block_begin, block_end,
rem_bit_index)
- block_begin;
}
inline size_t BitmapIndex::find_primary_block(size_t bit_index) const {
return std::distance(
primary_index_.begin(),
std::lower_bound(primary_index_.begin(),
primary_index_.begin() + primary_index_size(),
bit_index));
}
size_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const {
return InvertedSearch<kPrimaryBlockBits>(
primary_index_, 0, primary_index_.size(), bit_index);
}
} // end namespace fst