Cachemere
Modular Caching Library for C++
counting_bloom_filter.hpp
1 #include <cassert>
2 #include <limits>
3 #include <vector>
4 
5 #include "bloom_filter_math.h"
6 
7 namespace cachemere::policy::detail {
8 
9 template<typename ItemHash>
11  : m_cardinality{cardinality},
12  m_filter(optimal_filter_size(cardinality), 0),
13  m_nb_hashes{optimal_nb_of_hash_functions(cardinality, m_filter.size())}
14 {
15 }
16 
17 template<typename ItemHash> template<typename ItemKey> void CountingBloomFilter<ItemHash>::add(const ItemKey& item)
18 {
19  HashMixer<ItemKey, ItemHash> mixer{item, m_filter.size()};
20 
21  std::vector<size_t> indices;
22  indices.reserve(m_nb_hashes);
23 
24  uint32_t minimum_val = std::numeric_limits<uint32_t>::max();
25 
26  // Generate the indices and find the minimum.
27  for (size_t i = 0; i < m_nb_hashes; ++i) {
28  const size_t idx = mixer();
29  assert(idx < m_filter.size());
30 
31  indices.push_back(idx);
32  minimum_val = std::min(m_filter[idx], minimum_val);
33  }
34 
35  // Increment all filter slots corresponding to the minimum value.
36  const uint8_t is_zero_increment = minimum_val == 0 ? 1 : 0;
37  for (const size_t idx : indices) {
38  if (m_filter[idx] == minimum_val) {
39  ++m_filter[idx];
40 
41  // We track the number of non-zero values in the filter.
42  // This is used to compute the filter saturation.
43  m_nb_nonzero += is_zero_increment;
44  }
45  }
46 }
47 
48 template<typename ItemHash> void CountingBloomFilter<ItemHash>::clear()
49 {
50  std::fill(m_filter.begin(), m_filter.end(), 0);
51  m_nb_nonzero = 0;
52 }
53 
54 template<typename ItemHash> void CountingBloomFilter<ItemHash>::decay()
55 {
56  for (auto& counter : m_filter) {
57  if (counter == 1) {
58  --m_nb_nonzero;
59  }
60  counter /= 2;
61  }
62 }
63 
64 template<typename ItemHash> template<typename ItemKey> uint32_t CountingBloomFilter<ItemHash>::estimate(const ItemKey& item) const
65 {
66  assert(m_nb_hashes > 0);
67 
68  HashMixer<ItemKey, ItemHash> mixer{item, m_filter.size()};
69 
70  uint32_t minimum_val = std::numeric_limits<uint32_t>::max();
71  for (size_t i = 0; i < m_nb_hashes; ++i) {
72  const size_t idx = mixer();
73  assert(idx < m_filter.size());
74 
75  minimum_val = std::min(m_filter[idx], minimum_val);
76  }
77 
78  return minimum_val;
79 }
80 
81 template<typename ItemHash> uint32_t CountingBloomFilter<ItemHash>::cardinality() const noexcept
82 {
83  return m_cardinality;
84 }
85 
86 template<typename ItemHash> size_t CountingBloomFilter<ItemHash>::memory_used() const noexcept
87 {
88  // Note:
89  // The amount of memory could be reduced by using variable-length counters.
90  // Since we know that reset is triggered once we reach a count of
91  // m_Cardinality, we could use the smallest numeric type that holds
92  // m_Cardinality for the internal counters.
93  return m_filter.size() * sizeof(uint32_t) + sizeof(m_cardinality) + sizeof(m_nb_hashes) + sizeof(m_nb_nonzero);
94 }
95 
96 template<typename ItemHash> double CountingBloomFilter<ItemHash>::saturation() const noexcept
97 {
98  assert(m_filter.size() > 0);
99  return static_cast<double>(m_nb_nonzero) / static_cast<double>(m_filter.size());
100 }
101 
102 } // namespace cachemere::policy::detail
cachemere::policy::detail::HashMixer
Functor used for generating a uniform sequence of numbers in a given value range for a given key.
Definition: hash_mixer.h:13
cachemere::policy::detail::CountingBloomFilter
Space-efficient probabilistic data structure to estimate the number of times an item was inserted in ...
Definition: counting_bloom_filter.h:18
cachemere::policy::detail::CountingBloomFilter::CountingBloomFilter
CountingBloomFilter(uint32_t cardinality)
Constructor.
Definition: counting_bloom_filter.hpp:10