From b8d5785f6ed5084c2c92d7bc7dc7a9e812e6bd57 Mon Sep 17 00:00:00 2001 From: fengyang Date: Fri, 7 Apr 2023 18:59:15 +0800 Subject: [PATCH] =?UTF-8?q?BloomFilter=20=E9=87=8D=E6=9E=84=EF=BC=8C?= =?UTF-8?q?=E6=AF=8F=E4=B8=AAFilterPolicy=E5=9D=87=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E6=8B=86=E5=88=86=EF=BC=8C=20=E4=BF=AE=E5=A4=8DBloomFilterPoli?= =?UTF-8?q?cy=20bug=EF=BC=9B=20=E4=BF=AE=E5=A4=8D=20=20Coding=20bug;=20Fil?= =?UTF-8?q?terBlock=20finish=20=E5=8A=9F=E8=83=BD=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=EF=BC=9B=20=E4=BF=AE=E5=A4=8D=20ToHash=20bug;?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/table/filter_block.rs | 53 ++-- src/table/filter_block_test.rs | 151 +++--------- src/table/filter_block_test_filter_policy.rs | 239 +++++++++++++++++++ src/table/mod.rs | 1 + src/table/ss_table.rs | 14 ++ src/traits/coding_trait.rs | 4 +- src/traits/filter_policy_trait.rs | 21 +- src/util/bloom_filter.rs | 20 +- src/util/bloom_filter_test.rs | 16 +- src/util/coding.rs | 4 +- src/util/filter_policy.rs | 168 +------------ src/util/filter_policy_bloom.rs | 177 ++++++++++++++ src/util/filter_policy_bloom_test.rs | 193 +++++++++++++++ src/util/filter_policy_internal.rs | 37 +++ src/util/filter_policy_internal_test.rs | 5 + src/util/filter_policy_test.rs | 66 +---- src/util/hash.rs | 18 +- src/util/mod.rs | 8 +- 18 files changed, 792 insertions(+), 403 deletions(-) create mode 100644 src/table/filter_block_test_filter_policy.rs create mode 100644 src/util/filter_policy_bloom.rs create mode 100644 src/util/filter_policy_bloom_test.rs create mode 100644 src/util/filter_policy_internal.rs create mode 100644 src/util/filter_policy_internal_test.rs diff --git a/src/table/filter_block.rs b/src/table/filter_block.rs index 0013382..1b35cd1 100644 --- a/src/table/filter_block.rs +++ b/src/table/filter_block.rs @@ -7,8 +7,11 @@ use crate::util::slice::Slice; use crate::util::Result; -// Generate new filter every 2KB of data +// 对2K取2的对数,也就是得到11 const FILTER_BASE_LG: usize = 11; + +// 在每当data block的大小2K的时候(FILTER_BASE的值),开始创建一个filter +// Generate new filter every 2KB of data const FILTER_BASE: usize = 1 << FILTER_BASE_LG; /// @@ -19,7 +22,7 @@ pub trait FilterBlock { fn new_with_policy(policy: FilterPolicyPtr) -> Self; /// - /// 构造一个 FilterBlockBuilder + /// 构造一个 FilterBlockBuilder, 分配初始化容量大小 /// /// # Arguments /// @@ -39,7 +42,7 @@ pub trait FilterBlock { /// /// # Arguments /// - /// * `_block_offset`: 偏移量 + /// * `_block_offset`: filter block的 偏移量. 当给定block_offset的时候。需要创建的filter的数目也就确定了。 /// /// returns: () /// @@ -91,15 +94,23 @@ pub trait FilterBlock { /// SSTable 文件里面的 meta block 构建器, 按内存里面指定的格式整理在内存中 pub struct FilterBlockBuilder { + // 指向一个具体的filter_policy policy: FilterPolicyPtr, - // Flattened key contents + + // 包含了所有展开的keys。并且这些所有的keys都是存放在一起的。(通过 AddKey 达到这个目的) keys: Vec, - // Starting index in keys_ of each key + // 记录当前这个key在keys_里面的offset start: Vec, + // Filter data computed so far + // 用result_来记录所有的输入. + // result_变量就是表示的是一个filter计算之后的输出。 + // 比如 BloomFilter 经过各种key计算之后,可能会得到一个 filter_str。这个 filter_str 就是放到result里面。 result: Vec, // policy_->CreateFilter() argument tmp_keys: Vec, + + // 里面的每个元素就是用来记录每个filter内容的offset filter_offsets: Vec, } @@ -138,11 +149,13 @@ impl FilterBlock for FilterBlockBuilder { } fn start_block(&mut self, block_offset: u64) { - let filter_index = block_offset / (FILTER_BASE as u64); - assert!(filter_index >= self.filter_offsets.len() as u64); + // 计算出所有的filter的总数. filters_number ==> filter_index + let filters_number = block_offset / (FILTER_BASE as u64); + assert!(filters_number >= self.filter_offsets.len() as u64); - while filter_index > self.filter_offsets.len() as u64 { - self.generate_filter(); + // 当已经生成的filter的数目小于需要生成的filter的总数时,那么就继续创建filter。 + while filters_number > self.filter_offsets.len() as u64 { + self.generate_new_filter(); } } @@ -151,19 +164,19 @@ impl FilterBlock for FilterBlockBuilder { } fn add_key(&mut self, key: &Slice) { - self.start.push(key.len()); + self.start.push(key.size()); self.keys.write(key.as_str().as_bytes()).expect("add_key error!"); } fn finish(&mut self) -> Result { if self.start.len() != 0 { - self.generate_filter(); + self.generate_new_filter(); } // Append array of per-filter offsets let array_offset = self.result.len() as u32; // 当前需要写入的位置。result 中可能存在数据,因此为 self.result.len() 的位置 - let mut pos: usize = self.result.len(); + let mut offset: usize = self.result.len(); // todo 判断是否需要扩容 let result_total_capacity = self.result.capacity(); @@ -171,16 +184,16 @@ impl FilterBlock for FilterBlockBuilder { let dst_append = self.result.as_mut_slice(); for i in 0..self.filter_offsets.len() { - // 判断当前 pos + len 4 + // 判断当前 offset + len 4 let filter_offset_val = self.filter_offsets[i]; - pos = Coding::put_fixed32(dst_append, pos, filter_offset_val); + offset = Coding::put_fixed32(dst_append, offset, filter_offset_val); } - pos = Coding::put_fixed32(dst_append, pos, array_offset); + offset = Coding::put_fixed32(dst_append, offset, array_offset); // Save encoding parameter in result // todo 判断是否需要扩容 - Coding::put_varint64(self.result.as_mut_slice(), pos, FILTER_BASE_LG as u64); + Coding::put_varint64(self.result.as_mut_slice(), offset, FILTER_BASE_LG as u64); Ok(Slice::from_buf(&self.result)) } @@ -211,7 +224,8 @@ impl FilterBlock for FilterBlockBuilder { } impl FilterBlockBuilder { - fn generate_filter(&mut self) { + /// 创建新的 filter + fn generate_new_filter(&mut self) { let num_keys = self.start.len(); if num_keys == 0 { @@ -240,7 +254,8 @@ impl FilterBlockBuilder { let mut keys: Vec<&Slice> = Vec::new(); keys.push(&self.tmp_keys[0]); - let create_filter:Slice = self.policy.create_filter_with_len(num_keys, keys); + // let create_filter:Slice = self.policy.create_filter_with_len(num_keys, keys); + let create_filter:Slice = self.policy.create_filter(keys); // let result_len = self.result.len(); // let result_total_capacity = self.result.capacity(); @@ -255,7 +270,7 @@ impl FilterBlockBuilder { } impl FilterBlockReader { - pub fn new_with_policy(policy: FilterPolicyPtr, contents: Slice) -> Self { + pub fn new_with_policy(policy: FilterPolicyPtr, contents: &Slice) -> Self { let data = Vec::new(); let offset = Vec::new(); diff --git a/src/table/filter_block_test.rs b/src/table/filter_block_test.rs index 2350619..bb49f3a 100644 --- a/src/table/filter_block_test.rs +++ b/src/table/filter_block_test.rs @@ -4,113 +4,15 @@ mod test { use std::sync::Arc; use crate::table::filter_block; use crate::table::filter_block::{FilterBlock, FilterBlockBuilder, FilterBlockReader}; + use crate::table::filter_block_test_filter_policy::TestHashFilter; use crate::traits::coding_trait::CodingTrait; use crate::traits::filter_policy_trait::FilterPolicy; use crate::util::coding::Coding; - use crate::util::filter_policy::BloomFilterPolicy; use crate::util::slice::Slice; use crate::util::hash::{Hash, ToHash}; use crate::util::Result; - pub struct TestHashFilter { - //. - } - - impl TestHashFilter { - fn new() -> Self { - Self { - - } - } - } - - impl FilterPolicy for TestHashFilter { - fn name(&self) -> String { - String::from("TestHashFilter") - } - - fn create_filter(&self, keys: Vec<&Slice>) -> Slice { - let mut n: usize = 0; - for i in 0..keys.len() { - n += keys[i].len(); - } - - self.create_filter_with_len(n, keys) - } - - fn create_filter_with_len(&self, len: usize, keys: Vec<&Slice>) -> Slice { - let mut n: usize = len; - - let mut dst_chars = vec![0; n]; - let dst_chars_u8 = dst_chars.borrow_mut(); - - let mut offset: usize = 0; - for i in 0..keys.len() { - let h = Hash::hash_code(keys[i].as_ref(), 1); - let of = Coding::put_fixed32(dst_chars_u8, offset, h); - offset += of; - } - - Slice::from_buf(dst_chars_u8) - } - - fn key_may_match(&self, key: &Slice, bloom_filter: &Slice) -> bool { - let h = Hash::hash_code(key.to_vec().as_slice(), 1); - - let mut pos = 0; - while pos <= bloom_filter.size() { - let buf = &bloom_filter.as_ref()[pos..]; - - if h == Coding::decode_fixed32(buf) { - return true - } - - pos += 4; - } - - false - } - } - - // #[test] - // fn test_create_filter() { - // let policy = TestHashFilter::new(); - // - // let s1 = Slice::try_from(String::from("hello")).unwrap(); - // let s2 = Slice::try_from(String::from("world")).unwrap(); - // let mut keys : Vec<&Slice> = Vec::new(); - // keys.push(&s1); - // keys.push(&s2); - // - // let bloom_filter: Slice = policy.create_filter(keys); - // - // let mut key_may_match = policy.key_may_match( - // &Slice::try_from(String::from("hello")).unwrap(), - // &bloom_filter); - // assert!(key_may_match); - // - // key_may_match = policy.key_may_match(&Slice::try_from(String::from("world")).unwrap(), - // &bloom_filter); - // assert!(key_may_match); - // - // let mut key_not_match = policy.key_may_match(&Slice::try_from(String::from("x")).unwrap(), - // &bloom_filter); - // assert!(!key_not_match); - // - // key_not_match = policy.key_may_match(&Slice::try_from(String::from("helloworld")).unwrap(), - // &bloom_filter); - // assert!(!key_not_match); - // - // key_not_match = policy.key_may_match(&Slice::try_from(String::from("hello world")).unwrap(), - // &bloom_filter); - // assert!(!key_not_match); - // - // key_not_match = policy.key_may_match(&Slice::try_from(String::from("foo")).unwrap(), - // &bloom_filter); - // assert!(!key_not_match); - // } - #[test] fn test_filter_block_new_with_policy() { let policy: Arc> = Arc::new(Box::new(TestHashFilter::new())); @@ -133,7 +35,7 @@ mod test { let policy: Arc> = Arc::new(Box::new(TestHashFilter::new())); let contents = Slice::default(); - let filter_block_reader: FilterBlockReader = FilterBlockReader::new_with_policy(policy, contents); + let filter_block_reader: FilterBlockReader = FilterBlockReader::new_with_policy(policy, &contents); let fp_reader = filter_block_reader.get_policy(); let _reader_filter_policy_name = fp_reader.name(); @@ -144,24 +46,37 @@ mod test { assert_eq!(filter_block_reader.get_base_lg(), 0); } - #[test] - fn test_filter_block_new_with_policy_and_addkey() { - let policy: Arc> = Arc::new(Box::new(TestHashFilter::new())); - let mut filter_block_builder: FilterBlockBuilder = FilterBlockBuilder::new_with_policy_capacity( - policy, 10); - - filter_block_builder.start_block(100); - filter_block_builder.add_key_from_str("foo"); - filter_block_builder.add_key_from_str("bar"); - filter_block_builder.add_key_from_str("box"); - filter_block_builder.start_block(200); - filter_block_builder.add_key_from_str("box"); - filter_block_builder.start_block(300); - filter_block_builder.add_key_from_str("hello"); - - let sliceRs: Result = filter_block_builder.finish(); - assert_eq!("a", "leveldb.BuiltinBloomFilter"); - } + // todo + // #[test] + // fn test_filter_block_new_with_policy_and_addkey() { + // let policy: Arc> = Arc::new(Box::new(TestHashFilter::new())); + // let mut filter_block_builder: FilterBlockBuilder = + // FilterBlockBuilder::new_with_policy(policy.clone()); + // + // // filter block 的 offset + // filter_block_builder.start_block(100); + // filter_block_builder.add_key_from_str("foo"); + // filter_block_builder.add_key_from_str("bar"); + // filter_block_builder.add_key_from_str("box"); + // filter_block_builder.start_block(200); + // filter_block_builder.add_key_from_str("box"); + // filter_block_builder.start_block(300); + // filter_block_builder.add_key_from_str("hello"); + // + // let sliceRs: Result = filter_block_builder.finish(); + // assert_eq!("a", "leveldb.BuiltinBloomFilter"); + // + // let reader = FilterBlockReader::new_with_policy( + // policy.clone(), &sliceRs.unwrap()); + // + // assert!(reader.key_may_match(100, &Slice::from("foo"))); + // assert!(reader.key_may_match(100, &Slice::from("bar"))); + // assert!(reader.key_may_match(100, &Slice::from("box"))); + // assert!(reader.key_may_match(100, &Slice::from("hello"))); + // assert!(reader.key_may_match(100, &Slice::from("foo"))); + // assert!(!reader.key_may_match(100, &Slice::from("missing"))); + // assert!(!reader.key_may_match(100, &Slice::from("other"))); + // } // #[test] // fn test_filter_block_reader_new_with_policy_with_content() { diff --git a/src/table/filter_block_test_filter_policy.rs b/src/table/filter_block_test_filter_policy.rs new file mode 100644 index 0000000..244a330 --- /dev/null +++ b/src/table/filter_block_test_filter_policy.rs @@ -0,0 +1,239 @@ +use std::borrow::BorrowMut; +use std::cmp::max; +use std::usize::MAX; +use crate::traits::coding_trait::CodingTrait; +use crate::traits::filter_policy_trait::FilterPolicy; +use crate::util::coding::Coding; +use crate::util::hash::Hash; +use crate::util::slice::Slice; + +/// 内部使用。专门用于测试用例的 FilterPolicy +pub struct TestHashFilter { + //. +} + +impl TestHashFilter { + pub(crate) fn new() -> Self { + Self {} + } +} + +impl FilterPolicy for TestHashFilter { + fn name(&self) -> String { + String::from("TestHashFilter") + } + + fn create_filter(&self, keys: Vec<&Slice>) -> Slice { + // 每个 key 都会 hash_code 转为 u32, 所以 * 4 + let mut len: usize = keys.len() * 4; + + self.create_filter_with_len(len, keys) + } + + fn create_filter_with_len(&self, capacity: usize, keys: Vec<&Slice>) -> Slice { + // Actually capacity + let mut len: usize = capacity; + + let need_capacity = keys.len() * 4; + // 指定大小和 need_capacity 取最大值 + len = max(len, need_capacity); + + let mut dst_chars = vec![0; len]; + let bloom_filter = dst_chars.borrow_mut(); + + let mut offset: usize = 0; + // for [0, len) + for i in 0..keys.len() { + let h = Hash::hash_code(keys[i].as_ref(), 1); // seed 固定为 1 + offset = Coding::put_fixed32(bloom_filter, offset, h); + } + + Slice::from_buf(bloom_filter) + } + + fn key_may_match(&self, key: &Slice, bloom_filter: &Slice) -> bool { + let h = Hash::hash_code(key.to_vec().as_ref(), 1); + + let bloom_filter_data: &[u8] = bloom_filter.as_ref(); + let len = bloom_filter_data.len(); + + let mut pos = 0; + while pos < len { + let buf = &bloom_filter_data[pos..(pos+4)]; + + let h_bl = Coding::decode_fixed32(buf); + if h == h_bl { + return true + } + + pos += 4; + } + + false + } +} + +// #################### FilterPolicy test +#[test] +fn test_create_filter() { + let policy = TestHashFilter::new(); + + // 如下三个值, 存放在 BloomFilter 中 + let s1 = Slice::try_from(String::from("hello")).unwrap(); + let s2 = Slice::try_from("world").unwrap(); + let s3 = Slice::try_from("hello world").unwrap(); + + let mut keys : Vec<&Slice> = Vec::new(); + keys.push(&s1); + keys.push(&s2); + keys.push(&s3); + + let bloom_filter: Slice = policy.create_filter(keys); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from("hello").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 验证通过 + key_may_match = policy.key_may_match(&Slice::try_from("world").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let key_not_match = policy.key_may_match(&Slice::try_from("helloworld").unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 因为存在,所以验证通过 + let key_may_match = policy.key_may_match(&Slice::try_from("hello world").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let key_not_match = policy.key_may_match(&Slice::try_from("foo").unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from("hello").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let mut key_not_match = policy.key_may_match(&Slice::try_from("x").unwrap(), + &bloom_filter); + assert!(!key_not_match); +} + +/// 指定超长长度。可以超过放置的值 +#[test] +fn test_create_filter_with_long_len() { + let policy = TestHashFilter::new(); + + // 如下三个值, 存放在 BloomFilter 中 + let s1 = Slice::try_from(String::from("hello")).unwrap(); + let s2 = Slice::try_from("world").unwrap(); + let s3 = Slice::try_from("hello world").unwrap(); + + let mut keys : Vec<&Slice> = Vec::new(); + keys.push(&s1); + keys.push(&s2); + keys.push(&s3); + + let bloom_filter: Slice = policy.create_filter_with_len(500, keys); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from("hello").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 验证通过 + key_may_match = policy.key_may_match(&Slice::try_from("world").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let key_not_match = policy.key_may_match(&Slice::try_from("helloworld").unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 因为存在,所以验证通过 + let key_may_match = policy.key_may_match(&Slice::try_from("hello world").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let key_not_match = policy.key_may_match(&Slice::try_from("foo").unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from("hello").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let mut key_not_match = policy.key_may_match(&Slice::try_from("x").unwrap(), + &bloom_filter); + assert!(!key_not_match); +} + +/// 指定端长度。放不开放置的值。 此时需要扩容 +#[test] +fn test_create_filter_with_short_len() { + let policy = TestHashFilter::new(); + + // 如下三个值, 存放在 BloomFilter 中 + let s1 = Slice::try_from(String::from("hello")).unwrap(); + let s2 = Slice::try_from("world").unwrap(); + let s3 = Slice::try_from("hello world").unwrap(); + + let mut keys : Vec<&Slice> = Vec::new(); + keys.push(&s1); + keys.push(&s2); + keys.push(&s3); + + let bloom_filter: Slice = policy.create_filter_with_len(5, keys); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from("hello").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 验证通过 + key_may_match = policy.key_may_match(&Slice::try_from("world").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let key_not_match = policy.key_may_match(&Slice::try_from("helloworld").unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 因为存在,所以验证通过 + let key_may_match = policy.key_may_match(&Slice::try_from("hello world").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let key_not_match = policy.key_may_match(&Slice::try_from("foo").unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from("hello").unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let mut key_not_match = policy.key_may_match(&Slice::try_from("x").unwrap(), + &bloom_filter); + assert!(!key_not_match); +} diff --git a/src/table/mod.rs b/src/table/mod.rs index f4e0a94..214c796 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -2,6 +2,7 @@ pub mod block; pub mod block_builder; pub mod filter_block; mod filter_block_test; +mod filter_block_test_filter_policy; pub mod format; mod format_test; pub mod ss_table; diff --git a/src/table/ss_table.rs b/src/table/ss_table.rs index 869c7a7..f6a9998 100644 --- a/src/table/ss_table.rs +++ b/src/table/ss_table.rs @@ -1,4 +1,18 @@ +/// SST文件的格式: +/// +/// [data block 1] +/// [data block 2] +/// ... +/// [data block N] +/// [meta block 1] -- 只有一个 meta block +/// [meta block index] +/// [data block index] +/// [Footer] +/// +/// +/// 一般而言,虽然SST文件里面声称是支持多个meta block的,但是实际上,也只有一个meta block。 +/// 此外,会在每当data block的大小2K的时候(见 FilterBlock.rs),开始创建一个filter。 pub struct SSTable { } \ No newline at end of file diff --git a/src/traits/coding_trait.rs b/src/traits/coding_trait.rs index fd49882..4f0a304 100644 --- a/src/traits/coding_trait.rs +++ b/src/traits/coding_trait.rs @@ -6,9 +6,10 @@ pub trait CodingTrait { /// # Arguments /// /// * `dst`: 目标字符串 + /// * `offset`: 偏移量 /// * `value`: 编码值 /// - /// returns: () + /// returns: usize 返回的最新的偏移量 /// /// # Examples /// @@ -17,6 +18,7 @@ pub trait CodingTrait { /// put_fixed32(&mut string, 65535); /// ``` fn put_fixed32(dst: &mut [u8], offset: usize, value: u32) -> usize; + ///64位定长编码写入字符串 /// /// # Arguments diff --git a/src/traits/filter_policy_trait.rs b/src/traits/filter_policy_trait.rs index aaafafd..69cfe30 100644 --- a/src/traits/filter_policy_trait.rs +++ b/src/traits/filter_policy_trait.rs @@ -17,29 +17,34 @@ pub trait FilterPolicy { /// fn name(&self) -> String; - /// 根据 keys 创建过滤器,并返回 bloom_filter Slice + fn create_filter(&self, keys: Vec<&Slice>) -> Slice; + + /// + /// 使用一系列key来创建一个 bloom filter,并返回 bloom filter + /// + /// 有n个整数set,以及一个m位的bit数组,以及k个哈希函数。m[i]表示访问第i个bit位。 /// /// # Arguments /// - /// * `keys`: 创建过滤器的数据清单 + /// * `capacity`: 构造的 BloomFilter 的长度 + /// * `keys`: 创建过滤器的数据清单 /// - /// returns: bloom_filter Slice + /// returns: bloom filter Slice /// /// # Examples /// /// ``` - /// use crate::util::slice::Slice; + /// use level_db_rust::util::filter_policy_bloom::BloomFilterPolicy; + /// use level_db_rust::util::slice::Slice; /// - /// let mut keys : Vec = Vec::new(); + /// let mut keys : Vec = Vec::new(); /// keys.push(Slice::try_from(String::from("hello")).unwrap()); /// keys.push(Slice::try_from(String::from("world")).unwrap()); /// /// let policy = BloomFilterPolicy::new(800); /// let bloom_filter: Slice = policy.create_filter(keys); /// ``` - fn create_filter(&self, keys: Vec<&Slice>) -> Slice; - - fn create_filter_with_len(&self, len: usize, keys: Vec<&Slice>) -> Slice; + fn create_filter_with_len(&self, capacity: usize, keys: Vec<&Slice>) -> Slice; /// /// diff --git a/src/util/bloom_filter.rs b/src/util/bloom_filter.rs index b3d1b9e..f17a458 100644 --- a/src/util/bloom_filter.rs +++ b/src/util/bloom_filter.rs @@ -1,10 +1,10 @@ -/// 布隆过滤器 -/// - -pub struct BloomFilter { - -} - -impl BloomFilter { - -} \ No newline at end of file +// /// 布隆过滤器 +// /// +// +// pub struct BloomFilter { +// +// } +// +// impl BloomFilter { +// +// } \ No newline at end of file diff --git a/src/util/bloom_filter_test.rs b/src/util/bloom_filter_test.rs index d148c51..e87f3b4 100644 --- a/src/util/bloom_filter_test.rs +++ b/src/util/bloom_filter_test.rs @@ -1,8 +1,8 @@ - -mod test { - - #[test] - fn test_by() { - println!("{}", "a"); - } -} \ No newline at end of file +// +// mod test { +// +// #[test] +// fn test_by() { +// println!("{}", "a"); +// } +// } \ No newline at end of file diff --git a/src/util/coding.rs b/src/util/coding.rs index 421ea97..7081ac5 100644 --- a/src/util/coding.rs +++ b/src/util/coding.rs @@ -36,7 +36,7 @@ impl CodingTrait for Coding { dst[offset] = buf[2]; offset += 1; dst[offset] = buf[3]; - offset + offset + 1 } fn put_fixed64(dst: &mut [u8], mut offset: usize, value: u64) -> usize { @@ -57,7 +57,7 @@ impl CodingTrait for Coding { dst[offset] = buf[6]; offset += 1; dst[offset] = buf[7]; - offset + offset + 1 } varint!(u32,encode_varint32); diff --git a/src/util/filter_policy.rs b/src/util/filter_policy.rs index 726e541..e66ddc7 100644 --- a/src/util/filter_policy.rs +++ b/src/util/filter_policy.rs @@ -1,7 +1,5 @@ -use std::ops::{BitOr, Mul, Shl}; -use crate::traits::filter_policy_trait::{FilterPolicy}; +use crate::util::filter_policy_bloom::BloomFilterPolicy; use crate::util::hash::{Hash, ToHash}; -use crate::util::r#const::HASH_DEFAULT_SEED; use crate::util::slice::Slice; pub trait FromPolicy { @@ -29,168 +27,4 @@ impl AsBloomHash for Slice { fn bloom_hash(&self) -> u32 { BloomFilterPolicy::bloom_hash(self) } -} - -// ######################### BloomFilterPolicy -pub struct BloomFilterPolicy { - bits_per_key: usize, - k: usize -} - -impl BloomFilterPolicy { - pub fn new(bits_per_key: usize) -> Self { - // We intentionally round down to reduce probing cost a little bit - // 0.69 =~ ln(2) - let factor: f64 = 0.69; - let mut k_k: usize = factor.mul(bits_per_key as f64).round() as usize; - - if k_k < 1 { - k_k = 1; - } - if k_k > 30{ - k_k = 30; - } - - Self { - bits_per_key, - k : k_k - } - } -} - -impl<'a> BloomFilterPolicy { - pub fn bloom_hash(key: &Slice) -> u32 { - key.to_hash_with_seed(HASH_DEFAULT_SEED) - } -} - -/// get struct BloomFilterPolicy 属性 -impl FromPolicy for BloomFilterPolicy { - fn from_bits_per_key(&self) -> usize { - self.bits_per_key - } - - fn from_k(&self) -> usize { - self.k - } -} - -// dyn FilterPolicy + FromPolicy -impl FilterPolicy for BloomFilterPolicy { - - fn name(&self) -> String { - String::from("leveldb.BuiltinBloomFilter") - } - - fn create_filter(&self, keys: Vec<&Slice>) -> Slice { - self.create_filter_with_len(keys.len(), keys) - } - - fn create_filter_with_len(&self, len: usize, keys: Vec<&Slice>) -> Slice { - let n: usize = len; - - let mut bits: usize = n * self.bits_per_key; - - // For small n, we can see a very high false positive rate. - // Fix it by enforcing a minimum bloom filter length. - if bits < 64 { - bits = 64; - } - - let bytes: usize = (bits + 7) / 8; - bits = bytes * 8; - - let mut dst_chars: Vec = vec![0; bytes + 1]; - dst_chars[bytes] = self.k as u8; - - for i in 0..n { - let slice = keys[i]; - - let mut h : u32 = slice.bloom_hash(); - let delta : u32 = (h >> 17) | (h << 15); - - for j in 0..self.k { - let bitpos:usize = ((h as usize) % bits); - - // a |= b --> 按位或, 后赋值给a - let position: usize = bitpos / 8; - let mod_val: usize = bitpos % 8; - let val = (1 as u8).wrapping_shl(mod_val as u32); - - dst_chars[position] |= val; - - h = h.wrapping_add(delta); - } - } - - // Vec 转 Slice - Slice::from_buf(&dst_chars) - } - - fn key_may_match(&self, key: &Slice, bloom_filter: &Slice) -> bool { - let filter_size: usize = bloom_filter.size(); - if filter_size < 2 { - return false; - } - - let bloom_filter_array:Vec = bloom_filter.to_vec(); - let bits: usize = (filter_size - 1) * 8; - - // Use the encoded k so that we can read filters generated by bloom filters created using different parameters. - let k: u8 = bloom_filter_array[filter_size - 1]; - if k > 30 { - // Reserved for potentially new encodings for short bloom filters. Consider it a match. - return true; - } - - let mut h : u32 = key.bloom_hash(); - // Rotate right 17 bits - let delta = (h >> 17) | (h << 15); - - for j in 0..k { - let bitpos:usize = ((h as usize) % bits); - if (bloom_filter_array[bitpos/8] & (1 << (bitpos % 8))) == 0 { - return false; - } - - h = h.wrapping_add(delta); - } - - return true; - } -} - -// ######################### InternalFilterPolicy -pub struct InternalFilterPolicy { - user_policy_: dyn FilterPolicy -} - -impl InternalFilterPolicy { - fn new(policy: Box) -> Box { - // InternalFilterPolicy{ user_policy_: policy } - todo!() - } -} - -impl FilterPolicy for InternalFilterPolicy { - fn name(&self) -> String { - todo!() - } - - fn create_filter(&self, keys: Vec<&Slice>) -> Slice { - self.create_filter_with_len(keys.len(), keys) - } - - fn create_filter_with_len(&self, len: usize, keys: Vec<&Slice>) -> Slice { - // 根据指定的参数创建过滤器,并返回结果, 结果为dst的原始内容 + append结果。 - // 参数keys[0,n-1]包含依据用户提供的comparator排序的key列表--可重复, - // 并把根据这些key创建的filter追加到 dst中。 - // - todo!() - } - - fn key_may_match(&self, key: &Slice, bloom_filter: &Slice) -> bool { - todo!() - } - } \ No newline at end of file diff --git a/src/util/filter_policy_bloom.rs b/src/util/filter_policy_bloom.rs new file mode 100644 index 0000000..ff79d00 --- /dev/null +++ b/src/util/filter_policy_bloom.rs @@ -0,0 +1,177 @@ +use std::ops::Mul; +use crate::traits::filter_policy_trait::FilterPolicy; +use crate::util::filter_policy::{AsBloomHash, FromPolicy}; +use crate::util::hash::ToHash; +use crate::util::r#const::HASH_DEFAULT_SEED; +use crate::util::slice::Slice; + +// ######################### BloomFilterPolicy +pub struct BloomFilterPolicy { + // 布隆过滤器或哈希表的slot数 + bits_per_key: usize, + + // k为布隆过滤器重hash function数 + k: usize +} + +impl BloomFilterPolicy { + /// + /// + /// # Arguments + /// + /// * `bits_per_key`: m位的bit数组 / n个整数set 的值 + /// + /// returns: BloomFilterPolicy + /// + /// # Examples + /// + /// ``` + /// + /// ``` + pub fn new(bits_per_key: usize) -> Self { + // We intentionally round down to reduce probing cost a little bit + // 最优的 k_ 是 ln2 * (m/n) -> factor * bits_per_key + + // factor = 0.69 =~ ln(2) + let factor: f64 = 0.69; + let mut k_: usize = factor.mul(bits_per_key as f64).round() as usize; + + // 把k_放到[1, 30]这个区间 + if k_ < 1 { + k_ = 1; + } + if k_ > 30{ + k_ = 30; + } + + Self { + bits_per_key, + k : k_ + } + } +} + +impl<'a> BloomFilterPolicy { + pub fn bloom_hash(key: &Slice) -> u32 { + key.to_hash_with_seed(HASH_DEFAULT_SEED) + } +} + +/// get struct BloomFilterPolicy 属性 +impl FromPolicy for BloomFilterPolicy { + fn from_bits_per_key(&self) -> usize { + self.bits_per_key + } + + fn from_k(&self) -> usize { + self.k + } +} + +// dyn FilterPolicy + FromPolicy +impl FilterPolicy for BloomFilterPolicy { + + fn name(&self) -> String { + String::from("leveldb.BuiltinBloomFilter") + } + + fn create_filter(&self, keys: Vec<&Slice>) -> Slice { + let len: usize = keys.len(); + + self.create_filter_with_len(len, keys) + } + + fn create_filter_with_len(&self, capacity: usize, keys: Vec<&Slice>) -> Slice { + let n: usize = capacity; + + // Compute bloom filter size (in both bits and bytes) + // 计算总共需要的位数, n * bits_per_key, 也就是说,对于每一个key需要这么多bit + // 因为bits_per_key_表示 m/n,所以bits = bits_per_key_ * n = m(m 的意思是: m位的bit数组) + let mut bits: usize = n * self.bits_per_key; + + // For small n, we can see a very high false positive rate. + // Fix it by enforcing a minimum bloom filter length. + // 对于一个key,最小的bits数目设置为64. + if bits < 64 { + bits = 64; + } + + // 取为8的倍数 + let bytes: usize = (bits + 7) / 8; + // 根据 bytes 算出bits数 + bits = bytes * 8; + + // 相当于是 append 了bytes个0 + let mut dst_chars: Vec = vec![0; bytes + 1]; + // 在filter的最后压入哈希函数的个数。 在最后一位, 记录k 值。 这个k是位于bytes之后。 + dst_chars[bytes] = self.k as u8; + + // 依次处理每个key + // 对于每个key采用double hash的方式生成k_个bitpos,然后在 dst_chars 的相应位置设置1。 + for i in 0..keys.len() { + let slice = keys[i]; + + let mut h : u32 = slice.bloom_hash(); + // Rotate right 17 bits + let delta : u32 = (h >> 17) | (h << 15); + + for j in 0..self.k { + let bitpos:usize = ((h as usize) % bits); + + // val ==> 1 << (bitpos % 8) + let mod_val: usize = bitpos % 8; + let val = (1 as u8).wrapping_shl(mod_val as u32); + + // 本来应该直接把h bit设置为1的。但是这里总共只有bits个bit, 访问m[i] 把相应位设置为1 + // a |= b ==> 按位或, 后赋值给a + // let position: usize = bitpos / 8; + dst_chars[bitpos / 8] |= val; + + // 累加来实现k个hash函数, h.wrapping_add(delta) ==> h += delta + // LevelDB中并没有真正创建k个哈希函数。而是使用旧有的哈希值累加。 + // 使用了最原始的h哈希值位移来得到。(h >> 17) | (h << 15);,累加delta得到下一次hash值。 + h = h.wrapping_add(delta); + } + } + + // Vec 转 Slice + Slice::from_buf(&dst_chars) + } + + fn key_may_match(&self, key: &Slice, bloom_filter: &Slice) -> bool { + let len: usize = bloom_filter.size(); + if len < 2 { + return false; + } + + // 获得相应的内存区域的数据 + let bloom_filter_array:Vec = bloom_filter.to_vec(); + // 总共的bits数目 + let bits: usize = (len - 1) * 8; + + // 取得k哈希函数的数目 + // Use the encoded k so that we can read filters generated by bloom filters created using different parameters. + let k: u8 = bloom_filter_array[len - 1]; + // 对于大于30个哈希函数的情况,这里直接返回存在 + if k > 30 { + // Reserved for potentially new encodings for short bloom filters. Consider it a match. + return true; + } + + let mut h : u32 = key.bloom_hash(); + // Rotate right 17 bits + let delta = (h >> 17) | (h << 15); + + // 计算key的hash值,重复计算阶段的步骤,循环计算k_个hash值,只要有一个结果对应的bit位为0,就认为不匹配,否则认为匹配 + for j in 0..k { + let bitpos:usize = ((h as usize) % bits); + if (bloom_filter_array[bitpos/8] & (1 << (bitpos % 8))) == 0 { + return false; + } + + h = h.wrapping_add(delta); + } + + return true; + } +} \ No newline at end of file diff --git a/src/util/filter_policy_bloom_test.rs b/src/util/filter_policy_bloom_test.rs new file mode 100644 index 0000000..e7ad531 --- /dev/null +++ b/src/util/filter_policy_bloom_test.rs @@ -0,0 +1,193 @@ +use crate::traits::filter_policy_trait::FilterPolicy; +use crate::util::filter_policy::{AsBloomHash, FromPolicy}; +use crate::util::filter_policy_bloom::BloomFilterPolicy; +use crate::util::hash::ToHash; +use crate::util::slice::Slice; + +// #################### BloomFilterPolicy test +#[test] +fn test_bloom_hash() { + let val = "aabbccd"; + let slice: Slice = Slice::from_buf(val.as_bytes()); + + let hash_val = BloomFilterPolicy::bloom_hash(&slice); + let hash_val_1 = slice.bloom_hash(); + assert_eq!(hash_val, hash_val_1); + assert_eq!(hash_val, 2085241752); +} + +#[test] +fn test_new() { + let bloom_filter: BloomFilterPolicy = BloomFilterPolicy::new(8); + assert_eq!(bloom_filter.from_bits_per_key(), 8); + assert_eq!(bloom_filter.from_k(), 6); + + let bloom_filter = BloomFilterPolicy::new(800); + assert_eq!(bloom_filter.from_bits_per_key(), 800); + assert_eq!(bloom_filter.from_k(), 30); +} + +// #################### FilterPolicy test +#[test] +fn test_create_filter() { + let policy = BloomFilterPolicy::new(800); + + // 如下三个值, 存放在 BloomFilter 中 + let s1 = Slice::try_from(String::from("hello")).unwrap(); + let s2 = Slice::try_from(String::from("world")).unwrap(); + let s3 = Slice::try_from(String::from("hello world")).unwrap(); + + let mut keys : Vec<&Slice> = Vec::new(); + keys.push(&s1); + keys.push(&s2); + keys.push(&s3); + + let bloom_filter: Slice = policy.create_filter(keys); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from(String::from("hello")).unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 验证通过 + key_may_match = policy.key_may_match(&Slice::try_from(String::from("world")).unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let mut key_not_match = policy.key_may_match(&Slice::try_from(String::from("x")).unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 因为不存在,所以验证不通过 + key_not_match = policy.key_may_match(&Slice::try_from(String::from("helloworld")).unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 因为存在,所以验证通过 + let key_match = policy.key_may_match(&Slice::try_from(String::from("hello world")).unwrap(), + &bloom_filter); + assert!(key_match); + + // 因为不存在,所以验证不通过 + key_not_match = policy.key_may_match(&Slice::try_from(String::from("foo")).unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from(String::from("hello")).unwrap(), + &bloom_filter); + assert!(key_may_match); +} + +/// 指定超长长度。可以超过放置的值 +#[test] +fn test_create_filter_with_long_len(){ + let policy = BloomFilterPolicy::new(800); + + // 如下三个值, 存放在 BloomFilter 中 + let s1 = Slice::try_from(String::from("hello")).unwrap(); + let s2 = Slice::try_from(String::from("world")).unwrap(); + let s3 = Slice::try_from(String::from("hello world")).unwrap(); + + let mut keys : Vec<&Slice> = Vec::new(); + keys.push(&s1); + keys.push(&s2); + keys.push(&s3); + + let bloom_filter: Slice = policy.create_filter_with_len(600, keys); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from(String::from("hello")).unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 验证通过 + key_may_match = policy.key_may_match(&Slice::try_from(String::from("world")).unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let mut key_not_match = policy.key_may_match(&Slice::try_from(String::from("x")).unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 因为不存在,所以验证不通过 + key_not_match = policy.key_may_match(&Slice::try_from(String::from("helloworld")).unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 因为存在,所以验证通过 + let key_match = policy.key_may_match(&Slice::try_from(String::from("hello world")).unwrap(), + &bloom_filter); + assert!(key_match); + + // 因为不存在,所以验证不通过 + key_not_match = policy.key_may_match(&Slice::try_from(String::from("foo")).unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from(String::from("hello")).unwrap(), + &bloom_filter); + assert!(key_may_match); +} + +/// 指定端长度。放不开放置的值。 此时对于 BloomFilterPolicy 来讲不需要扩容 +#[test] +fn test_create_filter_with_short_len(){ + let policy = BloomFilterPolicy::new(800); + + // 如下三个值, 存放在 BloomFilter 中 + let s1 = Slice::try_from(String::from("hello")).unwrap(); + let s2 = Slice::try_from(String::from("world")).unwrap(); + let s3 = Slice::try_from(String::from("hello world")).unwrap(); + + let mut keys : Vec<&Slice> = Vec::new(); + keys.push(&s1); + keys.push(&s2); + keys.push(&s3); + + let bloom_filter: Slice = policy.create_filter_with_len(2, keys); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from(String::from("hello")).unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 验证通过 + key_may_match = policy.key_may_match(&Slice::try_from(String::from("world")).unwrap(), + &bloom_filter); + assert!(key_may_match); + + // 因为不存在,所以验证不通过 + let mut key_not_match = policy.key_may_match(&Slice::try_from(String::from("x")).unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 因为不存在,所以验证不通过 + key_not_match = policy.key_may_match(&Slice::try_from(String::from("helloworld")).unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 因为存在,所以验证通过 + let key_match = policy.key_may_match(&Slice::try_from(String::from("hello world")).unwrap(), + &bloom_filter); + assert!(key_match); + + // 因为不存在,所以验证不通过 + key_not_match = policy.key_may_match(&Slice::try_from(String::from("foo")).unwrap(), + &bloom_filter); + assert!(!key_not_match); + + // 验证通过 + let mut key_may_match = policy.key_may_match( + &Slice::try_from(String::from("hello")).unwrap(), + &bloom_filter); + assert!(key_may_match); +} \ No newline at end of file diff --git a/src/util/filter_policy_internal.rs b/src/util/filter_policy_internal.rs new file mode 100644 index 0000000..4b5516c --- /dev/null +++ b/src/util/filter_policy_internal.rs @@ -0,0 +1,37 @@ +use crate::traits::filter_policy_trait::FilterPolicy; +use crate::util::slice::Slice; + +// ######################### InternalFilterPolicy +pub struct InternalFilterPolicy { + user_policy_: dyn FilterPolicy +} + +impl InternalFilterPolicy { + fn new(policy: Box) -> Box { + // InternalFilterPolicy{ user_policy_: policy } + todo!() + } +} + +impl FilterPolicy for InternalFilterPolicy { + fn name(&self) -> String { + todo!() + } + + fn create_filter(&self, keys: Vec<&Slice>) -> Slice { + self.create_filter_with_len(keys.len(), keys) + } + + fn create_filter_with_len(&self, capacity: usize, keys: Vec<&Slice>) -> Slice { + // 根据指定的参数创建过滤器,并返回结果, 结果为dst的原始内容 + append结果。 + // 参数keys[0,n-1]包含依据用户提供的comparator排序的key列表--可重复, + // 并把根据这些key创建的filter追加到 dst中。 + // + todo!() + } + + fn key_may_match(&self, key: &Slice, bloom_filter: &Slice) -> bool { + todo!() + } + +} \ No newline at end of file diff --git a/src/util/filter_policy_internal_test.rs b/src/util/filter_policy_internal_test.rs new file mode 100644 index 0000000..f2003fe --- /dev/null +++ b/src/util/filter_policy_internal_test.rs @@ -0,0 +1,5 @@ + +#[test] +fn test__() { + +} diff --git a/src/util/filter_policy_test.rs b/src/util/filter_policy_test.rs index bea6d2a..f2003fe 100644 --- a/src/util/filter_policy_test.rs +++ b/src/util/filter_policy_test.rs @@ -1,69 +1,5 @@ -use std::ptr::null; -use crate::traits::filter_policy_trait::FilterPolicy; -use crate::util::bloom_filter; -use crate::util::filter_policy::{AsBloomHash, BloomFilterPolicy, FromPolicy}; -use crate::util::hash::ToHash; -use crate::util::slice::Slice; -// #################### BloomFilterPolicy test #[test] -fn test_bloom_hash() { - let val = "aabbccd"; - let slice: Slice = Slice::from_buf(val.as_bytes()); +fn test__() { - let hash_val = BloomFilterPolicy::bloom_hash(&slice); - let hash_val_1 = slice.bloom_hash(); - assert_eq!(hash_val, hash_val_1); - assert_eq!(hash_val, 2085241752); } - -#[test] -fn test_new() { - let bloom_filter: BloomFilterPolicy = BloomFilterPolicy::new(8); - assert_eq!(bloom_filter.from_bits_per_key(), 8); - assert_eq!(bloom_filter.from_k(), 6); - - let bloom_filter = BloomFilterPolicy::new(800); - assert_eq!(bloom_filter.from_bits_per_key(), 800); - assert_eq!(bloom_filter.from_k(), 30); -} - -// #################### FilterPolicy test -#[test] -fn test_create_filter() { - let policy = BloomFilterPolicy::new(800); - - let s1 = Slice::try_from(String::from("hello")).unwrap(); - let s2 = Slice::try_from(String::from("world")).unwrap(); - - let mut keys : Vec<&Slice> = Vec::new(); - keys.push(&s1); - keys.push(&s2); - - let bloom_filter: Slice = policy.create_filter(keys); - - let mut key_may_match = policy.key_may_match( - &Slice::try_from(String::from("hello")).unwrap(), - &bloom_filter); - assert!(key_may_match); - - key_may_match = policy.key_may_match(&Slice::try_from(String::from("world")).unwrap(), - &bloom_filter); - assert!(key_may_match); - - let mut key_not_match = policy.key_may_match(&Slice::try_from(String::from("x")).unwrap(), - &bloom_filter); - assert!(!key_not_match); - - key_not_match = policy.key_may_match(&Slice::try_from(String::from("helloworld")).unwrap(), - &bloom_filter); - assert!(!key_not_match); - - key_not_match = policy.key_may_match(&Slice::try_from(String::from("hello world")).unwrap(), - &bloom_filter); - assert!(!key_not_match); - - key_not_match = policy.key_may_match(&Slice::try_from(String::from("foo")).unwrap(), - &bloom_filter); - assert!(!key_not_match); -} \ No newline at end of file diff --git a/src/util/hash.rs b/src/util/hash.rs index 15a1a03..641c7bc 100644 --- a/src/util/hash.rs +++ b/src/util/hash.rs @@ -20,6 +20,7 @@ pub trait ToHash { /// 所有基本类型 u8, i8, u16, u32 ... 的Vec都可以实现 hash 值计算 /// Sample: /// ``` +/// use level_db_rust::util::hash::ToHash; /// let hash = vec!['a','b','c'].to_hash(); /// ``` impl ToHash for Vec { @@ -41,6 +42,8 @@ impl ToHash for Vec { /// 所有基本类型 u8, i8, u16, u32 ... 的slice都可以实现 hash 值计算 /// Sample: /// ``` +/// use level_db_rust::util::hash::ToHash; +/// /// let buf = ['a','b','c']; /// let hash_val = &buf.as_slice().to_hash(); /// ``` @@ -65,6 +68,7 @@ impl ToHash for &[T] { /// 实现了 &str 转 ToHash 的特质 /// Sample: /// ``` +/// use level_db_rust::util::hash::ToHash; /// let hash = "abc".to_hash(); /// ``` impl ToHash for &str { @@ -82,6 +86,9 @@ impl ToHash for &str { /// 实现了 Slice 转 ToHash 的特质 /// Sample: /// ``` +/// use level_db_rust::util::hash::ToHash; +/// use level_db_rust::util::slice::Slice; +/// /// let val = "aabbccd"; /// let slice: Slice = Slice::from_buf(val.as_bytes()); /// let slice_hash_val = slice.to_hash(); @@ -101,6 +108,8 @@ impl ToHash for Slice { /// 实现了 String 转 ToHash 的特质 /// Sample: /// ``` +/// use level_db_rust::util::hash::ToHash; +/// /// let val = "aabbccd"; /// let val_s = String::from(val); /// let string_hash_val = val_s.to_hash(); @@ -123,11 +132,15 @@ pub struct Hash {} impl Hash { #[inline] pub fn hash_code(data: &[u8], seed: u32) -> u32 { + let n = data.len(); + + // Similar to murmur hash + // uint32_t ==> unsigned int ==> u32 let murmur_hash: u32 = 0xc6a4a793; let r: u32 = 24; - let limit: usize = data.len(); - let mul_first = limit.mul(murmur_hash as usize); // x = data_size * murmur_hash + let limit: usize = n; + let mul_first = n.mul(murmur_hash as usize); // x = data_size * murmur_hash let mut h: u32 = seed.bitxor(mul_first as u32); // h = seed ^ x // 每次按照四字节长度读取字节流中的数据 w,并使用普通的哈希函数计算哈希值。 @@ -137,7 +150,6 @@ impl Hash { // rust的 &[u8] 是胖指针,带长度信息的,会做range check,所以是安全的。 // 虽然decode_fixed32 中也是解码4字节,但传入整个data在方法上不明确,因此传 [position..(position + 4)], 可以更加方便理解,对性能无影响 let w = Coding::decode_fixed32(&data[position..(position + 4)]); - // 向后移动4个字节 position += 4; diff --git a/src/util/mod.rs b/src/util/mod.rs index 527f6e5..3e9bd12 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -23,10 +23,14 @@ pub mod comparator; mod comparator_test; pub mod crc; mod crc_test; -pub mod bloom_filter; -mod bloom_filter_test; +// pub mod bloom_filter; +// mod bloom_filter_test; pub mod filter_policy; mod filter_policy_test; +pub mod filter_policy_bloom; +mod filter_policy_bloom_test; +pub mod filter_policy_internal; +mod filter_policy_internal_test; pub mod histogram; mod histogram_test; -- Gitee