From 75fd9f0dffcdeb8c61c036606e19f00d7f85001d Mon Sep 17 00:00:00 2001 From: Brian Pane Date: Wed, 11 Dec 2024 17:09:45 -0800 Subject: [PATCH] Refactoring to allow inlining of quick_insert_string for the common CRC32 case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before-and-after benchmarks from an x86_64 test system: ``` Benchmark 1 (56 runs): ./compress-baseline 1 rs silesia-small.tar measurement mean ± σ min … max outliers delta wall_time 89.8ms ± 2.42ms 88.2ms … 106ms 3 ( 5%) 0% peak_rss 26.7MB ± 65.7KB 26.5MB … 26.7MB 0 ( 0%) 0% cpu_cycles 333M ± 836K 332M … 335M 0 ( 0%) 0% instructions 747M ± 254 747M … 747M 0 ( 0%) 0% cache_references 400K ± 6.41K 396K … 434K 4 ( 7%) 0% cache_misses 299K ± 4.24K 282K … 311K 6 (11%) 0% branch_misses 3.15M ± 5.68K 3.14M … 3.16M 0 ( 0%) 0% Benchmark 2 (56 runs): ./target/release/examples/compress 1 rs silesia-small.tar measurement mean ± σ min … max outliers delta wall_time 89.3ms ± 582us 88.3ms … 90.8ms 2 ( 4%) - 0.5% ± 0.7% peak_rss 26.7MB ± 78.2KB 26.5MB … 26.7MB 0 ( 0%) - 0.1% ± 0.1% cpu_cycles 333M ± 1.45M 331M … 341M 1 ( 2%) - 0.1% ± 0.1% instructions 736M ± 268 736M … 736M 1 ( 2%) ⚡- 1.5% ± 0.0% cache_references 400K ± 3.33K 397K … 411K 3 ( 5%) + 0.1% ± 0.5% cache_misses 296K ± 6.42K 277K … 306K 6 (11%) - 0.9% ± 0.7% branch_misses 3.09M ± 7.74K 3.07M … 3.11M 2 ( 4%) ⚡- 1.9% ± 0.1% ``` --- zlib-rs/src/deflate.rs | 24 ++++++++++++++++++++---- zlib-rs/src/deflate/algorithm/quick.rs | 4 +++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/zlib-rs/src/deflate.rs b/zlib-rs/src/deflate.rs index e256df4..726d221 100644 --- a/zlib-rs/src/deflate.rs +++ b/zlib-rs/src/deflate.rs @@ -1353,11 +1353,27 @@ impl<'a> State<'a> { #[inline(always)] pub(crate) fn quick_insert_string(&mut self, string: usize) -> u16 { match self.hash_calc_variant { - HashCalcVariant::Standard => StandardHashCalc::quick_insert_string(self, string), - // SAFETY: self.hash_calc_variant is set by HashCalcVariant::for_max_chain_length, - // which avoids choosing Crc32 if the system doesn't have support. - HashCalcVariant::Crc32 => unsafe { Crc32HashCalc::quick_insert_string(self, string) }, HashCalcVariant::Roll => RollHashCalc::quick_insert_string(self, string), + _ => { + // Standard and Crc32 both can process 4 bytes at a time. + let slice = &self.window.filled()[string..]; + let val = u32::from_le_bytes(slice[..4].try_into().unwrap()); + + let hm = match self.hash_calc_variant { + // SAFETY: self.hash_calc_variant is set by HashCalcVariant::for_max_chain_length, + // which avoids choosing Crc32 if the system doesn't have support. + HashCalcVariant::Crc32 => unsafe { Crc32HashCalc::update_hash(0, val) } + _ => StandardHashCalc::update_hash(0, val) + } as usize; + + let head = self.head.as_slice()[hm]; + if head != string as u16 { + self.prev.as_mut_slice()[string & self.w_mask] = head; + self.head.as_mut_slice()[hm] = string as u16; + } + + head + } } } diff --git a/zlib-rs/src/deflate/algorithm/quick.rs b/zlib-rs/src/deflate/algorithm/quick.rs index c7b3526..7eb20d8 100644 --- a/zlib-rs/src/deflate/algorithm/quick.rs +++ b/zlib-rs/src/deflate/algorithm/quick.rs @@ -93,6 +93,8 @@ pub fn deflate_quick(stream: &mut DeflateStream, flush: DeflateFlush) -> BlockSt } if state.lookahead >= WANT_MIN_MATCH { + let slice = &state.window.filled()[state.strstart..]; + let str_prefetch = u32::from_le_bytes(slice[..4].try_into().unwrap()); let hash_head = state.quick_insert_string(state.strstart); let dist = state.strstart as isize - hash_head as isize; @@ -105,7 +107,7 @@ pub fn deflate_quick(stream: &mut DeflateStream, flush: DeflateFlush) -> BlockSt $slice[$offset] as u16 | ($slice[$offset + 1] as u16) << 8 } } - if first_two_bytes!(str_start, 0) == first_two_bytes!(match_start, 0) { + if str_prefetch as u16 == first_two_bytes!(match_start, 0) { let mut match_len = crate::deflate::compare256::compare256_slice( &str_start[2..], &match_start[2..],