diff --git a/src/complevel_estimator.rs b/src/complevel_estimator.rs index 6103e84..04370e5 100644 --- a/src/complevel_estimator.rs +++ b/src/complevel_estimator.rs @@ -10,7 +10,7 @@ use crate::{ hash_algorithm::HashAlgorithm, hash_chain::DictionaryAddPolicy, - hash_chain_holder::{new_hash_chain_holder, HashChainHolderTrait}, + hash_chain_holder::{new_hash_chain_holder, HashChainHolder}, preflate_constants, preflate_input::PreflateInput, preflate_parameter_estimator::PreflateStrategy, @@ -40,7 +40,7 @@ pub struct CompLevelInfo { struct CandidateInfo { hash_algorithm: HashAlgorithm, add_policy: DictionaryAddPolicy, - hash_chain: Box, + hash_chain: Box, longest_dist_at_hop_0: u32, longest_dist_at_hop_1_plus: u32, @@ -207,6 +207,13 @@ impl<'a> CompLevelEstimatorState<'a> { wbits, ))); + // RandomVector candidate + candidates.push(Box::new(CandidateInfo::new( + add_policy, + HashAlgorithm::RandomVector, + wbits, + ))); + CompLevelEstimatorState { input, candidates, diff --git a/src/hash_algorithm.rs b/src/hash_algorithm.rs index c2894b6..2a2cf8e 100644 --- a/src/hash_algorithm.rs +++ b/src/hash_algorithm.rs @@ -1,4 +1,4 @@ -use crate::hash_chain::{HashChain, HashChainNormalize, HashChainNormalizeLibflate4}; +use crate::hash_chain::{HashChain, HashChainAbs, HashChainNormalize, HashChainNormalizeLibflate4}; #[derive(Debug, Copy, Clone, Eq, PartialEq, Default)] pub enum HashAlgorithm { @@ -207,7 +207,7 @@ const RANDOM_VECTOR: [u16; 768] = [ ]; impl HashImplementation for RandomVectorHash { - type HashChainType = HashChainNormalize; + type HashChainType = HashChainAbs; fn get_hash(&self, b: &[u8]) -> usize { (RANDOM_VECTOR[b[0] as usize] @@ -220,6 +220,6 @@ impl HashImplementation for RandomVectorHash { } fn new_hash_chain(self) -> Self::HashChainType { - crate::hash_chain::HashChainNormalize::::new(self) + crate::hash_chain::HashChainAbs::::new(self) } } diff --git a/src/hash_chain.rs b/src/hash_chain.rs index 53986c6..f79b82a 100644 --- a/src/hash_chain.rs +++ b/src/hash_chain.rs @@ -31,12 +31,22 @@ pub enum DictionaryAddPolicy { /// Add only the first and last substring of a match to the dictionary that are larger than the limit AddFirstAndLast(u16), } + +trait InternalPosition: Copy + Clone + Eq + PartialEq + Default + std::fmt::Debug { + fn saturating_sub(&self, other: u16) -> Self; + fn to_index(self) -> usize; + fn inc(&self) -> Self; + fn from_absolute(pos: u32, total_shift: i32) -> Self; + fn is_valid(&self) -> bool; + fn dist(&self, pos: Self) -> u32; +} + #[derive(Default, Copy, Clone, Eq, PartialEq, Debug)] -struct InternalPosition { +struct InternalPositionRel { pos: u16, } -impl InternalPosition { +impl InternalPosition for InternalPositionRel { fn saturating_sub(&self, other: u16) -> Self { Self { pos: self.pos.saturating_sub(other), @@ -61,16 +71,47 @@ impl InternalPosition { self.pos > 0 } - fn dist(&self, pos: InternalPosition) -> u32 { + fn dist(&self, pos: InternalPositionRel) -> u32 { + u32::from(self.pos - pos.pos) + } +} + +#[derive(Default, Copy, Clone, Eq, PartialEq, Debug)] +struct InternalPositionAbs { + pos: u32, +} + +impl InternalPosition for InternalPositionAbs { + fn saturating_sub(&self, _other: u16) -> Self { + unimplemented!() + } + + fn to_index(self) -> usize { + (self.pos & 0x7fff) as usize + } + + fn inc(&self) -> Self { + Self { pos: self.pos + 1 } + } + + fn from_absolute(pos: u32, _total_shift: i32) -> Self { + Self { pos } + } + + fn is_valid(&self) -> bool { + self.pos > 0 + } + + fn dist(&self, pos: Self) -> u32 { u32::from(self.pos - pos.pos) } } #[derive(DefaultBoxed)] -struct HashTable { +struct HashTable { /// Represents the head of the hash chain for a given hash value. In order /// to find additional matches, you follow the prev chain from the head. - head: [InternalPosition; 65536], + head: [I; 65536], /// Represents the number of following nodes in the chain for a given /// position. For example, if chainDepth[100] == 5, then there are 5 more @@ -90,25 +131,25 @@ struct HashTable { /// all the potential matches for a given hash. The value points to previous /// position in the chain, or 0 if there are no more matches. (We start /// with an offset of 8 to avoid confusion with the end of the chain) - prev: [InternalPosition; 65536], + prev: [I; 65536], /// hash function used to calculate the hash hash: H, } -impl HashTable { - fn get_head(&self, h: usize) -> InternalPosition { +impl HashTable { + fn get_head(&self, h: usize) -> I { self.head[h] } - fn get_node_depth(&self, node: InternalPosition) -> i32 { + fn get_node_depth(&self, node: I) -> i32 { self.chain_depth[node.to_index()] } fn update_chain( &mut self, chars: &[u8], - mut pos: InternalPosition, + mut pos: I, length: u32, ) { let offset = H::num_hash_bytes() as usize - 1; @@ -157,7 +198,7 @@ impl HashTable { } } - pub fn match_depth(&self, end_pos: InternalPosition, input: &PreflateInput) -> u32 { + pub fn match_depth(&self, end_pos: I, input: &PreflateInput) -> u32 { let h = self.hash.get_hash(input.cur_chars(0)); let head = self.get_head(h); @@ -195,38 +236,11 @@ pub trait HashChain { ) -> u32; fn checksum(&self, checksum: &mut DebugHash); - - fn update_hash_with_policy( - &mut self, - length: u32, - input: &PreflateInput, - add_policy: DictionaryAddPolicy, - ) { - match add_policy { - DictionaryAddPolicy::AddAll => { - self.update_hash::(length, input); - } - DictionaryAddPolicy::AddFirst(limit) => { - if length > limit.into() { - self.update_hash::(length, input); - } else { - self.update_hash::(length, input); - } - } - DictionaryAddPolicy::AddFirstAndLast(limit) => { - if length > limit.into() { - self.update_hash::(length, input); - } else { - self.update_hash::(length, input); - } - } - } - } } /// This hash chain algorithm periodically normalizes the hash table pub struct HashChainNormalize { - hash_table: Box>, + hash_table: Box>, total_shift: i32, } @@ -248,7 +262,7 @@ impl HashChainNormalize { impl HashChain for HashChainNormalize { fn iterate<'a>(&'a self, input: &PreflateInput, offset: u32) -> impl Iterator + 'a { - let ref_pos = InternalPosition::from_absolute(input.pos() + offset, self.total_shift); + let ref_pos = InternalPositionRel::from_absolute(input.pos() + offset, self.total_shift); // if we have a match that needs to be inserted at the head first before // we start walking the chain @@ -311,7 +325,7 @@ impl HashChain for HashChainNormalize { } let end_pos = - InternalPosition::from_absolute(cur_pos - target_reference.dist(), self.total_shift); + InternalPositionRel::from_absolute(cur_pos - target_reference.dist(), self.total_shift); self.hash_table.match_depth(end_pos, input) } @@ -341,7 +355,7 @@ impl HashChain for HashChainNormalize { self.total_shift += DELTA as i32; } - let pos = InternalPosition::from_absolute(input.pos(), self.total_shift); + let pos = InternalPositionRel::from_absolute(input.pos(), self.total_shift); let chars = input.cur_chars(0); self.hash_table @@ -352,8 +366,8 @@ impl HashChain for HashChainNormalize { /// implementation of the hash chain that uses the libdeflate rotating hash. /// This consists of two hash tables, one for length 3 and one for length 4. pub struct HashChainNormalizeLibflate4 { - hash_table: Box>, - hash_table_3: Box>, + hash_table: Box>, + hash_table_3: Box>, total_shift: i32, } @@ -372,7 +386,7 @@ impl HashChainNormalizeLibflate4 { impl HashChain for HashChainNormalizeLibflate4 { fn iterate<'a>(&'a self, input: &PreflateInput, offset: u32) -> impl Iterator + 'a { - let ref_pos = InternalPosition::from_absolute(input.pos() + offset, self.total_shift); + let ref_pos = InternalPositionRel::from_absolute(input.pos() + offset, self.total_shift); // if we have a match that needs to be inserted at the head first before // we start walking the chain @@ -444,7 +458,7 @@ impl HashChain for HashChainNormalizeLibflate4 { } let end_pos = - InternalPosition::from_absolute(cur_pos - target_reference.dist(), self.total_shift); + InternalPositionRel::from_absolute(cur_pos - target_reference.dist(), self.total_shift); if target_reference.len() == 3 { // libdeflate uses the 3 byte hash table only for a single match attempt @@ -494,7 +508,7 @@ impl HashChain for HashChainNormalizeLibflate4 { self.total_shift += DELTA as i32; } - let pos = InternalPosition::from_absolute(input.pos(), self.total_shift); + let pos = InternalPositionRel::from_absolute(input.pos(), self.total_shift); let chars = input.cur_chars(0); self.hash_table @@ -630,3 +644,120 @@ impl HashChain for HashChainAbs { } } */ + +/// This hash chain algorithm periodically normalizes the hash table +pub struct HashChainAbs { + hash_table: Box>, + total_shift: i32, +} + +impl HashChainAbs { + pub fn new(hash: H) -> Self { + // Important: total_shift starts at -8 since 0 indicates the end of the hash chain + // so this means that all valid values will be >= 8, otherwise the very first hash + // offset would be zero and so it would get missed + let mut c = HashChainAbs { + total_shift: -8, + hash_table: HashTable::default_boxed(), + }; + + c.hash_table.hash = hash; + + c + } +} + +impl HashChain for HashChainAbs { + fn iterate<'a>(&'a self, input: &PreflateInput, offset: u32) -> impl Iterator + 'a { + let ref_pos = InternalPositionAbs::from_absolute(input.pos() + offset, self.total_shift); + + // if we have a match that needs to be inserted at the head first before + // we start walking the chain + let mut first_match = None; + + let h1 = self.hash_table.hash.get_hash(input.cur_chars(0)); + + let curr_hash; + + if offset == 0 { + curr_hash = h1; + } else { + assert_eq!(offset, 1); + + // current hash is the next hash since we are starting at offset 1 + curr_hash = self.hash_table.hash.get_hash(input.cur_chars(1)); + + // we are a lazy match, then we haven't added the last byte to the hash yet + // which is a problem if that hash should have been part of this hash chain + // (ie the same hash chain) and we have a limited number of enumerations + // throught the hash chain. + // + // In order to fix this, we see if the hashes are the same, and then add + // a distance 1 item to the iterator that we return. + if h1 == curr_hash { + first_match = Some(1); + } + } + + let mut cur_pos = self.hash_table.get_head(curr_hash); + + std::iter::from_fn(move || { + if let Some(d) = first_match { + first_match = None; + Some(d) + } else { + if cur_pos.is_valid() { + let d = ref_pos.dist(cur_pos); + cur_pos = self.hash_table.prev[cur_pos.to_index()]; + Some(d) + } else { + None + } + } + }) + } + + fn match_depth( + &self, + target_reference: &PreflateTokenReference, + window_size: u32, + input: &PreflateInput, + ) -> u32 { + let cur_pos = input.pos(); + let cur_max_dist = std::cmp::min(cur_pos, window_size); + + if target_reference.dist() > cur_max_dist { + //println!("dtl {:?} > {}", target_reference, cur_max_dist); + return 0xffff; + } + + let end_pos = + InternalPositionAbs::from_absolute(cur_pos - target_reference.dist(), self.total_shift); + + self.hash_table.match_depth(end_pos, input) + } + + #[allow(dead_code)] + fn checksum(&self, checksum: &mut DebugHash) { + checksum.update_slice(&self.hash_table.chain_depth); + //checksum.update_slice(&self.hash_table.head); + //checksum.update_slice(&self.hash_table.prev); + //checksum.update(self.hash_shift); + //checksum.update(self.running_hash.hash(self.hash_mask)); + //checksum.update(self.total_shift); + } + + fn update_hash( + &mut self, + length: u32, + input: &PreflateInput, + ) { + assert!(length <= MAX_UPDATE_HASH_BATCH); + + let pos = InternalPositionAbs::from_absolute(input.pos(), self.total_shift); + let chars = input.cur_chars(0); + + self.hash_table + .update_chain::(chars, pos, length); + } +} diff --git a/src/hash_chain_holder.rs b/src/hash_chain_holder.rs index 70c81e0..0584135 100644 --- a/src/hash_chain_holder.rs +++ b/src/hash_chain_holder.rs @@ -9,7 +9,10 @@ use crate::hash_algorithm::{ HashAlgorithm, HashImplementation, LibdeflateRotatingHash4, MiniZHash, RandomVectorHash, ZlibNGHash, ZlibRotatingHash, }; -use crate::hash_chain::{DictionaryAddPolicy, HashChain, MAX_UPDATE_HASH_BATCH}; +use crate::hash_chain::{ + DictionaryAddPolicy, HashChain, MAX_UPDATE_HASH_BATCH, UPDATE_MODE_ALL, UPDATE_MODE_FIRST, + UPDATE_MODE_FIRST_AND_LAST, +}; use crate::preflate_constants::{MAX_MATCH, MIN_LOOKAHEAD, MIN_MATCH}; use crate::preflate_input::PreflateInput; use crate::preflate_parameter_estimator::PreflateStrategy; @@ -26,22 +29,16 @@ pub enum MatchResult { MaxChainExceeded(u32), } -struct HashChainHolder { - hash: H::HashChainType, - params: TokenPredictorParameters, - window_bytes: u32, -} - /// Factory function to create a new HashChainHolder based on the parameters and returns /// a boxed trait object. The reason for this is that this lets the compiler optimize the -pub fn new_hash_chain_holder(params: &TokenPredictorParameters) -> Box { - let predictor_state: Box; +pub fn new_hash_chain_holder(params: &TokenPredictorParameters) -> Box { + let predictor_state: Box; match params.hash_algorithm { HashAlgorithm::Zlib { hash_mask, hash_shift, } => { - predictor_state = Box::new(HashChainHolder::new( + predictor_state = Box::new(HashChainHolderImpl::new( params, ZlibRotatingHash { hash_mask, @@ -50,24 +47,24 @@ pub fn new_hash_chain_holder(params: &TokenPredictorParameters) -> Box { - predictor_state = Box::new(HashChainHolder::new(params, MiniZHash {})) + predictor_state = Box::new(HashChainHolderImpl::new(params, MiniZHash {})) } HashAlgorithm::Libdeflate4 => { - predictor_state = Box::new(HashChainHolder::new(params, LibdeflateRotatingHash4 {})) + predictor_state = Box::new(HashChainHolderImpl::new(params, LibdeflateRotatingHash4 {})) } HashAlgorithm::ZlibNG => { - predictor_state = Box::new(HashChainHolder::new(params, ZlibNGHash {})) + predictor_state = Box::new(HashChainHolderImpl::new(params, ZlibNGHash {})) } HashAlgorithm::RandomVector => { - predictor_state = Box::new(HashChainHolder::new(params, RandomVectorHash {})) + predictor_state = Box::new(HashChainHolderImpl::new(params, RandomVectorHash {})) } } predictor_state } /// trait that is not dependent on the HashImplementation so it can -/// be used in a boxed type by the TokenPredictor -pub trait HashChainHolderTrait { +/// be used in a concrete boxed type by the TokenPredictor +pub trait HashChainHolder { /// updates the hash dictionary for a given length of matches. /// /// If this is a literal, then the update policy is to add all the bytes to the dictionary. @@ -118,46 +115,36 @@ pub trait HashChainHolderTrait { fn checksum(&self, checksum: &mut DebugHash); } -impl HashChainHolderTrait for HashChainHolder { - fn update_hash(&mut self, mut length: u32, input: &mut PreflateInput, is_literal: bool) { - while length > 0 { - let batch_len = cmp::min(length, MAX_UPDATE_HASH_BATCH); +/// implemenation of HashChainHolder depends type of hash implemenatation +struct HashChainHolderImpl { + hash: H::HashChainType, + params: TokenPredictorParameters, + window_bytes: u32, +} - self.hash.update_hash_with_policy::( - batch_len, - input, - if is_literal { - DictionaryAddPolicy::AddAll - } else { - self.params.add_policy - }, - ); - input.advance(batch_len); - length -= batch_len; - } +impl HashChainHolder for HashChainHolderImpl { + fn update_hash(&mut self, length: u32, input: &mut PreflateInput, is_literal: bool) { + self.update_hash_with_policy::( + length, + input, + if is_literal { + DictionaryAddPolicy::AddAll + } else { + self.params.add_policy + }, + ); } - fn update_hash_with_depth( - &mut self, - mut length: u32, - input: &mut PreflateInput, - is_literal: bool, - ) { - while length > 0 { - let batch_len = cmp::min(length, MAX_UPDATE_HASH_BATCH); - - self.hash.update_hash_with_policy::( - batch_len, - input, - if is_literal { - DictionaryAddPolicy::AddAll - } else { - self.params.add_policy - }, - ); - input.advance(batch_len); - length -= batch_len; - } + fn update_hash_with_depth(&mut self, length: u32, input: &mut PreflateInput, is_literal: bool) { + self.update_hash_with_policy::( + length, + input, + if is_literal { + DictionaryAddPolicy::AddAll + } else { + self.params.add_policy + }, + ); } fn match_depth( @@ -369,7 +356,7 @@ impl HashChainHolderTrait for HashChainHolder { } } -impl HashChainHolder { +impl HashChainHolderImpl { pub fn new(params: &TokenPredictorParameters, hash: H) -> Self { Self { hash: hash.new_hash_chain(), @@ -378,6 +365,49 @@ impl HashChainHolder { } } + fn update_hash_with_policy( + &mut self, + mut length: u32, + input: &mut PreflateInput, + add_policy: DictionaryAddPolicy, + ) { + while length > 0 { + let batch_len = cmp::min(length, MAX_UPDATE_HASH_BATCH); + + match add_policy { + DictionaryAddPolicy::AddAll => { + self.hash + .update_hash::(batch_len, input); + } + DictionaryAddPolicy::AddFirst(limit) => { + debug_assert_eq!(batch_len, length); + if length > limit.into() { + self.hash + .update_hash::(batch_len, input); + } else { + self.hash + .update_hash::(batch_len, input); + } + } + DictionaryAddPolicy::AddFirstAndLast(limit) => { + debug_assert_eq!(batch_len, length); + if length > limit.into() { + self.hash + .update_hash::( + batch_len, input, + ); + } else { + self.hash + .update_hash::(batch_len, input); + } + } + } + + input.advance(batch_len); + length -= batch_len; + } + } + fn prefix_compare(s1: &[u8], s2: &[u8], best_len: u32, max_len: u32) -> u32 { assert!(max_len >= 3 && s1.len() >= max_len as usize && s2.len() >= max_len as usize); diff --git a/src/preflate_parameter_estimator.rs b/src/preflate_parameter_estimator.rs index 14520b5..17d8bc1 100644 --- a/src/preflate_parameter_estimator.rs +++ b/src/preflate_parameter_estimator.rs @@ -122,6 +122,7 @@ impl PreflateParameters { HASH_ALGORITHM_MINIZ_FAST => HashAlgorithm::MiniZFast, HASH_ALGORITHM_LIBDEFLATE4 => HashAlgorithm::Libdeflate4, HASH_ALGORITHM_ZLIBNG => HashAlgorithm::ZlibNG, + HASH_ALGORITHM_RANDOMVECTOR => HashAlgorithm::RandomVector, _ => panic!("invalid hash algorithm"), }, }, diff --git a/src/process.rs b/src/process.rs index e77ad99..beb1839 100644 --- a/src/process.rs +++ b/src/process.rs @@ -368,6 +368,7 @@ fn verify_longmatch() { fn test_treepngdeflate() { use crate::hash_algorithm::{HashImplementation, RandomVectorHash}; use crate::hash_chain::HashChain; + use crate::hash_chain::UPDATE_MODE_ALL; let compressed_data: &[u8] = &read_file("treepng.deflate"); @@ -389,11 +390,7 @@ fn test_treepngdeflate() { let t = &b.tokens[i]; match t { crate::preflate_token::PreflateToken::Literal => { - chain.update_hash_with_policy::( - 1, - &input, - crate::hash_chain::DictionaryAddPolicy::AddAll, - ); + chain.update_hash::(1, &input); input.advance(1); } crate::preflate_token::PreflateToken::Reference(r) => { @@ -413,11 +410,8 @@ fn test_treepngdeflate() { return; } - chain.update_hash_with_policy::( - r.len(), - &input, - crate::hash_chain::DictionaryAddPolicy::AddAll, - ); + chain.update_hash::(r.len(), &input); + input.advance(r.len()); } } @@ -433,6 +427,12 @@ fn test_tree_paintnet() { do_analyze(None, &read_file("tree.paintnet.deflate"), true); } +#[test] +#[ignore = "doesn't work yet due to excessive hash chain length"] +fn test_tree_treepng() { + do_analyze(None, &read_file("treepng.deflate"), true); +} + // test binary deflate generated by MS Office #[test] fn verify_docx() { diff --git a/src/token_predictor.rs b/src/token_predictor.rs index baba4b4..5dc65f8 100644 --- a/src/token_predictor.rs +++ b/src/token_predictor.rs @@ -11,7 +11,7 @@ use crate::{ cabac_codec::{decode_difference, encode_difference}, hash_algorithm::HashAlgorithm, hash_chain::DictionaryAddPolicy, - hash_chain_holder::{new_hash_chain_holder, HashChainHolderTrait, MatchResult}, + hash_chain_holder::{new_hash_chain_holder, HashChainHolder, MatchResult}, preflate_constants::MIN_MATCH, preflate_input::PreflateInput, preflate_parameter_estimator::PreflateStrategy, @@ -24,7 +24,7 @@ use crate::{ const VERIFY: bool = false; pub struct TokenPredictor<'a> { - state: Box, + state: Box, params: TokenPredictorParameters, pending_reference: Option, current_token_count: u32,