From 01dddc6a7f7a9705335c269e4f8ab02d3561bddc Mon Sep 17 00:00:00 2001 From: Chielo Newctle Date: Wed, 27 Mar 2024 10:09:12 +0800 Subject: [PATCH] feat!: api adjustment --- README.md | 29 +++++----- benches/tokenize.rs | 8 +-- src/lib.rs | 34 +++++++----- src/sam/mod.rs | 84 ++++++++++++++-------------- src/sam/state.rs | 92 ++++++++++++++----------------- src/table.rs | 119 +++++++++++++++++++++------------------- src/tests/mod.rs | 70 +++++++++++------------ src/tests/trie.rs | 37 +++++++------ src/tests/utils.rs | 30 +++++----- src/trie.rs | 51 ++++++++++------- src/trie_alike.rs | 11 ++-- src/utils/suffixwise.rs | 14 ++--- src/utils/tokenize.rs | 54 +++++++++--------- 13 files changed, 326 insertions(+), 307 deletions(-) diff --git a/README.md b/README.md index f42bf7c..0b632bf 100644 --- a/README.md +++ b/README.md @@ -37,9 +37,9 @@ flowchart LR ## Examples ```rust -use general_sam::{GeneralSAM, BTreeTransTable}; +use general_sam::{GeneralSam, BTreeTransTable}; -let sam = GeneralSAM::>::from_bytes("abcbc"); +let sam = GeneralSam::>::from_bytes("abcbc"); // "cbc" is a suffix of "abcbc" assert!(sam.get_root_state().feed_bytes("cbc").is_accepting()); @@ -49,38 +49,38 @@ assert!(!sam.get_root_state().feed_bytes("bcb").is_accepting()); ``` ```rust -use general_sam::{GeneralSAM, BTreeTransTable}; +use general_sam::{GeneralSam, BTreeTransTable}; -let sam = GeneralSAM::>::from_chars("abcbc".chars()); +let sam = GeneralSam::>::from_chars("abcbc"); -let state = sam.get_root_state(); +let mut state = sam.get_root_state(); // "b" is not a suffix but at least a substring of "abcbc" -let state = state.feed_chars("b"); +state.feed_chars("b"); assert!(!state.is_accepting()); // "bc" is a suffix of "abcbc" -let state = state.feed_chars("c"); +state.feed_chars("c"); assert!(state.is_accepting()); // "bcbc" is a suffix of "abcbc" -let state = state.feed_chars("bc"); +state.feed_chars("bc"); assert!(state.is_accepting()); // "bcbcbc" is not a substring, much less a suffix of "abcbc" -let state = state.feed_chars("bc"); +state.feed_chars("bc"); assert!(!state.is_accepting() && state.is_nil()); ``` ```rust -// NOTE: This example requires the `trie` feature. -use general_sam::{GeneralSAM, Trie, BTreeTransTable}; +# #[cfg(feature = "trie")] { +use general_sam::{GeneralSam, Trie, BTreeTransTable}; let mut trie = Trie::>::default(); -trie.insert_iter("hello".chars()); -trie.insert_iter("Chielo".chars()); +trie.insert("hello".chars()); +trie.insert("Chielo".chars()); -let sam = GeneralSAM::>::from_trie(trie.get_root_state()); +let sam = GeneralSam::>::from_trie(trie.get_root_state()); assert!(sam.get_root_state().feed_chars("lo").is_accepting()); assert!(sam.get_root_state().feed_chars("ello").is_accepting()); @@ -91,6 +91,7 @@ assert!(!sam.get_root_state().feed_chars("el").is_nil()); assert!(!sam.get_root_state().feed_chars("bye").is_accepting()); assert!(sam.get_root_state().feed_chars("bye").is_nil()); +# } ``` ## References diff --git a/benches/tokenize.rs b/benches/tokenize.rs index e72c1dd..0b62b58 100644 --- a/benches/tokenize.rs +++ b/benches/tokenize.rs @@ -4,7 +4,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; use general_sam::{ table::{BoxBisectTable, HashTransTable, VecBisectTable}, tokenize::{trie::greedy_tokenize_with_trie, GreedyTokenizer}, - BTreeTransTable, GeneralSAM, TransitionTable, Trie, + BTreeTransTable, GeneralSam, TransitionTable, Trie, }; use rand::{ distributions::{Alphanumeric, DistString}, @@ -128,7 +128,7 @@ fn tokenize_with_hf(tokenizer: &HFTokenizer, seq: &str) -> Vec { } fn tokenize_with_sam>( - tokenizer: &GreedyTokenizer>, + tokenizer: &GreedyTokenizer>, seq: &str, ) -> Vec { tokenizer @@ -153,7 +153,7 @@ fn build_trie>(vocab: &Vocab) -> (Trie, Ve let mut trie = Trie::>::default(); let mut trie_id_and_token_id = Vec::new(); for (k, v) in vocab.iter() { - let node_id = trie.insert_iter(k.chars()); + let node_id = trie.insert_chars(k); trie_id_and_token_id.push((node_id, *v)); } let mut trie_to_token = vec![0; trie.num_of_nodes()]; @@ -178,7 +178,7 @@ fn criterion_benchmark>(c: &mut Crit println!("building trie..."); let (trie, trie_to_token) = build_trie::(&vocab); println!("building sam..."); - let sam = GeneralSAM::>::from_trie(trie.get_root_state()) + let sam = GeneralSam::>::from_trie(trie.get_root_state()) .alter_trans_table_into::(); println!("building greedy tokenizer..."); let tokenizer = diff --git a/src/lib.rs b/src/lib.rs index e360c16..d369a26 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,9 +27,9 @@ //! # Examples //! //! ```rust -//! use general_sam::{GeneralSAM, BTreeTransTable}; +//! use general_sam::{GeneralSam, BTreeTransTable}; //! -//! let sam = GeneralSAM::>::from_bytes("abcbc"); +//! let sam = GeneralSam::>::from_bytes("abcbc"); //! //! // "cbc" is a suffix of "abcbc" //! assert!(sam.get_root_state().feed_bytes("cbc").is_accepting()); @@ -39,38 +39,38 @@ //! ``` //! //! ```rust -//! use general_sam::{GeneralSAM, BTreeTransTable}; +//! use general_sam::{GeneralSam, BTreeTransTable}; //! -//! let sam = GeneralSAM::>::from_chars("abcbc".chars()); +//! let sam = GeneralSam::>::from_chars("abcbc"); //! -//! let state = sam.get_root_state(); +//! let mut state = sam.get_root_state(); //! //! // "b" is not a suffix but at least a substring of "abcbc" -//! let state = state.feed_chars("b"); +//! state.feed_chars("b"); //! assert!(!state.is_accepting()); //! //! // "bc" is a suffix of "abcbc" -//! let state = state.feed_chars("c"); +//! state.feed_chars("c"); //! assert!(state.is_accepting()); //! //! // "bcbc" is a suffix of "abcbc" -//! let state = state.feed_chars("bc"); +//! state.feed_chars("bc"); //! assert!(state.is_accepting()); //! //! // "bcbcbc" is not a substring, much less a suffix of "abcbc" -//! let state = state.feed_chars("bc"); +//! state.feed_chars("bc"); //! assert!(!state.is_accepting() && state.is_nil()); //! ``` //! //! ```rust //! # #[cfg(feature = "trie")] { -//! use general_sam::{GeneralSAM, Trie, BTreeTransTable}; +//! use general_sam::{GeneralSam, Trie, BTreeTransTable}; //! //! let mut trie = Trie::>::default(); -//! trie.insert_iter("hello".chars()); -//! trie.insert_iter("Chielo".chars()); +//! trie.insert("hello".chars()); +//! trie.insert("Chielo".chars()); //! -//! let sam = GeneralSAM::>::from_trie(trie.get_root_state()); +//! let sam = GeneralSam::>::from_trie(trie.get_root_state()); //! //! assert!(sam.get_root_state().feed_chars("lo").is_accepting()); //! assert!(sam.get_root_state().feed_chars("ello").is_accepting()); @@ -101,7 +101,7 @@ pub mod trie_alike; pub use { sam::{ - GeneralSAM, GeneralSAMNode, GeneralSAMNodeID, GeneralSAMState, SAM_NIL_NODE_ID, + GeneralSam, GeneralSamNode, GeneralSamNodeID, GeneralSamState, SAM_NIL_NODE_ID, SAM_ROOT_NODE_ID, }, table::{ @@ -127,3 +127,9 @@ pub use utils::{rope, suffixwise, tokenize, tokenize::GreedyTokenizer}; #[cfg(test)] mod tests; + +#[cfg(doctest)] +mod _doctest_readme { + #[doc = include_str!("../README.md")] + struct ReadMe; +} diff --git a/src/sam/mod.rs b/src/sam/mod.rs index bf290a0..70b0424 100644 --- a/src/sam/mod.rs +++ b/src/sam/mod.rs @@ -1,7 +1,7 @@ //! A general suffix automaton implementation. mod state; -pub use state::GeneralSAMState; +pub use state::GeneralSamState; use std::convert::Infallible; @@ -9,27 +9,27 @@ use crate::{ ConstructiveTransitionTable, IterAsChain, TransitionTable, TravelEvent, TrieNodeAlike, }; -pub type GeneralSAMNodeID = usize; -pub const SAM_NIL_NODE_ID: GeneralSAMNodeID = 0; -pub const SAM_ROOT_NODE_ID: GeneralSAMNodeID = 1; +pub type GeneralSamNodeID = usize; +pub const SAM_NIL_NODE_ID: GeneralSamNodeID = 0; +pub const SAM_ROOT_NODE_ID: GeneralSamNodeID = 1; #[derive(Clone, Debug)] -pub struct GeneralSAMNode { +pub struct GeneralSamNode { trans: TransTable, accept: bool, len: usize, - link: GeneralSAMNodeID, + link: GeneralSamNodeID, } /// A general suffix automaton. #[derive(Clone, Debug)] -pub struct GeneralSAM { - node_pool: Vec>, - topo_and_suf_len_sorted_order: Vec, +pub struct GeneralSam { + node_pool: Vec>, + topo_and_suf_len_sorted_order: Vec, } -impl GeneralSAMNode { - fn new(accept: bool, len: usize, link: GeneralSAMNodeID) -> Self { +impl GeneralSamNode { + fn new(accept: bool, len: usize, link: GeneralSamNodeID) -> Self { Self { trans: Default::default(), accept, @@ -39,7 +39,7 @@ impl GeneralSAMNode { } } -impl GeneralSAMNode { +impl GeneralSamNode { pub fn is_accepting(&self) -> bool { self.accept } @@ -48,7 +48,7 @@ impl GeneralSAMNode { self.len } - pub fn get_suffix_parent_id(&self) -> GeneralSAMNodeID { + pub fn get_suffix_parent_id(&self) -> GeneralSamNodeID { self.link } @@ -58,8 +58,8 @@ impl GeneralSAMNode { fn alter_trans_table>( &self, - ) -> GeneralSAMNode { - GeneralSAMNode { + ) -> GeneralSamNode { + GeneralSamNode { trans: NewTableType::from_kv_iter(self.trans.iter()), accept: self.accept, len: self.len, @@ -68,78 +68,78 @@ impl GeneralSAMNode { } } -impl> GeneralSAM { +impl> GeneralSam { pub fn from_bytes>(s: S) -> Self { let iter = IterAsChain::from(s.as_ref().iter().copied()); Self::from_trie(iter) } } -impl> GeneralSAM { +impl> GeneralSam { pub fn from_utf32>(s: S) -> Self { let iter = IterAsChain::from(s.as_ref().iter().copied()); Self::from_trie(iter) } } -impl> GeneralSAM { - pub fn from_chars>(s: S) -> Self { - let iter = IterAsChain::from(s); +impl> GeneralSam { + pub fn from_chars>(s: S) -> Self { + let iter = IterAsChain::from(s.as_ref().chars()); Self::from_trie(iter) } } -impl Default for GeneralSAM { +impl Default for GeneralSam { fn default() -> Self { Self { node_pool: vec![ - GeneralSAMNode::new(false, 0, SAM_NIL_NODE_ID), - GeneralSAMNode::new(true, 0, SAM_NIL_NODE_ID), + GeneralSamNode::new(false, 0, SAM_NIL_NODE_ID), + GeneralSamNode::new(true, 0, SAM_NIL_NODE_ID), ], topo_and_suf_len_sorted_order: Default::default(), } } } -impl GeneralSAM { +impl GeneralSam { pub fn num_of_nodes(&self) -> usize { self.node_pool.len() } - pub fn get_root_node(&self) -> &GeneralSAMNode { + pub fn get_root_node(&self) -> &GeneralSamNode { self.get_node(SAM_ROOT_NODE_ID).unwrap() } - pub fn get_node(&self, node_id: GeneralSAMNodeID) -> Option<&GeneralSAMNode> { + pub fn get_node(&self, node_id: GeneralSamNodeID) -> Option<&GeneralSamNode> { self.node_pool.get(node_id) } - pub fn get_root_state(&self) -> GeneralSAMState> { + pub fn get_root_state(&self) -> GeneralSamState> { self.get_state(SAM_ROOT_NODE_ID) } pub fn get_state( &self, - node_id: GeneralSAMNodeID, - ) -> GeneralSAMState> { + node_id: GeneralSamNodeID, + ) -> GeneralSamState> { if node_id < self.node_pool.len() { - GeneralSAMState::new(self, node_id) + GeneralSamState::new(self, node_id) } else { - GeneralSAMState::new(self, SAM_NIL_NODE_ID) + GeneralSamState::new(self, SAM_NIL_NODE_ID) } } /// Returns topological sorted, maximum suffix length sorted /// and suffix parent depth sorted node id sequence, /// which is generated by topological sorting with a queue. - pub fn get_topo_and_suf_len_sorted_node_ids(&self) -> &Vec { + pub fn get_topo_and_suf_len_sorted_node_ids(&self) -> &Vec { &self.topo_and_suf_len_sorted_order } pub fn alter_trans_table>( &self, - ) -> GeneralSAM { - GeneralSAM { + ) -> GeneralSam { + GeneralSam { node_pool: self .node_pool .iter() @@ -151,8 +151,8 @@ impl GeneralSAM { pub fn alter_trans_table_into>( self, - ) -> GeneralSAM { - GeneralSAM { + ) -> GeneralSam { + GeneralSam { node_pool: self .node_pool .iter() @@ -163,7 +163,7 @@ impl GeneralSAM { } } -impl GeneralSAM { +impl GeneralSam { pub fn from_trie(node: TN) -> Self where TN::InnerType: Into, @@ -185,7 +185,7 @@ impl GeneralSAM { where TN::InnerType: Into, { - node.bfs_travel(|event| -> Result { + node.bfs_travel(|event| -> Result { match event { TravelEvent::PushRoot(_) => Ok(SAM_ROOT_NODE_ID), TravelEvent::Push(cur_tn, cur_node_id, key) => { @@ -235,7 +235,7 @@ impl GeneralSAM { self.node_pool[SAM_NIL_NODE_ID].accept = false; } - fn alloc_node(&mut self, node: GeneralSAMNode) -> GeneralSAMNodeID { + fn alloc_node(&mut self, node: GeneralSamNode) -> GeneralSamNodeID { let id = self.node_pool.len(); self.node_pool.push(node); id @@ -243,15 +243,15 @@ impl GeneralSAM { fn insert_node_trans>( &mut self, - last_node_id: GeneralSAMNodeID, + last_node_id: GeneralSamNodeID, key: Key, accept: bool, - ) -> GeneralSAMNodeID { + ) -> GeneralSamNodeID { let key: TransTable::KeyType = key.into(); let new_node_id = { let last_node = &self.node_pool[last_node_id]; - self.alloc_node(GeneralSAMNode::new( + self.alloc_node(GeneralSamNode::new( accept, last_node.len + 1, SAM_NIL_NODE_ID, diff --git a/src/sam/state.rs b/src/sam/state.rs index a74fe0e..8e5db4b 100644 --- a/src/sam/state.rs +++ b/src/sam/state.rs @@ -4,17 +4,17 @@ use std::{borrow::Borrow, marker::PhantomData}; use crate::{TravelEvent, TrieNodeAlike}; -use super::{GeneralSAM, GeneralSAMNode, TransitionTable, SAM_NIL_NODE_ID, SAM_ROOT_NODE_ID}; +use super::{GeneralSam, GeneralSamNode, TransitionTable, SAM_NIL_NODE_ID, SAM_ROOT_NODE_ID}; #[derive(Debug)] -pub struct GeneralSAMState>> { - pub sam: SAMRef, +pub struct GeneralSamState>> { + pub sam: SamRef, pub node_id: usize, phantom: PhantomData, } -impl> + Clone> Clone - for GeneralSAMState +impl> + Clone> Clone + for GeneralSamState { fn clone(&self) -> Self { Self { @@ -25,26 +25,26 @@ impl> + Clone } } -impl, SAMRef: Borrow>> - GeneralSAMState +impl, SamRef: Borrow>> + GeneralSamState { - pub fn feed_bytes(self, seq: &str) -> Self { - self.feed_ref(seq.as_bytes()) + pub fn feed_bytes>(&mut self, seq: S) -> &mut Self { + self.feed_ref(seq.as_ref()) } } -impl, SAMRef: Borrow>> - GeneralSAMState +impl, SamRef: Borrow>> + GeneralSamState { - pub fn feed_chars(self, seq: &str) -> Self { - self.feed(seq.chars()) + pub fn feed_chars>(&mut self, seq: S) -> &mut Self { + self.feed(seq.as_ref().chars()) } } -impl>> - GeneralSAMState +impl>> + GeneralSamState { - pub fn new(sam: SAMRef, node_id: usize) -> Self { + pub fn new(sam: SamRef, node_id: usize) -> Self { Self { sam, node_id, @@ -52,8 +52,8 @@ impl>> } } - pub fn inner_as_ref(&self) -> GeneralSAMState> { - GeneralSAMState { + pub fn inner_as_ref(&self) -> GeneralSamState> { + GeneralSamState { sam: self.sam.borrow(), node_id: self.node_id, phantom: PhantomData, @@ -74,71 +74,63 @@ impl>> .unwrap_or(false) } - pub fn get_sam_ref(&self) -> &GeneralSAM { + pub fn get_sam_ref(&self) -> &GeneralSam { self.sam.borrow() } - pub fn get_node(&self) -> Option<&GeneralSAMNode> { + pub fn get_node(&self) -> Option<&GeneralSamNode> { self.sam.borrow().get_node(self.node_id) } - pub fn goto_suffix_parent(&mut self) { + pub fn goto_suffix_parent(&mut self) -> &mut Self { if let Some(node) = self.get_node() { self.node_id = node.link; } else { self.node_id = SAM_NIL_NODE_ID; } + self } - pub fn goto(&mut self, t: &TransTable::KeyType) { - self.node_id = - if let Some(next_node_id) = self.get_node().and_then(|node| node.trans.get(t)) { - *next_node_id - } else { - SAM_NIL_NODE_ID - } - } - - pub fn feed>(self, seq: Seq) -> Self { - self.feed_iter(seq.into_iter()) + pub fn goto>(&mut self, t: &K) -> &mut Self { + self.node_id = if let Some(next_node_id) = + self.get_node().and_then(|node| node.trans.get(t.borrow())) + { + *next_node_id + } else { + SAM_NIL_NODE_ID + }; + self } - pub fn feed_iter>(mut self, iter: Iter) -> Self { - for t in iter { + pub fn feed>(&mut self, seq: Seq) -> &mut Self { + for t in seq { if self.is_nil() { break; } - self.goto(&t) + self.goto(&t); } self } - pub fn feed_ref<'s, Seq: IntoIterator>(self, seq: Seq) -> Self - where - ::KeyType: 's, - { - self.feed_ref_iter(seq.into_iter()) - } - - pub fn feed_ref_iter<'s, Iter: Iterator>( - mut self, - iter: Iter, - ) -> Self + pub fn feed_ref<'s, Seq: IntoIterator>( + &mut self, + seq: Seq, + ) -> &mut Self where ::KeyType: 's, { - for t in iter { + for t in seq { if self.is_nil() { break; } - self.goto(t) + self.goto(t); } self } } -impl> + Clone> - GeneralSAMState +impl> + Clone> + GeneralSamState { pub fn get_non_nil_trans(&self, key: &TransTable::KeyType) -> Option { self.get_node() diff --git a/src/table.rs b/src/table.rs index 491fd85..8ae2eed 100644 --- a/src/table.rs +++ b/src/table.rs @@ -6,21 +6,21 @@ use std::{ marker::PhantomData, }; -use crate::GeneralSAMNodeID; +use crate::GeneralSamNodeID; #[derive(Clone, Debug)] pub struct WithKeyDerefedIter< 'a, KeyType: 'a + Clone, - IterType: Iterator, + IterType: Iterator, > { inner: IterType, } -impl<'a, KeyType: 'a + Clone, IterType: Iterator> +impl<'a, KeyType: 'a + Clone, IterType: Iterator> Iterator for WithKeyDerefedIter<'a, KeyType, IterType> { - type Item = (KeyType, &'a GeneralSAMNodeID); + type Item = (KeyType, &'a GeneralSamNodeID); fn next(&mut self) -> Option { self.inner.next().map(|x| (x.0.clone(), x.1)) @@ -31,15 +31,15 @@ impl<'a, KeyType: 'a + Clone, IterType: Iterator, + IterType: Iterator, > { inner: IterType, } -impl<'a, KeyType: 'a, IterType: Iterator> Iterator +impl<'a, KeyType: 'a, IterType: Iterator> Iterator for TransitionIter<'a, KeyType, IterType> { - type Item = &'a GeneralSAMNodeID; + type Item = &'a GeneralSamNodeID; fn next(&mut self) -> Option { self.inner.next().map(|x| x.1) @@ -48,18 +48,18 @@ impl<'a, KeyType: 'a, IterType: Iterator pub trait TransitionTable { type KeyType: Clone; - type IterType<'a>: Iterator + type IterType<'a>: Iterator where Self: 'a, Self::KeyType: 'a; - fn from_kv_iter<'b, Iter: Iterator>( + fn from_kv_iter<'b, Iter: IntoIterator>( iter: Iter, ) -> Self where Self::KeyType: 'b; - fn get(&self, key: &Self::KeyType) -> Option<&GeneralSAMNodeID>; - fn get_mut(&mut self, key: &Self::KeyType) -> Option<&mut GeneralSAMNodeID>; + fn get(&self, key: &Self::KeyType) -> Option<&GeneralSamNodeID>; + fn get_mut(&mut self, key: &Self::KeyType) -> Option<&mut GeneralSamNodeID>; fn iter(&self) -> Self::IterType<'_>; fn contains_key(&self, key: &Self::KeyType) -> bool { @@ -72,9 +72,9 @@ pub trait TransitionTable { } pub trait ConstructiveTransitionTable: TransitionTable + Clone + Default { - fn insert(&mut self, key: Self::KeyType, trans: GeneralSAMNodeID); + fn insert(&mut self, key: Self::KeyType, trans: GeneralSamNodeID); - fn from_kv_iter<'b, Iter: Iterator>( + fn from_kv_iter<'b, Iter: IntoIterator>( iter: Iter, ) -> Self where @@ -88,23 +88,23 @@ pub trait ConstructiveTransitionTable: TransitionTable + Clone + Default { } } -pub type BTreeTransTable = BTreeMap; +pub type BTreeTransTable = BTreeMap; impl ConstructiveTransitionTable for BTreeTransTable { - fn insert(&mut self, key: KeyType, trans: GeneralSAMNodeID) { + fn insert(&mut self, key: KeyType, trans: GeneralSamNodeID) { BTreeMap::insert(self, key, trans); } } impl TransitionTable for BTreeTransTable { type KeyType = KeyType; - type IterType<'a> = WithKeyDerefedIter<'a, KeyType, std::collections::btree_map::Iter<'a, KeyType, GeneralSAMNodeID>> where Self: 'a, Self::KeyType: 'a; + type IterType<'a> = WithKeyDerefedIter<'a, KeyType, std::collections::btree_map::Iter<'a, KeyType, GeneralSamNodeID>> where Self: 'a, Self::KeyType: 'a; - fn get(&self, key: &KeyType) -> Option<&GeneralSAMNodeID> { + fn get(&self, key: &KeyType) -> Option<&GeneralSamNodeID> { BTreeMap::get(self, key) } - fn get_mut(&mut self, key: &KeyType) -> Option<&mut GeneralSAMNodeID> { + fn get_mut(&mut self, key: &KeyType) -> Option<&mut GeneralSamNodeID> { BTreeMap::get_mut(self, key) } @@ -114,7 +114,9 @@ impl TransitionTable for BTreeTransTable { } } - fn from_kv_iter<'b, Iter: Iterator>(iter: Iter) -> Self + fn from_kv_iter<'b, Iter: IntoIterator>( + iter: Iter, + ) -> Self where Self::KeyType: 'b, { @@ -122,25 +124,25 @@ impl TransitionTable for BTreeTransTable { } } -pub type HashTransTable = HashMap; +pub type HashTransTable = HashMap; impl ConstructiveTransitionTable for HashTransTable { - fn insert(&mut self, key: KeyType, trans: GeneralSAMNodeID) { + fn insert(&mut self, key: KeyType, trans: GeneralSamNodeID) { HashMap::insert(self, key, trans); } } impl TransitionTable for HashTransTable { type KeyType = KeyType; - type IterType<'a> = WithKeyDerefedIter<'a, KeyType, std::collections::hash_map::Iter<'a, KeyType, GeneralSAMNodeID>> where Self: 'a, Self::KeyType: 'a; + type IterType<'a> = WithKeyDerefedIter<'a, KeyType, std::collections::hash_map::Iter<'a, KeyType, GeneralSamNodeID>> where Self: 'a, Self::KeyType: 'a; - fn get(&self, key: &KeyType) -> Option<&GeneralSAMNodeID> { + fn get(&self, key: &KeyType) -> Option<&GeneralSamNodeID> { HashMap::get(self, key) } - fn get_mut(&mut self, key: &KeyType) -> Option<&mut GeneralSAMNodeID> { + fn get_mut(&mut self, key: &KeyType) -> Option<&mut GeneralSamNodeID> { HashMap::get_mut(self, key) } @@ -150,7 +152,9 @@ impl TransitionTable for HashTransTable>(iter: Iter) -> Self + fn from_kv_iter<'b, Iter: IntoIterator>( + iter: Iter, + ) -> Self where Self::KeyType: 'b, { @@ -185,9 +189,9 @@ fn bisect_unstable>(container: C, key: &K) -> Opti #[derive(Clone, Debug)] pub struct BisectTable< K: Clone + Ord, - C: AsRef<[(K, GeneralSAMNodeID)]> - + AsMut<[(K, GeneralSAMNodeID)]> - + FromIterator<(K, GeneralSAMNodeID)>, + C: AsRef<[(K, GeneralSamNodeID)]> + + AsMut<[(K, GeneralSamNodeID)]> + + FromIterator<(K, GeneralSamNodeID)>, > { inner: C, phantom: PhantomData, @@ -195,11 +199,11 @@ pub struct BisectTable< #[derive(Clone, Debug)] pub struct BisectTableIter<'s, K: Clone + Ord> { - inner: core::slice::Iter<'s, (K, GeneralSAMNodeID)>, + inner: core::slice::Iter<'s, (K, GeneralSamNodeID)>, } impl<'s, K: Clone + Ord> Iterator for BisectTableIter<'s, K> { - type Item = (K, &'s GeneralSAMNodeID); + type Item = (K, &'s GeneralSamNodeID); fn next(&mut self) -> Option { self.inner.next().map(|x| (x.0.clone(), &x.1)) @@ -208,19 +212,19 @@ impl<'s, K: Clone + Ord> Iterator for BisectTableIter<'s, K> { impl< K: Clone + Ord, - C: AsRef<[(K, GeneralSAMNodeID)]> - + AsMut<[(K, GeneralSAMNodeID)]> - + FromIterator<(K, GeneralSAMNodeID)>, + C: AsRef<[(K, GeneralSamNodeID)]> + + AsMut<[(K, GeneralSamNodeID)]> + + FromIterator<(K, GeneralSamNodeID)>, > TransitionTable for BisectTable { type KeyType = K; type IterType<'a> = BisectTableIter<'a, K> where Self: 'a, Self::KeyType: 'a; - fn get(&self, key: &Self::KeyType) -> Option<&GeneralSAMNodeID> { + fn get(&self, key: &Self::KeyType) -> Option<&GeneralSamNodeID> { bisect_unstable(&self.inner, key).map(|i| &self.inner.as_ref()[i].1) } - fn get_mut(&mut self, key: &K) -> Option<&mut GeneralSAMNodeID> { + fn get_mut(&mut self, key: &K) -> Option<&mut GeneralSamNodeID> { bisect_unstable(&self.inner, key).map(|i| &mut self.inner.as_mut()[i].1) } @@ -230,11 +234,12 @@ impl< } } - fn from_kv_iter<'b, Iter: Iterator>(iter: Iter) -> Self + fn from_kv_iter<'b, Iter: IntoIterator>(iter: Iter) -> Self where Self::KeyType: 'b, { - let mut inner: Box<[(K, GeneralSAMNodeID)]> = iter.map(|(u, v)| (u.clone(), *v)).collect(); + let mut inner: Box<[(K, GeneralSamNodeID)]> = + iter.into_iter().map(|(u, v)| (u.clone(), *v)).collect(); inner.sort_unstable_by(|a, b| a.0.cmp(&b.0)); Self { inner: inner.iter().map(|x| (x.0.clone(), x.1)).collect(), @@ -243,8 +248,8 @@ impl< } } -pub type VecBisectTable = BisectTable>; -pub type BoxBisectTable = BisectTable>; +pub type VecBisectTable = BisectTable>; +pub type BoxBisectTable = BisectTable>; pub trait SmallAlphabet: Copy + Ord + Into { const SIZE_LOG_2: usize; @@ -272,9 +277,9 @@ impl SmallAlphabet for u8 { #[derive(Clone, Debug)] pub struct WholeAlphabetTable< K: SmallAlphabet, - C: AsRef<[Option]> - + AsMut<[Option]> - + FromIterator> + C: AsRef<[Option]> + + AsMut<[Option]> + + FromIterator> + Clone, > { inner: C, @@ -283,12 +288,12 @@ pub struct WholeAlphabetTable< #[derive(Clone, Debug)] pub struct WholeAlphabetTableIter<'s, K: SmallAlphabet> { - inner: std::iter::Enumerate>>, + inner: std::iter::Enumerate>>, phantom: PhantomData, } impl<'s, K: SmallAlphabet> Iterator for WholeAlphabetTableIter<'s, K> { - type Item = (K, &'s GeneralSAMNodeID); + type Item = (K, &'s GeneralSamNodeID); fn next(&mut self) -> Option { for (k, ref v) in self.inner.by_ref() { @@ -302,9 +307,9 @@ impl<'s, K: SmallAlphabet> Iterator for WholeAlphabetTableIter<'s, K> { impl< K: SmallAlphabet, - C: AsRef<[Option]> - + AsMut<[Option]> - + FromIterator> + C: AsRef<[Option]> + + AsMut<[Option]> + + FromIterator> + Clone, > Default for WholeAlphabetTable { @@ -318,13 +323,13 @@ impl< impl< K: SmallAlphabet, - C: AsRef<[Option]> - + AsMut<[Option]> - + FromIterator> + C: AsRef<[Option]> + + AsMut<[Option]> + + FromIterator> + Clone, > ConstructiveTransitionTable for WholeAlphabetTable { - fn insert(&mut self, key: Self::KeyType, trans: GeneralSAMNodeID) { + fn insert(&mut self, key: Self::KeyType, trans: GeneralSamNodeID) { let k: usize = key.into(); self.inner.as_mut()[k] = Some(trans) } @@ -332,21 +337,21 @@ impl< impl< K: SmallAlphabet, - C: AsRef<[Option]> - + AsMut<[Option]> - + FromIterator> + C: AsRef<[Option]> + + AsMut<[Option]> + + FromIterator> + Clone, > TransitionTable for WholeAlphabetTable { type KeyType = K; type IterType<'a> = WholeAlphabetTableIter<'a, K> where Self: 'a, Self::KeyType: 'a; - fn get(&self, key: &Self::KeyType) -> Option<&GeneralSAMNodeID> { + fn get(&self, key: &Self::KeyType) -> Option<&GeneralSamNodeID> { let k: usize = (*key).into(); self.inner.as_ref().get(k).and_then(|x| x.as_ref()) } - fn get_mut(&mut self, key: &Self::KeyType) -> Option<&mut GeneralSAMNodeID> { + fn get_mut(&mut self, key: &Self::KeyType) -> Option<&mut GeneralSamNodeID> { let k: usize = (*key).into(); self.inner.as_mut().get_mut(k).and_then(|x| x.as_mut()) } @@ -358,7 +363,7 @@ impl< } } - fn from_kv_iter<'b, Iter: Iterator>( + fn from_kv_iter<'b, Iter: IntoIterator>( iter: Iter, ) -> Self where diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 428ec4b..3ef25a5 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,4 +1,4 @@ -use crate::{BTreeTransTable, GeneralSAM}; +use crate::{BTreeTransTable, GeneralSam}; #[cfg(feature = "utils")] mod utils; @@ -8,86 +8,86 @@ mod trie; #[test] fn test_example_from_chars() { - let sam_from_chars = GeneralSAM::>::from_chars("abcbc".chars()); - // => GeneralSAM + let sam_from_chars = GeneralSam::>::from_chars("abcbc"); + // => GeneralSam - let state = sam_from_chars.get_root_state(); + let mut state = sam_from_chars.get_root_state(); assert!(state.is_root()); - let state = state.feed_chars("b"); + state.feed_chars("b"); assert!(!state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_chars("c"); + state.feed_chars("c"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_chars("bc"); + state.feed_chars("bc"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_chars("bc"); + state.feed_chars("bc"); assert!(!state.is_accepting() && state.is_nil() && !state.is_root()); } #[test] fn test_example_from_bytes() { - let sam_from_bytes = GeneralSAM::>::from_bytes("abcbc"); - // => GeneralSAM + let sam_from_bytes = GeneralSam::>::from_bytes("abcbc"); + // => GeneralSam - let state = sam_from_bytes.get_root_state(); + let mut state = sam_from_bytes.get_root_state(); assert!(state.is_root()); - let state = state.feed_bytes("b"); + state.feed_bytes("b"); assert!(!state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_bytes("c"); + state.feed_bytes("c"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_bytes("bc"); + state.feed_bytes("bc"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_bytes("bc"); + state.feed_bytes("bc"); assert!(!state.is_accepting() && state.is_nil() && !state.is_root()); } #[test] fn test_simple_bytes() { - let sam = GeneralSAM::>::from_bytes("abcbc".as_bytes().iter()); - let state = sam.get_root_state(); + let sam = GeneralSam::>::from_bytes("abcbc".as_bytes()); + let mut state = sam.get_root_state(); assert!(!state.is_accepting() && !state.is_nil() && state.is_root()); - let state = state.feed_bytes("bc"); + state.feed_bytes("bc"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_bytes("b"); + state.feed_bytes("b"); assert!(!state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_bytes("c"); + state.feed_bytes("c"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_bytes("a"); + state.feed_bytes("a"); assert!(!state.is_accepting() && state.is_nil() && !state.is_root()); - let state = state.feed_bytes("a"); + state.feed_bytes("a"); assert!(!state.is_accepting() && state.is_nil() && !state.is_root()); } #[test] fn test_simple_chars() { - let sam = GeneralSAM::>::from_chars("abcbc".chars()); - let state = sam.get_root_state(); + let sam = GeneralSam::>::from_chars("abcbc"); + let mut state = sam.get_root_state(); assert!(!state.is_accepting() && !state.is_nil() && state.is_root()); - let state = state.feed_chars("bc"); + state.feed_chars("bc"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_chars("b"); + state.feed_chars("b"); assert!(!state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_chars("c"); + state.feed_chars("c"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_chars("a"); + state.feed_chars("a"); assert!(!state.is_accepting() && state.is_nil() && !state.is_root()); - let state = state.feed_chars("a"); + state.feed_chars("a"); assert!(!state.is_accepting() && state.is_nil() && !state.is_root()); } #[test] fn test_chinese_bytes() { - let sam = GeneralSAM::>::from_bytes("你好".as_bytes().iter()); - let state = sam.get_root_state(); + let sam = GeneralSam::>::from_bytes("你好".as_bytes()); + let mut state = sam.get_root_state(); assert!(!state.is_accepting() && !state.is_nil() && state.is_root()); - let state = state.feed_bytes("你好"); + state.feed_bytes("你好"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); } #[test] fn test_chinese_chars() { - let sam = GeneralSAM::>::from_chars("你好".chars()); - let state = sam.get_root_state(); + let sam = GeneralSam::>::from_chars("你好"); + let mut state = sam.get_root_state(); assert!(!state.is_accepting() && !state.is_nil() && state.is_root()); - let state = state.feed_chars("你好"); + state.feed_chars("你好"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); } diff --git a/src/tests/trie.rs b/src/tests/trie.rs index c13e27f..0a41f91 100644 --- a/src/tests/trie.rs +++ b/src/tests/trie.rs @@ -4,39 +4,39 @@ use rand::{ Rng, SeedableRng, }; -use crate::{BTreeTransTable, GeneralSAM, Trie, SAM_ROOT_NODE_ID}; +use crate::{BTreeTransTable, GeneralSam, Trie, SAM_ROOT_NODE_ID}; #[test] fn test_example_from_trie() { - let mut trie = Trie::>::default(); + let mut trie = Trie::>::default(); - trie.insert_iter("hello".chars()); - trie.insert_iter("Chielo".chars()); + trie.insert_chars("hello"); + trie.insert_chars("Chielo"); - let sam = GeneralSAM::>::from_trie(trie.get_root_state()); + let sam = GeneralSam::>::from_trie(trie.get_root_state()); - let state = sam.get_root_state(); + let mut state = sam.get_root_state(); assert!(state.is_root()); - let state = state.feed_chars("l"); + state.feed_chars("l"); assert!(!state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_chars("o"); + state.feed_chars("o"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = sam.get_root_state(); + let mut state = sam.get_root_state(); assert!(state.is_root()); - let state = state.feed_chars("Chie"); + state.feed_chars("Chie"); assert!(!state.is_accepting() && !state.is_nil() && !state.is_root()); - let state = state.feed_chars("lo"); + state.feed_chars("lo"); assert!(state.is_accepting() && !state.is_nil() && !state.is_root()); } fn case_trie_suffix(vocab: &[&str]) { let mut trie = Trie::>::default(); - vocab.iter().for_each(|word| { - trie.insert_iter(word.chars()); + vocab.iter().for_each(|&word| { + trie.insert_chars(word); }); - let sam = GeneralSAM::>::from_trie(trie.get_root_state()); + let sam = GeneralSam::>::from_trie(trie.get_root_state()); let is_suffix = |word_slice: &str| vocab.iter().any(|word| word.ends_with(word_slice)); @@ -46,7 +46,8 @@ fn case_trie_suffix(vocab: &[&str]) { .chain(Some((word.len(), '\0'))) .for_each(|(j, _)| { if i < j { - let state = sam.get_root_state().feed_iter(word[i..j].chars()); + let mut state = sam.get_root_state(); + state.feed_chars(&word[i..j]); assert!(!state.is_nil()); assert!(is_suffix(&word[i..j]) ^ !(state.is_accepting())); } @@ -71,14 +72,14 @@ fn test_simple_trie_suffix() { fn test_topo_and_suf_len_sorted_order() { let mut rng = StdRng::seed_from_u64(1134759173975); for _ in 0..10000 { - let mut trie = Trie::>::default(); + let mut trie = Trie::>::default(); for _ in 0..rng.gen_range(0..32) { let len = rng.gen_range(0..9); let string = Alphanumeric.sample_string(&mut rng, len); - trie.insert_ref_iter(string.as_bytes().iter()); + trie.insert_bytes(string.as_bytes()); } - let sam = GeneralSAM::>::from_trie(trie.get_root_state()); + let sam = GeneralSam::>::from_trie(trie.get_root_state()); let order = sam.get_topo_and_suf_len_sorted_node_ids(); let rank = { diff --git a/src/tests/utils.rs b/src/tests/utils.rs index 535a37e..7be47d7 100644 --- a/src/tests/utils.rs +++ b/src/tests/utils.rs @@ -88,7 +88,7 @@ mod trie { suffixwise::{SuffixInTrie, SuffixInTrieData}, tokenize::GreedyTokenizer, }, - BTreeTransTable, GeneralSAM, TransitionTable, Trie, + BTreeTransTable, GeneralSam, TransitionTable, Trie, }; #[test] @@ -99,10 +99,10 @@ mod trie { let mut trie = Trie::>::default(); let mut id_to_word = BTreeMap::new(); for word in vocab { - id_to_word.insert(trie.insert_iter(word.chars()), word); + id_to_word.insert(trie.insert(word.chars()), word); } - let sam = GeneralSAM::>::from_trie(trie.get_root_state()); + let sam = GeneralSam::>::from_trie(trie.get_root_state()); let data = SuffixInTrieData::build(&sam, trie.get_root_state(), |tn| tn.clone()); for i in data.iter().skip(1) { @@ -128,14 +128,14 @@ mod trie { fn case_tokenizer< T: Clone, TransTable: TransitionTable, - Iter: Iterator, - SAMRef: Deref>, + Iter: IntoIterator, + SamRef: Deref>, >( - tokenizer: &GreedyTokenizer, + tokenizer: &GreedyTokenizer, trie: &Trie, seq: Iter, ) { - let seq: Box<[_]> = seq.collect(); + let seq: Box<[_]> = seq.into_iter().collect(); let output = tokenizer.tokenize(seq.iter().cloned(), &trie.num_of_nodes()); let expected = greedy_tokenize_with_trie(trie, seq.iter().cloned()); output.iter().zip(expected.iter()).for_each(|(o, e)| { @@ -151,10 +151,10 @@ mod trie { let mut trie = Trie::>::default(); let mut id_to_word = BTreeMap::new(); for word in vocab { - id_to_word.insert(trie.insert_iter(word.chars()), word); + id_to_word.insert(trie.insert(word.chars()), word); } - let sam = GeneralSAM::>::from_trie(trie.get_root_state()); + let sam = GeneralSam::>::from_trie(trie.get_root_state()); let tokenizer = GreedyTokenizer::build_from_trie(&sam, trie.get_root_state()); @@ -174,10 +174,10 @@ mod trie { let mut trie = Trie::>::default(); let mut id_to_word = BTreeMap::new(); for word in vocab { - id_to_word.insert(trie.insert_iter(word.bytes()), word); + id_to_word.insert(trie.insert(word.bytes()), word); } - let sam = GeneralSAM::>::from_trie(trie.get_root_state()); + let sam = GeneralSam::>::from_trie(trie.get_root_state()); let tokenizer = GreedyTokenizer::build_from_trie(&sam, trie.get_root_state()); @@ -197,10 +197,10 @@ mod trie { let mut trie = Trie::>::default(); let mut id_to_word = BTreeMap::new(); for word in vocab { - id_to_word.insert(trie.insert_iter(word.bytes()), word); + id_to_word.insert(trie.insert(word.bytes()), word); } - let sam = GeneralSAM::>::from_trie(trie.get_root_state()); + let sam = GeneralSam::>::from_trie(trie.get_root_state()); let tokenizer = GreedyTokenizer::, _, _>::build_from_sam_and_trie( sam, @@ -231,12 +231,12 @@ mod trie { for _ in 0..rng.gen_range(0..vocab_size) { let len = rng.gen_range(0..token_len); let string = Alphanumeric.sample_string(&mut rng, len); - trie.insert_ref_iter(f(string).iter()); + trie.insert(f(string)); } let trie = trie.alter_trans_table::(); let sam = - GeneralSAM::>::from_trie(trie.get_root_state()) + GeneralSam::>::from_trie(trie.get_root_state()) .alter_trans_table_into::(); let tokenizer = GreedyTokenizer::build_from_trie(&sam, trie.get_root_state()); diff --git a/src/trie.rs b/src/trie.rs index 25c254d..363ac59 100644 --- a/src/trie.rs +++ b/src/trie.rs @@ -1,10 +1,10 @@ //! Trie, supporting `TrieNodeAlike`. -use std::ops::Deref; +use std::{borrow::Borrow, ops::Deref}; -use crate::{ConstructiveTransitionTable, GeneralSAMNodeID, TransitionTable, TrieNodeAlike}; +use crate::{ConstructiveTransitionTable, GeneralSamNodeID, TransitionTable, TrieNodeAlike}; -pub type TrieNodeID = GeneralSAMNodeID; +pub type TrieNodeID = GeneralSamNodeID; pub const TRIE_NIL_NODE_ID: TrieNodeID = 0; pub const TRIE_ROOT_NODE_ID: TrieNodeID = 1; @@ -128,19 +128,12 @@ impl Trie { node_id } - pub fn insert_ref_iter<'s, Iter: Iterator>( - &'s mut self, - iter: Iter, - ) -> TrieNodeID { - self.insert_iter(iter.cloned()) - } - - pub fn insert_iter>( + pub fn insert>( &mut self, iter: Iter, ) -> TrieNodeID { let mut current = TRIE_ROOT_NODE_ID; - iter.for_each(|t| { + iter.into_iter().for_each(|t| { current = match self.node_pool[current].trans.get(&t) { Some(v) => *v, None => { @@ -155,6 +148,18 @@ impl Trie { } } +impl> Trie { + pub fn insert_bytes>(&mut self, bytes: S) -> TrieNodeID { + self.insert(bytes.as_ref().iter().copied()) + } +} + +impl> Trie { + pub fn insert_chars>(&mut self, s: S) -> TrieNodeID { + self.insert(s.as_ref().chars()) + } +} + impl>> TrieState { @@ -185,23 +190,31 @@ impl>> } } - pub fn goto(&mut self, t: &TransTable::KeyType) { + pub fn goto>(&mut self, t: K) { if let Some(node) = self.get_node() { - self.node_id = node.trans.get(t).copied().unwrap_or(TRIE_NIL_NODE_ID) + self.node_id = node + .trans + .get(t.borrow()) + .copied() + .unwrap_or(TRIE_NIL_NODE_ID) } else { self.node_id = TRIE_NIL_NODE_ID; } } - pub fn feed_iter>(&mut self, iter: Iter) { - iter.for_each(|x| self.goto(&x)); + pub fn feed>(&mut self, iter: Iter) { + iter.into_iter().for_each(|x| self.goto(&x)); } - pub fn feed_ref_iter<'s, Iter: Iterator>( - &'s mut self, + pub fn feed_ref, Iter: IntoIterator>( + &mut self, iter: Iter, ) { - iter.for_each(|x| self.goto(x)); + iter.into_iter().for_each(|x| self.goto(x)); + } + + pub fn feed_slice>(&mut self, slice: S) { + self.feed_ref(slice.as_ref().iter()) } } diff --git a/src/trie_alike.rs b/src/trie_alike.rs index a4cc595..7a04dd0 100644 --- a/src/trie_alike.rs +++ b/src/trie_alike.rs @@ -1,5 +1,5 @@ -//! A trait for constructing `GeneralSAM` from structures that form a trie, -//! and some utilities to construct `GeneralSAM` from iterators. +//! A trait for constructing `GeneralSam` from structures that form a trie, +//! and some utilities to construct `GeneralSam` from iterators. use std::collections::VecDeque; @@ -10,7 +10,7 @@ pub enum TravelEvent<'s, NodeType, ExtraType, KeyType> { Pop(NodeType, ExtraType), } -/// This trait provides the essential interfaces required by `GeneralSAM` +/// This trait provides the essential interfaces required by `GeneralSam` /// to construct a suffix automaton from structures that form a trie (prefix tree). pub trait TrieNodeAlike { type InnerType; @@ -90,8 +90,9 @@ pub struct IterAsChainNextStateIter { pub state: Option>, } -impl From for IterAsChain { - fn from(mut iter: Iter) -> Self { +impl From for IterAsChain { + fn from(iter: Iter) -> Self { + let mut iter = iter.into_iter(); let val = iter.next(); Self { iter, val } } diff --git a/src/utils/suffixwise.rs b/src/utils/suffixwise.rs index 93bb3da..e49dc35 100644 --- a/src/utils/suffixwise.rs +++ b/src/utils/suffixwise.rs @@ -4,7 +4,7 @@ use std::{collections::LinkedList, convert::Infallible, ops::Deref}; use crate::{ rope::{Rope, RopeBase, RopeData, RopeUntaggedInner, TreapBasedRopeBase}, - GeneralSAM, GeneralSAMState, TransitionTable, TravelEvent, TrieNodeAlike, SAM_NIL_NODE_ID, + GeneralSam, GeneralSamState, TransitionTable, TravelEvent, TrieNodeAlike, SAM_NIL_NODE_ID, SAM_ROOT_NODE_ID, }; @@ -60,17 +60,17 @@ impl SuffixwiseData { pub fn build_from_sam< TransTable: TransitionTable, - Iter: Iterator, + Iter: IntoIterator, FInit: FnMut(usize) -> Iter, >( - sam: &GeneralSAM, + sam: &GeneralSam, mut f_init: FInit, ) -> Vec { let mut res = vec![Self::default(); sam.num_of_nodes()]; for node_id in sam.get_topo_and_suf_len_sorted_node_ids().iter().copied() { assert_ne!(node_id, SAM_NIL_NODE_ID); - let node = sam.get_node(node_id).expect("invalid GeneralSAM"); + let node = sam.get_node(node_id).expect("invalid GeneralSam"); let node_data = res .get_mut(node_id) .unwrap_or_else(|| panic!("invalid node id: {}", node_id)); @@ -83,7 +83,7 @@ impl SuffixwiseData { node_data.data = Rope::new(Inner::default()); } else { let parent_id = node.get_suffix_parent_id(); - let parent = sam.get_node(parent_id).expect("invalid GeneralSAM"); + let parent = sam.get_node(parent_id).expect("invalid GeneralSam"); node_data.min_suf_len = parent.max_suffix_len() + 1; @@ -131,13 +131,13 @@ impl SuffixInTrieData { TN: TrieNodeAlike, F: FnMut(&TN) -> Digested, >( - sam: &GeneralSAM, + sam: &GeneralSam, trie_node: TN, mut f: F, ) -> Vec { let mut sam_to_data = vec![LinkedList::>::new(); sam.num_of_nodes()]; let callback = - |event: TravelEvent<(&GeneralSAMState<_, &GeneralSAM<_>>, &TN), _, _>| -> Result<_, Infallible> { + |event: TravelEvent<(&GeneralSamState<_, &GeneralSam<_>>, &TN), _, _>| -> Result<_, Infallible> { match event { crate::TravelEvent::Pop((sam_state, trie_state), len) => { if trie_state.is_accepting() { diff --git a/src/utils/tokenize.rs b/src/utils/tokenize.rs index 6efec1e..9611fd7 100644 --- a/src/utils/tokenize.rs +++ b/src/utils/tokenize.rs @@ -2,7 +2,7 @@ use std::ops::{AddAssign, Deref, SubAssign}; -use crate::{GeneralSAM, GeneralSAMState, TransitionTable, TrieNodeAlike}; +use crate::{GeneralSam, GeneralSamState, TransitionTable, TrieNodeAlike}; use super::suffixwise::SuffixInTrieData; @@ -22,19 +22,19 @@ use super::suffixwise::SuffixInTrieData; pub struct GreedyTokenizer< TransTable: TransitionTable, TokenIDType: Clone + Default + PartialEq, - SAMRef: Deref>, + SamRef: Deref>, > { - sam: SAMRef, + sam: SamRef, suffix_data: Vec>, } #[derive(Clone, Debug)] -pub struct OwnedGeneralSAM { - pub sam: GeneralSAM, +pub struct OwnedGeneralSam { + pub sam: GeneralSam, } -impl Deref for OwnedGeneralSAM { - type Target = GeneralSAM; +impl Deref for OwnedGeneralSam { + type Target = GeneralSam; fn deref(&self) -> &Self::Target { &self.sam @@ -42,19 +42,19 @@ impl Deref for OwnedGeneralSAM { } impl - GreedyTokenizer> + GreedyTokenizer> { pub fn build_from_sam< TN: TrieNodeAlike, F: FnMut(&TN) -> TokenIDType, >( - sam: GeneralSAM, + sam: GeneralSam, trie_node: TN, f: F, ) -> Self { Self { suffix_data: SuffixInTrieData::build(&sam, trie_node, f), - sam: OwnedGeneralSAM { sam }, + sam: OwnedGeneralSam { sam }, } } } @@ -62,14 +62,14 @@ impl impl< TransTable: TransitionTable, TokenIDType: Clone + Default + PartialEq, - SAMRef: Deref>, - > GreedyTokenizer + SamRef: Deref>, + > GreedyTokenizer { - pub fn get_sam(&self) -> &SAMRef { + pub fn get_sam(&self) -> &SamRef { &self.sam } - pub fn get_sam_ref(&self) -> &GeneralSAM { + pub fn get_sam_ref(&self) -> &GeneralSam { &self.sam } @@ -79,7 +79,7 @@ impl< pub fn inner_as_ref( &self, - ) -> GreedyTokenizer> { + ) -> GreedyTokenizer> { GreedyTokenizer { sam: &self.sam, suffix_data: self.suffix_data.clone(), @@ -90,7 +90,7 @@ impl< TN: TrieNodeAlike, F: FnMut(&TN) -> TokenIDType, >( - sam: SAMRef, + sam: SamRef, trie_node: TN, f: F, ) -> Self { @@ -100,7 +100,7 @@ impl< } } - pub fn tokenize>( + pub fn tokenize>( &self, iter: Iter, unk_token_id: &TokenIDType, @@ -118,7 +118,7 @@ impl< }; let pop_buffer = |cur_len: &mut usize, - cur_state: &mut GeneralSAMState>, + cur_state: &mut GeneralSamState>, res: &mut Vec<_>| { let inner_data = self.suffix_data[cur_state.node_id] .get(*cur_len) @@ -178,15 +178,15 @@ impl< pub mod trie { use std::ops::Deref; - use crate::{GeneralSAM, TransitionTable, Trie, TrieNodeAlike, TrieNodeID, TrieState}; + use crate::{GeneralSam, TransitionTable, Trie, TrieNodeAlike, TrieNodeID, TrieState}; - use super::OwnedGeneralSAM; + use super::OwnedGeneralSam; - impl>> - super::GreedyTokenizer + impl>> + super::GreedyTokenizer { pub fn build_from_trie>( - sam: SAMRef, + sam: SamRef, trie_state: TrieState>, ) -> Self { Self::build(sam, trie_state, |tn| tn.node_id) @@ -194,10 +194,10 @@ pub mod trie { } impl - super::GreedyTokenizer> + super::GreedyTokenizer> { pub fn build_from_sam_and_trie>( - sam: GeneralSAM, + sam: GeneralSam, trie_state: TrieState>, ) -> Self { Self::build_from_sam(sam, trie_state, |tn| tn.node_id) @@ -212,7 +212,7 @@ pub mod trie { /// $\mathcal{O}\left( n \cdot l \cdot \log{\Sigma} \right)$. pub fn greedy_tokenize_with_trie< TransTable: TransitionTable, - Iter: Iterator, + Iter: IntoIterator, >( trie: &Trie, seq: Iter, @@ -231,7 +231,7 @@ pub mod trie { res.push((token_id, token_len)) }; - let seq: Box<[_]> = seq.collect(); + let seq: Box<[_]> = seq.into_iter().collect(); let mut cur = 0; while cur < seq.len() { let mut best: Option<(usize, usize)> = None;