Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: api adjustment #48

Merged
merged 1 commit into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ flowchart LR
## Examples

```rust
use general_sam::{GeneralSAM, BTreeTransTable};
use general_sam::{GeneralSam, BTreeTransTable};

let sam = GeneralSAM::<BTreeTransTable<_>>::from_bytes("abcbc");
let sam = GeneralSam::<BTreeTransTable<_>>::from_bytes("abcbc");

// "cbc" is a suffix of "abcbc"
assert!(sam.get_root_state().feed_bytes("cbc").is_accepting());
Expand All @@ -49,38 +49,38 @@ assert!(!sam.get_root_state().feed_bytes("bcb").is_accepting());
```

```rust
use general_sam::{GeneralSAM, BTreeTransTable};
use general_sam::{GeneralSam, BTreeTransTable};

let sam = GeneralSAM::<BTreeTransTable<_>>::from_chars("abcbc".chars());
let sam = GeneralSam::<BTreeTransTable<_>>::from_chars("abcbc");

let state = sam.get_root_state();
let mut state = sam.get_root_state();

// "b" is not a suffix but at least a substring of "abcbc"
let state = state.feed_chars("b");
state.feed_chars("b");
assert!(!state.is_accepting());

// "bc" is a suffix of "abcbc"
let state = state.feed_chars("c");
state.feed_chars("c");
assert!(state.is_accepting());

// "bcbc" is a suffix of "abcbc"
let state = state.feed_chars("bc");
state.feed_chars("bc");
assert!(state.is_accepting());

// "bcbcbc" is not a substring, much less a suffix of "abcbc"
let state = state.feed_chars("bc");
state.feed_chars("bc");
assert!(!state.is_accepting() && state.is_nil());
```

```rust
// NOTE: This example requires the `trie` feature.
use general_sam::{GeneralSAM, Trie, BTreeTransTable};
# #[cfg(feature = "trie")] {
use general_sam::{GeneralSam, Trie, BTreeTransTable};

let mut trie = Trie::<BTreeTransTable<_>>::default();
trie.insert_iter("hello".chars());
trie.insert_iter("Chielo".chars());
trie.insert("hello".chars());
trie.insert("Chielo".chars());

let sam = GeneralSAM::<BTreeTransTable<_>>::from_trie(trie.get_root_state());
let sam = GeneralSam::<BTreeTransTable<_>>::from_trie(trie.get_root_state());

assert!(sam.get_root_state().feed_chars("lo").is_accepting());
assert!(sam.get_root_state().feed_chars("ello").is_accepting());
Expand All @@ -91,6 +91,7 @@ assert!(!sam.get_root_state().feed_chars("el").is_nil());

assert!(!sam.get_root_state().feed_chars("bye").is_accepting());
assert!(sam.get_root_state().feed_chars("bye").is_nil());
# }
```

## References
Expand Down
8 changes: 4 additions & 4 deletions benches/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
use general_sam::{
table::{BoxBisectTable, HashTransTable, VecBisectTable},
tokenize::{trie::greedy_tokenize_with_trie, GreedyTokenizer},
BTreeTransTable, GeneralSAM, TransitionTable, Trie,
BTreeTransTable, GeneralSam, TransitionTable, Trie,
};
use rand::{
distributions::{Alphanumeric, DistString},
Expand Down Expand Up @@ -128,7 +128,7 @@ fn tokenize_with_hf(tokenizer: &HFTokenizer, seq: &str) -> Vec<u32> {
}

fn tokenize_with_sam<T: TransitionTable<KeyType = char>>(
tokenizer: &GreedyTokenizer<T, u32, &GeneralSAM<T>>,
tokenizer: &GreedyTokenizer<T, u32, &GeneralSam<T>>,
seq: &str,
) -> Vec<u32> {
tokenizer
Expand All @@ -153,7 +153,7 @@ fn build_trie<T: TransitionTable<KeyType = char>>(vocab: &Vocab) -> (Trie<T>, Ve
let mut trie = Trie::<BTreeTransTable<_>>::default();
let mut trie_id_and_token_id = Vec::new();
for (k, v) in vocab.iter() {
let node_id = trie.insert_iter(k.chars());
let node_id = trie.insert_chars(k);
trie_id_and_token_id.push((node_id, *v));
}
let mut trie_to_token = vec![0; trie.num_of_nodes()];
Expand All @@ -178,7 +178,7 @@ fn criterion_benchmark<TransTable: TransitionTable<KeyType = char>>(c: &mut Crit
println!("building trie...");
let (trie, trie_to_token) = build_trie::<TransTable>(&vocab);
println!("building sam...");
let sam = GeneralSAM::<BTreeTransTable<_>>::from_trie(trie.get_root_state())
let sam = GeneralSam::<BTreeTransTable<_>>::from_trie(trie.get_root_state())
.alter_trans_table_into::<TransTable>();
println!("building greedy tokenizer...");
let tokenizer =
Expand Down
34 changes: 20 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
//! # Examples
//!
//! ```rust
//! use general_sam::{GeneralSAM, BTreeTransTable};
//! use general_sam::{GeneralSam, BTreeTransTable};
//!
//! let sam = GeneralSAM::<BTreeTransTable<_>>::from_bytes("abcbc");
//! let sam = GeneralSam::<BTreeTransTable<_>>::from_bytes("abcbc");
//!
//! // "cbc" is a suffix of "abcbc"
//! assert!(sam.get_root_state().feed_bytes("cbc").is_accepting());
Expand All @@ -39,38 +39,38 @@
//! ```
//!
//! ```rust
//! use general_sam::{GeneralSAM, BTreeTransTable};
//! use general_sam::{GeneralSam, BTreeTransTable};
//!
//! let sam = GeneralSAM::<BTreeTransTable<_>>::from_chars("abcbc".chars());
//! let sam = GeneralSam::<BTreeTransTable<_>>::from_chars("abcbc");
//!
//! let state = sam.get_root_state();
//! let mut state = sam.get_root_state();
//!
//! // "b" is not a suffix but at least a substring of "abcbc"
//! let state = state.feed_chars("b");
//! state.feed_chars("b");
//! assert!(!state.is_accepting());
//!
//! // "bc" is a suffix of "abcbc"
//! let state = state.feed_chars("c");
//! state.feed_chars("c");
//! assert!(state.is_accepting());
//!
//! // "bcbc" is a suffix of "abcbc"
//! let state = state.feed_chars("bc");
//! state.feed_chars("bc");
//! assert!(state.is_accepting());
//!
//! // "bcbcbc" is not a substring, much less a suffix of "abcbc"
//! let state = state.feed_chars("bc");
//! state.feed_chars("bc");
//! assert!(!state.is_accepting() && state.is_nil());
//! ```
//!
//! ```rust
//! # #[cfg(feature = "trie")] {
//! use general_sam::{GeneralSAM, Trie, BTreeTransTable};
//! use general_sam::{GeneralSam, Trie, BTreeTransTable};
//!
//! let mut trie = Trie::<BTreeTransTable<_>>::default();
//! trie.insert_iter("hello".chars());
//! trie.insert_iter("Chielo".chars());
//! trie.insert("hello".chars());
//! trie.insert("Chielo".chars());
//!
//! let sam = GeneralSAM::<BTreeTransTable<_>>::from_trie(trie.get_root_state());
//! let sam = GeneralSam::<BTreeTransTable<_>>::from_trie(trie.get_root_state());
//!
//! assert!(sam.get_root_state().feed_chars("lo").is_accepting());
//! assert!(sam.get_root_state().feed_chars("ello").is_accepting());
Expand Down Expand Up @@ -101,7 +101,7 @@ pub mod trie_alike;

pub use {
sam::{
GeneralSAM, GeneralSAMNode, GeneralSAMNodeID, GeneralSAMState, SAM_NIL_NODE_ID,
GeneralSam, GeneralSamNode, GeneralSamNodeID, GeneralSamState, SAM_NIL_NODE_ID,
SAM_ROOT_NODE_ID,
},
table::{
Expand All @@ -127,3 +127,9 @@ pub use utils::{rope, suffixwise, tokenize, tokenize::GreedyTokenizer};

#[cfg(test)]
mod tests;

#[cfg(doctest)]
mod _doctest_readme {
#[doc = include_str!("../README.md")]
struct ReadMe;
}
Loading