Skip to content
This repository has been archived by the owner on Nov 30, 2024. It is now read-only.

Commit

Permalink
allow vocab size override
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Aug 6, 2024
1 parent 022b496 commit ad0448a
Showing 1 changed file with 16 additions and 5 deletions.
21 changes: 16 additions & 5 deletions hf_tokenizers/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,17 +223,28 @@ pub struct ByteTokenizerEnv {
}

impl ByteTokenizerEnv {
pub fn from_name(name: &str) -> Result<ByteTokenizerEnv> {
pub fn from_name(name: &str, n_vocab: Option<usize>) -> Result<ByteTokenizerEnv> {
let tokenizer = ByteTokenizer::from_name(name)?;
Ok(ByteTokenizerEnv::new(tokenizer))
ByteTokenizerEnv::new(tokenizer, n_vocab)
}

pub fn new(tokenizer: ByteTokenizer) -> ByteTokenizerEnv {
pub fn new(tokenizer: ByteTokenizer, n_vocab: Option<usize>) -> Result<ByteTokenizerEnv> {
let mut info = tokenizer.tokrx_info();
let mut token_bytes = tokenizer.token_bytes();
if let Some(n_vocab) = n_vocab {
if n_vocab < token_bytes.len() {
bail!("vocab size too small; {} vs {}", n_vocab, token_bytes.len());
}
while n_vocab > token_bytes.len() {
token_bytes.push(Vec::new());
}
info.vocab_size = n_vocab as u32;
}
let tok_trie = TokTrie::from(&tokenizer.tokrx_info(), &tokenizer.token_bytes());
ByteTokenizerEnv {
Ok(ByteTokenizerEnv {
tokenizer,
tok_trie,
}
})
}

pub fn to_env(self) -> TokEnv {
Expand Down

0 comments on commit ad0448a

Please sign in to comment.