From e22222c1a4b5b3b1c5cd7e20858b9592604cc590 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Mon, 25 Nov 2024 13:42:15 -0800 Subject: [PATCH] add tokenize_is_approximate() method to TokenizerEnv trait --- core/src/toktree.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/core/src/toktree.rs b/core/src/toktree.rs index 07692e1..37c8fe4 100644 --- a/core/src/toktree.rs +++ b/core/src/toktree.rs @@ -161,6 +161,13 @@ pub trait TokenizerEnv: Send { fn eos_token(&self) -> TokenId { self.tok_trie().eos_token() } + + /// If this returns true, this tokenizer may return non-canonical tokenizations + /// and should generally not be used for forcing tokens. + /// Typically, it will just use TokTrie::greedy_tokenize(). + fn tokenize_is_approximate(&self) -> bool { + false + } } pub type TokEnv = Arc;