diff --git a/core/src/toktree.rs b/core/src/toktree.rs index ecdf707..e518f3f 100644 --- a/core/src/toktree.rs +++ b/core/src/toktree.rs @@ -267,6 +267,7 @@ impl TrieNode { const LEN_BITS: u32 = 10; impl TokTrie { + // see https://github.com/microsoft/toktrie/blob/main/special_tokens.md pub const SPECIAL_TOKEN_PREFIX_BYTE: u8 = 0xff; pub fn from(info: &TokRxInfo, words: &Vec>) -> Self { diff --git a/special_tokens.md b/special_tokens.md new file mode 100644 index 0000000..b69e23a --- /dev/null +++ b/special_tokens.md @@ -0,0 +1,27 @@ +# Support for special tokens + +Tokenizers typically include special tokens, such as +`<|end_of_text|>`, `<|eot_id|>`, `<|python_tag|>`, `<|start_header_id|>`, etc. +This library is tasked with translating between the byte sequences +and tokens. +If you see bytes `<|eot_id|>` in the input, you may or may not want to treat them +as a special token. + +The library assumes that by default you want ot treat them as bytes +(so they would be tokenized as `<|`, `eot`, `_`, `id`, `|>` or similar). +To indicate that you want to treat them as a special token, you need to +prefix them with byte 0xFF (255) (`TokTrie::SPECIAL_TOKEN_PREFIX_BYTE`). + +Byte FF is chosen because it is not a valid UTF-8 byte, so it should not normally +occur in regular inputs. +In Rust, you cannot have byte FF in `&str`, only in `&[u8]`. +In Python note the difference between `b"\xFF"` and `"\xFF".encode("utf-8")` +(or equivalently `"\u00FF".encode("utf-8")`), which is `b"\xC3\xBF"`. + +If you're constructing it manually, +the token array passed to the `TokTrie` constructor should include the special tokens +with the prefix byte FF. + +The llguidance library does not expose the FF bytes externally +(except for special `tokenize_bytes_prefix` methods), so you +generally don't need to worry about them, except when building the `TokTrie`.