diff --git a/core/src/toktree.rs b/core/src/toktree.rs
index ecdf707..e518f3f 100644
--- a/core/src/toktree.rs
+++ b/core/src/toktree.rs
@@ -267,6 +267,7 @@ impl TrieNode {
 const LEN_BITS: u32 = 10;
 
 impl TokTrie {
+    // see https://github.com/microsoft/toktrie/blob/main/special_tokens.md
     pub const SPECIAL_TOKEN_PREFIX_BYTE: u8 = 0xff;
 
     pub fn from(info: &TokRxInfo, words: &Vec<Vec<u8>>) -> Self {
diff --git a/special_tokens.md b/special_tokens.md
new file mode 100644
index 0000000..b69e23a
--- /dev/null
+++ b/special_tokens.md
@@ -0,0 +1,27 @@
+# Support for special tokens
+
+Tokenizers typically include special tokens, such as 
+`<|end_of_text|>`, `<|eot_id|>`, `<|python_tag|>`, `<|start_header_id|>`, etc.
+This library is tasked with translating between the byte sequences
+and tokens.
+If you see bytes `<|eot_id|>` in the input, you may or may not want to treat them
+as a special token.
+
+The library assumes that by default you want ot treat them as bytes
+(so they would be tokenized as `<|`, `eot`, `_`, `id`, `|>` or similar).
+To indicate that you want to treat them as a special token, you need to
+prefix them with byte 0xFF (255) (`TokTrie::SPECIAL_TOKEN_PREFIX_BYTE`).
+
+Byte FF is chosen because it is not a valid UTF-8 byte, so it should not normally
+occur in regular inputs.
+In Rust, you cannot have byte FF in `&str`, only in `&[u8]`.
+In Python note the difference between `b"\xFF"` and `"\xFF".encode("utf-8")`
+(or equivalently `"\u00FF".encode("utf-8")`), which is `b"\xC3\xBF"`.
+
+If you're constructing it manually, 
+the token array passed to the `TokTrie` constructor should include the special tokens
+with the prefix byte FF.
+
+The llguidance library does not expose the FF bytes externally
+(except for special `tokenize_bytes_prefix` methods), so you
+generally don't need to worry about them, except when building the `TokTrie`.