diff --git a/c_sample/Makefile b/c_sample/Makefile new file mode 100644 index 00000000..fa09e982 --- /dev/null +++ b/c_sample/Makefile @@ -0,0 +1,11 @@ +ifeq ($(wildcard ../../target),) + TARGET = ../target/release +else + TARGET = ../../target/release +endif + +all: + cd ../parser && cargo build --release + c++ -W -Wall -std=c++20 -o $(TARGET)/c_sample c_sample.cpp -I../parser -L$(TARGET) -lllguidance_parser + $(TARGET)/c_sample ../sample_parser/data/blog.schema.ll.json ../sample_parser/data/blog.sample.json + diff --git a/c_sample/README.md b/c_sample/README.md new file mode 100644 index 00000000..5565bb3b --- /dev/null +++ b/c_sample/README.md @@ -0,0 +1,22 @@ +# llguidance C++ sample + +This is a simple example of how to use the llguidance library in C++. + +It reads a Guidance grammar from a JSON file as well as the text that we +pretend the LLM has generated and then makes sure the text conforms to the +grammar. + +For a real integration: + +- replace `bogus_tokenize()` with a real tokenizer for your LLM +- make sure you pass the list of tokens to `create_tokenizer()` +- for an incoming request, create a constraint based on data in the + request; make sure to handle errors returned by `llg_get_error()` +- while computing logits, run `llg_compute_mask()` +- sample with the returned mask +- pass the sampled token to `llg_commit_token()` + +## TODO + +- [ ] extend to read JSON schema +- [ ] extend to allow simple regex as constraint diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp new file mode 100644 index 00000000..cf525183 --- /dev/null +++ b/c_sample/c_sample.cpp @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "llguidance.h" + +// Create an LlgTokenizer; tokens[token_id] is a byte sequence corresponding to +// given token_id; see below for tokenize_fn +LlgTokenizer *create_tokenizer(std::vector> &tokens, + uint32_t tok_eos, LlgTokenizeFn tokenize_fn, + const void *tokenize_user_data) { + auto token_lens = new uint32_t[tokens.size()]; + size_t total_size = 0; + for (size_t i = 0; i < tokens.size(); i++) { + token_lens[i] = tokens[i].size(); + total_size += token_lens[i]; + } + auto token_bytes = new uint8_t[total_size]; + size_t offset = 0; + for (size_t i = 0; i < tokens.size(); i++) { + memcpy(token_bytes + offset, tokens[i].data(), token_lens[i]); + offset += token_lens[i]; + } + LlgTokenizerInit tok_init = { + .vocab_size = (uint32_t)tokens.size(), + .tok_eos = tok_eos, + .token_lens = token_lens, + .token_bytes = token_bytes, + .tokenize_assumes_string = false, + .tokenize_user_data = tokenize_user_data, + .tokenize_fn = tokenize_fn, + }; + return llg_new_tokenizer(&tok_init); +} + +// This function assumes that each byte is a single token. +// You want to replace this. This has to be thread-safe! +std::vector bogus_tokenize(const uint8_t *bytes_ptr, size_t nbytes) { + std::vector token_ids; + for (size_t i = 0; i < nbytes; i++) { + token_ids.push_back(bytes_ptr[i]); + } + return token_ids; +} + +// This wraps a C++-style "bogus_tokenize()" in a way llg wants it. +size_t tokenize_callback(const void *user_data, const uint8_t *bytes, + size_t bytes_len, uint32_t *output_tokens, + size_t output_tokens_len) { + (void)user_data; + auto tokens = bogus_tokenize(bytes, bytes_len); + if (output_tokens_len > 0) { + memcpy(output_tokens, tokens.data(), + std::min(output_tokens_len, tokens.size()) * sizeof(uint32_t)); + } + return tokens.size(); +} + +// This creates a tokenizer that treats each byte as a token. +LlgTokenizer *create_byte_tokenizer(void) { + std::vector> tokens; + // every byte is a token + for (size_t i = 0; i < 256; i++) { + tokens.push_back({(uint8_t)i}); + } + const char *eos = ""; + tokens.push_back(std::vector(eos, eos + strlen(eos))); + return create_tokenizer(tokens, tokens.size() - 1, tokenize_callback, + nullptr); +} + +std::string read_file(const std::string &filePath) { + std::ifstream file(filePath); + std::stringstream buffer; + buffer << file.rdbuf(); + return buffer.str(); +} + +void fail_constraint(LlgConstraint *c) { + printf("Error: %s\n", llg_get_error(c)); + llg_free_constraint(c); + exit(1); +} + +int main(int argc, const char *argv[]) { + // the tokenizer can (and should) be shared between constraints + LlgTokenizer *tokenizer = create_byte_tokenizer(); + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + auto schema_json = read_file(argv[1]); + auto sample_json = read_file(argv[2]); + + LlgConstraintInit init; + llg_constraint_init_set_defaults(&init, tokenizer); + init.log_stderr_level = 0; // default to 1 (warnings only) + + LlgConstraint *c = llg_new_constraint(&init, schema_json.c_str()); + // this is a very common place where errors can happen - for example the + // schema was invalid + if (llg_get_error(c)) { + fail_constraint(c); + } + + // we assume our "LLM" will generate these tokens + auto tokens = + bogus_tokenize((const uint8_t *)sample_json.c_str(), sample_json.size()); + + LlgMaskResult mask_res; + for (size_t i = 0; i < tokens.size(); i++) { + // compute mask - this can be done with parallel with logit generation + if (llg_compute_mask(c, &mask_res) != 0) { + fail_constraint(c); + } + + // here, we would normally sample constrained to mask_res.sample_mask + // using mask_res.temperature + uint32_t token = tokens[i]; + + // make sure token is in the mask + assert(mask_res.sample_mask[token / 32] & (1 << (token % 32))); + + // here we commit the token + // if "ff_tokens" are enabled, this can return more than one token + // to fast-forward + LlgCommitResult commit_res; + if (llg_commit_token(c, tokens[i], &commit_res) != 0) { + fail_constraint(c); + } + + // we didn't enable ff_tokens, so the exact token that we passed should be + // returned + assert(commit_res.n_tokens == 1); + assert(commit_res.tokens[0] == token); + } + + if (llg_compute_mask(c, &mask_res) != 0) { + fail_constraint(c); + } + // we assume the constraint will force EOS at the end of the input + assert(mask_res.is_stop); + + printf("OK!\n"); + return 0; +} diff --git a/parser/llguidance.h b/parser/llguidance.h index 507b9185..3fb1191d 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -112,8 +112,10 @@ typedef struct LlgCommitResult { * Tokenization function * Will not write more than output_tokens_len tokens (which can be 0) * Returns the total number of tokens (which can be more than output_tokens_len) + * This function has to be thread-safe! */ -typedef size_t (*LlgTokenizeFn)(const uint8_t *bytes, +typedef size_t (*LlgTokenizeFn)(const void *user_data, + const uint8_t *bytes, size_t bytes_len, uint32_t *output_tokens, size_t output_tokens_len); @@ -144,13 +146,17 @@ typedef struct LlgTokenizerInit { */ bool tokenize_assumes_string; /** - * Tokenization function, see TokenizeFn docs. + * Tokenization function, see LlgTokenizeFn docs. * It should only tokenize the bytes and not add * any etc. It should also work on any byte sequence, including * invalid UTF-8. If this is not the case, set tokenize_assumes_string to true. * Either way, this function has to be thread-safe! */ LlgTokenizeFn tokenize_fn; + /** + * User data to pass to the tokenize_fn + */ + const void *tokenize_user_data; } LlgTokenizerInit; #ifdef __cplusplus @@ -163,7 +169,8 @@ extern "C" { * and all logging to the buffer (get with llg_flush_logs()). * You need to set the tokenizer field manually. */ -void llg_constraint_init_set_defaults(struct LlgConstraintInit *init); +void llg_constraint_init_set_defaults(struct LlgConstraintInit *init, + const struct LlgTokenizer *tokenizer); /** * Create a new constraint from a grammar JSON string diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 061c2fbb..2fdda5e5 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -1,9 +1,9 @@ use std::{ - ffi::{c_char, CStr}, + ffi::{c_char, c_void, CStr}, sync::Arc, }; -use anyhow::Result; +use anyhow::{bail, Result}; use toktrie::{InferenceCapabilities, TokEnv, TokRxInfo, TokTrie, TokenizerEnv}; use crate::{ @@ -14,17 +14,32 @@ use crate::{ struct CTokenizerInner { trie: TokTrie, tokenize_fn: LlgTokenizeFn, + tokenize_user_data: *const c_void, tokenize_assumes_string: bool, } +unsafe impl Send for CTokenizerInner {} +unsafe impl Sync for CTokenizerInner {} impl CTokenizerInner { fn raw_tokenize(&self, s: &[u8]) -> Vec { let mut res_toks = vec![0; s.len() / 4 + 5]; - let n_toks = (self.tokenize_fn)(s.as_ptr(), s.len(), res_toks.as_mut_ptr(), res_toks.len()); + let n_toks = (self.tokenize_fn)( + self.tokenize_user_data, + s.as_ptr(), + s.len(), + res_toks.as_mut_ptr(), + res_toks.len(), + ); if n_toks > res_toks.len() { res_toks.resize(n_toks, 0); - (self.tokenize_fn)(s.as_ptr(), s.len(), res_toks.as_mut_ptr(), res_toks.len()); + (self.tokenize_fn)( + self.tokenize_user_data, + s.as_ptr(), + s.len(), + res_toks.as_mut_ptr(), + res_toks.len(), + ); } res_toks.truncate(n_toks); @@ -77,6 +92,7 @@ impl LlgTokenizer { trie, tokenize_assumes_string: init.tokenize_assumes_string, tokenize_fn: init.tokenize_fn, + tokenize_user_data: init.tokenize_user_data, }), } } @@ -91,7 +107,9 @@ pub type LlgToken = u32; /// Tokenization function /// Will not write more than output_tokens_len tokens (which can be 0) /// Returns the total number of tokens (which can be more than output_tokens_len) +/// This function has to be thread-safe! pub type LlgTokenizeFn = extern "C" fn( + user_data: *const c_void, bytes: *const u8, bytes_len: usize, output_tokens: *mut u32, @@ -119,12 +137,15 @@ pub struct LlgTokenizerInit { /// TODO: the bit not implemented yet pub tokenize_assumes_string: bool, - /// Tokenization function, see TokenizeFn docs. + /// Tokenization function, see LlgTokenizeFn docs. /// It should only tokenize the bytes and not add /// any etc. It should also work on any byte sequence, including /// invalid UTF-8. If this is not the case, set tokenize_assumes_string to true. /// Either way, this function has to be thread-safe! pub tokenize_fn: LlgTokenizeFn, + + /// User data to pass to the tokenize_fn + pub tokenize_user_data: *const c_void, } #[repr(C)] @@ -193,6 +214,10 @@ impl LlgCommitResult { } fn new_constraint(init: &LlgConstraintInit, grammar_json: *const c_char) -> Result { + if init.tokenizer.is_null() { + bail!("Tokenizer is null"); + } + let grammar_json = unsafe { CStr::from_ptr(grammar_json) } .to_str() .map_err(|_| anyhow::anyhow!("Invalid UTF-8 in grammar_json"))?; @@ -244,10 +269,13 @@ impl LlgConstraint { /// and all logging to the buffer (get with llg_flush_logs()). /// You need to set the tokenizer field manually. #[no_mangle] -pub extern "C" fn llg_constraint_init_set_defaults(init: &mut LlgConstraintInit) { +pub extern "C" fn llg_constraint_init_set_defaults( + init: &mut LlgConstraintInit, + tokenizer: *const LlgTokenizer, +) { *init = LlgConstraintInit { - tokenizer: std::ptr::null(), - log_buffer_level: 2, + tokenizer, + log_buffer_level: 0, log_stderr_level: 1, ff_tokens_ok: false, backtrack_ok: false,