forked from guidance-ai/llguidance
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllguidance.h
366 lines (330 loc) · 11.3 KB
/
llguidance.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#ifndef LLGUIDANCE_H
#define LLGUIDANCE_H
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
typedef struct LlgConstraint LlgConstraint;
typedef struct LlgTokenizer LlgTokenizer;
typedef struct LlgParserLimits {
/**
* For non-ambiguous grammars, this is the maximum "branching factor" of the grammar.
* For ambiguous grammars, this might get hit much quicker.
* Default: 2000
*/
size_t max_items_in_row;
/**
* How much "fuel" are we willing to spend to build initial lexer regex AST nodes.
* Default: 1_000_000
* Speed: 50k/ms
*/
uint64_t initial_lexer_fuel;
/**
* Maximum lexer fuel for computation of the whole token mask.
* Default: 200_000
* Speed: 14k/ms
*/
uint64_t step_lexer_fuel;
/**
* Number of Earley items created for the whole token mask.
* Default: 50_000
* Speed: 20k/ms
*/
size_t step_max_items;
/**
* Maximum number of lexer states.
* Default: 50_000
*/
size_t max_lexer_states;
/**
* Maximum size of the grammar (symbols in productions)
* Default: 500_000 (a few megabytes of JSON)
*/
size_t max_grammar_size;
} LlgParserLimits;
typedef struct LlgConstraintInit {
/**
* The tokenizer to use, created with llg_new_tokenizer()
*/
const struct LlgTokenizer *tokenizer;
/**
* The log level for the buffer that is kept inside of the constraint
* 0 - no logging, 1 - warnings only, 2 - info
*/
uint32_t log_buffer_level;
/**
* The log level for writing to stderr
*/
uint32_t log_stderr_level;
/**
* Does the engine support fast-forward tokens?
* (Appending more than one token to output at once)
*/
bool ff_tokens_ok;
/**
* Does the engine support backtracking?
* (Removing tokens from the output)
*/
bool backtrack_ok;
/**
* The resource limits for the parser
* Default values will be used for all fields that are 0
*/
struct LlgParserLimits limits;
} LlgConstraintInit;
typedef struct LlgMaskResult {
/**
* One bit per vocab token
* This is valid until any call to llg_*() on the current constraint
*/
const uint32_t *sample_mask;
/**
* Temperature to use for sampling
*/
float temperature;
/**
* Should the sequence stop?
*/
bool is_stop;
} LlgMaskResult;
typedef uint32_t LlgToken;
/**
* Represents result from llg_commit_token()
*/
typedef struct LlgCommitResult {
/**
* The tokens to append to the output if any
* This is valid until any call to llg_*() on the current constraint
*/
const uint32_t *tokens;
/**
* The number of tokens in the tokens array (can be 0)
*/
uint32_t n_tokens;
/**
* Should the sequence stop?
*/
bool is_stop;
} LlgCommitResult;
typedef struct LlgConstraintStep {
/**
* The constraint to compute mask for.
*/
struct LlgConstraint *constraint;
/**
* Pointer to memory where the mask should be written.
*/
uint32_t *mask_dest;
/**
* The length of the mask_dest array in bytes (not elements).
*/
size_t mask_byte_len;
} LlgConstraintStep;
/**
* Function which llg calls when an operation is done.
*/
typedef void (*LlgCallback)(const void *user_data);
/**
* Tokenization function
* Will not write more than output_tokens_len tokens (which can be 0)
* Returns the total number of tokens (which can be more than output_tokens_len)
* This function has to be thread-safe!
*/
typedef size_t (*LlgTokenizeFn)(const void *user_data,
const uint8_t *bytes,
size_t bytes_len,
uint32_t *output_tokens,
size_t output_tokens_len);
typedef struct LlgTokenizerInit {
/**
* The number of tokens in the vocabulary
*/
uint32_t vocab_size;
/**
* The token ID for the end of sentence token
* For chat mode, set it to end-of-turn token
*/
LlgToken tok_eos;
/**
* An array of the lengths of the token strings (vocab_size elements)
*/
const uint32_t *token_lens;
/**
* A pointer to the token strings
* The length of this the sum of all token_lens
*/
const uint8_t *token_bytes;
/**
* Instead of passing token_lens and token_bytes, this can be set to
* the contents of HF tokenizer.json file.
*/
const char *tokenizer_json;
/**
* Set to true to enable hack that works around the tokenize_fn only
* accepting valid UTF-8 strings and possibly adding <BOS> etc.
* TODO: the <BOS> bit not implemented yet
*/
bool tokenize_assumes_string;
/**
* Tokenization function, see LlgTokenizeFn docs.
* It should only tokenize the bytes and not add
* any <BOS> etc. It should also work on any byte sequence, including
* invalid UTF-8. If this is not the case, set tokenize_assumes_string to true.
* Either way, this function has to be thread-safe!
*/
LlgTokenizeFn tokenize_fn;
/**
* Set to true to not use tokenize_fn and instead tokenize greedily,
* which is often incorrect and may reduce accuracy.
*/
bool use_approximate_greedy_tokenize_fn;
/**
* User data to pass to the tokenize_fn
*/
const void *tokenize_user_data;
} LlgTokenizerInit;
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/**
* Set the default values for the ConstraintInit
* Disables ff_tokens and backtracking, enables warnings on stderr
* and all logging to the buffer (get with llg_flush_logs()).
* You need to set the tokenizer field manually.
*/
void llg_constraint_init_set_defaults(struct LlgConstraintInit *init,
const struct LlgTokenizer *tokenizer);
/**
* Create a new constraint from a grammar JSON string
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint(const struct LlgConstraintInit *init,
const char *grammar_json);
/**
* Create a new constraint from a given regular expression
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_regex(const struct LlgConstraintInit *init,
const char *regex);
/**
* Create a new constraint from a given JSON schema
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_json(const struct LlgConstraintInit *init,
const char *json_schema);
/**
* Create a new constraint from a given lark grammar
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_lark(const struct LlgConstraintInit *init,
const char *lark);
/**
* Create a new constraint with specified type
* Type can be one of "regex", "json_schema" (or "json"), "lark", "llguidance" (or "guidance")
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_any(const struct LlgConstraintInit *init,
const char *constraint_type,
const char *data);
/**
* Get the error message from the constraint or null if there is no error.
* After it returns a non-null value, it will always return it until the constraint is freed
* using llg_free_constraint() (at which point the pointer will be invalid).
*/
const char *llg_get_error(const struct LlgConstraint *cc);
/**
* Get the current temperature of the constraint.
* It is updated by mask computation.
*/
float llg_get_temperature(const struct LlgConstraint *cc);
/**
* Check if constraint is stopped (cannot be extended further).
*/
bool llg_is_stopped(const struct LlgConstraint *cc);
/**
* Compute mask for the next token sampling
* It typically takes up to a millisecond for a 100k tokenizer, so should be called in background.
* Returns 0 on success and -1 on error (use llg_get_error() to get the exact error).
* When 0 is returned, the result is written to *res_p.
*/
int32_t llg_compute_mask(struct LlgConstraint *cc, struct LlgMaskResult *res_p);
/**
* Commit the token sampled with the mask returned from llg_compute_mask().
* Can be run on the critical path of sampling (is fast).
* Returns 0 on success and -1 on error (use llg_get_error() to get the exact error).
* When 0 is returned, the result is written to *res_p.
*/
int32_t llg_commit_token(struct LlgConstraint *cc, LlgToken token, struct LlgCommitResult *res_p);
/**
* Compute mask for several constraints in parallel.
*/
void llg_par_compute_mask(const struct LlgConstraintStep *steps,
size_t n_steps,
const void *user_data,
LlgCallback done_cb);
/**
* Clone the constraint
*/
struct LlgConstraint *llg_clone_constraint(const struct LlgConstraint *cc);
/**
* Construct a new tokenizer from the given TokenizerInit
*/
struct LlgTokenizer *llg_new_tokenizer(const struct LlgTokenizerInit *tok_init,
char *error_string,
size_t error_string_len);
/**
* Clone a tokenizer.
* This increments a reference count and does a small allocation.
*/
struct LlgTokenizer *llg_clone_tokenizer(const struct LlgTokenizer *tok);
/**
* Tokenize the given bytes and return the tokens.
* Always returns the number of tokens that would be written to output_tokens
* if output_tokens_len was large enough.
*/
size_t llg_tokenize_bytes(const struct LlgTokenizer *tok,
const uint8_t *bytes,
size_t bytes_len,
uint32_t *output_tokens,
size_t output_tokens_len);
/**
* Tokenize the given bytes and return the tokens.
* Special tokens will be tokenized, if they follow 0xFF byte prefix.
* Always returns the number of tokens that would be written to output_tokens
* if output_tokens_len was large enough.
*/
size_t llg_tokenize_bytes_marker(const struct LlgTokenizer *tok,
const uint8_t *bytes,
size_t bytes_len,
uint32_t *output_tokens,
size_t output_tokens_len);
/**
* Return a string representation of the tokens, useful for debugging.
* The output is null-terminated.
* Returns the number of bytes that would be written to output if output_len was large enough.
*/
size_t llg_stringify_tokens(const struct LlgTokenizer *tok,
const uint32_t *tokens,
size_t n_tokens,
char *output,
size_t output_len);
/**
* Free the tokenizer. Should *NOT* be called while there are still constraints using it.
*/
void llg_free_tokenizer(struct LlgTokenizer *tok);
/**
* Free the constraint
*/
void llg_free_constraint(struct LlgConstraint *cc);
/**
* Get the logs from the constraint, since last call to this function.
* The logs are null-terminated.
* The logs are kept in the constraint until the next call to this function
* or until the constraint is freed.
*/
const char *llg_flush_logs(struct LlgConstraint *cc);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif /* LLGUIDANCE_H */