Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lark syntax -> llguidance translator #37

Merged
merged 12 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ instant = "0.1.13"
jsonschema = { version = "0.24.0", default-features = false }
url = "2.5.2"
lazy_static = "1.5.0"
regex-syntax = "0.8.5"

[features]
default = []
Expand Down
16 changes: 16 additions & 0 deletions parser/llguidance.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,22 @@ struct LlgConstraint *llg_new_constraint_regex(const struct LlgConstraintInit *i
struct LlgConstraint *llg_new_constraint_json(const struct LlgConstraintInit *init,
const char *json_schema);

/**
* Create a new constraint from a given lark grammar
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_lark(const struct LlgConstraintInit *init,
const char *lark);

/**
* Create a new constraint with specified type
* Type can be one of "regex", "json_schema" (or "json"), "lark", "llguidance" (or "guidance")
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_any(const struct LlgConstraintInit *init,
const char *constraint_type,
const char *data);

/**
* Get the error message from the constraint or null if there is no error.
* After it returns a non-null value, it will always return it until the constraint is freed
Expand Down
16 changes: 15 additions & 1 deletion parser/src/api.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::fmt::Debug;

use serde::{Deserialize, Serialize};
use serde_json::Value;

/// This represents a collection of grammars, with a designated
/// "start" grammar at first position.
Expand All @@ -19,8 +20,19 @@ pub const DEFAULT_CONTEXTUAL: bool = true;
#[derive(Serialize, Deserialize, Clone, Default)]
pub struct GrammarWithLexer {
/// The start symbol is at nodes[0]
/// When nodes is empty, then one of json_schema or lark_grammar must be set.
#[serde(default)]
pub nodes: Vec<Node>,

/// The JSON schema that the grammar should generate.
/// When this is set, nodes and rx_nodes must be empty.
pub json_schema: Option<Value>,

/// The Lark grammar that the grammar should generate.
/// When this is set, nodes and rx_nodes must be empty.
pub lark_grammar: Option<String>,

/// This is no longer used.
/// When enabled, the grammar can use `Lexeme` but not `Gen`.
/// When disabled, the grammar can use `Gen` but not `Lexeme`.
/// `String` is allowed in either case as a shorthand for either `Lexeme` or `Gen`.
Expand Down Expand Up @@ -196,7 +208,7 @@ pub struct GenGrammarOptions {
pub max_tokens_grm: usize,
}

#[derive(Serialize, Deserialize, Clone, Debug)]
#[derive(Serialize, Deserialize, Clone, Debug, Hash, PartialEq, Eq)]
pub enum RegexNode {
/// Intersection of the regexes
And(Vec<RegexId>),
Expand Down Expand Up @@ -373,6 +385,8 @@ impl TopLevelGrammar {
json_allowed_escapes: None,
json_raw: None,
}],
json_schema: None,
lark_grammar: None,
greedy_lexer: true,
greedy_skip_rx: None,
contextual: None,
Expand Down
34 changes: 32 additions & 2 deletions parser/src/earley/from_guidance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ use crate::api::{
GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
DEFAULT_CONTEXTUAL,
};
use crate::{loginfo, Logger};
use crate::lark::{lark_to_llguidance, parse_lark};
use crate::{loginfo, JsonCompileOptions, Logger};
use anyhow::{bail, ensure, Result};
use derivre::{ExprRef, JsonQuoteOptions, RegexAst, RegexBuilder};
use instant::Instant;
Expand Down Expand Up @@ -84,8 +85,37 @@ fn map_rx_nodes(
fn grammar_from_json(
tok_env: &TokEnv,
limits: &mut ParserLimits,
input: GrammarWithLexer,
mut input: GrammarWithLexer,
) -> Result<(LexerSpec, Grammar)> {
if input.json_schema.is_some() || input.lark_grammar.is_some() {
ensure!(
input.nodes.is_empty() && input.rx_nodes.is_empty(),
"cannot have both json_schema/lark_grammar and nodes/rx_nodes"
);

let mut new_grm = if let Some(json_schema) = input.json_schema.as_ref() {
ensure!(
input.lark_grammar.is_none(),
"cannot have both json_schema and lark_grammar"
);
let opts = JsonCompileOptions { compact: false };
opts.json_to_llg_no_validate(json_schema)?
} else {
let items = parse_lark(input.lark_grammar.as_ref().unwrap())?;
lark_to_llguidance(items)?
};

let g = new_grm.grammars.pop().unwrap();

input.greedy_skip_rx = g.greedy_skip_rx;
input.nodes = g.nodes;
input.rx_nodes = g.rx_nodes;
input.contextual = g.contextual;

input.lark_grammar = None;
input.json_schema = None;
}

ensure!(input.nodes.len() > 0, "empty grammar");

let (builder, rx_nodes) = map_rx_nodes(limits, input.rx_nodes, input.allow_invalid_utf8)?;
Expand Down
9 changes: 9 additions & 0 deletions parser/src/earley/lexerspec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ impl LexerSpec {
SimpleVob::alloc(self.lexemes.len())
}

pub fn all_lexemes(&self) -> SimpleVob {
let mut v = self.alloc_lexeme_set();
self.lexemes[0..self.lexemes.len() - self.num_extra_lexemes]
.iter()
.enumerate()
.for_each(|(idx, _)| v.set(idx, true));
v
}

pub fn lazy_lexemes(&self) -> SimpleVob {
let mut v = self.alloc_lexeme_set();
for (idx, lex) in self.lexemes.iter().enumerate() {
Expand Down
2 changes: 1 addition & 1 deletion parser/src/earley/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mod from_guidance;
mod grammar;
mod lexer;
pub(crate) mod lexer;
mod parser;

pub mod lexerspec;
Expand Down
49 changes: 48 additions & 1 deletion parser/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use toktrie::{InferenceCapabilities, TokEnv, TokRxInfo, TokTrie, TokenizerEnv};

use crate::{
api::{ParserLimits, RegexNode, TopLevelGrammar},
lark::{lark_to_llguidance, parse_lark},
CommitResult, Constraint, JsonCompileOptions, Logger, TokenParser,
};

Expand Down Expand Up @@ -273,6 +274,14 @@ fn new_constraint_regex(init: &LlgConstraintInit, regex: *const c_char) -> Resul
new_constraint_core(init, grammar)
}

fn new_constraint_lark(init: &LlgConstraintInit, lark: *const c_char) -> Result<Constraint> {
let lark = unsafe { CStr::from_ptr(lark) }
.to_str()
.map_err(|_| anyhow::anyhow!("Invalid UTF-8 in lark"))?;
let grammar = lark_to_llguidance(parse_lark(lark)?)?;
new_constraint_core(init, grammar)
}

fn new_constraint_json(init: &LlgConstraintInit, json_schema: *const c_char) -> Result<Constraint> {
let json_schema = unsafe { CStr::from_ptr(json_schema) }
.to_str()
Expand All @@ -295,6 +304,23 @@ fn new_constraint(init: &LlgConstraintInit, grammar_json: *const c_char) -> Resu
new_constraint_core(init, grammar)
}

fn new_constraint_any(
init: &LlgConstraintInit,
constraint_type: *const c_char,
data: *const c_char,
) -> Result<Constraint> {
let tp = unsafe { CStr::from_ptr(constraint_type) }
.to_str()
.map_err(|_| anyhow::anyhow!("Invalid UTF-8 in constraint_type"))?;
match tp {
"regex" => new_constraint_regex(init, data),
"json" | "json_schema" => new_constraint_json(init, data),
"lark" => new_constraint_lark(init, data),
"llguidance" | "guidance" => new_constraint_lark(init, data),
_ => bail!("unknown constraint type: {tp}"),
}
}

fn new_constraint_core(init: &LlgConstraintInit, grammar: TopLevelGrammar) -> Result<Constraint> {
if init.tokenizer.is_null() {
bail!("Tokenizer is null");
Expand Down Expand Up @@ -400,6 +426,28 @@ pub extern "C" fn llg_new_constraint_json(
return_constraint(new_constraint_json(init, json_schema))
}

/// Create a new constraint from a given lark grammar
/// Always returns a non-null value. Call llg_get_error() on the result to check for errors.
#[no_mangle]
pub extern "C" fn llg_new_constraint_lark(
init: &LlgConstraintInit,
lark: *const c_char,
) -> *mut LlgConstraint {
return_constraint(new_constraint_lark(init, lark))
}

/// Create a new constraint with specified type
/// Type can be one of "regex", "json_schema" (or "json"), "lark", "llguidance" (or "guidance")
/// Always returns a non-null value. Call llg_get_error() on the result to check for errors.
#[no_mangle]
pub extern "C" fn llg_new_constraint_any(
init: &LlgConstraintInit,
constraint_type: *const c_char,
data: *const c_char,
) -> *mut LlgConstraint {
return_constraint(new_constraint_any(init, constraint_type, data))
}

/// Get the error message from the constraint or null if there is no error.
/// After it returns a non-null value, it will always return it until the constraint is freed
/// using llg_free_constraint() (at which point the pointer will be invalid).
Expand Down Expand Up @@ -532,7 +580,6 @@ pub extern "C" fn llg_stringify_tokens(
s.len() + 1
}


/// Free the tokenizer. Should *NOT* be called while there are still constraints using it.
#[no_mangle]
pub extern "C" fn llg_free_tokenizer(tok: *mut LlgTokenizer) {
Expand Down
92 changes: 90 additions & 2 deletions parser/src/grammar_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ use std::{collections::HashMap, sync::atomic::AtomicU32};

use anyhow::{ensure, Result};

use crate::api::{GrammarWithLexer, Node, NodeId, NodeProps, RegexSpec, TopLevelGrammar};
use crate::api::{
GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
};

#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub struct NodeRef {
Expand All @@ -16,6 +18,81 @@ pub struct GrammarBuilder {
strings: HashMap<String, NodeRef>,
curr_grammar_id: u32,
nodes: Vec<Node>,
pub regex: RegexBuilder,
}

pub struct RegexBuilder {
node_ids: HashMap<RegexNode, RegexId>,
nodes: Vec<RegexNode>,
}

impl RegexBuilder {
pub fn new() -> Self {
Self {
nodes: vec![],
node_ids: HashMap::new(),
}
}

pub fn add_node(&mut self, node: RegexNode) -> RegexId {
if let Some(id) = self.node_ids.get(&node) {
return *id;
}
let id = RegexId(self.nodes.len());
self.nodes.push(node.clone());
self.node_ids.insert(node, id);
id
}

pub fn regex(&mut self, rx: String) -> RegexId {
self.add_node(RegexNode::Regex(rx))
}

pub fn literal(&mut self, s: String) -> RegexId {
self.add_node(RegexNode::Literal(s))
}

pub fn concat(&mut self, nodes: Vec<RegexId>) -> RegexId {
if nodes.len() == 1 {
return nodes[0];
}
if nodes.len() == 0 {
return self.add_node(RegexNode::NoMatch);
}
self.add_node(RegexNode::Concat(nodes))
}

pub fn select(&mut self, nodes: Vec<RegexId>) -> RegexId {
if nodes.len() == 1 {
return nodes[0];
}
if nodes.len() == 0 {
return self.add_node(RegexNode::NoMatch);
}
self.add_node(RegexNode::Or(nodes))
}

pub fn zero_or_more(&mut self, node: RegexId) -> RegexId {
self.repeat(node, 0, None)
}

pub fn one_or_more(&mut self, node: RegexId) -> RegexId {
self.repeat(node, 1, None)
}

pub fn optional(&mut self, node: RegexId) -> RegexId {
self.repeat(node, 0, Some(1))
}

pub fn repeat(&mut self, node: RegexId, min: u32, max: Option<u32>) -> RegexId {
self.add_node(RegexNode::Repeat(node, min, max))
}

fn finalize(&mut self) -> Vec<RegexNode> {
let r = std::mem::take(&mut self.nodes);
*self = Self::new();
r
}
}

impl GrammarBuilder {
Expand Down Expand Up @@ -49,6 +126,7 @@ impl GrammarBuilder {
strings: HashMap::new(),
curr_grammar_id: 0,
nodes: vec![],
regex: RegexBuilder::new(),
}
}

Expand All @@ -62,6 +140,7 @@ impl GrammarBuilder {
"no nodes added before add_grammar() or finalize()"
);
self.top_grammar.grammars.last_mut().unwrap().nodes = nodes;
self.top_grammar.grammars.last_mut().unwrap().rx_nodes = self.regex.finalize();
}
}

Expand Down Expand Up @@ -158,10 +237,19 @@ impl GrammarBuilder {
self.select(&[value, empty])
}

pub fn one_or_more(&mut self, elt: NodeRef) -> NodeRef {
let p = self.placeholder();
let p_elt = self.join(&[p, elt]);
let inner = self.select(&[elt, p_elt]);
self.set_placeholder(p, inner);
p
}

pub fn zero_or_more(&mut self, elt: NodeRef) -> NodeRef {
let p = self.placeholder();
let empty = self.empty();
let inner = self.select(&[empty, elt]);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hudson-ai be aware that this was broken; I'll be merging this initial lark stuff soon, so the fix will come in (zero_or_more was really zero_or_one)

let p_elt = self.join(&[p, elt]);
let inner = self.select(&[empty, p_elt]);
self.set_placeholder(p, inner);
p
}
Expand Down
Loading
Loading