Skip to content

Commit

Permalink
Merge branch 'lark'
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Nov 4, 2024
2 parents 9fd2f92 + bbd8e3c commit 7663176
Show file tree
Hide file tree
Showing 18 changed files with 1,427 additions and 7 deletions.
1 change: 1 addition & 0 deletions parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ instant = "0.1.13"
jsonschema = { version = "0.24.0", default-features = false }
url = "2.5.2"
lazy_static = "1.5.0"
regex-syntax = "0.8.5"

[features]
default = []
Expand Down
16 changes: 16 additions & 0 deletions parser/llguidance.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,22 @@ struct LlgConstraint *llg_new_constraint_regex(const struct LlgConstraintInit *i
struct LlgConstraint *llg_new_constraint_json(const struct LlgConstraintInit *init,
const char *json_schema);

/**
* Create a new constraint from a given lark grammar
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_lark(const struct LlgConstraintInit *init,
const char *lark);

/**
* Create a new constraint with specified type
* Type can be one of "regex", "json_schema" (or "json"), "lark", "llguidance" (or "guidance")
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_any(const struct LlgConstraintInit *init,
const char *constraint_type,
const char *data);

/**
* Get the error message from the constraint or null if there is no error.
* After it returns a non-null value, it will always return it until the constraint is freed
Expand Down
16 changes: 15 additions & 1 deletion parser/src/api.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::fmt::Debug;

use serde::{Deserialize, Serialize};
use serde_json::Value;

/// This represents a collection of grammars, with a designated
/// "start" grammar at first position.
Expand All @@ -19,8 +20,19 @@ pub const DEFAULT_CONTEXTUAL: bool = true;
#[derive(Serialize, Deserialize, Clone, Default)]
pub struct GrammarWithLexer {
/// The start symbol is at nodes[0]
/// When nodes is empty, then one of json_schema or lark_grammar must be set.
#[serde(default)]
pub nodes: Vec<Node>,

/// The JSON schema that the grammar should generate.
/// When this is set, nodes and rx_nodes must be empty.
pub json_schema: Option<Value>,

/// The Lark grammar that the grammar should generate.
/// When this is set, nodes and rx_nodes must be empty.
pub lark_grammar: Option<String>,

/// This is no longer used.
/// When enabled, the grammar can use `Lexeme` but not `Gen`.
/// When disabled, the grammar can use `Gen` but not `Lexeme`.
/// `String` is allowed in either case as a shorthand for either `Lexeme` or `Gen`.
Expand Down Expand Up @@ -196,7 +208,7 @@ pub struct GenGrammarOptions {
pub max_tokens_grm: usize,
}

#[derive(Serialize, Deserialize, Clone, Debug)]
#[derive(Serialize, Deserialize, Clone, Debug, Hash, PartialEq, Eq)]
pub enum RegexNode {
/// Intersection of the regexes
And(Vec<RegexId>),
Expand Down Expand Up @@ -373,6 +385,8 @@ impl TopLevelGrammar {
json_allowed_escapes: None,
json_raw: None,
}],
json_schema: None,
lark_grammar: None,
greedy_lexer: true,
greedy_skip_rx: None,
contextual: None,
Expand Down
34 changes: 32 additions & 2 deletions parser/src/earley/from_guidance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ use crate::api::{
GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
DEFAULT_CONTEXTUAL,
};
use crate::{loginfo, Logger};
use crate::lark::{lark_to_llguidance, parse_lark};
use crate::{loginfo, JsonCompileOptions, Logger};
use anyhow::{bail, ensure, Result};
use derivre::{ExprRef, JsonQuoteOptions, RegexAst, RegexBuilder};
use instant::Instant;
Expand Down Expand Up @@ -84,8 +85,37 @@ fn map_rx_nodes(
fn grammar_from_json(
tok_env: &TokEnv,
limits: &mut ParserLimits,
input: GrammarWithLexer,
mut input: GrammarWithLexer,
) -> Result<(LexerSpec, Grammar)> {
if input.json_schema.is_some() || input.lark_grammar.is_some() {
ensure!(
input.nodes.is_empty() && input.rx_nodes.is_empty(),
"cannot have both json_schema/lark_grammar and nodes/rx_nodes"
);

let mut new_grm = if let Some(json_schema) = input.json_schema.as_ref() {
ensure!(
input.lark_grammar.is_none(),
"cannot have both json_schema and lark_grammar"
);
let opts = JsonCompileOptions { compact: false };
opts.json_to_llg_no_validate(json_schema)?
} else {
let items = parse_lark(input.lark_grammar.as_ref().unwrap())?;
lark_to_llguidance(items)?
};

let g = new_grm.grammars.pop().unwrap();

input.greedy_skip_rx = g.greedy_skip_rx;
input.nodes = g.nodes;
input.rx_nodes = g.rx_nodes;
input.contextual = g.contextual;

input.lark_grammar = None;
input.json_schema = None;
}

ensure!(input.nodes.len() > 0, "empty grammar");

let (builder, rx_nodes) = map_rx_nodes(limits, input.rx_nodes, input.allow_invalid_utf8)?;
Expand Down
9 changes: 9 additions & 0 deletions parser/src/earley/lexerspec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ impl LexerSpec {
SimpleVob::alloc(self.lexemes.len())
}

pub fn all_lexemes(&self) -> SimpleVob {
let mut v = self.alloc_lexeme_set();
self.lexemes[0..self.lexemes.len() - self.num_extra_lexemes]
.iter()
.enumerate()
.for_each(|(idx, _)| v.set(idx, true));
v
}

pub fn lazy_lexemes(&self) -> SimpleVob {
let mut v = self.alloc_lexeme_set();
for (idx, lex) in self.lexemes.iter().enumerate() {
Expand Down
2 changes: 1 addition & 1 deletion parser/src/earley/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mod from_guidance;
mod grammar;
mod lexer;
pub(crate) mod lexer;
mod parser;

pub mod lexerspec;
Expand Down
49 changes: 48 additions & 1 deletion parser/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use toktrie::{InferenceCapabilities, TokEnv, TokRxInfo, TokTrie, TokenizerEnv};

use crate::{
api::{ParserLimits, RegexNode, TopLevelGrammar},
lark::{lark_to_llguidance, parse_lark},
CommitResult, Constraint, JsonCompileOptions, Logger, TokenParser,
};

Expand Down Expand Up @@ -273,6 +274,14 @@ fn new_constraint_regex(init: &LlgConstraintInit, regex: *const c_char) -> Resul
new_constraint_core(init, grammar)
}

fn new_constraint_lark(init: &LlgConstraintInit, lark: *const c_char) -> Result<Constraint> {
let lark = unsafe { CStr::from_ptr(lark) }
.to_str()
.map_err(|_| anyhow::anyhow!("Invalid UTF-8 in lark"))?;
let grammar = lark_to_llguidance(parse_lark(lark)?)?;
new_constraint_core(init, grammar)
}

fn new_constraint_json(init: &LlgConstraintInit, json_schema: *const c_char) -> Result<Constraint> {
let json_schema = unsafe { CStr::from_ptr(json_schema) }
.to_str()
Expand All @@ -295,6 +304,23 @@ fn new_constraint(init: &LlgConstraintInit, grammar_json: *const c_char) -> Resu
new_constraint_core(init, grammar)
}

fn new_constraint_any(
init: &LlgConstraintInit,
constraint_type: *const c_char,
data: *const c_char,
) -> Result<Constraint> {
let tp = unsafe { CStr::from_ptr(constraint_type) }
.to_str()
.map_err(|_| anyhow::anyhow!("Invalid UTF-8 in constraint_type"))?;
match tp {
"regex" => new_constraint_regex(init, data),
"json" | "json_schema" => new_constraint_json(init, data),
"lark" => new_constraint_lark(init, data),
"llguidance" | "guidance" => new_constraint_lark(init, data),
_ => bail!("unknown constraint type: {tp}"),
}
}

fn new_constraint_core(init: &LlgConstraintInit, grammar: TopLevelGrammar) -> Result<Constraint> {
if init.tokenizer.is_null() {
bail!("Tokenizer is null");
Expand Down Expand Up @@ -400,6 +426,28 @@ pub extern "C" fn llg_new_constraint_json(
return_constraint(new_constraint_json(init, json_schema))
}

/// Create a new constraint from a given lark grammar
/// Always returns a non-null value. Call llg_get_error() on the result to check for errors.
#[no_mangle]
pub extern "C" fn llg_new_constraint_lark(
init: &LlgConstraintInit,
lark: *const c_char,
) -> *mut LlgConstraint {
return_constraint(new_constraint_lark(init, lark))
}

/// Create a new constraint with specified type
/// Type can be one of "regex", "json_schema" (or "json"), "lark", "llguidance" (or "guidance")
/// Always returns a non-null value. Call llg_get_error() on the result to check for errors.
#[no_mangle]
pub extern "C" fn llg_new_constraint_any(
init: &LlgConstraintInit,
constraint_type: *const c_char,
data: *const c_char,
) -> *mut LlgConstraint {
return_constraint(new_constraint_any(init, constraint_type, data))
}

/// Get the error message from the constraint or null if there is no error.
/// After it returns a non-null value, it will always return it until the constraint is freed
/// using llg_free_constraint() (at which point the pointer will be invalid).
Expand Down Expand Up @@ -532,7 +580,6 @@ pub extern "C" fn llg_stringify_tokens(
s.len() + 1
}


/// Free the tokenizer. Should *NOT* be called while there are still constraints using it.
#[no_mangle]
pub extern "C" fn llg_free_tokenizer(tok: *mut LlgTokenizer) {
Expand Down
92 changes: 90 additions & 2 deletions parser/src/grammar_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ use std::{collections::HashMap, sync::atomic::AtomicU32};

use anyhow::{ensure, Result};

use crate::api::{GrammarWithLexer, Node, NodeId, NodeProps, RegexSpec, TopLevelGrammar};
use crate::api::{
GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
};

#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub struct NodeRef {
Expand All @@ -16,6 +18,81 @@ pub struct GrammarBuilder {
strings: HashMap<String, NodeRef>,
curr_grammar_id: u32,
nodes: Vec<Node>,
pub regex: RegexBuilder,
}

pub struct RegexBuilder {
node_ids: HashMap<RegexNode, RegexId>,
nodes: Vec<RegexNode>,
}

impl RegexBuilder {
pub fn new() -> Self {
Self {
nodes: vec![],
node_ids: HashMap::new(),
}
}

pub fn add_node(&mut self, node: RegexNode) -> RegexId {
if let Some(id) = self.node_ids.get(&node) {
return *id;
}
let id = RegexId(self.nodes.len());
self.nodes.push(node.clone());
self.node_ids.insert(node, id);
id
}

pub fn regex(&mut self, rx: String) -> RegexId {
self.add_node(RegexNode::Regex(rx))
}

pub fn literal(&mut self, s: String) -> RegexId {
self.add_node(RegexNode::Literal(s))
}

pub fn concat(&mut self, nodes: Vec<RegexId>) -> RegexId {
if nodes.len() == 1 {
return nodes[0];
}
if nodes.len() == 0 {
return self.add_node(RegexNode::NoMatch);
}
self.add_node(RegexNode::Concat(nodes))
}

pub fn select(&mut self, nodes: Vec<RegexId>) -> RegexId {
if nodes.len() == 1 {
return nodes[0];
}
if nodes.len() == 0 {
return self.add_node(RegexNode::NoMatch);
}
self.add_node(RegexNode::Or(nodes))
}

pub fn zero_or_more(&mut self, node: RegexId) -> RegexId {
self.repeat(node, 0, None)
}

pub fn one_or_more(&mut self, node: RegexId) -> RegexId {
self.repeat(node, 1, None)
}

pub fn optional(&mut self, node: RegexId) -> RegexId {
self.repeat(node, 0, Some(1))
}

pub fn repeat(&mut self, node: RegexId, min: u32, max: Option<u32>) -> RegexId {
self.add_node(RegexNode::Repeat(node, min, max))
}

fn finalize(&mut self) -> Vec<RegexNode> {
let r = std::mem::take(&mut self.nodes);
*self = Self::new();
r
}
}

impl GrammarBuilder {
Expand Down Expand Up @@ -49,6 +126,7 @@ impl GrammarBuilder {
strings: HashMap::new(),
curr_grammar_id: 0,
nodes: vec![],
regex: RegexBuilder::new(),
}
}

Expand All @@ -62,6 +140,7 @@ impl GrammarBuilder {
"no nodes added before add_grammar() or finalize()"
);
self.top_grammar.grammars.last_mut().unwrap().nodes = nodes;
self.top_grammar.grammars.last_mut().unwrap().rx_nodes = self.regex.finalize();
}
}

Expand Down Expand Up @@ -158,10 +237,19 @@ impl GrammarBuilder {
self.select(&[value, empty])
}

pub fn one_or_more(&mut self, elt: NodeRef) -> NodeRef {
let p = self.placeholder();
let p_elt = self.join(&[p, elt]);
let inner = self.select(&[elt, p_elt]);
self.set_placeholder(p, inner);
p
}

pub fn zero_or_more(&mut self, elt: NodeRef) -> NodeRef {
let p = self.placeholder();
let empty = self.empty();
let inner = self.select(&[empty, elt]);
let p_elt = self.join(&[p, elt]);
let inner = self.select(&[empty, p_elt]);
self.set_placeholder(p, inner);
p
}
Expand Down
Loading

0 comments on commit 7663176

Please sign in to comment.