Merge branch 'lark'

guidance-ai · Nov 4, 2024 · 7663176 · 7663176
2 parents 9fd2f92 + bbd8e3c
commit 7663176
Show file tree

Hide file tree

Showing 18 changed files with 1,427 additions and 7 deletions.
diff --git a/parser/Cargo.toml b/parser/Cargo.toml
@@ -14,6 +14,7 @@ instant = "0.1.13"
 jsonschema = { version = "0.24.0", default-features = false }
 url = "2.5.2"
 lazy_static = "1.5.0"
+regex-syntax = "0.8.5"
 
 [features]
 default = []

diff --git a/parser/llguidance.h b/parser/llguidance.h
@@ -203,6 +203,22 @@ struct LlgConstraint *llg_new_constraint_regex(const struct LlgConstraintInit *i
 struct LlgConstraint *llg_new_constraint_json(const struct LlgConstraintInit *init,
                                               const char *json_schema);
 
+/**
+ * Create a new constraint from a given lark grammar
+ * Always returns a non-null value. Call llg_get_error() on the result to check for errors.
+ */
+struct LlgConstraint *llg_new_constraint_lark(const struct LlgConstraintInit *init,
+                                              const char *lark);
+
+/**
+ * Create a new constraint with specified type
+ * Type can be one of "regex", "json_schema" (or "json"), "lark", "llguidance" (or "guidance")
+ * Always returns a non-null value. Call llg_get_error() on the result to check for errors.
+ */
+struct LlgConstraint *llg_new_constraint_any(const struct LlgConstraintInit *init,
+                                             const char *constraint_type,
+                                             const char *data);
+
 /**
  * Get the error message from the constraint or null if there is no error.
  * After it returns a non-null value, it will always return it until the constraint is freed

diff --git a/parser/src/api.rs b/parser/src/api.rs
@@ -1,6 +1,7 @@
 use std::fmt::Debug;
 
 use serde::{Deserialize, Serialize};
+use serde_json::Value;
 
 /// This represents a collection of grammars, with a designated
 /// "start" grammar at first position.
@@ -19,8 +20,19 @@ pub const DEFAULT_CONTEXTUAL: bool = true;
 #[derive(Serialize, Deserialize, Clone, Default)]
 pub struct GrammarWithLexer {
     /// The start symbol is at nodes[0]
+    /// When nodes is empty, then one of json_schema or lark_grammar must be set.
+    #[serde(default)]
     pub nodes: Vec<Node>,
 
+    /// The JSON schema that the grammar should generate.
+    /// When this is set, nodes and rx_nodes must be empty.
+    pub json_schema: Option<Value>,
+
+    /// The Lark grammar that the grammar should generate.
+    /// When this is set, nodes and rx_nodes must be empty.
+    pub lark_grammar: Option<String>,
+
+    /// This is no longer used.
     /// When enabled, the grammar can use `Lexeme` but not `Gen`.
     /// When disabled, the grammar can use `Gen` but not `Lexeme`.
     /// `String` is allowed in either case as a shorthand for either `Lexeme` or `Gen`.
@@ -196,7 +208,7 @@ pub struct GenGrammarOptions {
     pub max_tokens_grm: usize,
 }
 
-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[derive(Serialize, Deserialize, Clone, Debug, Hash, PartialEq, Eq)]
 pub enum RegexNode {
     /// Intersection of the regexes
     And(Vec<RegexId>),
@@ -373,6 +385,8 @@ impl TopLevelGrammar {
                     json_allowed_escapes: None,
                     json_raw: None,
                 }],
+                json_schema: None,
+                lark_grammar: None,
                 greedy_lexer: true,
                 greedy_skip_rx: None,
                 contextual: None,

diff --git a/parser/src/earley/from_guidance.rs b/parser/src/earley/from_guidance.rs
@@ -6,7 +6,8 @@ use crate::api::{
     GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
     DEFAULT_CONTEXTUAL,
 };
-use crate::{loginfo, Logger};
+use crate::lark::{lark_to_llguidance, parse_lark};
+use crate::{loginfo, JsonCompileOptions, Logger};
 use anyhow::{bail, ensure, Result};
 use derivre::{ExprRef, JsonQuoteOptions, RegexAst, RegexBuilder};
 use instant::Instant;
@@ -84,8 +85,37 @@ fn map_rx_nodes(
 fn grammar_from_json(
     tok_env: &TokEnv,
     limits: &mut ParserLimits,
-    input: GrammarWithLexer,
+    mut input: GrammarWithLexer,
 ) -> Result<(LexerSpec, Grammar)> {
+    if input.json_schema.is_some() || input.lark_grammar.is_some() {
+        ensure!(
+            input.nodes.is_empty() && input.rx_nodes.is_empty(),
+            "cannot have both json_schema/lark_grammar and nodes/rx_nodes"
+        );
+
+        let mut new_grm = if let Some(json_schema) = input.json_schema.as_ref() {
+            ensure!(
+                input.lark_grammar.is_none(),
+                "cannot have both json_schema and lark_grammar"
+            );
+            let opts = JsonCompileOptions { compact: false };
+            opts.json_to_llg_no_validate(json_schema)?
+        } else {
+            let items = parse_lark(input.lark_grammar.as_ref().unwrap())?;
+            lark_to_llguidance(items)?
+        };
+
+        let g = new_grm.grammars.pop().unwrap();
+
+        input.greedy_skip_rx = g.greedy_skip_rx;
+        input.nodes = g.nodes;
+        input.rx_nodes = g.rx_nodes;
+        input.contextual = g.contextual;
+
+        input.lark_grammar = None;
+        input.json_schema = None;
+    }
+
     ensure!(input.nodes.len() > 0, "empty grammar");
 
     let (builder, rx_nodes) = map_rx_nodes(limits, input.rx_nodes, input.allow_invalid_utf8)?;

diff --git a/parser/src/earley/lexerspec.rs b/parser/src/earley/lexerspec.rs
@@ -99,6 +99,15 @@ impl LexerSpec {
         SimpleVob::alloc(self.lexemes.len())
     }
 
+    pub fn all_lexemes(&self) -> SimpleVob {
+        let mut v = self.alloc_lexeme_set();
+        self.lexemes[0..self.lexemes.len() - self.num_extra_lexemes]
+            .iter()
+            .enumerate()
+            .for_each(|(idx, _)| v.set(idx, true));
+        v
+    }
+
     pub fn lazy_lexemes(&self) -> SimpleVob {
         let mut v = self.alloc_lexeme_set();
         for (idx, lex) in self.lexemes.iter().enumerate() {

diff --git a/parser/src/earley/mod.rs b/parser/src/earley/mod.rs
@@ -1,6 +1,6 @@
 mod from_guidance;
 mod grammar;
-mod lexer;
+pub(crate) mod lexer;
 mod parser;
 
 pub mod lexerspec;

diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs
@@ -8,6 +8,7 @@ use toktrie::{InferenceCapabilities, TokEnv, TokRxInfo, TokTrie, TokenizerEnv};
 
 use crate::{
     api::{ParserLimits, RegexNode, TopLevelGrammar},
+    lark::{lark_to_llguidance, parse_lark},
     CommitResult, Constraint, JsonCompileOptions, Logger, TokenParser,
 };
 
@@ -273,6 +274,14 @@ fn new_constraint_regex(init: &LlgConstraintInit, regex: *const c_char) -> Resul
     new_constraint_core(init, grammar)
 }
 
+fn new_constraint_lark(init: &LlgConstraintInit, lark: *const c_char) -> Result<Constraint> {
+    let lark = unsafe { CStr::from_ptr(lark) }
+        .to_str()
+        .map_err(|_| anyhow::anyhow!("Invalid UTF-8 in lark"))?;
+    let grammar = lark_to_llguidance(parse_lark(lark)?)?;
+    new_constraint_core(init, grammar)
+}
+
 fn new_constraint_json(init: &LlgConstraintInit, json_schema: *const c_char) -> Result<Constraint> {
     let json_schema = unsafe { CStr::from_ptr(json_schema) }
         .to_str()
@@ -295,6 +304,23 @@ fn new_constraint(init: &LlgConstraintInit, grammar_json: *const c_char) -> Resu
     new_constraint_core(init, grammar)
 }
 
+fn new_constraint_any(
+    init: &LlgConstraintInit,
+    constraint_type: *const c_char,
+    data: *const c_char,
+) -> Result<Constraint> {
+    let tp = unsafe { CStr::from_ptr(constraint_type) }
+        .to_str()
+        .map_err(|_| anyhow::anyhow!("Invalid UTF-8 in constraint_type"))?;
+    match tp {
+        "regex" => new_constraint_regex(init, data),
+        "json" | "json_schema" => new_constraint_json(init, data),
+        "lark" => new_constraint_lark(init, data),
+        "llguidance" | "guidance" => new_constraint_lark(init, data),
+        _ => bail!("unknown constraint type: {tp}"),
+    }
+}
+
 fn new_constraint_core(init: &LlgConstraintInit, grammar: TopLevelGrammar) -> Result<Constraint> {
     if init.tokenizer.is_null() {
         bail!("Tokenizer is null");
@@ -400,6 +426,28 @@ pub extern "C" fn llg_new_constraint_json(
     return_constraint(new_constraint_json(init, json_schema))
 }
 
+/// Create a new constraint from a given lark grammar
+/// Always returns a non-null value. Call llg_get_error() on the result to check for errors.
+#[no_mangle]
+pub extern "C" fn llg_new_constraint_lark(
+    init: &LlgConstraintInit,
+    lark: *const c_char,
+) -> *mut LlgConstraint {
+    return_constraint(new_constraint_lark(init, lark))
+}
+
+/// Create a new constraint with specified type
+/// Type can be one of "regex", "json_schema" (or "json"), "lark", "llguidance" (or "guidance")
+/// Always returns a non-null value. Call llg_get_error() on the result to check for errors.
+#[no_mangle]
+pub extern "C" fn llg_new_constraint_any(
+    init: &LlgConstraintInit,
+    constraint_type: *const c_char,
+    data: *const c_char,
+) -> *mut LlgConstraint {
+    return_constraint(new_constraint_any(init, constraint_type, data))
+}
+
 /// Get the error message from the constraint or null if there is no error.
 /// After it returns a non-null value, it will always return it until the constraint is freed
 /// using llg_free_constraint() (at which point the pointer will be invalid).
@@ -532,7 +580,6 @@ pub extern "C" fn llg_stringify_tokens(
     s.len() + 1
 }
 
-
 /// Free the tokenizer. Should *NOT* be called while there are still constraints using it.
 #[no_mangle]
 pub extern "C" fn llg_free_tokenizer(tok: *mut LlgTokenizer) {

diff --git a/parser/src/grammar_builder.rs b/parser/src/grammar_builder.rs
@@ -2,7 +2,9 @@ use std::{collections::HashMap, sync::atomic::AtomicU32};
 
 use anyhow::{ensure, Result};
 
-use crate::api::{GrammarWithLexer, Node, NodeId, NodeProps, RegexSpec, TopLevelGrammar};
+use crate::api::{
+    GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
+};
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub struct NodeRef {
@@ -16,6 +18,81 @@ pub struct GrammarBuilder {
     strings: HashMap<String, NodeRef>,
     curr_grammar_id: u32,
     nodes: Vec<Node>,
+    pub regex: RegexBuilder,
+}
+
+pub struct RegexBuilder {
+    node_ids: HashMap<RegexNode, RegexId>,
+    nodes: Vec<RegexNode>,
+}
+
+impl RegexBuilder {
+    pub fn new() -> Self {
+        Self {
+            nodes: vec![],
+            node_ids: HashMap::new(),
+        }
+    }
+
+    pub fn add_node(&mut self, node: RegexNode) -> RegexId {
+        if let Some(id) = self.node_ids.get(&node) {
+            return *id;
+        }
+        let id = RegexId(self.nodes.len());
+        self.nodes.push(node.clone());
+        self.node_ids.insert(node, id);
+        id
+    }
+
+    pub fn regex(&mut self, rx: String) -> RegexId {
+        self.add_node(RegexNode::Regex(rx))
+    }
+
+    pub fn literal(&mut self, s: String) -> RegexId {
+        self.add_node(RegexNode::Literal(s))
+    }
+
+    pub fn concat(&mut self, nodes: Vec<RegexId>) -> RegexId {
+        if nodes.len() == 1 {
+            return nodes[0];
+        }
+        if nodes.len() == 0 {
+            return self.add_node(RegexNode::NoMatch);
+        }
+        self.add_node(RegexNode::Concat(nodes))
+    }
+
+    pub fn select(&mut self, nodes: Vec<RegexId>) -> RegexId {
+        if nodes.len() == 1 {
+            return nodes[0];
+        }
+        if nodes.len() == 0 {
+            return self.add_node(RegexNode::NoMatch);
+        }
+        self.add_node(RegexNode::Or(nodes))
+    }
+
+    pub fn zero_or_more(&mut self, node: RegexId) -> RegexId {
+        self.repeat(node, 0, None)
+    }
+
+    pub fn one_or_more(&mut self, node: RegexId) -> RegexId {
+        self.repeat(node, 1, None)
+    }
+
+    pub fn optional(&mut self, node: RegexId) -> RegexId {
+        self.repeat(node, 0, Some(1))
+    }
+
+    pub fn repeat(&mut self, node: RegexId, min: u32, max: Option<u32>) -> RegexId {
+        self.add_node(RegexNode::Repeat(node, min, max))
+    }
+
+    fn finalize(&mut self) -> Vec<RegexNode> {
+        let r = std::mem::take(&mut self.nodes);
+        *self = Self::new();
+        r
+    }
 }
 
 impl GrammarBuilder {
@@ -49,6 +126,7 @@ impl GrammarBuilder {
             strings: HashMap::new(),
             curr_grammar_id: 0,
             nodes: vec![],
+            regex: RegexBuilder::new(),
         }
     }
 
@@ -62,6 +140,7 @@ impl GrammarBuilder {
                 "no nodes added before add_grammar() or finalize()"
             );
             self.top_grammar.grammars.last_mut().unwrap().nodes = nodes;
+            self.top_grammar.grammars.last_mut().unwrap().rx_nodes = self.regex.finalize();
         }
     }
 
@@ -158,10 +237,19 @@ impl GrammarBuilder {
         self.select(&[value, empty])
     }
 
+    pub fn one_or_more(&mut self, elt: NodeRef) -> NodeRef {
+        let p = self.placeholder();
+        let p_elt = self.join(&[p, elt]);
+        let inner = self.select(&[elt, p_elt]);
+        self.set_placeholder(p, inner);
+        p
+    }
+
     pub fn zero_or_more(&mut self, elt: NodeRef) -> NodeRef {
         let p = self.placeholder();
         let empty = self.empty();
-        let inner = self.select(&[empty, elt]);
+        let p_elt = self.join(&[p, elt]);
+        let inner = self.select(&[empty, p_elt]);
         self.set_placeholder(p, inner);
         p
     }