From e79e53835586bb2483d2dbbc63881e0c4550e4d4 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Mon, 25 Nov 2024 16:06:56 -0800 Subject: [PATCH 1/4] Do not treat oneOf as anyOf by default (#71) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds coerce_one_of option (false by default) --------- Co-authored-by: MichaƂ Moskal --- parser/src/json/compiler.rs | 16 +++++++++++++--- python/llguidance/_lib.pyi | 1 + rust/src/py.rs | 7 +++++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/parser/src/json/compiler.rs b/parser/src/json/compiler.rs index 30d0f0b2..fdb4763c 100644 --- a/parser/src/json/compiler.rs +++ b/parser/src/json/compiler.rs @@ -20,6 +20,7 @@ pub struct JsonCompileOptions { pub item_separator: String, pub key_separator: String, pub whitespace_flexible: bool, + pub coerce_one_of: bool, } fn json_dumps(target: &serde_json::Value) -> String { @@ -98,6 +99,7 @@ impl Default for JsonCompileOptions { item_separator: ",".to_string(), key_separator: ":".to_string(), whitespace_flexible: true, + coerce_one_of: false, } } } @@ -244,11 +246,19 @@ impl Compiler { Ok(self.builder.string(if *value { "true" } else { "false" })) } Schema::AnyOf { options } => self.process_any_of(options.clone()), - Schema::OneOf { options } => self.process_any_of(options.clone()), + Schema::OneOf { options } => self.process_one_of(options.clone()), Schema::Ref { uri, .. } => self.get_definition(uri), } } + fn process_one_of(&mut self, options: Vec) -> Result { + if self.options.coerce_one_of { + self.process_any_of(options) + } else { + Err(anyhow!("oneOf constraints are not supported. Enable 'coerce_one_of' option to approximate oneOf with anyOf")) + } + } + fn process_any_of(&mut self, options: Vec) -> Result { let mut nodes = vec![]; let mut errors = vec![]; @@ -265,12 +275,12 @@ impl Compiler { Ok(self.builder.select(&nodes)) } else if let Some(e) = errors.pop() { Err(anyhow!(UnsatisfiableSchemaError { - message: format!("All options in anyOf/oneOf are unsatisfiable",), + message: format!("All options in anyOf are unsatisfiable",), }) .context(e)) } else { Err(anyhow!(UnsatisfiableSchemaError { - message: "No options in anyOf/oneOf".to_string(), + message: "No options in anyOf".to_string(), })) } } diff --git a/python/llguidance/_lib.pyi b/python/llguidance/_lib.pyi index 1a369ab0..72c3d3ed 100644 --- a/python/llguidance/_lib.pyi +++ b/python/llguidance/_lib.pyi @@ -135,6 +135,7 @@ class JsonCompiler: cls, separators: Optional[Tuple[str, str]] = None, whitespace_flexible: bool = False, + coerce_one_of: bool = False, ) -> "JsonCompiler": """ Create a new JSON compiler. diff --git a/rust/src/py.rs b/rust/src/py.rs index 397bad06..e0b67628 100644 --- a/rust/src/py.rs +++ b/rust/src/py.rs @@ -249,13 +249,14 @@ struct JsonCompiler { item_separator: String, key_separator: String, whitespace_flexible: bool, + coerce_one_of: bool, } #[pymethods] impl JsonCompiler { #[new] - #[pyo3(signature = (separators = None, whitespace_flexible = false))] - fn py_new(separators: Option<(String, String)>, whitespace_flexible: bool) -> Self { + #[pyo3(signature = (separators = None, whitespace_flexible = false, coerce_one_of = false))] + fn py_new(separators: Option<(String, String)>, whitespace_flexible: bool, coerce_one_of: bool) -> Self { let (item_separator, key_separator) = separators.unwrap_or_else(|| { if whitespace_flexible { (",".to_owned(), ":".to_owned()) @@ -267,6 +268,7 @@ impl JsonCompiler { item_separator: item_separator, key_separator: key_separator, whitespace_flexible, + coerce_one_of, } } fn compile(&self, schema: &str) -> PyResult { @@ -275,6 +277,7 @@ impl JsonCompiler { item_separator: self.item_separator.clone(), key_separator: self.key_separator.clone(), whitespace_flexible: self.whitespace_flexible, + coerce_one_of: self.coerce_one_of, }; let grammar = compile_options.json_to_llg(schema).map_err(val_error)?; serde_json::to_string(&grammar).map_err(val_error) From 65cbcbe5811801dff83b852a2dc91d34e0edfc0a Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Tue, 26 Nov 2024 09:08:54 -0800 Subject: [PATCH 2/4] Cache `NodeRef`s in `grammar_builder` (#70) Cache NodeRefs by hashing serialized grammar nodes; add hashing to repeat function --- parser/src/grammar_builder.rs | 40 +++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/parser/src/grammar_builder.rs b/parser/src/grammar_builder.rs index 1115068c..1882d76a 100644 --- a/parser/src/grammar_builder.rs +++ b/parser/src/grammar_builder.rs @@ -20,8 +20,11 @@ pub struct GrammarBuilder { placeholder: Node, strings: HashMap, curr_grammar_id: u32, + node_refs: HashMap, nodes: Vec, pub regex: RegexBuilder, + at_most_cache: HashMap<(NodeRef, usize), NodeRef>, + repeat_exact_cache: HashMap<(NodeRef, usize), NodeRef>, } pub struct RegexBuilder { @@ -136,8 +139,11 @@ impl GrammarBuilder { }, strings: HashMap::new(), curr_grammar_id: 0, + node_refs: HashMap::new(), nodes: vec![], regex: RegexBuilder::new(), + at_most_cache: HashMap::new(), + repeat_exact_cache: HashMap::new(), } } @@ -174,11 +180,27 @@ impl GrammarBuilder { } pub fn add_node(&mut self, node: Node) -> NodeRef { + // Generate a key for the node from its serialized form if it is not the placeholder + let key = (node != self.placeholder).then(|| serde_json::to_string(&node).ok()).flatten(); + + // Return the node reference if it already exists + if let Some(ref key) = key { + if let Some(node_ref) = self.node_refs.get(key) { + return *node_ref; + } + } + + // Create new node reference let r = NodeRef { idx: self.nodes.len(), grammar_id: self.curr_grammar_id, }; + + // Add the node and store the reference (if it's not the placeholder) self.nodes.push(node); + if let Some(key) = key { + self.node_refs.insert(key, r); + } r } @@ -321,7 +343,10 @@ impl GrammarBuilder { // at_most() recursively factors the sequence into K-size pieces, // in an attempt to keep grammar size O(log(n)). fn at_most(&mut self, elt: NodeRef, n: usize) -> NodeRef { - if n == 0 { + if let Some(r) = self.at_most_cache.get(&(elt, n)) { + return *r; + } + let r = if n == 0 { // If the max ('n') is 0, an empty rule self.empty() } else if n == 1 { @@ -378,7 +403,9 @@ impl GrammarBuilder { // (inclusive) in 'elt_n'. Clearly, the sequences of length at most 'n' // are the alternation of 'elt_max_nk' and 'elt_n'. self.select(&[elt_n, elt_max_nk]) - } + }; + self.at_most_cache.insert((elt, n), r); + r } // simple_repeat() "simply" repeats the element ('elt') 'n' times. @@ -393,7 +420,10 @@ impl GrammarBuilder { // Repeat element 'elt' exactly 'n' times, using factoring // in an attempt to keep grammar size O(log(n)). fn repeat_exact(&mut self, elt: NodeRef, n: usize) -> NodeRef { - if n > 2 * K { + if let Some(r) = self.repeat_exact_cache.get(&(elt, n)) { + return *r; + } + let r = if n > 2 * K { // For large 'n', try to keep the number of rules O(log(n)) // by "factoring" the sequence into K-sized pieces @@ -418,7 +448,9 @@ impl GrammarBuilder { // For small 'n' (currently, 8 or less), simply // repeat 'elt' 'n' times. self.simple_repeat(elt, n) - } + }; + self.repeat_exact_cache.insert((elt, n), r); + r } // at_least() accepts a sequence of at least 'n' copies of From cf66a07e5b10eaf2b3bd553e017efa5acc22f49b Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Tue, 26 Nov 2024 09:09:41 -0800 Subject: [PATCH 3/4] Prioritize sibling keys (rather, parent schema) over $ref, oneOf, anyOf, allOf subschemas (#74) --- parser/src/json/schema.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/parser/src/json/schema.rs b/parser/src/json/schema.rs index a65b1f79..5ace7afb 100644 --- a/parser/src/json/schema.rs +++ b/parser/src/json/schema.rs @@ -423,7 +423,7 @@ fn compile_contents_map(ctx: &Context, mut schemadict: HashMap<&str, &Value>) -> .iter() .map(|value| compile_resource(&ctx, ctx.as_resource_ref(value))) .collect::>>()?; - let merged = intersect(ctx, options.into_iter().chain(vec![siblings]).collect())?; + let merged = intersect(ctx, vec![siblings].into_iter().chain(options.into_iter()).collect())?; return Ok(merged); } @@ -439,7 +439,7 @@ fn compile_contents_map(ctx: &Context, mut schemadict: HashMap<&str, &Value>) -> let options = any_of .into_iter() .map(|value| compile_resource(&ctx, ctx.as_resource_ref(value))) - .map(|res| res.and_then(|schema| intersect_two(ctx, schema, siblings.clone()))) + .map(|res| res.and_then(|schema| intersect_two(ctx, siblings.clone(), schema))) .collect::>>()?; return Ok(Schema::AnyOf { options }); } @@ -457,7 +457,7 @@ fn compile_contents_map(ctx: &Context, mut schemadict: HashMap<&str, &Value>) -> let options = one_of .into_iter() .map(|value| compile_resource(&ctx, ctx.as_resource_ref(value))) - .map(|res| res.and_then(|schema| intersect_two(ctx, schema, siblings.clone()))) + .map(|res| res.and_then(|schema| intersect_two(ctx, siblings.clone(), schema))) .collect::>>()?; return Ok(Schema::OneOf { options }); } @@ -474,7 +474,7 @@ fn compile_contents_map(ctx: &Context, mut schemadict: HashMap<&str, &Value>) -> define_ref(ctx, &uri)?; return Ok(Schema::Ref { uri }); } else { - return intersect_ref(ctx, &uri, siblings); + return intersect_ref(ctx, &uri, siblings, false); } } @@ -511,7 +511,7 @@ fn define_ref(ctx: &Context, ref_uri: &str) -> Result<()> { Ok(()) } -fn intersect_ref(ctx: &Context, ref_uri: &str, schema: Schema) -> Result { +fn intersect_ref(ctx: &Context, ref_uri: &str, schema: Schema, ref_first: bool) -> Result { define_ref(ctx, ref_uri)?; let resolved_schema = ctx .get_ref_cloned(ref_uri) @@ -525,7 +525,11 @@ fn intersect_ref(ctx: &Context, ref_uri: &str, schema: Schema) -> Result ref_uri ) })?; - intersect_two(ctx, schema, resolved_schema) + if ref_first { + intersect_two(ctx, resolved_schema, schema) + } else { + intersect_two(ctx, schema, resolved_schema) + } } fn compile_const(instance: &Value) -> Result { @@ -833,8 +837,8 @@ fn intersect_two(ctx: &Context, schema0: Schema, schema1: Schema) -> Result schema0, (Schema::Unsatisfiable { reason }, _) => Schema::Unsatisfiable { reason }, (_, Schema::Unsatisfiable { reason }) => Schema::Unsatisfiable { reason }, - (Schema::Ref { uri }, schema1) => intersect_ref(ctx, &uri, schema1)?, - (schema0, Schema::Ref { uri }) => intersect_ref(ctx, &uri, schema0)?, + (Schema::Ref { uri }, schema1) => intersect_ref(ctx, &uri, schema1, true)?, + (schema0, Schema::Ref { uri }) => intersect_ref(ctx, &uri, schema0, false)?, (Schema::OneOf { options }, schema1) => Schema::OneOf { options: options .into_iter() From ce3c760f314604a80f2c53642d3c48a8548c8a8a Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Tue, 26 Nov 2024 09:10:29 -0800 Subject: [PATCH 4/4] treat schema's own items as pseudo-prefixItems when merging; use closure for lazy cloning (#75) fix some test issues with prefixItems --- parser/src/json/schema.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parser/src/json/schema.rs b/parser/src/json/schema.rs index 5ace7afb..db46245b 100644 --- a/parser/src/json/schema.rs +++ b/parser/src/json/schema.rs @@ -957,8 +957,8 @@ fn intersect_two(ctx: &Context, schema0: Schema, schema1: Schema) -> Result