Skip to content

Commit

Permalink
add support for json_string in Lexeme
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Aug 13, 2024
1 parent 80deb17 commit b4880ed
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 4 deletions.
2 changes: 1 addition & 1 deletion parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ edition = "2021"

[dependencies]
toktrie = { git = "https://github.com/microsoft/toktrie", rev = "6934722328ee1d3d679f95fcd5c669d47cee08f2" }
derivre = { git = "https://github.com/microsoft/derivre", rev = "fb0ba7b6307782e0d43a0ca598b237836cb6d304" }
derivre = { git = "https://github.com/microsoft/derivre", rev = "ad363698cc95d7e63c5116aa114596f18dc79385" }
serde = { version = "1.0.192", features = ["derive"] }
serde_json = "1.0.108"
anyhow = "1.0.75"
Expand Down
23 changes: 23 additions & 0 deletions parser/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,23 @@ pub enum Node {
/// Override sampling temperature.
temperature: Option<f32>,

/// When set, the lexeme will be quoted as a JSON string.
/// For example, /[a-z"]+/ will be quoted as /([a-z]|\\")+/
json_string: Option<bool>,

/// It lists the allowed escape sequences, typically one of:
/// "nrbtf\\\"u" - to allow all JSON escapes, including \u00XX for control characters
/// this is the default
/// "nrbtf\\\"" - to disallow \u00XX control characters
/// "nrt\\\"" - to also disallow unusual escapes (\f and \b)
/// "" - to disallow all escapes
/// Note that \uXXXX for non-control characters (code points above U+001F) are never allowed,
/// as they never have to be quoted in JSON.
json_allowed_escapes: Option<String>,

/// When set and json_string is also set, "..." will not be added around the regular expression.
json_raw: Option<bool>,

#[serde(flatten)]
props: NodeProps,
},
Expand Down Expand Up @@ -115,6 +132,12 @@ pub enum Node {
},
}

pub enum JsonQuoteOptions {
/// Do not allow \uXXXX in strings. Will allow \n, \t, \" etc
NoUnicodeEscapes,
WithUnicodeEscapes,
}

/// Optional fields allowed on any Node
#[derive(Serialize, Deserialize, Default, Clone)]
pub struct NodeProps {
Expand Down
24 changes: 22 additions & 2 deletions parser/src/earley/from_guidance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ use crate::api::{
GrammarWithLexer, Node, RegexId, RegexNode, RegexSpec, TopLevelGrammar, DEFAULT_CONTEXTUAL,
};
use crate::Logger;
use anyhow::{bail, Result};
use derivre::{ExprRef, RegexAst, RegexBuilder};
use anyhow::{bail, ensure, Result};
use derivre::{ExprRef, JsonQuoteOptions, RegexAst, RegexBuilder};

fn resolve_rx(rx_refs: &[ExprRef], node: &RegexSpec) -> Result<RegexAst> {
match node {
Expand Down Expand Up @@ -166,12 +166,32 @@ fn grammar_from_json(input: GrammarWithLexer) -> Result<(LexerSpec, Grammar)> {
rx,
contextual,
temperature,
json_allowed_escapes,
json_raw,
json_string,
..
} => {
let json_options = if json_string.unwrap_or(false) {
Some(JsonQuoteOptions {
allowed_escapes: json_allowed_escapes
.as_ref()
.map_or("nrbtf\\\"u", |e| e.as_str())
.to_string(),
raw_mode: json_raw.unwrap_or(false),
})
} else {
ensure!(
json_allowed_escapes.is_none(),
"json_allowed_escapes is only valid for json_string"
);
ensure!(json_raw.is_none(), "json_raw is only valid for json_string");
None
};
let idx = lexer_spec.add_greedy_lexeme(
format!("lex_{}", grm.sym_name(lhs)),
resolve_rx(&rx_nodes, rx)?,
contextual.unwrap_or(input.contextual.unwrap_or(DEFAULT_CONTEXTUAL)),
json_options,
)?;
if let Some(t) = temperature {
let symprops = grm.sym_props_mut(lhs);
Expand Down
11 changes: 10 additions & 1 deletion parser/src/earley/lexerspec.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use anyhow::Result;
use derivre::{ExprRef, RegexAst, RegexBuilder};
use derivre::{ExprRef, JsonQuoteOptions, RegexAst, RegexBuilder};
use std::{fmt::Debug, hash::Hash};
use toktrie::{bytes::limit_str, SimpleVob};

Expand All @@ -22,6 +22,7 @@ pub struct LexemeSpec {
ends_at_eos: bool,
lazy: bool,
contextual: bool,
json_options: Option<JsonQuoteOptions>,
}

#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
Expand Down Expand Up @@ -129,6 +130,11 @@ impl LexerSpec {

fn add_lexeme_spec(&mut self, mut spec: LexemeSpec) -> Result<LexemeIdx> {
let compiled = self.regex_builder.mk(&spec.rx)?;
let compiled = if let Some(ref opts) = spec.json_options {
self.regex_builder.json_quote(compiled, opts)?
} else {
compiled
};
if let Some(idx) = self
.lexemes
.iter()
Expand All @@ -152,6 +158,7 @@ impl LexerSpec {
lazy: false,
contextual: false,
ends_at_eos: false,
json_options: None,
}
}

Expand Down Expand Up @@ -195,11 +202,13 @@ impl LexerSpec {
name: String,
rx: RegexAst,
contextual: bool,
json_options: Option<JsonQuoteOptions>,
) -> Result<LexemeIdx> {
self.add_lexeme_spec(LexemeSpec {
name,
rx,
contextual,
json_options,
..self.empty_spec()
})
}
Expand Down

0 comments on commit b4880ed

Please sign in to comment.