From e3f1bff9ffa03d44719fc314b8f10b57a04aff24 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Fri, 1 Nov 2024 14:46:49 -0700 Subject: [PATCH 01/11] start on lark "importer" --- parser/src/earley/lark.rs | 258 +++++++++++++++++++++++++++++++++ parser/src/earley/lexerspec.rs | 9 ++ parser/src/earley/mod.rs | 1 + sample_parser/Cargo.toml | 4 + sample_parser/data/lark.lark | 69 +++++++++ sample_parser/lark.sh | 3 + sample_parser/src/lark_test.rs | 22 +++ 7 files changed, 366 insertions(+) create mode 100644 parser/src/earley/lark.rs create mode 100644 sample_parser/data/lark.lark create mode 100755 sample_parser/lark.sh create mode 100644 sample_parser/src/lark_test.rs diff --git a/parser/src/earley/lark.rs b/parser/src/earley/lark.rs new file mode 100644 index 00000000..66a55a5c --- /dev/null +++ b/parser/src/earley/lark.rs @@ -0,0 +1,258 @@ +use std::collections::HashMap; + +use anyhow::{bail, Result}; +use derivre::{RegexAst, RegexBuilder}; + +use crate::earley::lexerspec::LexerSpec; + +use super::{ + lexer::{Lexer, LexerResult}, + lexerspec::LexemeIdx, +}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Token { + KwIgnore, + KwImport, + KwOverride, + KwDeclare, + Colon, + Comma, + Dot, + Percent, + Arrow, + LParen, + RParen, + LBrace, + RBrace, + LBracket, + RBracket, + Tilde, + Question, + Star, + Plus, + // regexps + Op, + String, + Regexp, + Rule, + Token, + Number, + Newline, + VBar, + // special + SKIP, + EOF, +} + +#[derive(Debug, Clone)] +pub struct Lexeme { + token: Token, + value: String, + line: usize, + column: usize, +} + + +impl Token { + const LITERAL_TOKENS: &'static [(Token, &'static str)] = &[ + (Token::Arrow, "->"), + (Token::Colon, ":"), + (Token::Comma, ","), + (Token::Dot, "."), + (Token::KwDeclare, "%declare"), + (Token::KwIgnore, "%ignore"), + (Token::KwImport, "%import"), + (Token::KwOverride, "%override"), + (Token::LParen, "("), + (Token::RParen, ")"), + (Token::LBrace, "{"), + (Token::RBrace, "}"), + (Token::LBracket, "["), + (Token::RBracket, "]"), + (Token::Percent, "%"), + (Token::Tilde, "~"), + (Token::Question, "?"), + (Token::Star, "*"), + (Token::Plus, "+"), + (Token::VBar, "|"), + ]; + + const REGEX_TOKENS: &'static [(Token, &'static str)] = &[ + (Token::Op, r"[+*?]"), + (Token::Rule, r"!?[_?]?[a-z][_a-z0-9]*"), + (Token::Token, r"_?[A-Z][_A-Z0-9]*"), + // use JSON string syntax + ( + Token::String, + r#""(\\([\"\\\/bfnrt]|u[a-fA-F0-9]{4})|[^\"\\\x00-\x1F\x7F])*"(i|)"#, + ), + (Token::Regexp, r#"/(\\.|[^/\\])+/[imslux]*"#), + (Token::Number, r#"[+-]?[0-9]+"#), + (Token::Newline, r"(\r?\n)+[ \t]*"), + ]; +} + +pub fn lex_lark(input: &str) -> Result> { + let builder = RegexBuilder::new(); + let comment_or_ws = r"((#|//)[^\n]*)|[ \t]+".to_string(); + let mut spec = LexerSpec::new(builder, RegexAst::Regex(comment_or_ws)).unwrap(); + let mut lexeme_idx_to_token = HashMap::new(); + lexeme_idx_to_token.insert(LexemeIdx::SKIP, Token::SKIP); + for (token, literal) in Token::LITERAL_TOKENS { + let l = spec + .add_simple_literal(format!("{:?}", token), *literal, false) + .unwrap(); + lexeme_idx_to_token.insert(l, *token); + } + for (token, regexp) in Token::REGEX_TOKENS { + let l = spec + .add_greedy_lexeme( + format!("{:?}", token), + RegexAst::Regex(regexp.to_string()), + false, + None, + ) + .unwrap(); + lexeme_idx_to_token.insert(l, *token); + } + let mut lexer = Lexer::from(&spec, &mut Default::default()).unwrap(); + let all_lexemes = spec.all_lexemes(); + let state0 = lexer.start_state(&all_lexemes, None); + let mut line_no = 1; + let mut column_no = 1; + let mut curr_lexeme = Lexeme { + token: Token::EOF, + value: String::new(), + line: 1, + column: 1, + }; + let mut state = state0; + let mut lexemes = Vec::new(); + let mut start_idx = 0; + + let input_bytes = input.as_bytes(); + for idx in 0..=input_bytes.len() { + let mut b = b'\n'; + let res = if idx == input_bytes.len() { + lexer.force_lexeme_end(state) + } else { + b = input_bytes[idx]; + lexer.advance(state, b, false) + }; + + match res { + LexerResult::Error => { + bail!("{}({}): lexer error", line_no, column_no); + } + LexerResult::State(s, _) => { + state = s; + } + LexerResult::Lexeme(p) => { + let transition_byte = if p.byte_next_row { p.byte } else { None }; + + let token = lexeme_idx_to_token[&p.idx]; + curr_lexeme.token = token; + let end_idx = if p.byte_next_row || p.byte.is_none() { + idx + } else { + idx + 1 + }; + curr_lexeme.value = input[start_idx..end_idx].to_string(); + start_idx = end_idx; + println!("lex: {:?}", curr_lexeme); + lexemes.push(curr_lexeme.clone()); + + state = lexer.start_state(&all_lexemes, transition_byte); + + curr_lexeme.value.clear(); + curr_lexeme.line = line_no; + curr_lexeme.column = column_no; + } + } + + if b == b'\n' { + line_no += 1; + column_no = 1; + } else { + column_no += 1; + } + } + + Ok(lexemes) +} + +pub fn test_lex_lark() { + lex_lark(LARK_GRAMMAR).unwrap(); +} + +const LARK_GRAMMAR: &str = r#" +# Lark grammar of Lark's syntax +# Note: Lark is not bootstrapped, its parser is implemented in load_grammar.py + +start: (_item? _NL)* _item? + +_item: rule + | token + | statement + +rule: RULE rule_params priority? ":" expansions +token: TOKEN token_params priority? ":" expansions + +rule_params: ["{" RULE ("," RULE)* "}"] +token_params: ["{" TOKEN ("," TOKEN)* "}"] + +priority: "." NUMBER + +statement: "%ignore" expansions -> ignore + | "%import" import_path ["->" name] -> import + | "%import" import_path name_list -> multi_import + | "%override" rule -> override_rule + | "%declare" name+ -> declare + +!import_path: "."? name ("." name)* +name_list: "(" name ("," name)* ")" + +?expansions: alias (_VBAR alias)* + +?alias: expansion ["->" RULE] + +?expansion: expr* + +?expr: atom [OP | "~" NUMBER [".." NUMBER]] + +?atom: "(" expansions ")" + | "[" expansions "]" -> maybe + | value + +?value: STRING ".." STRING -> literal_range + | name + | (REGEXP | STRING) -> literal + | name "{" value ("," value)* "}" -> template_usage + +name: RULE + | TOKEN + +_VBAR: _NL? "|" +OP: /[+*]|[?](?![a-z])/ +RULE: /!?[_?]?[a-z][_a-z0-9]*/ +TOKEN: /_?[A-Z][_A-Z0-9]*/ +STRING: _STRING "i"? +REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ +_NL: /(\r?\n)+\s*/ + +_STRING_INNER: /.*?/ +_STRING_ESC_INNER: _STRING_INNER /(? SimpleVob { + let mut v = self.alloc_lexeme_set(); + self.lexemes[0..self.lexemes.len() - self.num_extra_lexemes] + .iter() + .enumerate() + .for_each(|(idx, _)| v.set(idx, true)); + v + } + pub fn lazy_lexemes(&self) -> SimpleVob { let mut v = self.alloc_lexeme_set(); for (idx, lex) in self.lexemes.iter().enumerate() { diff --git a/parser/src/earley/mod.rs b/parser/src/earley/mod.rs index 93785092..e7d981c2 100644 --- a/parser/src/earley/mod.rs +++ b/parser/src/earley/mod.rs @@ -2,6 +2,7 @@ mod from_guidance; mod grammar; mod lexer; mod parser; +pub mod lark; pub mod lexerspec; pub mod regexvec; diff --git a/sample_parser/Cargo.toml b/sample_parser/Cargo.toml index 69b47d63..3fc91353 100644 --- a/sample_parser/Cargo.toml +++ b/sample_parser/Cargo.toml @@ -21,3 +21,7 @@ path = "src/schema_tester.rs" [[bin]] name = "minimal" path = "src/minimal.rs" + +[[bin]] +name = "lark_test" +path = "src/lark_test.rs" diff --git a/sample_parser/data/lark.lark b/sample_parser/data/lark.lark new file mode 100644 index 00000000..fc8c81cc --- /dev/null +++ b/sample_parser/data/lark.lark @@ -0,0 +1,69 @@ +# from https://github.com/lark-parser/lark/blob/master/lark/grammars/lark.lark +# Lark grammar of Lark's syntax +# Note: Lark is not bootstrapped, its parser is implemented in load_grammar.py + +start: (_item? _NL)* _item? + +_item: rule + | token + | statement + +rule: RULE rule_params priority? ":" expansions +token: TOKEN token_params priority? ":" expansions + +rule_params: ["{" RULE ("," RULE)* "}"] +token_params: ["{" TOKEN ("," TOKEN)* "}"] + +priority: "." NUMBER + +statement: "%ignore" expansions -> ignore + | "%import" import_path ["->" name] -> import + | "%import" import_path name_list -> multi_import + | "%override" rule -> override_rule + | "%declare" name+ -> declare + +!import_path: "."? name ("." name)* +name_list: "(" name ("," name)* ")" + +?expansions: alias (_VBAR alias)* + +?alias: expansion ["->" RULE] + +?expansion: expr* + +?expr: atom [OP | "~" NUMBER [".." NUMBER]] + +?atom: "(" expansions ")" + | "[" expansions "]" -> maybe + | value + +?value: STRING ".." STRING -> literal_range + | name + | (REGEXP | STRING) -> literal + | name "{" value ("," value)* "}" -> template_usage + +name: RULE + | TOKEN + +_VBAR: _NL? "|" +OP: /qqq[+*]|[?](?![a-z])/ +RULE: /!?[_?]?[a-z][_a-z0-9]*/ +TOKEN: /_?[A-Z][_A-Z0-9]*/ +STRING: _STRING "i"? +REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ +_NL: /(\r?\n)+\s*/ + +_STRING_INNER: /.*?/ +_STRING_ESC_INNER: _STRING_INNER /(? = env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let grammar_file = read_file_to_string(&args[1]); + let _tokens = lex_lark(&grammar_file).unwrap(); +} + +fn read_file_to_string(filename: &str) -> String { + let mut file = File::open(filename).expect("Unable to open file"); + let mut content = String::new(); + file.read_to_string(&mut content) + .expect("Unable to read file"); + content +} From ce47dd72d747ca9dc59ee8af9b372e9c9f1184df Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Fri, 1 Nov 2024 15:23:34 -0700 Subject: [PATCH 02/11] draft parsing code --- parser/src/earley/lark.rs | 497 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 497 insertions(+) diff --git a/parser/src/earley/lark.rs b/parser/src/earley/lark.rs index 66a55a5c..d0473d28 100644 --- a/parser/src/earley/lark.rs +++ b/parser/src/earley/lark.rs @@ -19,6 +19,7 @@ enum Token { Colon, Comma, Dot, + DotDot, Percent, Arrow, LParen, @@ -45,6 +46,7 @@ enum Token { EOF, } +/// Represents a lexeme with its token type, value, and position. #[derive(Debug, Clone)] pub struct Lexeme { token: Token, @@ -53,6 +55,500 @@ pub struct Lexeme { column: usize, } +/// Represents an item in the grammar (rule, token, or statement). +#[derive(Debug, Clone)] +pub enum Item { + Rule(Rule), + Token(TokenDef), + Statement(Statement), +} + +/// Represents a grammar rule. +#[derive(Debug, Clone)] +pub struct Rule { + pub name: String, + pub params: Option, + pub priority: Option, + pub expansions: Expansions, +} + +/// Represents a token definition. +#[derive(Debug, Clone)] +pub struct TokenDef { + pub name: String, + pub params: Option, + pub priority: Option, + pub expansions: Expansions, +} + +/// Represents different types of statements. +#[derive(Debug, Clone)] +pub enum Statement { + Ignore(Expansions), + Import { + path: ImportPath, + alias: Option, + }, + MultiImport { + path: ImportPath, + names: Vec, + }, + OverrideRule(Box), + Declare(Vec), +} + +/// Represents an import path. +#[derive(Debug, Clone)] +pub struct ImportPath(pub Vec); + +/// Represents parameters for a rule. +#[derive(Debug, Clone)] +pub struct RuleParams(pub Vec); + +/// Represents parameters for a token. +#[derive(Debug, Clone)] +pub struct TokenParams(pub Vec); + +/// Represents a list of expansions. +#[derive(Debug, Clone)] +pub struct Expansions(pub Vec); + +/// Represents an alias in the grammar. +#[derive(Debug, Clone)] +pub struct Alias { + pub expansion: Expansion, + pub alias: Option, +} + +/// Represents an expansion consisting of expressions. +#[derive(Debug, Clone)] +pub struct Expansion(pub Vec); + +/// Represents an expression. +#[derive(Debug, Clone)] +pub struct Expr { + pub atom: Atom, + pub op: Option, + pub range: Option<(i32, i32)>, +} + +/// Represents an atom in the grammar. +#[derive(Debug, Clone)] +pub enum Atom { + Group(Expansions), + Maybe(Expansions), + Value(Value), +} + +/// Represents different values in the grammar. +#[derive(Debug, Clone)] +pub enum Value { + LiteralRange(String, String), + Name(String), + Literal(String), + TemplateUsage { name: String, values: Vec }, +} + +/// Represents an operator. +#[derive(Debug, Clone)] +pub struct Op(pub String); + +/// The parser struct that holds the tokens and current position. +pub struct Parser { + tokens: Vec, + pos: usize, +} + +impl Parser { + /// Creates a new parser instance. + pub fn new(tokens: Vec) -> Self { + Parser { tokens, pos: 0 } + } + + /// Parses the start symbol of the grammar. + pub fn parse_start(&mut self) -> Result> { + let mut items = Vec::new(); + while !self.is_at_end() { + self.consume_newlines(); + if self.is_at_end() { + break; + } + items.push(self.parse_item()?); + self.consume_newlines(); + } + Ok(items) + } + + /// Parses an item (rule, token, or statement). + fn parse_item(&mut self) -> Result { + if self.has_token(Token::Rule) { + Ok(Item::Rule(self.parse_rule()?)) + } else if self.has_token(Token::Token) { + Ok(Item::Token(self.parse_token_def()?)) + } else { + Ok(Item::Statement(self.parse_statement()?)) + } + } + + /// Parses a rule definition. + fn parse_rule(&mut self) -> Result { + let name = self.expect_token(Token::Rule)?.value; + let params = if self.has_token(Token::LBrace) { + Some(self.parse_rule_params()?) + } else { + None + }; + let priority = if self.has_token(Token::Dot) { + Some(self.parse_priority()?) + } else { + None + }; + self.expect_token(Token::Colon)?; + let expansions = self.parse_expansions()?; + Ok(Rule { + name, + params, + priority, + expansions, + }) + } + + /// Parses a token definition. + fn parse_token_def(&mut self) -> Result { + let name = self.expect_token(Token::Token)?.value; + let params = if self.has_token(Token::LBrace) { + Some(self.parse_token_params()?) + } else { + None + }; + let priority = if self.has_token(Token::Dot) { + Some(self.parse_priority()?) + } else { + None + }; + self.expect_token(Token::Colon)?; + let expansions = self.parse_expansions()?; + Ok(TokenDef { + name, + params, + priority, + expansions, + }) + } + + /// Parses a statement. + fn parse_statement(&mut self) -> Result { + if self.match_token(Token::KwIgnore) { + let expansions = self.parse_expansions()?; + Ok(Statement::Ignore(expansions)) + } else if self.match_token(Token::KwImport) { + let import_path = self.parse_import_path()?; + if self.match_token(Token::Arrow) { + let name = self.parse_name()?; + Ok(Statement::Import { + path: import_path, + alias: Some(name), + }) + } else { + Ok(Statement::MultiImport { + path: import_path, + names: self.parse_name_list()?, + }) + } + } else if self.match_token(Token::KwOverride) { + let rule = self.parse_rule()?; + Ok(Statement::OverrideRule(Box::new(rule))) + } else if self.match_token(Token::KwDeclare) { + let mut names = Vec::new(); + while let Ok(name) = self.parse_name() { + names.push(name); + } + if names.is_empty() { + bail!("Expected at least one name after %declare") + } + Ok(Statement::Declare(names)) + } else { + bail!("expecting rule, token or statement") + } + } + + /// Parses rule parameters. + fn parse_rule_params(&mut self) -> Result { + if !self.match_token(Token::LBrace) { + bail!("Expected '{{' in rule parameters") + } + let mut params = Vec::new(); + let name = self.expect_token(Token::Rule)?.value; + params.push(name); + while self.match_token(Token::Comma) { + let name = self.expect_token(Token::Rule)?.value; + params.push(name); + } + self.expect_token(Token::RBrace)?; + Ok(RuleParams(params)) + } + + /// Parses token parameters. + fn parse_token_params(&mut self) -> Result { + if !self.match_token(Token::LBrace) { + bail!("Expected '{{' in token parameters") + } + let mut params = Vec::new(); + let name = self.expect_token(Token::Token)?.value; + params.push(name); + while self.match_token(Token::Comma) { + let name = self.expect_token(Token::Token)?.value; + params.push(name); + } + self.expect_token(Token::RBrace)?; + Ok(TokenParams(params)) + } + + /// Parses priority. + fn parse_priority(&mut self) -> Result { + if !self.match_token(Token::Dot) { + bail!("Expected '.' in priority") + } + let number = self.expect_token(Token::Number)?.value.parse::()?; + Ok(number) + } + + /// Parses expansions. + fn parse_expansions(&mut self) -> Result { + let mut aliases = Vec::new(); + aliases.push(self.parse_alias()?); + while self.match_vbar() { + aliases.push(self.parse_alias()?); + } + Ok(Expansions(aliases)) + } + + fn match_vbar(&mut self) -> bool { + if self.match_token(Token::VBar) { + return true; + } + let p0 = self.pos; + if self.match_token(Token::Newline) { + if self.match_token(Token::VBar) { + return true; + } + } + self.pos = p0; + false + } + + /// Parses an alias. + fn parse_alias(&mut self) -> Result { + let expansion = self.parse_expansion()?; + let alias = if self.match_token(Token::Arrow) { + Some(self.expect_token(Token::Rule)?.value) + } else { + None + }; + Ok(Alias { expansion, alias }) + } + + /// Parses an expansion. + fn parse_expansion(&mut self) -> Result { + let mut exprs = Vec::new(); + loop { + if self.has_token(Token::Newline) + || self.has_token(Token::VBar) + || self.has_token(Token::Arrow) + { + break; + } + exprs.push(self.parse_expr()?); + } + Ok(Expansion(exprs)) + } + + /// Parses an expression. + fn parse_expr(&mut self) -> Result { + let atom = self.parse_atom()?; + let mut op = None; + let mut range = None; + if let Some(op_token) = self.match_token_with_value(Token::Op) { + op = Some(Op(op_token.value.clone())); + } else if self.match_token(Token::Tilde) { + let start_num = self.expect_token(Token::Number)?.value.parse::()?; + let end_num = if self.match_token(Token::DotDot) { + Some(self.expect_token(Token::Number)?.value.parse::()?) + } else { + None + }; + range = Some((start_num, end_num.unwrap_or(start_num))); + } + Ok(Expr { atom, op, range }) + } + + /// Parses an atom. + fn parse_atom(&mut self) -> Result { + if self.match_token(Token::LParen) { + let expansions = self.parse_expansions()?; + self.expect_token(Token::RParen)?; + Ok(Atom::Group(expansions)) + } else if self.match_token(Token::LBracket) { + let expansions = self.parse_expansions()?; + self.expect_token(Token::RBracket)?; + Ok(Atom::Maybe(expansions)) + } else { + Ok(Atom::Value(self.parse_value()?)) + } + } + + /// Parses a value. + fn parse_value(&mut self) -> Result { + if let Some(string1) = self.match_token_with_value(Token::String) { + if self.match_token(Token::DotDot) { + let string2 = self.expect_token(Token::String)?.value; + Ok(Value::LiteralRange(string1.value.clone(), string2)) + } else { + Ok(Value::Literal(string1.value.clone())) + } + } else if let Some(regexp_token) = self.match_token_with_value(Token::Regexp) { + Ok(Value::Literal(regexp_token.value.clone())) + } else if let Some(name_token) = self + .match_token_with_value(Token::Rule) + .or_else(|| self.match_token_with_value(Token::Token)) + { + if self.match_token(Token::LBrace) { + let mut values = Vec::new(); + values.push(self.parse_value()?); + while self.match_token(Token::Comma) { + values.push(self.parse_value()?); + } + self.expect_token(Token::RBrace)?; + Ok(Value::TemplateUsage { + name: name_token.value.clone(), + values, + }) + } else { + Ok(Value::Name(name_token.value.clone())) + } + } else { + bail!("Expected value") + } + } + + /// Parses an import path. + fn parse_import_path(&mut self) -> Result { + let mut names = Vec::new(); + if self.match_token(Token::Dot) { + names.push(".".to_string()); + } + names.push(self.parse_name()?); + while self.match_token(Token::Dot) { + names.push(self.parse_name()?); + } + Ok(ImportPath(names)) + } + + /// Parses a name (RULE or TOKEN). + fn parse_name(&mut self) -> Result { + if let Some(token) = self.match_token_with_value(Token::Rule) { + Ok(token.value.clone()) + } else if let Some(token) = self.match_token_with_value(Token::Token) { + Ok(token.value.clone()) + } else { + bail!("Expected name (RULE or TOKEN)") + } + } + + /// Parses a list of names. + fn parse_name_list(&mut self) -> Result> { + if !self.match_token(Token::LParen) { + bail!("Expected '(' in name list") + } + let mut names = Vec::new(); + names.push(self.parse_name()?); + while self.match_token(Token::Comma) { + names.push(self.parse_name()?); + } + self.expect_token(Token::RParen)?; + Ok(names) + } + + fn has_token(&self, token: Token) -> bool { + if let Some(lexeme) = self.peek_token() { + lexeme.token == token + } else { + false + } + } + + /// Matches a specific token. + fn match_token(&mut self, expected: Token) -> bool { + if let Some(token) = self.peek_token() { + if token.token == expected { + self.advance(); + true + } else { + false + } + } else { + false + } + } + + /// Expects a specific token, or returns an error. + fn expect_token(&mut self, expected: Token) -> Result { + if let Some(token) = self.peek_token() { + if token.token == expected { + let r = token.clone(); + self.advance(); + Ok(r) + } else { + bail!("Expected token {:?}, found {:?}", expected, token.token) + } + } else { + bail!("Expected token {:?}, found end of input", expected) + } + } + + /// Matches a token and returns it if it matches the expected token. + fn match_token_with_value(&mut self, expected: Token) -> Option { + if let Some(token) = self.peek_token() { + if token.token == expected { + let r = token.clone(); + self.advance(); + Some(r) + } else { + None + } + } else { + None + } + } + + /// Consumes any newlines. + fn consume_newlines(&mut self) { + while let Some(token) = self.peek_token() { + if token.token == Token::Newline { + self.advance(); + } else { + break; + } + } + } + + /// Checks if the parser has reached the end of the tokens. + fn is_at_end(&self) -> bool { + self.pos >= self.tokens.len() + } + + /// Peeks at the next token without advancing. + fn peek_token(&self) -> Option<&Lexeme> { + self.tokens.get(self.pos) + } + + /// Advances to the next token. + fn advance(&mut self) { + if !self.is_at_end() { + self.pos += 1; + } + } +} impl Token { const LITERAL_TOKENS: &'static [(Token, &'static str)] = &[ @@ -60,6 +556,7 @@ impl Token { (Token::Colon, ":"), (Token::Comma, ","), (Token::Dot, "."), + (Token::DotDot, ".."), (Token::KwDeclare, "%declare"), (Token::KwIgnore, "%ignore"), (Token::KwImport, "%import"), From b47aea80eaffb4561b48d7f0530557a96e6ecb33 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Fri, 1 Nov 2024 15:52:06 -0700 Subject: [PATCH 03/11] lark grammar parses --- parser/src/earley/lark.rs | 41 +++++++++++++++++++++++++--------- sample_parser/src/lark_test.rs | 7 ++++-- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/parser/src/earley/lark.rs b/parser/src/earley/lark.rs index d0473d28..f43432e2 100644 --- a/parser/src/earley/lark.rs +++ b/parser/src/earley/lark.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use anyhow::{bail, Result}; +use anyhow::{anyhow, bail, Result}; use derivre::{RegexAst, RegexBuilder}; use crate::earley::lexerspec::LexerSpec; @@ -20,7 +20,6 @@ enum Token { Comma, Dot, DotDot, - Percent, Arrow, LParen, RParen, @@ -29,9 +28,6 @@ enum Token { LBracket, RBracket, Tilde, - Question, - Star, - Plus, // regexps Op, String, @@ -355,6 +351,9 @@ impl Parser { if self.has_token(Token::Newline) || self.has_token(Token::VBar) || self.has_token(Token::Arrow) + || self.has_token(Token::RBrace) + || self.has_token(Token::RParen) + || self.has_token(Token::RBracket) { break; } @@ -567,11 +566,7 @@ impl Token { (Token::RBrace, "}"), (Token::LBracket, "["), (Token::RBracket, "]"), - (Token::Percent, "%"), (Token::Tilde, "~"), - (Token::Question, "?"), - (Token::Star, "*"), - (Token::Plus, "+"), (Token::VBar, "|"), ]; @@ -628,6 +623,7 @@ pub fn lex_lark(input: &str) -> Result> { let mut lexemes = Vec::new(); let mut start_idx = 0; + let input = format!("{}\n", input); let input_bytes = input.as_bytes(); for idx in 0..=input_bytes.len() { let mut b = b'\n'; @@ -657,8 +653,12 @@ pub fn lex_lark(input: &str) -> Result> { }; curr_lexeme.value = input[start_idx..end_idx].to_string(); start_idx = end_idx; - println!("lex: {:?}", curr_lexeme); - lexemes.push(curr_lexeme.clone()); + + // println!("lex: {:?}", curr_lexeme); + + if curr_lexeme.token != Token::SKIP { + lexemes.push(curr_lexeme.clone()); + } state = lexer.start_state(&all_lexemes, transition_byte); @@ -679,6 +679,25 @@ pub fn lex_lark(input: &str) -> Result> { Ok(lexemes) } +pub fn parse_lark(input: &str) -> Result> { + let tokens = lex_lark(input)?; + let mut parser = Parser::new(tokens); + parser.parse_start().map_err(|e| { + if let Some(tok) = parser.peek_token() { + anyhow!( + "{}({}): {} (at {:?} ({:?}))", + tok.line, + tok.column, + e, + tok.value, + tok.token + ) + } else { + anyhow!("at EOF: {}", e) + } + }) +} + pub fn test_lex_lark() { lex_lark(LARK_GRAMMAR).unwrap(); } diff --git a/sample_parser/src/lark_test.rs b/sample_parser/src/lark_test.rs index b275cdd1..a74a1940 100644 --- a/sample_parser/src/lark_test.rs +++ b/sample_parser/src/lark_test.rs @@ -1,6 +1,6 @@ use std::{env, fs::File, io::Read}; -use llguidance_parser::earley::lark::lex_lark; +use llguidance_parser::earley::lark::parse_lark; fn main() { let args: Vec = env::args().collect(); @@ -10,7 +10,10 @@ fn main() { } let grammar_file = read_file_to_string(&args[1]); - let _tokens = lex_lark(&grammar_file).unwrap(); + let r = parse_lark(&grammar_file).unwrap(); + for it in r.iter() { + println!("{:?}", it); + } } fn read_file_to_string(filename: &str) -> String { From abee01cac36ff65bc9f04c46a40b5eb279af1072 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Fri, 1 Nov 2024 15:57:16 -0700 Subject: [PATCH 04/11] move lark to folder --- parser/src/earley/mod.rs | 3 +-- parser/src/{earley/lark.rs => lark/lexer.rs} | 7 +++---- parser/src/lark/mod.rs | 3 +++ parser/src/lib.rs | 2 ++ sample_parser/src/lark_test.rs | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) rename parser/src/{earley/lark.rs => lark/lexer.rs} (99%) create mode 100644 parser/src/lark/mod.rs diff --git a/parser/src/earley/mod.rs b/parser/src/earley/mod.rs index e7d981c2..ad555db2 100644 --- a/parser/src/earley/mod.rs +++ b/parser/src/earley/mod.rs @@ -1,8 +1,7 @@ mod from_guidance; mod grammar; -mod lexer; +pub(crate) mod lexer; mod parser; -pub mod lark; pub mod lexerspec; pub mod regexvec; diff --git a/parser/src/earley/lark.rs b/parser/src/lark/lexer.rs similarity index 99% rename from parser/src/earley/lark.rs rename to parser/src/lark/lexer.rs index f43432e2..b1560f41 100644 --- a/parser/src/earley/lark.rs +++ b/parser/src/lark/lexer.rs @@ -3,11 +3,9 @@ use std::collections::HashMap; use anyhow::{anyhow, bail, Result}; use derivre::{RegexAst, RegexBuilder}; -use crate::earley::lexerspec::LexerSpec; - -use super::{ +use crate::earley::{ lexer::{Lexer, LexerResult}, - lexerspec::LexemeIdx, + lexerspec::{LexemeIdx, LexerSpec}, }; #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -698,6 +696,7 @@ pub fn parse_lark(input: &str) -> Result> { }) } +#[allow(dead_code)] pub fn test_lex_lark() { lex_lark(LARK_GRAMMAR).unwrap(); } diff --git a/parser/src/lark/mod.rs b/parser/src/lark/mod.rs new file mode 100644 index 00000000..a3bd03cd --- /dev/null +++ b/parser/src/lark/mod.rs @@ -0,0 +1,3 @@ +mod lexer; + +pub use lexer::parse_lark; diff --git a/parser/src/lib.rs b/parser/src/lib.rs index dc9c9231..06cddf15 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -18,6 +18,8 @@ pub use derivre; pub mod ffi; +pub mod lark; + mod grammar_builder; mod json; pub use grammar_builder::{GrammarBuilder, NodeRef}; diff --git a/sample_parser/src/lark_test.rs b/sample_parser/src/lark_test.rs index a74a1940..58d0c8ff 100644 --- a/sample_parser/src/lark_test.rs +++ b/sample_parser/src/lark_test.rs @@ -1,6 +1,6 @@ use std::{env, fs::File, io::Read}; -use llguidance_parser::earley::lark::parse_lark; +use llguidance_parser::lark::parse_lark; fn main() { let args: Vec = env::args().collect(); From 446a29296c24aadab0338794a1a46799244245a7 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Fri, 1 Nov 2024 16:01:32 -0700 Subject: [PATCH 05/11] split lark --- parser/src/lark/ast.rs | 97 ++++++ parser/src/lark/lexer.rs | 605 +------------------------------------- parser/src/lark/mod.rs | 4 +- parser/src/lark/parser.rs | 424 ++++++++++++++++++++++++++ 4 files changed, 530 insertions(+), 600 deletions(-) create mode 100644 parser/src/lark/ast.rs create mode 100644 parser/src/lark/parser.rs diff --git a/parser/src/lark/ast.rs b/parser/src/lark/ast.rs new file mode 100644 index 00000000..729612ef --- /dev/null +++ b/parser/src/lark/ast.rs @@ -0,0 +1,97 @@ +/// Represents an item in the grammar (rule, token, or statement). +#[derive(Debug, Clone)] +pub enum Item { + Rule(Rule), + Token(TokenDef), + Statement(Statement), +} + +/// Represents a grammar rule. +#[derive(Debug, Clone)] +pub struct Rule { + pub name: String, + pub params: Option, + pub priority: Option, + pub expansions: Expansions, +} + +/// Represents a token definition. +#[derive(Debug, Clone)] +pub struct TokenDef { + pub name: String, + pub params: Option, + pub priority: Option, + pub expansions: Expansions, +} + +/// Represents different types of statements. +#[derive(Debug, Clone)] +pub enum Statement { + Ignore(Expansions), + Import { + path: ImportPath, + alias: Option, + }, + MultiImport { + path: ImportPath, + names: Vec, + }, + OverrideRule(Box), + Declare(Vec), +} + +/// Represents an import path. +#[derive(Debug, Clone)] +pub struct ImportPath(pub Vec); + +/// Represents parameters for a rule. +#[derive(Debug, Clone)] +pub struct RuleParams(pub Vec); + +/// Represents parameters for a token. +#[derive(Debug, Clone)] +pub struct TokenParams(pub Vec); + +/// Represents a list of expansions. +#[derive(Debug, Clone)] +pub struct Expansions(pub Vec); + +/// Represents an alias in the grammar. +#[derive(Debug, Clone)] +pub struct Alias { + pub expansion: Expansion, + pub alias: Option, +} + +/// Represents an expansion consisting of expressions. +#[derive(Debug, Clone)] +pub struct Expansion(pub Vec); + +/// Represents an expression. +#[derive(Debug, Clone)] +pub struct Expr { + pub atom: Atom, + pub op: Option, + pub range: Option<(i32, i32)>, +} + +/// Represents an atom in the grammar. +#[derive(Debug, Clone)] +pub enum Atom { + Group(Expansions), + Maybe(Expansions), + Value(Value), +} + +/// Represents different values in the grammar. +#[derive(Debug, Clone)] +pub enum Value { + LiteralRange(String, String), + Name(String), + Literal(String), + TemplateUsage { name: String, values: Vec }, +} + +/// Represents an operator. +#[derive(Debug, Clone)] +pub struct Op(pub String); diff --git a/parser/src/lark/lexer.rs b/parser/src/lark/lexer.rs index b1560f41..f36f61b5 100644 --- a/parser/src/lark/lexer.rs +++ b/parser/src/lark/lexer.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use anyhow::{anyhow, bail, Result}; +use anyhow::{bail, Result}; use derivre::{RegexAst, RegexBuilder}; use crate::earley::{ @@ -9,7 +9,7 @@ use crate::earley::{ }; #[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum Token { +pub enum Token { KwIgnore, KwImport, KwOverride, @@ -43,508 +43,10 @@ enum Token { /// Represents a lexeme with its token type, value, and position. #[derive(Debug, Clone)] pub struct Lexeme { - token: Token, - value: String, - line: usize, - column: usize, -} - -/// Represents an item in the grammar (rule, token, or statement). -#[derive(Debug, Clone)] -pub enum Item { - Rule(Rule), - Token(TokenDef), - Statement(Statement), -} - -/// Represents a grammar rule. -#[derive(Debug, Clone)] -pub struct Rule { - pub name: String, - pub params: Option, - pub priority: Option, - pub expansions: Expansions, -} - -/// Represents a token definition. -#[derive(Debug, Clone)] -pub struct TokenDef { - pub name: String, - pub params: Option, - pub priority: Option, - pub expansions: Expansions, -} - -/// Represents different types of statements. -#[derive(Debug, Clone)] -pub enum Statement { - Ignore(Expansions), - Import { - path: ImportPath, - alias: Option, - }, - MultiImport { - path: ImportPath, - names: Vec, - }, - OverrideRule(Box), - Declare(Vec), -} - -/// Represents an import path. -#[derive(Debug, Clone)] -pub struct ImportPath(pub Vec); - -/// Represents parameters for a rule. -#[derive(Debug, Clone)] -pub struct RuleParams(pub Vec); - -/// Represents parameters for a token. -#[derive(Debug, Clone)] -pub struct TokenParams(pub Vec); - -/// Represents a list of expansions. -#[derive(Debug, Clone)] -pub struct Expansions(pub Vec); - -/// Represents an alias in the grammar. -#[derive(Debug, Clone)] -pub struct Alias { - pub expansion: Expansion, - pub alias: Option, -} - -/// Represents an expansion consisting of expressions. -#[derive(Debug, Clone)] -pub struct Expansion(pub Vec); - -/// Represents an expression. -#[derive(Debug, Clone)] -pub struct Expr { - pub atom: Atom, - pub op: Option, - pub range: Option<(i32, i32)>, -} - -/// Represents an atom in the grammar. -#[derive(Debug, Clone)] -pub enum Atom { - Group(Expansions), - Maybe(Expansions), - Value(Value), -} - -/// Represents different values in the grammar. -#[derive(Debug, Clone)] -pub enum Value { - LiteralRange(String, String), - Name(String), - Literal(String), - TemplateUsage { name: String, values: Vec }, -} - -/// Represents an operator. -#[derive(Debug, Clone)] -pub struct Op(pub String); - -/// The parser struct that holds the tokens and current position. -pub struct Parser { - tokens: Vec, - pos: usize, -} - -impl Parser { - /// Creates a new parser instance. - pub fn new(tokens: Vec) -> Self { - Parser { tokens, pos: 0 } - } - - /// Parses the start symbol of the grammar. - pub fn parse_start(&mut self) -> Result> { - let mut items = Vec::new(); - while !self.is_at_end() { - self.consume_newlines(); - if self.is_at_end() { - break; - } - items.push(self.parse_item()?); - self.consume_newlines(); - } - Ok(items) - } - - /// Parses an item (rule, token, or statement). - fn parse_item(&mut self) -> Result { - if self.has_token(Token::Rule) { - Ok(Item::Rule(self.parse_rule()?)) - } else if self.has_token(Token::Token) { - Ok(Item::Token(self.parse_token_def()?)) - } else { - Ok(Item::Statement(self.parse_statement()?)) - } - } - - /// Parses a rule definition. - fn parse_rule(&mut self) -> Result { - let name = self.expect_token(Token::Rule)?.value; - let params = if self.has_token(Token::LBrace) { - Some(self.parse_rule_params()?) - } else { - None - }; - let priority = if self.has_token(Token::Dot) { - Some(self.parse_priority()?) - } else { - None - }; - self.expect_token(Token::Colon)?; - let expansions = self.parse_expansions()?; - Ok(Rule { - name, - params, - priority, - expansions, - }) - } - - /// Parses a token definition. - fn parse_token_def(&mut self) -> Result { - let name = self.expect_token(Token::Token)?.value; - let params = if self.has_token(Token::LBrace) { - Some(self.parse_token_params()?) - } else { - None - }; - let priority = if self.has_token(Token::Dot) { - Some(self.parse_priority()?) - } else { - None - }; - self.expect_token(Token::Colon)?; - let expansions = self.parse_expansions()?; - Ok(TokenDef { - name, - params, - priority, - expansions, - }) - } - - /// Parses a statement. - fn parse_statement(&mut self) -> Result { - if self.match_token(Token::KwIgnore) { - let expansions = self.parse_expansions()?; - Ok(Statement::Ignore(expansions)) - } else if self.match_token(Token::KwImport) { - let import_path = self.parse_import_path()?; - if self.match_token(Token::Arrow) { - let name = self.parse_name()?; - Ok(Statement::Import { - path: import_path, - alias: Some(name), - }) - } else { - Ok(Statement::MultiImport { - path: import_path, - names: self.parse_name_list()?, - }) - } - } else if self.match_token(Token::KwOverride) { - let rule = self.parse_rule()?; - Ok(Statement::OverrideRule(Box::new(rule))) - } else if self.match_token(Token::KwDeclare) { - let mut names = Vec::new(); - while let Ok(name) = self.parse_name() { - names.push(name); - } - if names.is_empty() { - bail!("Expected at least one name after %declare") - } - Ok(Statement::Declare(names)) - } else { - bail!("expecting rule, token or statement") - } - } - - /// Parses rule parameters. - fn parse_rule_params(&mut self) -> Result { - if !self.match_token(Token::LBrace) { - bail!("Expected '{{' in rule parameters") - } - let mut params = Vec::new(); - let name = self.expect_token(Token::Rule)?.value; - params.push(name); - while self.match_token(Token::Comma) { - let name = self.expect_token(Token::Rule)?.value; - params.push(name); - } - self.expect_token(Token::RBrace)?; - Ok(RuleParams(params)) - } - - /// Parses token parameters. - fn parse_token_params(&mut self) -> Result { - if !self.match_token(Token::LBrace) { - bail!("Expected '{{' in token parameters") - } - let mut params = Vec::new(); - let name = self.expect_token(Token::Token)?.value; - params.push(name); - while self.match_token(Token::Comma) { - let name = self.expect_token(Token::Token)?.value; - params.push(name); - } - self.expect_token(Token::RBrace)?; - Ok(TokenParams(params)) - } - - /// Parses priority. - fn parse_priority(&mut self) -> Result { - if !self.match_token(Token::Dot) { - bail!("Expected '.' in priority") - } - let number = self.expect_token(Token::Number)?.value.parse::()?; - Ok(number) - } - - /// Parses expansions. - fn parse_expansions(&mut self) -> Result { - let mut aliases = Vec::new(); - aliases.push(self.parse_alias()?); - while self.match_vbar() { - aliases.push(self.parse_alias()?); - } - Ok(Expansions(aliases)) - } - - fn match_vbar(&mut self) -> bool { - if self.match_token(Token::VBar) { - return true; - } - let p0 = self.pos; - if self.match_token(Token::Newline) { - if self.match_token(Token::VBar) { - return true; - } - } - self.pos = p0; - false - } - - /// Parses an alias. - fn parse_alias(&mut self) -> Result { - let expansion = self.parse_expansion()?; - let alias = if self.match_token(Token::Arrow) { - Some(self.expect_token(Token::Rule)?.value) - } else { - None - }; - Ok(Alias { expansion, alias }) - } - - /// Parses an expansion. - fn parse_expansion(&mut self) -> Result { - let mut exprs = Vec::new(); - loop { - if self.has_token(Token::Newline) - || self.has_token(Token::VBar) - || self.has_token(Token::Arrow) - || self.has_token(Token::RBrace) - || self.has_token(Token::RParen) - || self.has_token(Token::RBracket) - { - break; - } - exprs.push(self.parse_expr()?); - } - Ok(Expansion(exprs)) - } - - /// Parses an expression. - fn parse_expr(&mut self) -> Result { - let atom = self.parse_atom()?; - let mut op = None; - let mut range = None; - if let Some(op_token) = self.match_token_with_value(Token::Op) { - op = Some(Op(op_token.value.clone())); - } else if self.match_token(Token::Tilde) { - let start_num = self.expect_token(Token::Number)?.value.parse::()?; - let end_num = if self.match_token(Token::DotDot) { - Some(self.expect_token(Token::Number)?.value.parse::()?) - } else { - None - }; - range = Some((start_num, end_num.unwrap_or(start_num))); - } - Ok(Expr { atom, op, range }) - } - - /// Parses an atom. - fn parse_atom(&mut self) -> Result { - if self.match_token(Token::LParen) { - let expansions = self.parse_expansions()?; - self.expect_token(Token::RParen)?; - Ok(Atom::Group(expansions)) - } else if self.match_token(Token::LBracket) { - let expansions = self.parse_expansions()?; - self.expect_token(Token::RBracket)?; - Ok(Atom::Maybe(expansions)) - } else { - Ok(Atom::Value(self.parse_value()?)) - } - } - - /// Parses a value. - fn parse_value(&mut self) -> Result { - if let Some(string1) = self.match_token_with_value(Token::String) { - if self.match_token(Token::DotDot) { - let string2 = self.expect_token(Token::String)?.value; - Ok(Value::LiteralRange(string1.value.clone(), string2)) - } else { - Ok(Value::Literal(string1.value.clone())) - } - } else if let Some(regexp_token) = self.match_token_with_value(Token::Regexp) { - Ok(Value::Literal(regexp_token.value.clone())) - } else if let Some(name_token) = self - .match_token_with_value(Token::Rule) - .or_else(|| self.match_token_with_value(Token::Token)) - { - if self.match_token(Token::LBrace) { - let mut values = Vec::new(); - values.push(self.parse_value()?); - while self.match_token(Token::Comma) { - values.push(self.parse_value()?); - } - self.expect_token(Token::RBrace)?; - Ok(Value::TemplateUsage { - name: name_token.value.clone(), - values, - }) - } else { - Ok(Value::Name(name_token.value.clone())) - } - } else { - bail!("Expected value") - } - } - - /// Parses an import path. - fn parse_import_path(&mut self) -> Result { - let mut names = Vec::new(); - if self.match_token(Token::Dot) { - names.push(".".to_string()); - } - names.push(self.parse_name()?); - while self.match_token(Token::Dot) { - names.push(self.parse_name()?); - } - Ok(ImportPath(names)) - } - - /// Parses a name (RULE or TOKEN). - fn parse_name(&mut self) -> Result { - if let Some(token) = self.match_token_with_value(Token::Rule) { - Ok(token.value.clone()) - } else if let Some(token) = self.match_token_with_value(Token::Token) { - Ok(token.value.clone()) - } else { - bail!("Expected name (RULE or TOKEN)") - } - } - - /// Parses a list of names. - fn parse_name_list(&mut self) -> Result> { - if !self.match_token(Token::LParen) { - bail!("Expected '(' in name list") - } - let mut names = Vec::new(); - names.push(self.parse_name()?); - while self.match_token(Token::Comma) { - names.push(self.parse_name()?); - } - self.expect_token(Token::RParen)?; - Ok(names) - } - - fn has_token(&self, token: Token) -> bool { - if let Some(lexeme) = self.peek_token() { - lexeme.token == token - } else { - false - } - } - - /// Matches a specific token. - fn match_token(&mut self, expected: Token) -> bool { - if let Some(token) = self.peek_token() { - if token.token == expected { - self.advance(); - true - } else { - false - } - } else { - false - } - } - - /// Expects a specific token, or returns an error. - fn expect_token(&mut self, expected: Token) -> Result { - if let Some(token) = self.peek_token() { - if token.token == expected { - let r = token.clone(); - self.advance(); - Ok(r) - } else { - bail!("Expected token {:?}, found {:?}", expected, token.token) - } - } else { - bail!("Expected token {:?}, found end of input", expected) - } - } - - /// Matches a token and returns it if it matches the expected token. - fn match_token_with_value(&mut self, expected: Token) -> Option { - if let Some(token) = self.peek_token() { - if token.token == expected { - let r = token.clone(); - self.advance(); - Some(r) - } else { - None - } - } else { - None - } - } - - /// Consumes any newlines. - fn consume_newlines(&mut self) { - while let Some(token) = self.peek_token() { - if token.token == Token::Newline { - self.advance(); - } else { - break; - } - } - } - - /// Checks if the parser has reached the end of the tokens. - fn is_at_end(&self) -> bool { - self.pos >= self.tokens.len() - } - - /// Peeks at the next token without advancing. - fn peek_token(&self) -> Option<&Lexeme> { - self.tokens.get(self.pos) - } - - /// Advances to the next token. - fn advance(&mut self) { - if !self.is_at_end() { - self.pos += 1; - } - } + pub token: Token, + pub value: String, + pub line: usize, + pub column: usize, } impl Token { @@ -676,98 +178,3 @@ pub fn lex_lark(input: &str) -> Result> { Ok(lexemes) } - -pub fn parse_lark(input: &str) -> Result> { - let tokens = lex_lark(input)?; - let mut parser = Parser::new(tokens); - parser.parse_start().map_err(|e| { - if let Some(tok) = parser.peek_token() { - anyhow!( - "{}({}): {} (at {:?} ({:?}))", - tok.line, - tok.column, - e, - tok.value, - tok.token - ) - } else { - anyhow!("at EOF: {}", e) - } - }) -} - -#[allow(dead_code)] -pub fn test_lex_lark() { - lex_lark(LARK_GRAMMAR).unwrap(); -} - -const LARK_GRAMMAR: &str = r#" -# Lark grammar of Lark's syntax -# Note: Lark is not bootstrapped, its parser is implemented in load_grammar.py - -start: (_item? _NL)* _item? - -_item: rule - | token - | statement - -rule: RULE rule_params priority? ":" expansions -token: TOKEN token_params priority? ":" expansions - -rule_params: ["{" RULE ("," RULE)* "}"] -token_params: ["{" TOKEN ("," TOKEN)* "}"] - -priority: "." NUMBER - -statement: "%ignore" expansions -> ignore - | "%import" import_path ["->" name] -> import - | "%import" import_path name_list -> multi_import - | "%override" rule -> override_rule - | "%declare" name+ -> declare - -!import_path: "."? name ("." name)* -name_list: "(" name ("," name)* ")" - -?expansions: alias (_VBAR alias)* - -?alias: expansion ["->" RULE] - -?expansion: expr* - -?expr: atom [OP | "~" NUMBER [".." NUMBER]] - -?atom: "(" expansions ")" - | "[" expansions "]" -> maybe - | value - -?value: STRING ".." STRING -> literal_range - | name - | (REGEXP | STRING) -> literal - | name "{" value ("," value)* "}" -> template_usage - -name: RULE - | TOKEN - -_VBAR: _NL? "|" -OP: /[+*]|[?](?![a-z])/ -RULE: /!?[_?]?[a-z][_a-z0-9]*/ -TOKEN: /_?[A-Z][_A-Z0-9]*/ -STRING: _STRING "i"? -REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ -_NL: /(\r?\n)+\s*/ - -_STRING_INNER: /.*?/ -_STRING_ESC_INNER: _STRING_INNER /(?, + pos: usize, +} + +impl Parser { + /// Creates a new parser instance. + pub fn new(tokens: Vec) -> Self { + Parser { tokens, pos: 0 } + } + + /// Parses the start symbol of the grammar. + pub fn parse_start(&mut self) -> Result> { + let mut items = Vec::new(); + while !self.is_at_end() { + self.consume_newlines(); + if self.is_at_end() { + break; + } + items.push(self.parse_item()?); + self.consume_newlines(); + } + Ok(items) + } + + /// Parses an item (rule, token, or statement). + fn parse_item(&mut self) -> Result { + if self.has_token(Token::Rule) { + Ok(Item::Rule(self.parse_rule()?)) + } else if self.has_token(Token::Token) { + Ok(Item::Token(self.parse_token_def()?)) + } else { + Ok(Item::Statement(self.parse_statement()?)) + } + } + + /// Parses a rule definition. + fn parse_rule(&mut self) -> Result { + let name = self.expect_token(Token::Rule)?.value; + let params = if self.has_token(Token::LBrace) { + Some(self.parse_rule_params()?) + } else { + None + }; + let priority = if self.has_token(Token::Dot) { + Some(self.parse_priority()?) + } else { + None + }; + self.expect_token(Token::Colon)?; + let expansions = self.parse_expansions()?; + Ok(Rule { + name, + params, + priority, + expansions, + }) + } + + /// Parses a token definition. + fn parse_token_def(&mut self) -> Result { + let name = self.expect_token(Token::Token)?.value; + let params = if self.has_token(Token::LBrace) { + Some(self.parse_token_params()?) + } else { + None + }; + let priority = if self.has_token(Token::Dot) { + Some(self.parse_priority()?) + } else { + None + }; + self.expect_token(Token::Colon)?; + let expansions = self.parse_expansions()?; + Ok(TokenDef { + name, + params, + priority, + expansions, + }) + } + + /// Parses a statement. + fn parse_statement(&mut self) -> Result { + if self.match_token(Token::KwIgnore) { + let expansions = self.parse_expansions()?; + Ok(Statement::Ignore(expansions)) + } else if self.match_token(Token::KwImport) { + let import_path = self.parse_import_path()?; + if self.match_token(Token::Arrow) { + let name = self.parse_name()?; + Ok(Statement::Import { + path: import_path, + alias: Some(name), + }) + } else { + Ok(Statement::MultiImport { + path: import_path, + names: self.parse_name_list()?, + }) + } + } else if self.match_token(Token::KwOverride) { + let rule = self.parse_rule()?; + Ok(Statement::OverrideRule(Box::new(rule))) + } else if self.match_token(Token::KwDeclare) { + let mut names = Vec::new(); + while let Ok(name) = self.parse_name() { + names.push(name); + } + if names.is_empty() { + bail!("Expected at least one name after %declare") + } + Ok(Statement::Declare(names)) + } else { + bail!("expecting rule, token or statement") + } + } + + /// Parses rule parameters. + fn parse_rule_params(&mut self) -> Result { + if !self.match_token(Token::LBrace) { + bail!("Expected '{{' in rule parameters") + } + let mut params = Vec::new(); + let name = self.expect_token(Token::Rule)?.value; + params.push(name); + while self.match_token(Token::Comma) { + let name = self.expect_token(Token::Rule)?.value; + params.push(name); + } + self.expect_token(Token::RBrace)?; + Ok(RuleParams(params)) + } + + /// Parses token parameters. + fn parse_token_params(&mut self) -> Result { + if !self.match_token(Token::LBrace) { + bail!("Expected '{{' in token parameters") + } + let mut params = Vec::new(); + let name = self.expect_token(Token::Token)?.value; + params.push(name); + while self.match_token(Token::Comma) { + let name = self.expect_token(Token::Token)?.value; + params.push(name); + } + self.expect_token(Token::RBrace)?; + Ok(TokenParams(params)) + } + + /// Parses priority. + fn parse_priority(&mut self) -> Result { + if !self.match_token(Token::Dot) { + bail!("Expected '.' in priority") + } + let number = self.expect_token(Token::Number)?.value.parse::()?; + Ok(number) + } + + /// Parses expansions. + fn parse_expansions(&mut self) -> Result { + let mut aliases = Vec::new(); + aliases.push(self.parse_alias()?); + while self.match_vbar() { + aliases.push(self.parse_alias()?); + } + Ok(Expansions(aliases)) + } + + fn match_vbar(&mut self) -> bool { + if self.match_token(Token::VBar) { + return true; + } + let p0 = self.pos; + if self.match_token(Token::Newline) { + if self.match_token(Token::VBar) { + return true; + } + } + self.pos = p0; + false + } + + /// Parses an alias. + fn parse_alias(&mut self) -> Result { + let expansion = self.parse_expansion()?; + let alias = if self.match_token(Token::Arrow) { + Some(self.expect_token(Token::Rule)?.value) + } else { + None + }; + Ok(Alias { expansion, alias }) + } + + /// Parses an expansion. + fn parse_expansion(&mut self) -> Result { + let mut exprs = Vec::new(); + loop { + if self.has_token(Token::Newline) + || self.has_token(Token::VBar) + || self.has_token(Token::Arrow) + || self.has_token(Token::RBrace) + || self.has_token(Token::RParen) + || self.has_token(Token::RBracket) + { + break; + } + exprs.push(self.parse_expr()?); + } + Ok(Expansion(exprs)) + } + + /// Parses an expression. + fn parse_expr(&mut self) -> Result { + let atom = self.parse_atom()?; + let mut op = None; + let mut range = None; + if let Some(op_token) = self.match_token_with_value(Token::Op) { + op = Some(Op(op_token.value.clone())); + } else if self.match_token(Token::Tilde) { + let start_num = self.expect_token(Token::Number)?.value.parse::()?; + let end_num = if self.match_token(Token::DotDot) { + Some(self.expect_token(Token::Number)?.value.parse::()?) + } else { + None + }; + range = Some((start_num, end_num.unwrap_or(start_num))); + } + Ok(Expr { atom, op, range }) + } + + /// Parses an atom. + fn parse_atom(&mut self) -> Result { + if self.match_token(Token::LParen) { + let expansions = self.parse_expansions()?; + self.expect_token(Token::RParen)?; + Ok(Atom::Group(expansions)) + } else if self.match_token(Token::LBracket) { + let expansions = self.parse_expansions()?; + self.expect_token(Token::RBracket)?; + Ok(Atom::Maybe(expansions)) + } else { + Ok(Atom::Value(self.parse_value()?)) + } + } + + /// Parses a value. + fn parse_value(&mut self) -> Result { + if let Some(string1) = self.match_token_with_value(Token::String) { + if self.match_token(Token::DotDot) { + let string2 = self.expect_token(Token::String)?.value; + Ok(Value::LiteralRange(string1.value.clone(), string2)) + } else { + Ok(Value::Literal(string1.value.clone())) + } + } else if let Some(regexp_token) = self.match_token_with_value(Token::Regexp) { + Ok(Value::Literal(regexp_token.value.clone())) + } else if let Some(name_token) = self + .match_token_with_value(Token::Rule) + .or_else(|| self.match_token_with_value(Token::Token)) + { + if self.match_token(Token::LBrace) { + let mut values = Vec::new(); + values.push(self.parse_value()?); + while self.match_token(Token::Comma) { + values.push(self.parse_value()?); + } + self.expect_token(Token::RBrace)?; + Ok(Value::TemplateUsage { + name: name_token.value.clone(), + values, + }) + } else { + Ok(Value::Name(name_token.value.clone())) + } + } else { + bail!("Expected value") + } + } + + /// Parses an import path. + fn parse_import_path(&mut self) -> Result { + let mut names = Vec::new(); + if self.match_token(Token::Dot) { + names.push(".".to_string()); + } + names.push(self.parse_name()?); + while self.match_token(Token::Dot) { + names.push(self.parse_name()?); + } + Ok(ImportPath(names)) + } + + /// Parses a name (RULE or TOKEN). + fn parse_name(&mut self) -> Result { + if let Some(token) = self.match_token_with_value(Token::Rule) { + Ok(token.value.clone()) + } else if let Some(token) = self.match_token_with_value(Token::Token) { + Ok(token.value.clone()) + } else { + bail!("Expected name (RULE or TOKEN)") + } + } + + /// Parses a list of names. + fn parse_name_list(&mut self) -> Result> { + if !self.match_token(Token::LParen) { + bail!("Expected '(' in name list") + } + let mut names = Vec::new(); + names.push(self.parse_name()?); + while self.match_token(Token::Comma) { + names.push(self.parse_name()?); + } + self.expect_token(Token::RParen)?; + Ok(names) + } + + fn has_token(&self, token: Token) -> bool { + if let Some(lexeme) = self.peek_token() { + lexeme.token == token + } else { + false + } + } + + /// Matches a specific token. + fn match_token(&mut self, expected: Token) -> bool { + if let Some(token) = self.peek_token() { + if token.token == expected { + self.advance(); + true + } else { + false + } + } else { + false + } + } + + /// Expects a specific token, or returns an error. + fn expect_token(&mut self, expected: Token) -> Result { + if let Some(token) = self.peek_token() { + if token.token == expected { + let r = token.clone(); + self.advance(); + Ok(r) + } else { + bail!("Expected token {:?}, found {:?}", expected, token.token) + } + } else { + bail!("Expected token {:?}, found end of input", expected) + } + } + + /// Matches a token and returns it if it matches the expected token. + fn match_token_with_value(&mut self, expected: Token) -> Option { + if let Some(token) = self.peek_token() { + if token.token == expected { + let r = token.clone(); + self.advance(); + Some(r) + } else { + None + } + } else { + None + } + } + + /// Consumes any newlines. + fn consume_newlines(&mut self) { + while let Some(token) = self.peek_token() { + if token.token == Token::Newline { + self.advance(); + } else { + break; + } + } + } + + /// Checks if the parser has reached the end of the tokens. + fn is_at_end(&self) -> bool { + self.pos >= self.tokens.len() + } + + /// Peeks at the next token without advancing. + fn peek_token(&self) -> Option<&Lexeme> { + self.tokens.get(self.pos) + } + + /// Advances to the next token. + fn advance(&mut self) { + if !self.is_at_end() { + self.pos += 1; + } + } +} + +pub fn parse_lark(input: &str) -> Result> { + let tokens = lex_lark(input)?; + let mut parser = Parser::new(tokens); + parser.parse_start().map_err(|e| { + if let Some(tok) = parser.peek_token() { + anyhow!( + "{}({}): {} (at {:?} ({:?}))", + tok.line, + tok.column, + e, + tok.value, + tok.token + ) + } else { + anyhow!("at EOF: {}", e) + } + }) +} From 7b279f31f9a3f4b7a19d6880373e3526b7248996 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Fri, 1 Nov 2024 16:48:58 -0700 Subject: [PATCH 06/11] start on compiler --- parser/src/grammar_builder.rs | 14 +++++++- parser/src/lark/compiler.rs | 65 +++++++++++++++++++++++++++++++++++ parser/src/lark/mod.rs | 1 + 3 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 parser/src/lark/compiler.rs diff --git a/parser/src/grammar_builder.rs b/parser/src/grammar_builder.rs index 8a43a457..55e214a2 100644 --- a/parser/src/grammar_builder.rs +++ b/parser/src/grammar_builder.rs @@ -2,7 +2,9 @@ use std::{collections::HashMap, sync::atomic::AtomicU32}; use anyhow::{ensure, Result}; -use crate::api::{GrammarWithLexer, Node, NodeId, NodeProps, RegexSpec, TopLevelGrammar}; +use crate::api::{ + GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode, RegexSpec, TopLevelGrammar, +}; #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub struct NodeRef { @@ -16,6 +18,7 @@ pub struct GrammarBuilder { strings: HashMap, curr_grammar_id: u32, nodes: Vec, + rx_nodes: Vec, } impl GrammarBuilder { @@ -49,9 +52,16 @@ impl GrammarBuilder { strings: HashMap::new(), curr_grammar_id: 0, nodes: vec![], + rx_nodes: vec![], } } + pub fn add_regex_node(&mut self, node: RegexNode) -> RegexId { + let id = RegexId(self.rx_nodes.len()); + self.rx_nodes.push(node); + id + } + fn shift_nodes(&mut self) { if self.top_grammar.grammars.len() == 0 { assert!(self.nodes.is_empty(), "nodes added before add_grammar()"); @@ -62,6 +72,8 @@ impl GrammarBuilder { "no nodes added before add_grammar() or finalize()" ); self.top_grammar.grammars.last_mut().unwrap().nodes = nodes; + self.top_grammar.grammars.last_mut().unwrap().rx_nodes = + std::mem::take(&mut self.rx_nodes); } } diff --git a/parser/src/lark/compiler.rs b/parser/src/lark/compiler.rs new file mode 100644 index 00000000..6262be8f --- /dev/null +++ b/parser/src/lark/compiler.rs @@ -0,0 +1,65 @@ +use std::collections::HashMap; + +use anyhow::Result; + +use crate::{ + api::{RegexSpec, TopLevelGrammar}, + GrammarBuilder, NodeRef, +}; + +use super::ast::*; + +struct Compiler { + builder: GrammarBuilder, + items: Vec, + nodes: HashMap, +} + +struct NodeInfo { + id: NodeRef, + is_terminal: bool, + regex: Option, +} + +pub fn lark_to_llguidance(items: Vec) -> Result { + let mut c = Compiler { + builder: GrammarBuilder::new(), + items, + nodes: HashMap::new(), + }; + c.execute()?; + c.builder.finalize() +} + +impl Compiler { + fn execute(&mut self) -> Result<()> { + for item in self.items.iter() { + match item { + Item::Rule(rule) => { + let id = self.builder.placeholder(); + self.nodes.insert( + rule.name.clone(), + NodeInfo { + id, + is_terminal: false, + regex: None, + }, + ); + } + Item::Token(token_def) => { + let id = self.builder.placeholder(); + self.nodes.insert( + token_def.name.clone(), + NodeInfo { + id, + is_terminal: true, + regex: None, + }, + ); + } + Item::Statement(statement) => todo!(), + } + } + Ok(()) + } +} diff --git a/parser/src/lark/mod.rs b/parser/src/lark/mod.rs index cb8a0a4c..a1e9a1a5 100644 --- a/parser/src/lark/mod.rs +++ b/parser/src/lark/mod.rs @@ -1,4 +1,5 @@ mod ast; +mod compiler; mod lexer; mod parser; From 6fecfdb4f9f8e3b08d734eb4ea00326efbf07b09 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Mon, 4 Nov 2024 13:53:25 -0800 Subject: [PATCH 07/11] fix zero_or_more() add RegexBuilder to GrammarBuilder --- parser/src/grammar_builder.rs | 98 +++++++++++++++++++++++++++++++---- 1 file changed, 87 insertions(+), 11 deletions(-) diff --git a/parser/src/grammar_builder.rs b/parser/src/grammar_builder.rs index 55e214a2..1a15515a 100644 --- a/parser/src/grammar_builder.rs +++ b/parser/src/grammar_builder.rs @@ -18,7 +18,81 @@ pub struct GrammarBuilder { strings: HashMap, curr_grammar_id: u32, nodes: Vec, - rx_nodes: Vec, + pub regex: RegexBuilder, +} + +pub struct RegexBuilder { + node_ids: HashMap, + nodes: Vec, +} + +impl RegexBuilder { + pub fn new() -> Self { + Self { + nodes: vec![], + node_ids: HashMap::new(), + } + } + + pub fn add_node(&mut self, node: RegexNode) -> RegexId { + if let Some(id) = self.node_ids.get(&node) { + return *id; + } + let id = RegexId(self.nodes.len()); + self.nodes.push(node.clone()); + self.node_ids.insert(node, id); + id + } + + pub fn regex(&mut self, rx: String) -> RegexId { + self.add_node(RegexNode::Regex(rx)) + } + + pub fn literal(&mut self, s: String) -> RegexId { + self.add_node(RegexNode::Literal(s)) + } + + pub fn concat(&mut self, nodes: Vec) -> RegexId { + if nodes.len() == 1 { + return nodes[0]; + } + if nodes.len() == 0 { + return self.add_node(RegexNode::NoMatch); + } + self.add_node(RegexNode::Concat(nodes)) + } + + pub fn select(&mut self, nodes: Vec) -> RegexId { + if nodes.len() == 1 { + return nodes[0]; + } + if nodes.len() == 0 { + return self.add_node(RegexNode::NoMatch); + } + self.add_node(RegexNode::Or(nodes)) + } + + pub fn zero_or_more(&mut self, node: RegexId) -> RegexId { + self.repeat(node, 0, None) + } + + pub fn one_or_more(&mut self, node: RegexId) -> RegexId { + self.repeat(node, 1, None) + } + + pub fn optional(&mut self, node: RegexId) -> RegexId { + self.repeat(node, 0, Some(1)) + } + + pub fn repeat(&mut self, node: RegexId, min: u32, max: Option) -> RegexId { + self.add_node(RegexNode::Repeat(node, min, max)) + } + + fn finalize(&mut self) -> Vec { + let r = std::mem::take(&mut self.nodes); + *self = Self::new(); + r + } } impl GrammarBuilder { @@ -52,16 +126,10 @@ impl GrammarBuilder { strings: HashMap::new(), curr_grammar_id: 0, nodes: vec![], - rx_nodes: vec![], + regex: RegexBuilder::new(), } } - pub fn add_regex_node(&mut self, node: RegexNode) -> RegexId { - let id = RegexId(self.rx_nodes.len()); - self.rx_nodes.push(node); - id - } - fn shift_nodes(&mut self) { if self.top_grammar.grammars.len() == 0 { assert!(self.nodes.is_empty(), "nodes added before add_grammar()"); @@ -72,8 +140,7 @@ impl GrammarBuilder { "no nodes added before add_grammar() or finalize()" ); self.top_grammar.grammars.last_mut().unwrap().nodes = nodes; - self.top_grammar.grammars.last_mut().unwrap().rx_nodes = - std::mem::take(&mut self.rx_nodes); + self.top_grammar.grammars.last_mut().unwrap().rx_nodes = self.regex.finalize(); } } @@ -170,10 +237,19 @@ impl GrammarBuilder { self.select(&[value, empty]) } + pub fn one_or_more(&mut self, elt: NodeRef) -> NodeRef { + let p = self.placeholder(); + let p_elt = self.join(&[p, elt]); + let inner = self.select(&[elt, p_elt]); + self.set_placeholder(p, inner); + p + } + pub fn zero_or_more(&mut self, elt: NodeRef) -> NodeRef { let p = self.placeholder(); let empty = self.empty(); - let inner = self.select(&[empty, elt]); + let p_elt = self.join(&[p, elt]); + let inner = self.select(&[empty, p_elt]); self.set_placeholder(p, inner); p } From 6a39e606bed063a8959b2e75872bf1ca7f26dd13 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Mon, 4 Nov 2024 13:55:21 -0800 Subject: [PATCH 08/11] basic lark translation working --- parser/Cargo.toml | 1 + parser/src/api.rs | 3 +- parser/src/lark/ast.rs | 11 +- parser/src/lark/compiler.rs | 315 +++++++++++++++++++++++++++++---- parser/src/lark/lexer.rs | 22 ++- parser/src/lark/mod.rs | 1 + parser/src/lark/parser.rs | 67 ++++++- sample_parser/data/lark.lark | 6 +- sample_parser/src/lark_test.rs | 16 +- 9 files changed, 392 insertions(+), 50 deletions(-) diff --git a/parser/Cargo.toml b/parser/Cargo.toml index 582c8ad0..7e3e5b9d 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -14,6 +14,7 @@ instant = "0.1.13" jsonschema = { version = "0.24.0", default-features = false } url = "2.5.2" lazy_static = "1.5.0" +regex-syntax = "0.8.5" [features] default = [] diff --git a/parser/src/api.rs b/parser/src/api.rs index 34450ffe..41b75ac7 100644 --- a/parser/src/api.rs +++ b/parser/src/api.rs @@ -21,6 +21,7 @@ pub struct GrammarWithLexer { /// The start symbol is at nodes[0] pub nodes: Vec, + /// This is no longer used. /// When enabled, the grammar can use `Lexeme` but not `Gen`. /// When disabled, the grammar can use `Gen` but not `Lexeme`. /// `String` is allowed in either case as a shorthand for either `Lexeme` or `Gen`. @@ -196,7 +197,7 @@ pub struct GenGrammarOptions { pub max_tokens_grm: usize, } -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, Clone, Debug, Hash, PartialEq, Eq)] pub enum RegexNode { /// Intersection of the regexes And(Vec), diff --git a/parser/src/lark/ast.rs b/parser/src/lark/ast.rs index 729612ef..e4936add 100644 --- a/parser/src/lark/ast.rs +++ b/parser/src/lark/ast.rs @@ -1,15 +1,19 @@ +use super::lexer::Location; + /// Represents an item in the grammar (rule, token, or statement). #[derive(Debug, Clone)] pub enum Item { Rule(Rule), Token(TokenDef), - Statement(Statement), + Statement(Location, Statement), } /// Represents a grammar rule. #[derive(Debug, Clone)] pub struct Rule { pub name: String, + pub cond_inline: bool, + pub pin_terminals: bool, pub params: Option, pub priority: Option, pub expansions: Expansions, @@ -54,7 +58,7 @@ pub struct TokenParams(pub Vec); /// Represents a list of expansions. #[derive(Debug, Clone)] -pub struct Expansions(pub Vec); +pub struct Expansions(pub Location, pub Vec); /// Represents an alias in the grammar. #[derive(Debug, Clone)] @@ -88,7 +92,8 @@ pub enum Atom { pub enum Value { LiteralRange(String, String), Name(String), - Literal(String), + LiteralString(String, String), + LiteralRegex(String, String), TemplateUsage { name: String, values: Vec }, } diff --git a/parser/src/lark/compiler.rs b/parser/src/lark/compiler.rs index 6262be8f..c259d44e 100644 --- a/parser/src/lark/compiler.rs +++ b/parser/src/lark/compiler.rs @@ -1,65 +1,318 @@ -use std::collections::HashMap; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; -use anyhow::Result; +use anyhow::{anyhow, bail, ensure, Result}; use crate::{ - api::{RegexSpec, TopLevelGrammar}, + api::{GrammarWithLexer, RegexId, RegexSpec, TopLevelGrammar}, GrammarBuilder, NodeRef, }; use super::ast::*; struct Compiler { + test_rx: derivre::RegexBuilder, builder: GrammarBuilder, items: Vec, - nodes: HashMap, -} - -struct NodeInfo { - id: NodeRef, - is_terminal: bool, - regex: Option, + ignore: Vec, + rules: Arc>, + tokens: Arc>, + node_ids: HashMap, + regex_ids: HashMap, + in_progress: HashSet, } pub fn lark_to_llguidance(items: Vec) -> Result { let mut c = Compiler { builder: GrammarBuilder::new(), + test_rx: derivre::RegexBuilder::new(), items, - nodes: HashMap::new(), + ignore: Vec::new(), + rules: Arc::new(HashMap::new()), + tokens: Arc::new(HashMap::new()), + node_ids: HashMap::new(), + regex_ids: HashMap::new(), + in_progress: HashSet::new(), }; c.execute()?; c.builder.finalize() } impl Compiler { + fn do_token(&mut self, name: &str) -> Result { + if let Some(id) = self.regex_ids.get(name) { + return Ok(*id); + } + if self.in_progress.contains(name) { + bail!("circular reference in token {:?} definition", name); + } + self.in_progress.insert(name.to_string()); + let tokens = Arc::clone(&self.tokens); + let token = tokens + .get(name) + .ok_or_else(|| anyhow!("token {:?} not found", name))?; + let id = self.do_token_expansions(&token.expansions)?; + self.regex_ids.insert(name.to_string(), id); + self.in_progress.remove(name); + Ok(id) + } + + fn mk_regex(&mut self, info: &str, rx: String) -> Result { + self.test_rx + .mk_regex(&rx) + .map_err(|e| anyhow!("invalid regex {rx:?} (in {info}): {e}"))?; + Ok(self.builder.regex.regex(rx)) + } + + fn do_token_atom(&mut self, atom: &Atom) -> Result { + match atom { + Atom::Group(expansions) => self.do_token_expansions(expansions), + Atom::Maybe(expansions) => { + let id = self.do_token_expansions(expansions)?; + Ok(self.builder.regex.optional(id)) + } + Atom::Value(value) => match value { + Value::LiteralRange(a, b) => { + ensure!( + a.chars().count() == 1, + "range start must be a single character" + ); + ensure!( + b.chars().count() == 1, + "range end must be a single character" + ); + let a = a.chars().next().unwrap(); + let b = b.chars().next().unwrap(); + if a <= b { + self.mk_regex( + "range", + format!( + "[{}-{}]", + regex_syntax::escape(&a.to_string()), + regex_syntax::escape(&b.to_string()) + ), + ) + } else { + bail!("invalid range order: {:?}..{:?}", a, b); + } + } + Value::Name(n) => self.do_token(n), + Value::LiteralString(val, flags) => { + if flags.contains("i") { + self.mk_regex( + "string with i-flag", + format!("(?i){}", regex_syntax::escape(val)), + ) + } else { + Ok(self.builder.regex.literal(val.clone())) + } + } + Value::LiteralRegex(val, flags) => { + ensure!(!flags.contains("l"), "l-flag is not supported in regexes"); + let rx = if flags.is_empty() { + val.clone() + } else { + format!("(?{}){}", flags, val) + }; + self.mk_regex("regex", rx) + } + Value::TemplateUsage { .. } => bail!("template usage not supported yet"), + }, + } + } + + fn do_token_expr(&mut self, expr: &Expr) -> Result { + let atom = self.do_token_atom(&expr.atom)?; + if let Some(range) = &expr.range { + ensure!(expr.op.is_none(), "ranges not supported with operators"); + ensure!(range.0 >= 0, "range start must be >= 0"); + ensure!(range.1 >= range.0, "range end must be >= start"); + Ok(self + .builder + .regex + .repeat(atom, range.0 as u32, Some(range.1 as u32))) + } else { + match &expr.op { + Some(op) => match op.0.as_str() { + "*" => Ok(self.builder.regex.zero_or_more(atom)), + "+" => Ok(self.builder.regex.one_or_more(atom)), + "?" => Ok(self.builder.regex.optional(atom)), + _ => { + bail!("unsupported operator: {:?}", op.0); + } + }, + None => Ok(atom), + } + } + } + + fn do_token_expansions(&mut self, expansions: &Expansions) -> Result { + let options = expansions + .1 + .iter() + .map(|alias| { + let args = alias + .expansion + .0 + .iter() + .map(|e| self.do_token_expr(e)) + .collect::>>()?; + Ok(self.builder.regex.concat(args)) + }) + .collect::>>() + .map_err(|e| expansions.0.augment(e))?; + Ok(self.builder.regex.select(options)) + } + + fn lift_regex(&mut self, rx_id: RegexId) -> Result { + Ok(self.builder.lexeme(RegexSpec::RegexId(rx_id), false)) + } + + fn do_atom(&mut self, expr: &Atom) -> Result { + match expr { + Atom::Group(expansions) => self.do_expansions(expansions), + Atom::Maybe(expansions) => { + let id = self.do_expansions(expansions)?; + Ok(self.builder.optional(id)) + } + Atom::Value(value) => { + match value { + Value::Name(n) => { + if self.rules.contains_key(n) { + return self.do_rule(n); + } else if self.tokens.contains_key(n) { + // OK -> treat as token + } else { + bail!("unknown name: {:?}", n); + } + } + Value::LiteralRange(_, _) + | Value::LiteralString(_, _) + | Value::LiteralRegex(_, _) => { + // treat as token + } + Value::TemplateUsage { .. } => { + bail!("template usage not supported yet"); + } + }; + let rx = self.do_token_atom(expr)?; + Ok(self.lift_regex(rx)?) + } + } + } + + fn do_expr(&mut self, expr: &Expr) -> Result { + ensure!(expr.range.is_none(), "ranges (~1..100) not supported yet"); + let atom = self.do_atom(&expr.atom)?; + match &expr.op { + Some(op) => match op.0.as_str() { + "*" => Ok(self.builder.zero_or_more(atom)), + "+" => Ok(self.builder.one_or_more(atom)), + "?" => Ok(self.builder.optional(atom)), + _ => { + bail!("unsupported operator: {}", op.0); + } + }, + None => Ok(atom), + } + } + + fn do_expansions(&mut self, expansions: &Expansions) -> Result { + let options = expansions + .1 + .iter() + .map(|alias| { + let args = alias + .expansion + .0 + .iter() + .map(|e| self.do_expr(e)) + .collect::>>()?; + Ok(self.builder.join(&args)) + }) + .collect::>>() + .map_err(|e| expansions.0.augment(e))?; + Ok(self.builder.select(&options)) + } + + fn do_rule(&mut self, name: &str) -> Result { + if let Some(id) = self.node_ids.get(name) { + return Ok(*id); + } + if self.in_progress.contains(name) { + let id = self.builder.placeholder(); + self.node_ids.insert(name.to_string(), id); + return Ok(id); + } + self.in_progress.insert(name.to_string()); + let rules = Arc::clone(&self.rules); + let rule = rules + .get(name) + .ok_or_else(|| anyhow!("rule {:?} not found", name))?; + let id = self.do_expansions(&rule.expansions)?; + if let Some(placeholder) = self.node_ids.get(name) { + self.builder.set_placeholder(*placeholder, id); + } + self.node_ids.insert(name.to_string(), id); + self.in_progress.remove(name); + Ok(id) + } + + fn do_statement(&mut self, statement: Statement) -> Result<()> { + match statement { + Statement::Ignore(exp) => { + self.ignore.push(exp); + } + Statement::Import { .. } | Statement::MultiImport { .. } => { + bail!("import statement not supported yet"); + } + Statement::OverrideRule(_) => { + bail!("override statement not supported yet"); + } + Statement::Declare(_) => { + // ignore + } + } + Ok(()) + } + fn execute(&mut self) -> Result<()> { - for item in self.items.iter() { + let mut rules = HashMap::new(); + let mut tokens = HashMap::new(); + for item in std::mem::take(&mut self.items) { match item { Item::Rule(rule) => { - let id = self.builder.placeholder(); - self.nodes.insert( - rule.name.clone(), - NodeInfo { - id, - is_terminal: false, - regex: None, - }, - ); + ensure!(rule.params.is_none(), "params not supported yet"); + ensure!(rule.priority.is_none(), "priority not supported yet"); + rules.insert(rule.name.clone(), rule); } Item::Token(token_def) => { - let id = self.builder.placeholder(); - self.nodes.insert( - token_def.name.clone(), - NodeInfo { - id, - is_terminal: true, - regex: None, - }, - ); + ensure!(token_def.params.is_none(), "params not supported yet"); + ensure!(token_def.priority.is_none(), "priority not supported yet"); + tokens.insert(token_def.name.clone(), token_def); + } + Item::Statement(loc, statement) => { + self.do_statement(statement).map_err(|e| loc.augment(e))?; } - Item::Statement(statement) => todo!(), } } + ensure!(rules.contains_key("start"), "no start rule found"); + self.rules = Arc::new(rules); + self.tokens = Arc::new(tokens); + self.builder.add_grammar(GrammarWithLexer::default()); + let ignore = std::mem::take(&mut self.ignore) + .iter() + .map(|exp| self.do_token_expansions(exp)) + .collect::>>()?; + let start = self.do_rule("start")?; + self.builder.set_start_node(start); + if ignore.len() > 0 { + let ignore_rx = self.builder.regex.select(ignore); + self.builder.top_grammar.grammars[0].greedy_skip_rx = + Some(RegexSpec::RegexId(ignore_rx)); + } Ok(()) } } diff --git a/parser/src/lark/lexer.rs b/parser/src/lark/lexer.rs index f36f61b5..ee82f93d 100644 --- a/parser/src/lark/lexer.rs +++ b/parser/src/lark/lexer.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::{collections::HashMap, fmt::Display}; use anyhow::{bail, Result}; use derivre::{RegexAst, RegexBuilder}; @@ -27,7 +27,7 @@ pub enum Token { RBracket, Tilde, // regexps - Op, + Op, // + * ? String, Regexp, Rule, @@ -49,6 +49,24 @@ pub struct Lexeme { pub column: usize, } +#[derive(Debug, Clone)] +pub struct Location { + pub line: usize, + pub column: usize, +} + +impl Location { + pub fn augment(&self, err: impl Display) -> anyhow::Error { + let err = err.to_string(); + if err.starts_with("at ") { + // don't add more location info + anyhow::anyhow!("{err}") + } else { + anyhow::anyhow!("at {}({}): {}", self.line, self.column, err) + } + } +} + impl Token { const LITERAL_TOKENS: &'static [(Token, &'static str)] = &[ (Token::Arrow, "->"), diff --git a/parser/src/lark/mod.rs b/parser/src/lark/mod.rs index a1e9a1a5..9833a52c 100644 --- a/parser/src/lark/mod.rs +++ b/parser/src/lark/mod.rs @@ -4,3 +4,4 @@ mod lexer; mod parser; pub use parser::parse_lark; +pub use compiler::lark_to_llguidance; \ No newline at end of file diff --git a/parser/src/lark/parser.rs b/parser/src/lark/parser.rs index e97a598b..ceb26f04 100644 --- a/parser/src/lark/parser.rs +++ b/parser/src/lark/parser.rs @@ -1,8 +1,8 @@ use super::{ ast::*, - lexer::{lex_lark, Lexeme, Token}, + lexer::{lex_lark, Lexeme, Location, Token}, }; -use anyhow::{anyhow, bail, Result}; +use anyhow::{anyhow, bail, ensure, Result}; /// The parser struct that holds the tokens and current position. pub struct Parser { @@ -37,7 +37,19 @@ impl Parser { } else if self.has_token(Token::Token) { Ok(Item::Token(self.parse_token_def()?)) } else { - Ok(Item::Statement(self.parse_statement()?)) + let loc = self.location(); + Ok(Item::Statement(loc, self.parse_statement()?)) + } + } + + fn location(&self) -> Location { + if let Some(t) = self.peek_token() { + Location { + line: t.line, + column: t.column, + } + } else { + Location { line: 0, column: 0 } } } @@ -56,8 +68,20 @@ impl Parser { }; self.expect_token(Token::Colon)?; let expansions = self.parse_expansions()?; + let (name, pin_terminals) = if name.starts_with("!") { + (name[1..].to_string(), true) + } else { + (name, false) + }; + let (name, cond_inline) = if name.starts_with("?") { + (name[1..].to_string(), true) + } else { + (name, false) + }; Ok(Rule { name, + pin_terminals, + cond_inline, params, priority, expansions, @@ -166,12 +190,13 @@ impl Parser { /// Parses expansions. fn parse_expansions(&mut self) -> Result { + let loc = self.location(); let mut aliases = Vec::new(); aliases.push(self.parse_alias()?); while self.match_vbar() { aliases.push(self.parse_alias()?); } - Ok(Expansions(aliases)) + Ok(Expansions(loc, aliases)) } fn match_vbar(&mut self) -> bool { @@ -251,17 +276,43 @@ impl Parser { } } + fn parse_string(&self, string1: &Lexeme) -> Result<(String, String)> { + let inner = string1.value.clone(); + let (inner, flags) = if inner.ends_with('i') { + (inner[..inner.len() - 1].to_string(), "i".to_string()) + } else { + (inner, "".to_string()) + }; + let inner = + serde_json::from_str(&inner).map_err(|e| anyhow!("error parsing string: {e}"))?; + Ok((inner, flags)) + } + + fn parse_simple_string(&self, string1: &Lexeme) -> Result { + let (inner, flags) = self.parse_string(string1)?; + ensure!(flags.is_empty(), "flags not allowed in this context"); + Ok(inner) + } + /// Parses a value. fn parse_value(&mut self) -> Result { if let Some(string1) = self.match_token_with_value(Token::String) { if self.match_token(Token::DotDot) { - let string2 = self.expect_token(Token::String)?.value; - Ok(Value::LiteralRange(string1.value.clone(), string2)) + let string2 = self.expect_token(Token::String)?; + Ok(Value::LiteralRange( + self.parse_simple_string(&string1)?, + self.parse_simple_string(&string2)?, + )) } else { - Ok(Value::Literal(string1.value.clone())) + let (inner, flags) = self.parse_string(&string1)?; + Ok(Value::LiteralString(inner, flags)) } } else if let Some(regexp_token) = self.match_token_with_value(Token::Regexp) { - Ok(Value::Literal(regexp_token.value.clone())) + let inner = regexp_token.value; + let last_slash_idx = inner.rfind('/').unwrap(); + let flags = inner[last_slash_idx + 1..].to_string(); + let regex = inner[1..last_slash_idx].to_string(); + Ok(Value::LiteralRegex(regex, flags)) } else if let Some(name_token) = self .match_token_with_value(Token::Rule) .or_else(|| self.match_token_with_value(Token::Token)) diff --git a/sample_parser/data/lark.lark b/sample_parser/data/lark.lark index fc8c81cc..34eaa323 100644 --- a/sample_parser/data/lark.lark +++ b/sample_parser/data/lark.lark @@ -46,15 +46,15 @@ name: RULE | TOKEN _VBAR: _NL? "|" -OP: /qqq[+*]|[?](?![a-z])/ +OP: /[+*]|[?]/ RULE: /!?[_?]?[a-z][_a-z0-9]*/ TOKEN: /_?[A-Z][_A-Z0-9]*/ STRING: _STRING "i"? -REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ +REGEXP: /\/(\\\/|\\\\|[^\/])+?\/[imslux]*/ _NL: /(\r?\n)+\s*/ _STRING_INNER: /.*?/ -_STRING_ESC_INNER: _STRING_INNER /(? = env::args().collect(); @@ -14,6 +18,14 @@ fn main() { for it in r.iter() { println!("{:?}", it); } + + let llguidance = lark_to_llguidance(r).unwrap(); + let json = serde_json::to_string_pretty(&llguidance).unwrap(); + // write json to file + let mut file = File::create("tmp/llguidance.json").expect("Unable to create file"); + file.write_all(json.as_bytes()) + .expect("Unable to write data to file"); + println!("tmp/llguidance.json created"); } fn read_file_to_string(filename: &str) -> String { From 9c9ab2416c985a9424742795fa891780448523a6 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Mon, 4 Nov 2024 14:12:43 -0800 Subject: [PATCH 09/11] add llg_new_constraint_lark --- parser/llguidance.h | 7 +++++++ parser/src/ffi.rs | 20 +++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/parser/llguidance.h b/parser/llguidance.h index e20576d2..bce6a44c 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -203,6 +203,13 @@ struct LlgConstraint *llg_new_constraint_regex(const struct LlgConstraintInit *i struct LlgConstraint *llg_new_constraint_json(const struct LlgConstraintInit *init, const char *json_schema); +/** + * Create a new constraint from a given lark grammar + * Always returns a non-null value. Call llg_get_error() on the result to check for errors. + */ +struct LlgConstraint *llg_new_constraint_lark(const struct LlgConstraintInit *init, + const char *lark); + /** * Get the error message from the constraint or null if there is no error. * After it returns a non-null value, it will always return it until the constraint is freed diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index fd2f428d..3164b96c 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -8,6 +8,7 @@ use toktrie::{InferenceCapabilities, TokEnv, TokRxInfo, TokTrie, TokenizerEnv}; use crate::{ api::{ParserLimits, RegexNode, TopLevelGrammar}, + lark::{lark_to_llguidance, parse_lark}, CommitResult, Constraint, JsonCompileOptions, Logger, TokenParser, }; @@ -273,6 +274,14 @@ fn new_constraint_regex(init: &LlgConstraintInit, regex: *const c_char) -> Resul new_constraint_core(init, grammar) } +fn new_constraint_lark(init: &LlgConstraintInit, lark: *const c_char) -> Result { + let lark = unsafe { CStr::from_ptr(lark) } + .to_str() + .map_err(|_| anyhow::anyhow!("Invalid UTF-8 in lark"))?; + let grammar = lark_to_llguidance(parse_lark(lark)?)?; + new_constraint_core(init, grammar) +} + fn new_constraint_json(init: &LlgConstraintInit, json_schema: *const c_char) -> Result { let json_schema = unsafe { CStr::from_ptr(json_schema) } .to_str() @@ -400,6 +409,16 @@ pub extern "C" fn llg_new_constraint_json( return_constraint(new_constraint_json(init, json_schema)) } +/// Create a new constraint from a given lark grammar +/// Always returns a non-null value. Call llg_get_error() on the result to check for errors. +#[no_mangle] +pub extern "C" fn llg_new_constraint_lark( + init: &LlgConstraintInit, + lark: *const c_char, +) -> *mut LlgConstraint { + return_constraint(new_constraint_lark(init, lark)) +} + /// Get the error message from the constraint or null if there is no error. /// After it returns a non-null value, it will always return it until the constraint is freed /// using llg_free_constraint() (at which point the pointer will be invalid). @@ -532,7 +551,6 @@ pub extern "C" fn llg_stringify_tokens( s.len() + 1 } - /// Free the tokenizer. Should *NOT* be called while there are still constraints using it. #[no_mangle] pub extern "C" fn llg_free_tokenizer(tok: *mut LlgTokenizer) { From 075714bbd21f010c759bcea2443e166c175e744c Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Mon, 4 Nov 2024 14:12:50 -0800 Subject: [PATCH 10/11] add support for json_schema and lark_grammar in GrammarWithLexer --- parser/src/api.rs | 13 ++++++++++++ parser/src/earley/from_guidance.rs | 34 ++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/parser/src/api.rs b/parser/src/api.rs index 41b75ac7..3f129feb 100644 --- a/parser/src/api.rs +++ b/parser/src/api.rs @@ -1,6 +1,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use serde_json::Value; /// This represents a collection of grammars, with a designated /// "start" grammar at first position. @@ -19,8 +20,18 @@ pub const DEFAULT_CONTEXTUAL: bool = true; #[derive(Serialize, Deserialize, Clone, Default)] pub struct GrammarWithLexer { /// The start symbol is at nodes[0] + /// When nodes is empty, then one of json_schema or lark_grammar must be set. + #[serde(default)] pub nodes: Vec, + /// The JSON schema that the grammar should generate. + /// When this is set, nodes and rx_nodes must be empty. + pub json_schema: Option, + + /// The Lark grammar that the grammar should generate. + /// When this is set, nodes and rx_nodes must be empty. + pub lark_grammar: Option, + /// This is no longer used. /// When enabled, the grammar can use `Lexeme` but not `Gen`. /// When disabled, the grammar can use `Gen` but not `Lexeme`. @@ -374,6 +385,8 @@ impl TopLevelGrammar { json_allowed_escapes: None, json_raw: None, }], + json_schema: None, + lark_grammar: None, greedy_lexer: true, greedy_skip_rx: None, contextual: None, diff --git a/parser/src/earley/from_guidance.rs b/parser/src/earley/from_guidance.rs index 3b646e66..f6819630 100644 --- a/parser/src/earley/from_guidance.rs +++ b/parser/src/earley/from_guidance.rs @@ -6,7 +6,8 @@ use crate::api::{ GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec, TopLevelGrammar, DEFAULT_CONTEXTUAL, }; -use crate::{loginfo, Logger}; +use crate::lark::{lark_to_llguidance, parse_lark}; +use crate::{loginfo, JsonCompileOptions, Logger}; use anyhow::{bail, ensure, Result}; use derivre::{ExprRef, JsonQuoteOptions, RegexAst, RegexBuilder}; use instant::Instant; @@ -84,8 +85,37 @@ fn map_rx_nodes( fn grammar_from_json( tok_env: &TokEnv, limits: &mut ParserLimits, - input: GrammarWithLexer, + mut input: GrammarWithLexer, ) -> Result<(LexerSpec, Grammar)> { + if input.json_schema.is_some() || input.lark_grammar.is_some() { + ensure!( + input.nodes.is_empty() && input.rx_nodes.is_empty(), + "cannot have both json_schema/lark_grammar and nodes/rx_nodes" + ); + + let mut new_grm = if let Some(json_schema) = input.json_schema.as_ref() { + ensure!( + input.lark_grammar.is_none(), + "cannot have both json_schema and lark_grammar" + ); + let opts = JsonCompileOptions { compact: false }; + opts.json_to_llg_no_validate(json_schema)? + } else { + let items = parse_lark(input.lark_grammar.as_ref().unwrap())?; + lark_to_llguidance(items)? + }; + + let g = new_grm.grammars.pop().unwrap(); + + input.greedy_skip_rx = g.greedy_skip_rx; + input.nodes = g.nodes; + input.rx_nodes = g.rx_nodes; + input.contextual = g.contextual; + + input.lark_grammar = None; + input.json_schema = None; + } + ensure!(input.nodes.len() > 0, "empty grammar"); let (builder, rx_nodes) = map_rx_nodes(limits, input.rx_nodes, input.allow_invalid_utf8)?; From bbd8e3c5d182800fd5199814de94f86262fba68e Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Mon, 4 Nov 2024 14:17:34 -0800 Subject: [PATCH 11/11] add llg_new_constraint_any --- parser/llguidance.h | 9 +++++++++ parser/src/ffi.rs | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/parser/llguidance.h b/parser/llguidance.h index bce6a44c..d6c51ec0 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -210,6 +210,15 @@ struct LlgConstraint *llg_new_constraint_json(const struct LlgConstraintInit *in struct LlgConstraint *llg_new_constraint_lark(const struct LlgConstraintInit *init, const char *lark); +/** + * Create a new constraint with specified type + * Type can be one of "regex", "json_schema" (or "json"), "lark", "llguidance" (or "guidance") + * Always returns a non-null value. Call llg_get_error() on the result to check for errors. + */ +struct LlgConstraint *llg_new_constraint_any(const struct LlgConstraintInit *init, + const char *constraint_type, + const char *data); + /** * Get the error message from the constraint or null if there is no error. * After it returns a non-null value, it will always return it until the constraint is freed diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 3164b96c..e29ec5dc 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -304,6 +304,23 @@ fn new_constraint(init: &LlgConstraintInit, grammar_json: *const c_char) -> Resu new_constraint_core(init, grammar) } +fn new_constraint_any( + init: &LlgConstraintInit, + constraint_type: *const c_char, + data: *const c_char, +) -> Result { + let tp = unsafe { CStr::from_ptr(constraint_type) } + .to_str() + .map_err(|_| anyhow::anyhow!("Invalid UTF-8 in constraint_type"))?; + match tp { + "regex" => new_constraint_regex(init, data), + "json" | "json_schema" => new_constraint_json(init, data), + "lark" => new_constraint_lark(init, data), + "llguidance" | "guidance" => new_constraint_lark(init, data), + _ => bail!("unknown constraint type: {tp}"), + } +} + fn new_constraint_core(init: &LlgConstraintInit, grammar: TopLevelGrammar) -> Result { if init.tokenizer.is_null() { bail!("Tokenizer is null"); @@ -419,6 +436,18 @@ pub extern "C" fn llg_new_constraint_lark( return_constraint(new_constraint_lark(init, lark)) } +/// Create a new constraint with specified type +/// Type can be one of "regex", "json_schema" (or "json"), "lark", "llguidance" (or "guidance") +/// Always returns a non-null value. Call llg_get_error() on the result to check for errors. +#[no_mangle] +pub extern "C" fn llg_new_constraint_any( + init: &LlgConstraintInit, + constraint_type: *const c_char, + data: *const c_char, +) -> *mut LlgConstraint { + return_constraint(new_constraint_any(init, constraint_type, data)) +} + /// Get the error message from the constraint or null if there is no error. /// After it returns a non-null value, it will always return it until the constraint is freed /// using llg_free_constraint() (at which point the pointer will be invalid).