Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lark syntax -> llguidance translator #37

Merged
merged 12 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions parser/src/earley/lexerspec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ impl LexerSpec {
SimpleVob::alloc(self.lexemes.len())
}

pub fn all_lexemes(&self) -> SimpleVob {
let mut v = self.alloc_lexeme_set();
self.lexemes[0..self.lexemes.len() - self.num_extra_lexemes]
.iter()
.enumerate()
.for_each(|(idx, _)| v.set(idx, true));
v
}

pub fn lazy_lexemes(&self) -> SimpleVob {
let mut v = self.alloc_lexeme_set();
for (idx, lex) in self.lexemes.iter().enumerate() {
Expand Down
2 changes: 1 addition & 1 deletion parser/src/earley/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mod from_guidance;
mod grammar;
mod lexer;
pub(crate) mod lexer;
mod parser;

pub mod lexerspec;
Expand Down
92 changes: 90 additions & 2 deletions parser/src/grammar_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ use std::{collections::HashMap, sync::atomic::AtomicU32};

use anyhow::{ensure, Result};

use crate::api::{GrammarWithLexer, Node, NodeId, NodeProps, RegexSpec, TopLevelGrammar};
use crate::api::{
GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
};

#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub struct NodeRef {
Expand All @@ -16,6 +18,81 @@ pub struct GrammarBuilder {
strings: HashMap<String, NodeRef>,
curr_grammar_id: u32,
nodes: Vec<Node>,
pub regex: RegexBuilder,
}

pub struct RegexBuilder {
node_ids: HashMap<RegexNode, RegexId>,
nodes: Vec<RegexNode>,
}

impl RegexBuilder {
pub fn new() -> Self {
Self {
nodes: vec![],
node_ids: HashMap::new(),
}
}

pub fn add_node(&mut self, node: RegexNode) -> RegexId {
if let Some(id) = self.node_ids.get(&node) {
return *id;
}
let id = RegexId(self.nodes.len());
self.nodes.push(node.clone());
self.node_ids.insert(node, id);
id
}

pub fn regex(&mut self, rx: String) -> RegexId {
self.add_node(RegexNode::Regex(rx))
}

pub fn literal(&mut self, s: String) -> RegexId {
self.add_node(RegexNode::Literal(s))
}

pub fn concat(&mut self, nodes: Vec<RegexId>) -> RegexId {
if nodes.len() == 1 {
return nodes[0];
}
if nodes.len() == 0 {
return self.add_node(RegexNode::NoMatch);
}
self.add_node(RegexNode::Concat(nodes))
}

pub fn select(&mut self, nodes: Vec<RegexId>) -> RegexId {
if nodes.len() == 1 {
return nodes[0];
}
if nodes.len() == 0 {
return self.add_node(RegexNode::NoMatch);
}
self.add_node(RegexNode::Or(nodes))
}

pub fn zero_or_more(&mut self, node: RegexId) -> RegexId {
self.repeat(node, 0, None)
}

pub fn one_or_more(&mut self, node: RegexId) -> RegexId {
self.repeat(node, 1, None)
}

pub fn optional(&mut self, node: RegexId) -> RegexId {
self.repeat(node, 0, Some(1))
}

pub fn repeat(&mut self, node: RegexId, min: u32, max: Option<u32>) -> RegexId {
self.add_node(RegexNode::Repeat(node, min, max))
}

fn finalize(&mut self) -> Vec<RegexNode> {
let r = std::mem::take(&mut self.nodes);
*self = Self::new();
r
}
}

impl GrammarBuilder {
Expand Down Expand Up @@ -49,6 +126,7 @@ impl GrammarBuilder {
strings: HashMap::new(),
curr_grammar_id: 0,
nodes: vec![],
regex: RegexBuilder::new(),
}
}

Expand All @@ -62,6 +140,7 @@ impl GrammarBuilder {
"no nodes added before add_grammar() or finalize()"
);
self.top_grammar.grammars.last_mut().unwrap().nodes = nodes;
self.top_grammar.grammars.last_mut().unwrap().rx_nodes = self.regex.finalize();
}
}

Expand Down Expand Up @@ -158,10 +237,19 @@ impl GrammarBuilder {
self.select(&[value, empty])
}

pub fn one_or_more(&mut self, elt: NodeRef) -> NodeRef {
let p = self.placeholder();
let p_elt = self.join(&[p, elt]);
let inner = self.select(&[elt, p_elt]);
self.set_placeholder(p, inner);
p
}

pub fn zero_or_more(&mut self, elt: NodeRef) -> NodeRef {
let p = self.placeholder();
let empty = self.empty();
let inner = self.select(&[empty, elt]);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hudson-ai be aware that this was broken; I'll be merging this initial lark stuff soon, so the fix will come in (zero_or_more was really zero_or_one)

let p_elt = self.join(&[p, elt]);
let inner = self.select(&[empty, p_elt]);
self.set_placeholder(p, inner);
p
}
Expand Down
97 changes: 97 additions & 0 deletions parser/src/lark/ast.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/// Represents an item in the grammar (rule, token, or statement).
#[derive(Debug, Clone)]
pub enum Item {
Rule(Rule),
Token(TokenDef),
Statement(Statement),
}

/// Represents a grammar rule.
#[derive(Debug, Clone)]
pub struct Rule {
pub name: String,
pub params: Option<RuleParams>,
pub priority: Option<i32>,
pub expansions: Expansions,
}

/// Represents a token definition.
#[derive(Debug, Clone)]
pub struct TokenDef {
pub name: String,
pub params: Option<TokenParams>,
pub priority: Option<i32>,
pub expansions: Expansions,
}

/// Represents different types of statements.
#[derive(Debug, Clone)]
pub enum Statement {
Ignore(Expansions),
Import {
path: ImportPath,
alias: Option<String>,
},
MultiImport {
path: ImportPath,
names: Vec<String>,
},
OverrideRule(Box<Rule>),
Declare(Vec<String>),
}

/// Represents an import path.
#[derive(Debug, Clone)]
pub struct ImportPath(pub Vec<String>);

/// Represents parameters for a rule.
#[derive(Debug, Clone)]
pub struct RuleParams(pub Vec<String>);

/// Represents parameters for a token.
#[derive(Debug, Clone)]
pub struct TokenParams(pub Vec<String>);

/// Represents a list of expansions.
#[derive(Debug, Clone)]
pub struct Expansions(pub Vec<Alias>);

/// Represents an alias in the grammar.
#[derive(Debug, Clone)]
pub struct Alias {
pub expansion: Expansion,
pub alias: Option<String>,
}

/// Represents an expansion consisting of expressions.
#[derive(Debug, Clone)]
pub struct Expansion(pub Vec<Expr>);

/// Represents an expression.
#[derive(Debug, Clone)]
pub struct Expr {
pub atom: Atom,
pub op: Option<Op>,
pub range: Option<(i32, i32)>,
}

/// Represents an atom in the grammar.
#[derive(Debug, Clone)]
pub enum Atom {
Group(Expansions),
Maybe(Expansions),
Value(Value),
}

/// Represents different values in the grammar.
#[derive(Debug, Clone)]
pub enum Value {
LiteralRange(String, String),
Name(String),
Literal(String),
TemplateUsage { name: String, values: Vec<Value> },
}

/// Represents an operator.
#[derive(Debug, Clone)]
pub struct Op(pub String);
65 changes: 65 additions & 0 deletions parser/src/lark/compiler.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use std::collections::HashMap;

use anyhow::Result;

use crate::{
api::{RegexSpec, TopLevelGrammar},
GrammarBuilder, NodeRef,
};

use super::ast::*;

struct Compiler {
builder: GrammarBuilder,
items: Vec<Item>,
nodes: HashMap<String, NodeInfo>,
}

struct NodeInfo {
id: NodeRef,
is_terminal: bool,
regex: Option<RegexSpec>,
}

pub fn lark_to_llguidance(items: Vec<Item>) -> Result<TopLevelGrammar> {
let mut c = Compiler {
builder: GrammarBuilder::new(),
items,
nodes: HashMap::new(),
};
c.execute()?;
c.builder.finalize()
}

impl Compiler {
fn execute(&mut self) -> Result<()> {
for item in self.items.iter() {
match item {
Item::Rule(rule) => {
let id = self.builder.placeholder();
self.nodes.insert(
rule.name.clone(),
NodeInfo {
id,
is_terminal: false,
regex: None,
},
);
}
Item::Token(token_def) => {
let id = self.builder.placeholder();
self.nodes.insert(
token_def.name.clone(),
NodeInfo {
id,
is_terminal: true,
regex: None,
},
);
}
Item::Statement(statement) => todo!(),
}
}
Ok(())
}
}
Loading
Loading