Skip to content

Commit

Permalink
allow intersection/negation in lexemes (proper relevance checks)
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Oct 4, 2024
1 parent d4e35a8 commit 48bcba4
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 17 deletions.
4 changes: 2 additions & 2 deletions parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ version = "0.2.0"
edition = "2021"

[dependencies]
toktrie = { git = "https://github.com/microsoft/toktrie", rev = "8828701d3b1c743472fe61bdf6dab12cdd726ab4" }
derivre = { git = "https://github.com/microsoft/derivre", rev = "424ec3bd1f711add6aeab1711108b63abe856d0c" }
toktrie = { git = "https://github.com/microsoft/toktrie", rev = "5e7013ad05081e918809d4ecebb33db7c4aabc69" }
derivre = { git = "https://github.com/microsoft/derivre", rev = "02ee497e6e404a0b402b4f68a9abf599d22ed2ed" }
serde = { version = "1.0.192", features = ["derive"] }
serde_json = { version = "1.0.108", features = ["preserve_order"] }
anyhow = "1.0.75"
Expand Down
6 changes: 4 additions & 2 deletions parser/src/earley/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use anyhow::Result;
use std::fmt::Debug;
use toktrie::SimpleVob;

use crate::api::ParserLimits;

use super::{
lexerspec::{LexemeIdx, LexerSpec},
regexvec::{NextByte, RegexVec, StateDesc},
Expand Down Expand Up @@ -41,8 +43,8 @@ pub enum LexerResult {
}

impl Lexer {
pub fn from(spec: &LexerSpec) -> Result<Self> {
let dfa = spec.to_regex_vec();
pub fn from(spec: &LexerSpec, limits: &mut ParserLimits) -> Result<Self> {
let dfa = spec.to_regex_vec(limits)?;

debug!("lexer: {:?}\n ==> dfa: {:?}", spec, dfa);

Expand Down
5 changes: 4 additions & 1 deletion parser/src/earley/lexerspec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ use derivre::{ExprRef, JsonQuoteOptions, RegexAst, RegexBuilder};
use std::{fmt::Debug, hash::Hash};
use toktrie::{bytes::limit_str, SimpleVob};

use crate::api::ParserLimits;

use super::regexvec::RegexVec;

#[derive(Clone)]
Expand Down Expand Up @@ -115,7 +117,7 @@ impl LexerSpec {
.is_nullable(self.lexemes[idx.0].compiled_rx)
}

pub fn to_regex_vec(&self) -> RegexVec {
pub fn to_regex_vec(&self, limits: &mut ParserLimits) -> Result<RegexVec> {
// TODO
// Find all non-contextual lexemes that are literals (we call them 'keywords')
// This assumes that this is the only possible conflict in the lexer that we want to catch.
Expand All @@ -127,6 +129,7 @@ impl LexerSpec {
self.regex_builder.exprset(),
&rx_list,
Some(self.lazy_lexemes()),
limits,
)
}

Expand Down
4 changes: 2 additions & 2 deletions parser/src/earley/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,10 +378,10 @@ impl ParserState {
fn new(
grammar: Arc<CGrammar>,
options: GenGrammarOptions,
limits: ParserLimits,
mut limits: ParserLimits,
) -> Result<(Self, Lexer)> {
let start = grammar.start();
let mut lexer = Lexer::from(grammar.lexer_spec())?;
let mut lexer = Lexer::from(grammar.lexer_spec(), &mut limits)?;
let scratch = Scratch::new(Arc::clone(&grammar));
let lexer_state = lexer.a_dead_state(); // placeholder
let mut r = ParserState {
Expand Down
58 changes: 51 additions & 7 deletions parser/src/earley/regexvec.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
use anyhow::Result;
use anyhow::{bail, Result};
use derivre::raw::{DerivCache, ExprSet, NextByteCache, RelevanceCache, VecHashCons};
use std::{fmt::Debug, u64};
use toktrie::SimpleVob;

pub use derivre::{AlphabetInfo, ExprRef, NextByte, StateID};

use crate::api::ParserLimits;

#[derive(Clone)]
pub struct RegexVec {
exprs: ExprSet,
Expand Down Expand Up @@ -59,7 +61,10 @@ impl RegexVec {
pub fn initial_state(&mut self, selected: &SimpleVob) -> StateID {
let mut vec_desc = vec![];
for idx in selected.iter() {
Self::push_rx(&mut vec_desc, idx as usize, self.rx_list[idx as usize]);
let rx = self.rx_list[idx as usize];
if rx != ExprRef::NO_MATCH {
Self::push_rx(&mut vec_desc, idx as usize, rx);
}
}
self.insert_state(vec_desc)
}
Expand Down Expand Up @@ -333,16 +338,41 @@ impl RegexVec {
exprset: &ExprSet,
rx_list: &[ExprRef],
lazy: Option<SimpleVob>,
) -> Self {
let (alpha, exprset, rx_list) = AlphabetInfo::from_exprset(exprset, rx_list);
limits: &mut ParserLimits,
) -> Result<Self> {
let (alpha, mut exprset, mut rx_list) = AlphabetInfo::from_exprset(exprset, rx_list);
let num_ast_nodes = exprset.len();

let rx_sets = StateID::new_hash_cons();
let fuel0 = limits.initial_lexer_fuel;
let mut relevance = RelevanceCache::new();
for idx in 0..rx_list.len() {
let c0 = exprset.cost();
match relevance.is_non_empty_limited(
&mut exprset,
rx_list[idx],
limits.initial_lexer_fuel,
) {
Ok(true) => {}
Ok(false) => {
rx_list[idx] = ExprRef::NO_MATCH;
}
Err(_) => {
bail!(
"fuel exhausted when checking relevance of lexemes ({})",
fuel0
);
}
}
limits.initial_lexer_fuel = limits
.initial_lexer_fuel
.saturating_sub(exprset.cost() - c0);
}

let rx_sets = StateID::new_hash_cons();
let mut r = RegexVec {
deriv: DerivCache::new(),
next_byte: NextByteCache::new(),
relevance: RelevanceCache::new(),
relevance,
lazy: lazy.unwrap_or_else(|| SimpleVob::alloc(rx_list.len())),
exprs: exprset,
alpha,
Expand All @@ -364,7 +394,7 @@ impl RegexVec {
// in fact, transition from MISSING and DEAD should both lead to DEAD
r.state_table.fill(StateID::DEAD);
assert!(r.alpha.len() > 0);
r
Ok(r)
}

fn append_state(&mut self, state_desc: StateDesc) {
Expand Down Expand Up @@ -439,6 +469,20 @@ impl RegexVec {

for (idx, e) in iter_state(&self.rx_sets, state) {
let d = self.deriv.derivative(&mut self.exprs, e, b);

let fuel = self.fuel.saturating_sub(self.exprs.cost() - c0);
let d = match self
.relevance
.is_non_empty_limited(&mut self.exprs, d, fuel)
{
Ok(true) => d,
Ok(false) => ExprRef::NO_MATCH,
Err(_) => {
self.fuel = 0; // just in case
break;
}
};

state_size += 1;
if d != ExprRef::NO_MATCH {
Self::push_rx(&mut vec_desc, idx, d);
Expand Down
4 changes: 2 additions & 2 deletions rust/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion sample_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ default-run = "sample_parser"

[dependencies]
llguidance_parser = { path = "../parser" }
toktrie_hf_tokenizers = { git = "https://github.com/microsoft/toktrie", rev = "8828701d3b1c743472fe61bdf6dab12cdd726ab4" }
toktrie_hf_tokenizers = { git = "https://github.com/microsoft/toktrie", rev = "5e7013ad05081e918809d4ecebb33db7c4aabc69" }
serde_json = "1.0.128"
anyhow = "1.0.87"

Expand Down

0 comments on commit 48bcba4

Please sign in to comment.