Skip to content

Commit

Permalink
modify unescape_byte
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonnnli committed Feb 21, 2024
1 parent d500eb3 commit 006283d
Showing 1 changed file with 34 additions and 14 deletions.
48 changes: 34 additions & 14 deletions datafusion/sql/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,11 @@ pub(crate) fn unescape(s: &str) -> Option<String> {
'n' => '\n',
'r' => '\r',
't' => '\t',
'\'' | '\"' | '\\' | '/' => c,
'u' => return_if_none!(unescape_unicode_16(&mut queue)),
'U' => return_if_none!(unescape_unicode_32(&mut queue)),
'x' => return_if_none!(unescape_byte(&mut queue)),
maybe_octal => return_if_none!(unescape_octal(maybe_octal, &mut queue)),
c if c.is_digit(8) => return_if_none!(unescape_octal(c, &mut queue)),
c => c,
},
None => return None,
};
Expand All @@ -289,20 +289,37 @@ pub(crate) fn unescape(s: &str) -> Option<String> {

// Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
fn unescape_byte(queue: &mut VecDeque<char>) -> Option<char> {
unescape_hexadecimal::<2>(queue)
let mut s = String::new();
let mut try_get_next = || match queue.front() {
Some(c) if c.is_ascii_hexdigit() => queue.pop_front(),
_ => None,
};

for _ in 0..2 {
match try_get_next() {
Some(c) => s.push(c),
None => break,
}
}

if s.is_empty() {
return Some('x');
}

to_char::<16>(&s)
}

// 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
fn unescape_unicode_16(queue: &mut VecDeque<char>) -> Option<char> {
unescape_hexadecimal::<4>(queue)
unescape_unicode::<4>(queue)
}

// 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
fn unescape_unicode_32(queue: &mut VecDeque<char>) -> Option<char> {
unescape_hexadecimal::<8>(queue)
unescape_unicode::<8>(queue)
}

fn unescape_hexadecimal<const NUM: usize>(queue: &mut VecDeque<char>) -> Option<char> {
fn unescape_unicode<const NUM: usize>(queue: &mut VecDeque<char>) -> Option<char> {
let mut s = String::new();
for _ in 0..NUM {
s.push(return_if_none!(queue.pop_front()));
Expand All @@ -312,12 +329,8 @@ fn unescape_hexadecimal<const NUM: usize>(queue: &mut VecDeque<char>) -> Option<

// Octal byte value. \o, \oo, \ooo (o = 0–7)
fn unescape_octal(c: char, queue: &mut VecDeque<char>) -> Option<char> {
if !c.is_digit(8) {
return None;
}

let mut s = String::new();
let mut try_get_next = || match queue.get(0) {
let mut try_get_next = || match queue.front() {
Some(c) if c.is_digit(8) => queue.pop_front(),
_ => None,
};
Expand Down Expand Up @@ -345,7 +358,7 @@ fn unescape_octal(c: char, queue: &mut VecDeque<char>) -> Option<char> {

#[inline]
fn to_char<const RADIX: u32>(s: &str) -> Option<char> {
match u32::from_str_radix(&s, RADIX) {
match u32::from_str_radix(s, RADIX) {
Err(_) => None,
Ok(n) => char::from_u32(n),
}
Expand All @@ -368,18 +381,23 @@ mod tests {
check_unescape(r"\/", Some("/"));
check_unescape(r"/", Some("/"));
check_unescape(r"\\", Some("\\"));
check_unescape(r"\x", None);

// 16 and 32-bit hexadecimal Unicode character value
check_unescape(r"\u4c91", Some("\u{4c91}"));
check_unescape(r"\u4c916", Some("\u{4c91}6"));
check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
check_unescape(r"\u", None);
check_unescape(r"\U", None);
check_unescape(r"\U1010FFFF", None);

// hexadecimal byte value
check_unescape(r"\xCAD", Some("\u{00ca}D"));
check_unescape(r"\xA9", Some("\u{00a9}"));
check_unescape(r"\x4B", Some("\u{004b}"));
check_unescape(r"\x4", Some("\u{0004}"));
check_unescape(r"\x4L", Some("\u{0004}L"));
check_unescape(r"\x", Some("x"));
check_unescape(r"\xP", Some("xP"));

// octal byte value
check_unescape(r"\1", Some("\u{0001}"));
Expand All @@ -389,6 +407,8 @@ mod tests {
check_unescape(r"\4", Some("\u{0004}"));
check_unescape(r"\45", Some("\u{0025}"));
check_unescape(r"\450", Some("\u{0025}0"));
check_unescape(r"\9", None);

// others
check_unescape(r"\9", Some("9"));
}
}

0 comments on commit 006283d

Please sign in to comment.