-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix(capture): strip invalid utf16 surrogate pairs from input (#25507)
- Loading branch information
1 parent
8fd3940
commit 7a9a699
Showing
5 changed files
with
345 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,312 @@ | ||
use std::str::Chars; | ||
|
||
#[derive(Debug, PartialEq, Clone, Copy)] | ||
enum LastSeen { | ||
Char, // Any regular character | ||
Escape, // We've seen a backslash, not preceded by another backslash | ||
} | ||
|
||
// Unicode unknown character replacement - �, but as a hex escape sequence | ||
const REPLACEMENT: &str = "uFFFD"; | ||
const HIGH_SURROGATE_RANGE: std::ops::Range<u16> = 0xD800..0xDBFF; | ||
const LOW_SURROGATE_RANGE: std::ops::Range<u16> = 0xDC00..0xDFFF; | ||
const HEX_ESCAPE_LENGTH: usize = 4; | ||
|
||
pub struct InvalidSurrogatesPass<'a> { | ||
input: Chars<'a>, | ||
last_seen: LastSeen, | ||
pending_output: Vec<char>, | ||
pending_ptr: usize, | ||
escape_seq_buf: String, | ||
} | ||
|
||
impl<'a> Iterator for InvalidSurrogatesPass<'a> { | ||
type Item = char; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
self.step() | ||
} | ||
} | ||
|
||
impl<'a> InvalidSurrogatesPass<'a> { | ||
pub fn new(input: Chars<'a>) -> Self { | ||
Self { | ||
input, | ||
last_seen: LastSeen::Char, | ||
pending_output: Vec::with_capacity(32), | ||
pending_ptr: 0, | ||
escape_seq_buf: String::with_capacity(32), | ||
} | ||
} | ||
|
||
fn queue(&mut self, c: char) { | ||
if self.last_seen == LastSeen::Escape { | ||
// When we enter an escape sequence, we swallow the backslash, | ||
// to avoid having to backtrack if we drop an invalid escape sequence. | ||
// So we have to emit it here. | ||
self.pending_output.push('\\'); | ||
self.pending_output.push(c); | ||
self.last_seen = LastSeen::Char; | ||
} else if c == '\\' { | ||
// If we're not already in an escape sequence, enter one, dropping the char to | ||
// avoid needing to backtrack | ||
self.last_seen = LastSeen::Escape; | ||
} else { | ||
// If we're not in an escape sequence, and not entering one, just push | ||
self.last_seen = LastSeen::Char; | ||
self.pending_output.push(c); | ||
} | ||
} | ||
|
||
fn queue_str(&mut self, s: &str) { | ||
for c in s.chars() { | ||
self.queue(c); | ||
} | ||
} | ||
|
||
fn pop(&mut self) -> Option<char> { | ||
// We push chars into the buffer reading left-to-right, and need to emit them | ||
// in the same order, so we have to track our stack index, and reset it when we | ||
// run out of chars to pop. | ||
if self.pending_ptr < self.pending_output.len() { | ||
let c = self.pending_output[self.pending_ptr]; | ||
self.pending_ptr += 1; | ||
Some(c) | ||
} else { | ||
self.pending_output.clear(); | ||
self.pending_ptr = 0; | ||
None | ||
} | ||
} | ||
|
||
fn step(&mut self) -> Option<char> { | ||
if let Some(c) = self.pop() { | ||
return Some(c); | ||
} | ||
|
||
// We're out of input, and we've go no pending output, so we're done | ||
let Some(c) = self.input.next() else { | ||
// If we're all out of input and the last thing we saw was an escape, | ||
// we have to emit that escape character. We just do that directly here, | ||
// knowing the next call around we'll return None. | ||
// Note that since we're parsing strings to get turned into json values, | ||
// we technically know this will be immediately discarded, but there's | ||
// no harm making it "correct" first. | ||
if self.last_seen == LastSeen::Escape { | ||
self.last_seen = LastSeen::Char; | ||
return Some('\\'); | ||
}; | ||
return None; | ||
}; | ||
|
||
match (self.last_seen, c) { | ||
(LastSeen::Escape, 'u') => { | ||
let first_code_point = | ||
match collect_escape_sequence(&mut self.escape_seq_buf, &mut self.input) { | ||
Ok(code_point) => code_point, | ||
Err(None) => { | ||
// We ran out of chars. Push a replacement, and return. | ||
// We drop the collected chars here because, if we'd encountered a syntactically | ||
// important one, it would have been caught as non-hex earlier and returned in | ||
// the branch below. | ||
self.queue_str(REPLACEMENT); | ||
return self.pop(); | ||
} | ||
Err(Some(c)) => { | ||
// We encountered an invalid char. Push the replacement, push the invalid char, and return | ||
self.queue_str(REPLACEMENT); | ||
self.queue(c); | ||
return self.pop(); | ||
} | ||
}; | ||
|
||
// Now, we try to get the second member of the surrogate pair, since we require surrogates to be paired | ||
match self.input.next() { | ||
Some('\\') => { | ||
// We don't push a backslash here because we're already in an escape sequence, | ||
// and it would cause us to exit it - but the specific characters we're going | ||
// to emit isn't known yet, so we can't push those and then a backslash either | ||
} | ||
Some(c) => { | ||
self.queue_str(REPLACEMENT); | ||
self.queue(c); | ||
return self.pop(); | ||
} | ||
None => { | ||
// We didn't get a second escape sequence, so we just drop the first one | ||
self.queue_str(REPLACEMENT); | ||
return self.pop(); | ||
} | ||
} | ||
match self.input.next() { | ||
Some('u') => {} | ||
Some(c) => { | ||
self.queue_str(REPLACEMENT); | ||
self.queue('\\'); // We have to handle that we've already consumed a backslash | ||
self.queue(c); | ||
return self.pop(); | ||
} | ||
None => { | ||
self.queue_str(REPLACEMENT); | ||
self.queue('\\'); // As above | ||
return self.pop(); | ||
} | ||
} | ||
|
||
let second_code_point = | ||
match collect_escape_sequence(&mut self.escape_seq_buf, &mut self.input) { | ||
Ok(code_point) => code_point, | ||
Err(None) => { | ||
self.queue_str(REPLACEMENT); | ||
self.queue('\\'); | ||
self.queue_str(REPLACEMENT); | ||
return self.pop(); | ||
} | ||
Err(Some(c)) => { | ||
self.queue_str(REPLACEMENT); | ||
self.queue('\\'); | ||
self.queue_str(REPLACEMENT); | ||
self.queue(c); | ||
return self.pop(); | ||
} | ||
}; | ||
if HIGH_SURROGATE_RANGE.contains(&first_code_point) | ||
&& LOW_SURROGATE_RANGE.contains(&second_code_point) | ||
{ | ||
// We have a valid pair of hex escapes, so we should push them. | ||
// TODO - there's way to do this that doesn't require the | ||
// allocation format! implies, but I'm not gonna work it out | ||
// right now - we expect this to be /extremely/ rare | ||
self.queue_str(&format!( | ||
"u{:04X}\\u{:04X}", // First backslash is already in the buffer due to last_seen | ||
first_code_point, second_code_point | ||
)); | ||
} else { | ||
// We didn't get a valid pair, so we just drop the pair entirely | ||
self.queue_str(REPLACEMENT); | ||
self.queue('\\'); | ||
self.queue_str(REPLACEMENT); | ||
} | ||
} | ||
(LastSeen::Char | LastSeen::Escape, c) => { | ||
// emit handles the transition between escape and char for us, | ||
// so we just unconditionally emit here if the last thing we saw | ||
// was a char, or the last thing we saw was an escape, AND the | ||
// current char is not a 'u' (the case above) | ||
self.queue(c); | ||
} | ||
} | ||
|
||
// Because we swallow escape chars to avoid backtracking, we have to recurse | ||
// here to handle the case where we just entered an escape squeuence | ||
self.next() | ||
} | ||
} | ||
|
||
// Collects 4 chars into a hex escape sequence, returning the first char that couldn't be part of | ||
// one, if one was found. If we run out of input, we return Result::Err(None) | ||
fn collect_escape_sequence( | ||
buf: &mut String, | ||
iter: &mut dyn Iterator<Item = char>, | ||
) -> Result<u16, Option<char>> { | ||
buf.clear(); | ||
for _ in 0..HEX_ESCAPE_LENGTH { | ||
let Some(c) = iter.next() else { | ||
return Err(None); | ||
}; | ||
// If this character couldn't be part of a hex escape sequence, we return it | ||
if !c.is_ascii_hexdigit() { | ||
return Err(Some(c)); | ||
} | ||
buf.push(c); | ||
} | ||
// Unwrap safe due to the checking above | ||
Ok(u16::from_str_radix(buf, 16).unwrap()) | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use crate::v0_request::RawEvent; | ||
|
||
const RAW_DATA: &str = include_str!("../../tests/invalid_surrogate.json"); | ||
|
||
#[test] | ||
fn test() { | ||
let pass = super::InvalidSurrogatesPass::new(RAW_DATA.chars()); | ||
let data = pass.collect::<String>(); | ||
let res = serde_json::from_str::<RawEvent>(&data); | ||
assert!(res.is_ok()) | ||
} | ||
|
||
#[test] | ||
fn test_unpaired_high_surrogate() { | ||
let raw_data = r#"{"event":"\uD800"}"#; | ||
let pass = super::InvalidSurrogatesPass::new(raw_data.chars()); | ||
let data = pass.collect::<String>(); | ||
assert_eq!(data, r#"{"event":"\uFFFD"}"#); | ||
} | ||
|
||
#[test] | ||
fn test_unpaired_low_surrogate() { | ||
let raw_data = r#"{"event":"\uDC00"}"#; | ||
let pass = super::InvalidSurrogatesPass::new(raw_data.chars()); | ||
let data = pass.collect::<String>(); | ||
assert_eq!(data, r#"{"event":"\uFFFD"}"#); | ||
} | ||
|
||
#[test] | ||
fn test_wrong_order_surrogate_pair() { | ||
let raw_data = r#"{"event":"\uDC00\uD800"}"#; | ||
let pass = super::InvalidSurrogatesPass::new(raw_data.chars()); | ||
let data = pass.collect::<String>(); | ||
assert_eq!(data, r#"{"event":"\uFFFD\uFFFD"}"#); | ||
} | ||
|
||
#[test] | ||
fn test_trailing_escape() { | ||
let raw_data = r#"{"event":"\u"}"#; | ||
let pass = super::InvalidSurrogatesPass::new(raw_data.chars()); | ||
let data = pass.collect::<String>(); | ||
assert_eq!(data, r#"{"event":"\uFFFD"}"#); | ||
} | ||
|
||
#[test] | ||
fn test_trailing_escape_pair() { | ||
let raw_data = r#"{"event":"\u\u"}"#; | ||
let pass = super::InvalidSurrogatesPass::new(raw_data.chars()); | ||
let data = pass.collect::<String>(); | ||
assert_eq!(data, r#"{"event":"\uFFFD\uFFFD"}"#); | ||
} | ||
|
||
#[test] | ||
fn test_trailing_escape_pair_high_surrogate() { | ||
let raw_data = r#"{"event":"\uD800\u"}"#; | ||
let pass = super::InvalidSurrogatesPass::new(raw_data.chars()); | ||
let data = pass.collect::<String>(); | ||
assert_eq!(data, r#"{"event":"\uFFFD\uFFFD"}"#); | ||
} | ||
|
||
#[test] | ||
fn test_trailing_escape_pair_low_surrogate() { | ||
let raw_data = r#"{"event":"\uDC00\u"}"#; | ||
let pass = super::InvalidSurrogatesPass::new(raw_data.chars()); | ||
let data = pass.collect::<String>(); | ||
assert_eq!(data, r#"{"event":"\uFFFD\uFFFD"}"#); | ||
} | ||
|
||
#[test] | ||
fn test_trailing_escape_char() { | ||
let raw_data = r#"{"event":"\uD800\"#; | ||
let pass = super::InvalidSurrogatesPass::new(raw_data.chars()); | ||
let data = pass.collect::<String>(); | ||
assert_eq!(data, r#"{"event":"\uFFFD\"#); | ||
} | ||
|
||
#[test] | ||
fn test_valid_pair_trailing_slash() { | ||
let raw_data = r#"{"event":"\uD800\uDC00\"#; | ||
let pass = super::InvalidSurrogatesPass::new(raw_data.chars()); | ||
let data = pass.collect::<String>(); | ||
assert_eq!(data, r#"{"event":"\uD800\uDC00\"#); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pub mod invalid_surrogates; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
{ | ||
"event": "$snapshot", | ||
"properties": { | ||
"$snapshot_data": [ | ||
{ | ||
"windowId": "01924ccf-34f9-764e-b7f9-73c74eb7ed55", | ||
"data": { | ||
"payload": { | ||
"level": "warn", | ||
"payload": ["\\\\\\\",\\\\\\\"emoji_flag\\\\\\\":\\\\\\\"\ud83c...[truncated]", "\"test\""], | ||
"trace": [ | ||
"q/< (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:19808)", | ||
"q (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:20042)", | ||
"z (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:21009)", | ||
"a (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:34064)", | ||
"e/this.emit (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:35299)", | ||
"e/this.processMutations (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:33691)" | ||
] | ||
}, | ||
"plugin": "rrweb/console@1" | ||
}, | ||
"timestamp": 1727865503680, | ||
"type": 6, | ||
"seen": 1185537021728171 | ||
} | ||
] | ||
} | ||
} |