Skip to content

Commit

Permalink
fix(ingest/bigquery): changes helper function to decode unicode scape…
Browse files Browse the repository at this point in the history
… sequences (datahub-project#10845)
  • Loading branch information
PatrickfBraz authored Jul 16, 2024
1 parent 1565fb0 commit 4b83adf
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,27 @@ def unquote_and_decode_unicode_escape_seq(
"""
If string starts and ends with a quote, unquote it and decode Unicode escape sequences
"""
unicode_seq_pattern = re.compile(r"\\(u|U)[0-9a-fA-F]{4}")
trailing_quote = trailing_quote if trailing_quote else leading_quote

if string.startswith(leading_quote) and string.endswith(trailing_quote):
string = string[1:-1]

cleaned_string = string.encode().decode("unicode-escape")

return cleaned_string
# Decode Unicode escape sequences. This avoid issues with encoding
# This process does not handle unicode from "\U00010000" to "\U0010FFFF"
while unicode_seq_pattern.search(string):
# Get the first Unicode escape sequence.
# mypy: unicode_seq_pattern.search(string) is not None because of the while loop
unicode_seq = unicode_seq_pattern.search(string).group(0) # type: ignore
# Replace the Unicode escape sequence with the decoded character
try:
string = string.replace(
unicode_seq, unicode_seq.encode("utf-8").decode("unicode-escape")
)
except UnicodeDecodeError:
# Skip decoding if is not possible to decode the Unicode escape sequence
break # avoid infinite loop
return string


def parse_labels(labels_str: str) -> Dict[str, str]:
Expand Down
26 changes: 26 additions & 0 deletions metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,29 @@ def test_unquote_and_decode_unicode_escape_seq():
expected_output = "No escape sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with invalid Unicode escape sequences
input_string = '"No escape \\u123 sequences here"'
expected_output = "No escape \\u123 sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that has multiple Unicode escape sequences
input_string = '"Hello \\u003cWorld\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e"'
expected_output = "Hello <World> <Again> <Again> <Again>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that has a Unicode escape sequence at the beginning
input_string = '"Hello \\utest"'
expected_output = "Hello \\utest"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with special characters
input_string = (
'"Hello \\u003cWorld\\u003e \\u003cçãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?\\u003e"'
)
expected_output = "Hello <World> <çãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

0 comments on commit 4b83adf

Please sign in to comment.