From 4b83adfa9f33d50c92376fda12f47fb1574ba80f Mon Sep 17 00:00:00 2001 From: Patrick Franco Braz Date: Tue, 16 Jul 2024 19:50:54 -0300 Subject: [PATCH] fix(ingest/bigquery): changes helper function to decode unicode scape sequences (#10845) --- .../source/bigquery_v2/bigquery_helper.py | 19 +++++++++++--- .../unit/test_bigqueryv2_usage_source.py | 26 +++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py index bbdf32da13621..507e1d917d206 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py @@ -10,14 +10,27 @@ def unquote_and_decode_unicode_escape_seq( """ If string starts and ends with a quote, unquote it and decode Unicode escape sequences """ + unicode_seq_pattern = re.compile(r"\\(u|U)[0-9a-fA-F]{4}") trailing_quote = trailing_quote if trailing_quote else leading_quote if string.startswith(leading_quote) and string.endswith(trailing_quote): string = string[1:-1] - cleaned_string = string.encode().decode("unicode-escape") - - return cleaned_string + # Decode Unicode escape sequences. This avoid issues with encoding + # This process does not handle unicode from "\U00010000" to "\U0010FFFF" + while unicode_seq_pattern.search(string): + # Get the first Unicode escape sequence. + # mypy: unicode_seq_pattern.search(string) is not None because of the while loop + unicode_seq = unicode_seq_pattern.search(string).group(0) # type: ignore + # Replace the Unicode escape sequence with the decoded character + try: + string = string.replace( + unicode_seq, unicode_seq.encode("utf-8").decode("unicode-escape") + ) + except UnicodeDecodeError: + # Skip decoding if is not possible to decode the Unicode escape sequence + break # avoid infinite loop + return string def parse_labels(labels_str: str) -> Dict[str, str]: diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 8a3fa5ca46ea4..21787af1b0cb9 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -212,3 +212,29 @@ def test_unquote_and_decode_unicode_escape_seq(): expected_output = "No escape sequences here" result = unquote_and_decode_unicode_escape_seq(input_string) assert result == expected_output + + # Test with invalid Unicode escape sequences + input_string = '"No escape \\u123 sequences here"' + expected_output = "No escape \\u123 sequences here" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that has multiple Unicode escape sequences + input_string = '"Hello \\u003cWorld\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e"' + expected_output = "Hello " + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that has a Unicode escape sequence at the beginning + input_string = '"Hello \\utest"' + expected_output = "Hello \\utest" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with special characters + input_string = ( + '"Hello \\u003cWorld\\u003e \\u003cçãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?\\u003e"' + ) + expected_output = "Hello <çãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?>" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output