From 4b83adfa9f33d50c92376fda12f47fb1574ba80f Mon Sep 17 00:00:00 2001
From: Patrick Franco Braz <patrickfbraz@poli.ufrj.br>
Date: Tue, 16 Jul 2024 19:50:54 -0300
Subject: [PATCH] fix(ingest/bigquery): changes helper function to decode
 unicode scape sequences (#10845)

---
 .../source/bigquery_v2/bigquery_helper.py     | 19 +++++++++++---
 .../unit/test_bigqueryv2_usage_source.py      | 26 +++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py
index bbdf32da13621..507e1d917d206 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py
@@ -10,14 +10,27 @@ def unquote_and_decode_unicode_escape_seq(
     """
     If string starts and ends with a quote, unquote it and decode Unicode escape sequences
     """
+    unicode_seq_pattern = re.compile(r"\\(u|U)[0-9a-fA-F]{4}")
     trailing_quote = trailing_quote if trailing_quote else leading_quote
 
     if string.startswith(leading_quote) and string.endswith(trailing_quote):
         string = string[1:-1]
 
-    cleaned_string = string.encode().decode("unicode-escape")
-
-    return cleaned_string
+    # Decode Unicode escape sequences. This avoid issues with encoding
+    # This process does not handle unicode from "\U00010000" to "\U0010FFFF"
+    while unicode_seq_pattern.search(string):
+        # Get the first Unicode escape sequence.
+        # mypy: unicode_seq_pattern.search(string) is not None because of the while loop
+        unicode_seq = unicode_seq_pattern.search(string).group(0)  # type: ignore
+        # Replace the Unicode escape sequence with the decoded character
+        try:
+            string = string.replace(
+                unicode_seq, unicode_seq.encode("utf-8").decode("unicode-escape")
+            )
+        except UnicodeDecodeError:
+            # Skip decoding if is not possible to decode the Unicode escape sequence
+            break  # avoid infinite loop
+    return string
 
 
 def parse_labels(labels_str: str) -> Dict[str, str]:
diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
index 8a3fa5ca46ea4..21787af1b0cb9 100644
--- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
+++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
@@ -212,3 +212,29 @@ def test_unquote_and_decode_unicode_escape_seq():
     expected_output = "No escape sequences here"
     result = unquote_and_decode_unicode_escape_seq(input_string)
     assert result == expected_output
+
+    # Test with invalid Unicode escape sequences
+    input_string = '"No escape \\u123 sequences here"'
+    expected_output = "No escape \\u123 sequences here"
+    result = unquote_and_decode_unicode_escape_seq(input_string)
+    assert result == expected_output
+
+    # Test with a string that has multiple Unicode escape sequences
+    input_string = '"Hello \\u003cWorld\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e"'
+    expected_output = "Hello <World> <Again> <Again> <Again>"
+    result = unquote_and_decode_unicode_escape_seq(input_string)
+    assert result == expected_output
+
+    # Test with a string that has a Unicode escape sequence at the beginning
+    input_string = '"Hello \\utest"'
+    expected_output = "Hello \\utest"
+    result = unquote_and_decode_unicode_escape_seq(input_string)
+    assert result == expected_output
+
+    # Test with special characters
+    input_string = (
+        '"Hello \\u003cWorld\\u003e \\u003cçãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?\\u003e"'
+    )
+    expected_output = "Hello <World> <çãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?>"
+    result = unquote_and_decode_unicode_escape_seq(input_string)
+    assert result == expected_output