From 7e2076e852109e8538eddb37a7c89541886aa8bf Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 11 Mar 2024 15:34:44 -0700 Subject: [PATCH] feat(ingest): add query formatting to sql aggregator (#10025) --- .../sql_parsing/sql_parsing_aggregator.py | 33 +++++++++--- .../datahub/sql_parsing/sqlglot_lineage.py | 2 +- .../src/datahub/sql_parsing/sqlglot_utils.py | 50 +++++++++++++++--- .../src/datahub/utilities/sql_formatter.py | 3 ++ .../snowflake/snowflake_golden.json | 52 +++++++++---------- .../snowflake_privatelink_golden.json | 4 +- .../test_add_known_query_lineage.json | 2 +- .../test_basic_lineage.json | 2 +- .../test_column_lineage_deduplication.json | 4 +- .../test_overlapping_inserts.json | 4 +- .../aggregator_goldens/test_table_rename.json | 4 +- .../aggregator_goldens/test_temp_table.json | 6 +-- .../aggregator_goldens/test_view_lineage.json | 2 +- .../unit/sql_parsing/test_sqlglot_utils.py | 8 +-- 14 files changed, 116 insertions(+), 60 deletions(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index d553fa8c07c327..8edb131c232971 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -35,7 +35,11 @@ infer_output_schema, sqlglot_lineage, ) -from datahub.sql_parsing.sqlglot_utils import generate_hash, get_query_fingerprint +from datahub.sql_parsing.sqlglot_utils import ( + generate_hash, + get_query_fingerprint, + try_format_query, +) from datahub.utilities.cooperative_timeout import CooperativeTimeoutError from datahub.utilities.file_backed_collections import ( ConnectionWrapper, @@ -180,6 +184,7 @@ def __init__( generate_operations: bool = False, usage_config: Optional[BaseUsageConfig] = None, is_temp_table: Optional[Callable[[UrnStr], bool]] = None, + format_queries: bool = True, query_log: QueryLogSetting = QueryLogSetting.DISABLED, ) -> None: self.platform = DataPlatformUrn(platform) @@ -202,6 +207,7 @@ def __init__( # can be used by BQ where we have a "temp_table_dataset_prefix" self.is_temp_table = is_temp_table + self.format_queries = format_queries self.query_log = query_log # Set up the schema resolver. @@ -328,6 +334,11 @@ def _initialize_schema_resolver_from_graph(self, graph: DataHubGraph) -> None: env=self.env, ) + def _maybe_format_query(self, query: str) -> str: + if self.format_queries: + return try_format_query(query, self.platform.platform_name) + return query + def add_known_query_lineage( self, known_query_lineage: KnownQueryLineageInfo, merge_lineage: bool = False ) -> None: @@ -342,21 +353,23 @@ def add_known_query_lineage( Args: known_query_lineage: The known query lineage information. + merge_lineage: Whether to merge the lineage with any existing lineage + for the query ID. """ self.report.num_known_query_lineage += 1 # Generate a fingerprint for the query. query_fingerprint = get_query_fingerprint( - known_query_lineage.query_text, self.platform.platform_name + known_query_lineage.query_text, platform=self.platform.platform_name ) - # TODO format the query text? + formatted_query = self._maybe_format_query(known_query_lineage.query_text) # Register the query. self._add_to_query_map( QueryMetadata( query_id=query_fingerprint, - formatted_query_string=known_query_lineage.query_text, + formatted_query_string=formatted_query, session_id=known_query_lineage.session_id or _MISSING_SESSION_ID, query_type=known_query_lineage.query_type, lineage_type=models.DatasetLineageTypeClass.TRANSFORMED, @@ -499,6 +512,9 @@ def add_observed_query( elif parsed.debug_info.column_error: self.report.num_observed_queries_column_failed += 1 + # Format the query. + formatted_query = self._maybe_format_query(query) + # Register the query's usage. if not self._usage_aggregator: pass # usage is not enabled @@ -518,7 +534,7 @@ def add_observed_query( self._usage_aggregator.aggregate_event( resource=upstream_urn, start_time=query_timestamp, - query=query, + query=formatted_query, user=user.urn() if user else None, fields=sorted(upstream_fields.get(upstream_urn, [])), count=usage_multiplier, @@ -540,7 +556,7 @@ def add_observed_query( self._add_to_query_map( QueryMetadata( query_id=query_fingerprint, - formatted_query_string=query, # TODO replace with formatted query string + formatted_query_string=formatted_query, session_id=session_id, query_type=parsed.query_type, lineage_type=models.DatasetLineageTypeClass.TRANSFORMED, @@ -655,12 +671,15 @@ def _process_view_definition( self.report.num_views_column_failed += 1 query_fingerprint = self._view_query_id(view_urn) + formatted_view_definition = self._maybe_format_query( + view_definition.view_definition + ) # Register the query. self._add_to_query_map( QueryMetadata( query_id=query_fingerprint, - formatted_query_string=view_definition.view_definition, + formatted_query_string=formatted_view_definition, session_id=_MISSING_SESSION_ID, query_type=QueryType.CREATE_VIEW, lineage_type=models.DatasetLineageTypeClass.VIEW, diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index 9150f2d93b7390..91b9c1d4f17fc7 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -928,7 +928,7 @@ def _sqlglot_lineage_inner( original_statement, dialect=dialect ) query_fingerprint, debug_info.generalized_statement = get_query_fingerprint_debug( - original_statement, dialect=dialect + original_statement, platform=dialect ) return SqlParsingResult( query_type=query_type, diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py index 0dffc4291132b4..2fba7185ca4cab 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py @@ -58,6 +58,14 @@ def parse_statement( return statement +def _expression_to_string( + expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr +) -> str: + if isinstance(expression, str): + return expression + return expression.sql(dialect=get_dialect(platform)) + + def generalize_query(expression: sqlglot.exp.ExpOrStr, dialect: DialectOrStr) -> str: """ Generalize/normalize a SQL query. @@ -121,24 +129,28 @@ def generate_hash(text: str) -> str: def get_query_fingerprint_debug( - expression: sqlglot.exp.ExpOrStr, dialect: DialectOrStr -) -> Tuple[str, str]: + expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr +) -> Tuple[str, Optional[str]]: try: - dialect = get_dialect(dialect) + dialect = get_dialect(platform) expression_sql = generalize_query(expression, dialect=dialect) except (ValueError, sqlglot.errors.SqlglotError) as e: if not isinstance(expression, str): raise logger.debug("Failed to generalize query for fingerprinting: %s", e) - expression_sql = expression + expression_sql = None - fingerprint = generate_hash(expression_sql) + fingerprint = generate_hash( + expression_sql + if expression_sql is not None + else _expression_to_string(expression, platform=platform) + ) return fingerprint, expression_sql def get_query_fingerprint( - expression: sqlglot.exp.ExpOrStr, dialect: DialectOrStr + expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr ) -> str: """Get a fingerprint for a SQL query. @@ -154,13 +166,35 @@ def get_query_fingerprint( Args: expression: The SQL query to fingerprint. - dialect: The SQL dialect to use. + platform: The SQL dialect to use. Returns: The fingerprint for the SQL query. """ - return get_query_fingerprint_debug(expression, dialect)[0] + return get_query_fingerprint_debug(expression, platform)[0] + + +def try_format_query(expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr) -> str: + """Format a SQL query. + + If the query cannot be formatted, the original query is returned unchanged. + + Args: + expression: The SQL query to format. + platform: The SQL dialect to use. + + Returns: + The formatted SQL query. + """ + + try: + dialect = get_dialect(platform) + expression = parse_statement(expression, dialect=dialect) + return expression.sql(dialect=dialect, pretty=True) + except Exception as e: + logger.debug("Failed to format query: %s", e) + return _expression_to_string(expression, platform=platform) def detach_ctes( diff --git a/metadata-ingestion/src/datahub/utilities/sql_formatter.py b/metadata-ingestion/src/datahub/utilities/sql_formatter.py index 96cb71749b0b96..5b62c10378e995 100644 --- a/metadata-ingestion/src/datahub/utilities/sql_formatter.py +++ b/metadata-ingestion/src/datahub/utilities/sql_formatter.py @@ -5,6 +5,9 @@ logger = logging.getLogger(__name__) +# TODO: The sql query formatting functionality is duplicated by the try_format_query method, +# which is powered by sqlglot instead of sqlparse. + def format_sql_query(query: str, **options: Any) -> str: try: diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json index a064e4d1d58dc3..e650d9194cdc7b 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json @@ -4317,7 +4317,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_1 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_1\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -4333,7 +4333,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -4857,7 +4857,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_10 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_10\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -4873,7 +4873,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -4885,7 +4885,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_2 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_2\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -4901,7 +4901,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -5415,7 +5415,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_4 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_4\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -5431,7 +5431,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -5618,7 +5618,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_5 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_5\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -5634,7 +5634,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -5973,7 +5973,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_3 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_3\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -5989,7 +5989,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -6473,7 +6473,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_6 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_6\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -6489,7 +6489,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -6582,7 +6582,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_8 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_8\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -6598,7 +6598,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -6639,7 +6639,7 @@ "aspect": { "json": { "statement": { - "value": "create view view_1 as select * from table_1", + "value": "CREATE VIEW view_1 AS\nSELECT\n *\nFROM table_1", "language": "SQL" }, "source": "SYSTEM", @@ -6648,14 +6648,14 @@ "actor": "urn:li:corpuser:_ingestion" }, "lastModified": { - "time": 1709290787484, + "time": 1710193011317, "actor": "urn:li:corpuser:_ingestion" } } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -6713,7 +6713,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_9 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_9\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -6729,7 +6729,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -6822,7 +6822,7 @@ "aspect": { "json": { "statement": { - "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_7 SELECT * FROM TEST_DB.TEST_SCHEMA.TABLE_2", + "value": "INSERT INTO TEST_DB.TEST_SCHEMA.TABLE_7\nSELECT\n *\nFROM TEST_DB.TEST_SCHEMA.TABLE_2", "language": "SQL" }, "source": "SYSTEM", @@ -6838,7 +6838,7 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, @@ -6879,7 +6879,7 @@ "aspect": { "json": { "statement": { - "value": "create view view_2 as select * from table_2", + "value": "CREATE VIEW view_2 AS\nSELECT\n *\nFROM table_2", "language": "SQL" }, "source": "SYSTEM", @@ -6888,14 +6888,14 @@ "actor": "urn:li:corpuser:_ingestion" }, "lastModified": { - "time": 1709290787501, + "time": 1710193011336, "actor": "urn:li:corpuser:_ingestion" } } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_03_11-14_36_03", "lastRunId": "no-run-id-provided" } }, diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json index 7c39a53e243c10..7b266698f291a6 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json @@ -3864,7 +3864,7 @@ "aspect": { "json": { "statement": { - "value": "create view view_1 as select * from table_1", + "value": "CREATE VIEW view_1 AS\nSELECT\n *\nFROM table_1", "language": "SQL" }, "source": "SYSTEM", @@ -4067,7 +4067,7 @@ "aspect": { "json": { "statement": { - "value": "create view view_2 as select * from table_2", + "value": "CREATE VIEW view_2 AS\nSELECT\n *\nFROM table_2", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json index 49015efc24a623..2b4cf324f57fcf 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json @@ -70,7 +70,7 @@ "aspect": { "json": { "statement": { - "value": "insert into foo (a, b, c) select a, b, c from bar", + "value": "INSERT INTO foo (\n a,\n b,\n c\n)\nSELECT\n a,\n b,\n c\nFROM bar", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json index 036e5e5fa4ff28..135f6be02a434f 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json @@ -58,7 +58,7 @@ "aspect": { "json": { "statement": { - "value": "create table foo as select a, b from bar", + "value": "CREATE TABLE foo AS\nSELECT\n a,\n b\nFROM bar", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json index 183c4c8c929eff..8ff0b720deff5d 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json @@ -70,7 +70,7 @@ "aspect": { "json": { "statement": { - "value": "/* query 2 */ insert into foo (a, b) select a, b from bar", + "value": "/* query 2 */\nINSERT INTO foo (\n a,\n b\n)\nSELECT\n a,\n b\nFROM bar", "language": "SQL" }, "source": "SYSTEM", @@ -111,7 +111,7 @@ "aspect": { "json": { "statement": { - "value": "/* query 1 */ insert into foo (a, b, c) select a, b, c from bar", + "value": "/* query 1 */\nINSERT INTO foo (\n a,\n b,\n c\n)\nSELECT\n a,\n b,\n c\nFROM bar", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json index 7759d71fe4a773..874d0d7fef7500 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json @@ -95,7 +95,7 @@ "aspect": { "json": { "statement": { - "value": "insert into downstream (a, c) select a, c from upstream2", + "value": "INSERT INTO downstream (\n a,\n c\n)\nSELECT\n a,\n c\nFROM upstream2", "language": "SQL" }, "source": "SYSTEM", @@ -136,7 +136,7 @@ "aspect": { "json": { "statement": { - "value": "insert into downstream (a, b) select a, b from upstream1", + "value": "INSERT INTO downstream (\n a,\n b\n)\nSELECT\n a,\n b\nFROM upstream1", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json index 70eb9cc2b14d26..e572f5d27ef496 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json @@ -58,7 +58,7 @@ "aspect": { "json": { "statement": { - "value": "create table bar as select a, b from baz", + "value": "CREATE TABLE bar AS\nSELECT\n a,\n b\nFROM baz", "language": "SQL" }, "source": "SYSTEM", @@ -150,7 +150,7 @@ "aspect": { "json": { "statement": { - "value": "create table foo_staging as select a, b from foo_dep", + "value": "CREATE TABLE foo_staging AS\nSELECT\n a,\n b\nFROM foo_dep", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json index 5e61fb2b6a20f2..dc1320c2c4d579 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json @@ -58,7 +58,7 @@ "aspect": { "json": { "statement": { - "value": "create table foo as select a, 2*b as b from bar", + "value": "CREATE TABLE foo AS\nSELECT\n a,\n 2 * b AS b\nFROM bar", "language": "SQL" }, "source": "SYSTEM", @@ -151,7 +151,7 @@ "aspect": { "json": { "statement": { - "value": "create temp table foo as select a, b+c as c from bar;\n\ncreate table foo_session2 as select * from foo", + "value": "CREATE TEMPORARY TABLE foo AS\nSELECT\n a,\n b + c AS c\nFROM bar;\n\nCREATE TABLE foo_session2 AS\nSELECT\n *\nFROM foo", "language": "SQL" }, "source": "SYSTEM", @@ -217,7 +217,7 @@ "aspect": { "json": { "statement": { - "value": "create table foo_session3 as select * from foo", + "value": "CREATE TABLE foo_session3 AS\nSELECT\n *\nFROM foo", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json index 3f8fa7e5a1e282..8cc8b877b3be8b 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json @@ -58,7 +58,7 @@ "aspect": { "json": { "statement": { - "value": "create view foo as select a, b from bar", + "value": "CREATE VIEW foo AS\nSELECT\n a,\n b\nFROM bar", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py index 7c4d431520a7eb..61b5a4dc2ffb1d 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py @@ -108,9 +108,9 @@ def test_query_generalization(): def test_query_fingerprint(): assert get_query_fingerprint( - "select * /* everything */ from foo where ts = 34", dialect="redshift" - ) == get_query_fingerprint("SELECT * FROM foo where ts = 38", dialect="redshift") + "select * /* everything */ from foo where ts = 34", platform="redshift" + ) == get_query_fingerprint("SELECT * FROM foo where ts = 38", platform="redshift") assert get_query_fingerprint( - "select 1 + 1", dialect="postgres" - ) != get_query_fingerprint("select 2", dialect="postgres") + "select 1 + 1", platform="postgres" + ) != get_query_fingerprint("select 2", platform="postgres")