From 03590a194885b2fbbb5249aef909d761c3ffc12c Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Mon, 18 Dec 2023 19:54:31 +0100 Subject: [PATCH 1/4] fix(ingest/snowflake) - Fixing snowflake url with default region (#9443) --- metadata-ingestion/setup.py | 8 +- .../source/snowflake/snowflake_utils.py | 28 ++++- .../snowflake/snowflake_golden.json | 116 +++++++++--------- .../integration/sql_server/test_sql_server.py | 5 + .../tests/unit/test_snowflake_source.py | 27 ++++ 5 files changed, 120 insertions(+), 64 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 1bc1bc5100b08d..cb13a40125c0da 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -354,7 +354,11 @@ "mlflow": {"mlflow-skinny>=2.3.0"}, "mode": {"requests", "tenacity>=8.0.1"} | sqllineage_lib, "mongodb": {"pymongo[srv]>=3.11", "packaging"}, - "mssql": sql_common | {"sqlalchemy-pytds>=0.3", "pyOpenSSL"}, + "mssql": sql_common + | { + "sqlalchemy-pytds>=0.3", + "pyOpenSSL", + }, "mssql-odbc": sql_common | {"pyodbc"}, "mysql": mysql, # mariadb should have same dependency as mysql @@ -559,7 +563,7 @@ "kafka-connect", "ldap", "mongodb", - "mssql", + "mssql" if sys.version_info >= (3, 8) else None, "mysql", "mariadb", "redash", diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 5a451bf197d347..af8d8824a4b172 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -9,8 +9,8 @@ from datahub.configuration.pattern_utils import is_schema_allowed from datahub.ingestion.source.snowflake.constants import ( GENERIC_PERMISSION_ERROR_KEY, - SNOWFLAKE_DEFAULT_CLOUD, SNOWFLAKE_REGION_CLOUD_REGION_MAPPING, + SnowflakeCloudProvider, SnowflakeObjectDomain, ) from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config @@ -72,6 +72,15 @@ def report_error(self, key: str, reason: str) -> None: class SnowflakeCommonMixin: platform = "snowflake" + CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX = [ + "us-west-2", + "us-east-1", + "eu-west-1", + "eu-central-1", + "ap-southeast-1", + "ap-southeast-2", + ] + @staticmethod def create_snowsight_base_url( account_locator: str, @@ -79,12 +88,23 @@ def create_snowsight_base_url( cloud: str, privatelink: bool = False, ) -> Optional[str]: + if cloud: + url_cloud_provider_suffix = f".{cloud}" + + if cloud == SnowflakeCloudProvider.AWS: + # Some AWS regions do not have cloud suffix. See below the list: + # https://docs.snowflake.com/en/user-guide/admin-account-identifier#non-vps-account-locator-formats-by-cloud-platform-and-region + if ( + cloud_region_id + in SnowflakeCommonMixin.CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX + ): + url_cloud_provider_suffix = "" + else: + url_cloud_provider_suffix = f".{cloud}" if privatelink: url = f"https://app.{account_locator}.{cloud_region_id}.privatelink.snowflakecomputing.com/" - elif cloud == SNOWFLAKE_DEFAULT_CLOUD: - url = f"https://app.snowflake.com/{cloud_region_id}/{account_locator}/" else: - url = f"https://app.snowflake.com/{cloud_region_id}.{cloud}/{account_locator}/" + url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/" return url @staticmethod diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json index c7273fee5a2e58..ece54f00eeaa04 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json @@ -11,20 +11,20 @@ "env": "PROD", "database": "test_db" }, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/", "name": "TEST_DB", "description": "Comment for TEST_DB", "created": { - "time": 1623110400000 + "time": 1623103200000 }, "lastModified": { - "time": 1623110400000 + "time": 1623103200000 } } }, "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00", + "lastObserved": 1615443388097, + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -144,20 +144,20 @@ "database": "test_db", "schema": "test_schema" }, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/", "name": "TEST_SCHEMA", "description": "comment for TEST_DB.TEST_SCHEMA", "created": { - "time": 1623110400000 + "time": 1623103200000 }, "lastModified": { - "time": 1623110400000 + "time": 1623103200000 } } }, "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00", + "lastObserved": 1615443388097, + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -489,22 +489,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/", "name": "TABLE_1", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_1", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -788,22 +788,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/", "name": "TABLE_2", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_2", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -1087,22 +1087,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/", "name": "TABLE_3", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_3", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -1386,22 +1386,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/", "name": "TABLE_4", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_4", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -1685,22 +1685,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_5/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_5/", "name": "TABLE_5", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_5", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -1984,22 +1984,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/", "name": "TABLE_6", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_6", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -2283,22 +2283,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/", "name": "TABLE_7", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_7", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -2582,22 +2582,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/", "name": "TABLE_8", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_8", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -2881,22 +2881,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/", "name": "TABLE_9", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_9", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -3180,22 +3180,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/", "name": "TABLE_10", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_10", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -3470,22 +3470,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/", "name": "VIEW_1", "qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_1", "description": "Comment for View", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -3805,22 +3805,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_2/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_2/", "name": "VIEW_2", "qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_2", "description": "Comment for View", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index f439a322c26771..5ed672d527264a 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -1,5 +1,6 @@ import os import subprocess +import sys import time import pytest @@ -8,6 +9,10 @@ from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port +pytestmark = pytest.mark.skipif( + sys.version_info < (3, 8), reason="requires python 3.8 or higher" +) + @pytest.fixture(scope="module") def mssql_runner(docker_compose_runner, pytestconfig): diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 536c91ace4f5ed..69a7510692df1d 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -24,6 +24,7 @@ from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeObjectAccessEntry, ) +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.snowflake.snowflake_v2 import SnowflakeV2Source from tests.test_helpers import test_connection_helpers @@ -584,3 +585,29 @@ def test_email_filter_query_generation_with_case_insensitive_filter(): filter_query == "AND (rlike(user_name, '.*@example.com','c')) AND NOT (rlike(user_name, '.*@example2.com','c'))" ) + + +def test_create_snowsight_base_url_us_west(): + ( + cloud, + cloud_region_id, + ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id("aws_us_west_2") + + result = SnowflakeCommonMixin.create_snowsight_base_url( + "account_locator", cloud_region_id, cloud, False + ) + assert result == "https://app.snowflake.com/us-west-2/account_locator/" + + +def test_create_snowsight_base_url_ap_northeast_1(): + ( + cloud, + cloud_region_id, + ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id( + "aws_ap_northeast_1" + ) + + result = SnowflakeCommonMixin.create_snowsight_base_url( + "account_locator", cloud_region_id, cloud, False + ) + assert result == "https://app.snowflake.com/ap-northeast-1.aws/account_locator/" From 193d1464a628fc800e926f04fcd4bd1d6774d858 Mon Sep 17 00:00:00 2001 From: noggi Date: Mon, 18 Dec 2023 14:06:17 -0800 Subject: [PATCH 2/4] Fix downstream CI issue (#9479) --- docker/datahub-ingestion-base/Dockerfile | 2 +- docker/datahub-ingestion/Dockerfile | 2 +- docker/datahub-ingestion/build.gradle | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index e0f9fdc997071c..81fec61ea50733 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -4,7 +4,7 @@ ARG BASE_IMAGE=base # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com -ARG DEBIAN_REPO_URL=http://deb.debian.org/debian +ARG DEBIAN_REPO_URL=https://deb.debian.org/debian ARG PIP_MIRROR_URL=null FROM golang:1-alpine3.18 AS dockerize-binary diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 9516c31a19e21b..2898a363a0a185 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -3,7 +3,7 @@ ARG APP_ENV=full ARG BASE_IMAGE=acryldata/datahub-ingestion-base ARG DOCKER_VERSION=head ARG PIP_MIRROR_URL=null -ARG DEBIAN_REPO_URL=http://deb.debian.org/debian +ARG DEBIAN_REPO_URL=https://deb.debian.org/debian FROM $BASE_IMAGE:$DOCKER_VERSION as base USER 0 diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 36444210f1938b..0b08f189e6b45a 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -33,7 +33,7 @@ docker { i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } - def dockerBuildArgs = [DOCKER_VERSION: version, RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')] + def dockerBuildArgs = [DOCKER_VERSION: version, RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", ''), BASE_IMAGE: "${docker_registry}/datahub-ingestion-base"] // Add build args if they are defined (needed for some CI or enterprise environments) if (project.hasProperty('pipMirrorUrl')) { From ecda3e618704c5eb335ad1a21c30f0c935581f64 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 18 Dec 2023 18:26:33 -0500 Subject: [PATCH 3/4] feat(ingest): pydantic v2 compatibility (#9434) --- .github/workflows/airflow-plugin.yml | 7 ++-- .../airflow-plugin/tox.ini | 9 +++++ metadata-ingestion/setup.py | 39 ++++++++++++++++--- .../api/entities/datacontract/assertion.py | 4 +- .../datacontract/assertion_operator.py | 16 ++++---- .../datacontract/data_quality_assertion.py | 11 +++--- .../api/entities/datacontract/datacontract.py | 23 +++++------ .../datacontract/freshness_assertion.py | 15 ++++--- .../entities/datacontract/schema_assertion.py | 14 ++++--- .../src/datahub/cli/check_cli.py | 13 ++++++- .../src/datahub/configuration/common.py | 16 +++++++- .../src/datahub/configuration/datetimes.py | 4 +- .../pydantic_migration_helpers.py | 29 ++++++++++++++ .../configuration/time_window_config.py | 16 ++++++-- .../configuration/validate_field_rename.py | 4 +- .../ingestion/glossary/datahub_classifier.py | 11 +++++- .../source/bigquery_v2/bigquery_config.py | 2 +- .../ingestion/source/delta_lake/config.py | 4 +- .../source/snowflake/snowflake_config.py | 2 +- .../ingestion/source_config/sql/snowflake.py | 2 +- .../src/datahub/utilities/urns/urn_iter.py | 2 +- .../integration/snowflake/test_snowflake.py | 16 ++++---- .../unit/{ => config}/test_allow_deny.py | 0 .../unit/{ => config}/test_config_clean.py | 0 .../tests/unit/config/test_config_model.py | 18 +++++++-- .../{ => config}/test_pydantic_validators.py | 13 +++++-- .../{ => config}/test_time_window_config.py | 0 27 files changed, 209 insertions(+), 81 deletions(-) rename metadata-ingestion/tests/unit/{ => config}/test_allow_deny.py (100%) rename metadata-ingestion/tests/unit/{ => config}/test_config_clean.py (100%) rename metadata-ingestion/tests/unit/{ => config}/test_pydantic_validators.py (92%) rename metadata-ingestion/tests/unit/{ => config}/test_time_window_config.py (100%) diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index cd1e159b7d53cc..70816e5f093d13 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -32,6 +32,7 @@ jobs: strategy: matrix: include: + # Note: this should be kept in sync with tox.ini. - python-version: "3.8" extra_pip_requirements: "apache-airflow~=2.1.4" extra_pip_extras: plugin-v1 @@ -39,13 +40,13 @@ jobs: extra_pip_requirements: "apache-airflow~=2.2.4" extra_pip_extras: plugin-v1 - python-version: "3.10" - extra_pip_requirements: "apache-airflow~=2.4.0" + extra_pip_requirements: 'apache-airflow~=2.4.0 pluggy==1.0.0 "pendulum<3.0"' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: "apache-airflow~=2.6.0" + extra_pip_requirements: 'apache-airflow~=2.6.0 "pendulum<3.0"' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: "apache-airflow>=2.7.0" + extra_pip_requirements: "apache-airflow>=2.7.0 pydantic==2.4.2" extra_pip_extras: plugin-v2 fail-fast: false steps: diff --git a/metadata-ingestion-modules/airflow-plugin/tox.ini b/metadata-ingestion-modules/airflow-plugin/tox.ini index 1010bd2933e452..27ae2ce65ba658 100644 --- a/metadata-ingestion-modules/airflow-plugin/tox.ini +++ b/metadata-ingestion-modules/airflow-plugin/tox.ini @@ -10,6 +10,7 @@ envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py31 use_develop = true extras = dev,integration-tests,plugin-v1 deps = + # This should be kept in sync with the Github Actions matrix. -e ../../metadata-ingestion/ # Airflow version airflow21: apache-airflow~=2.1.0 @@ -20,7 +21,15 @@ deps = # See https://github.com/datahub-project/datahub/pull/9365 airflow24: apache-airflow~=2.4.0,pluggy==1.0.0 airflow26: apache-airflow~=2.6.0 + # Respect the constraints file on pendulum. + # See https://github.com/apache/airflow/issues/36274 + airflow24,airflow26: pendulum>=2.0,<3.0 + # The Airflow 2.7 constraints file points at pydantic v2, so we match that here. + # https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt + # Note that Airflow is actually compatible with both pydantic v1 and v2, and the + # constraints file is overly restrictive. airflow27: apache-airflow~=2.7.0 + airflow27: pydantic==2.4.2 commands = pytest --cov-append {posargs} diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index cb13a40125c0da..13c9d3c99aaca1 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -14,9 +14,10 @@ "mypy_extensions>=0.4.3", # Actual dependencies. "typing-inspect", + # pydantic 1.8.2 is incompatible with mypy 0.910. + # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. # pydantic 1.10.3 is incompatible with typing-extensions 4.1.1 - https://github.com/pydantic/pydantic/issues/4885 - # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887 - "pydantic>=1.5.1,!=1.10.3,<2", + "pydantic>=1.10.0,!=1.10.3", "mixpanel>=4.9.0", "sentry-sdk", } @@ -53,6 +54,18 @@ "ruamel.yaml", } +pydantic_no_v2 = { + # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887 + # Tags sources that require the pydantic v2 API. + "pydantic<2", +} + +plugin_common = { + # While pydantic v2 support is experimental, require that all plugins + # continue to use v1. This will ensure that no ingestion recipes break. + *pydantic_no_v2, +} + rest_common = {"requests", "requests_file"} kafka_common = { @@ -118,6 +131,7 @@ "sqlalchemy>=1.4.39, <2", # Required for SQL profiling. "great-expectations>=0.15.12, <=0.15.50", + *pydantic_no_v2, # because of great-expectations # scipy version restricted to reduce backtracking, used by great-expectations, "scipy>=1.7.2", # GE added handling for higher version of jinja2 @@ -229,6 +243,7 @@ iceberg_common = { # Iceberg Python SDK "pyiceberg", + *pydantic_no_v2, # because of pyiceberg "pyarrow>=9.0.0, <13.0.0", } @@ -477,9 +492,6 @@ "flake8-bugbear==23.3.12", "isort>=5.7.0", "mypy==1.0.0", - # pydantic 1.8.2 is incompatible with mypy 0.910. - # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.10.0", *test_api_requirements, pytest_dep, "pytest-asyncio>=0.16.0", @@ -740,7 +752,22 @@ extras_require={ "base": list(framework_common), **{ - plugin: list(framework_common | dependencies) + plugin: list( + framework_common + | ( + plugin_common + if plugin + not in { + "airflow", + "datahub-rest", + "datahub-kafka", + "sync-file-emitter", + "sql-parser", + } + else set() + ) + | dependencies + ) for (plugin, dependencies) in plugins.items() }, "all": list( diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py index c45d4ddc924580..89ac528efe81a1 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py @@ -1,7 +1,7 @@ from typing import Optional -from datahub.configuration import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel -class BaseAssertion(ConfigModel): +class BaseAssertion(v1_ConfigModel): description: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py index a41b0f7aafd9f2..dc0c97d1c74e56 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py @@ -2,7 +2,7 @@ from typing_extensions import Literal, Protocol -from datahub.configuration import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel from datahub.metadata.schema_classes import ( AssertionStdOperatorClass, AssertionStdParameterClass, @@ -58,7 +58,7 @@ def _generate_assertion_std_parameters( ) -class EqualToOperator(ConfigModel): +class EqualToOperator(v1_ConfigModel): type: Literal["equal_to"] value: Union[str, int, float] @@ -71,7 +71,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class BetweenOperator(ConfigModel): +class BetweenOperator(v1_ConfigModel): type: Literal["between"] min: Union[int, float] max: Union[int, float] @@ -87,7 +87,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: ) -class LessThanOperator(ConfigModel): +class LessThanOperator(v1_ConfigModel): type: Literal["less_than"] value: Union[int, float] @@ -100,7 +100,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class GreaterThanOperator(ConfigModel): +class GreaterThanOperator(v1_ConfigModel): type: Literal["greater_than"] value: Union[int, float] @@ -113,7 +113,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class LessThanOrEqualToOperator(ConfigModel): +class LessThanOrEqualToOperator(v1_ConfigModel): type: Literal["less_than_or_equal_to"] value: Union[int, float] @@ -126,7 +126,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class GreaterThanOrEqualToOperator(ConfigModel): +class GreaterThanOrEqualToOperator(v1_ConfigModel): type: Literal["greater_than_or_equal_to"] value: Union[int, float] @@ -139,7 +139,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class NotNullOperator(ConfigModel): +class NotNullOperator(v1_ConfigModel): type: Literal["not_null"] operator: str = AssertionStdOperatorClass.NOT_NULL diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py index 6a3944ba36baf0..975aa359bd2031 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py @@ -1,12 +1,11 @@ from typing import List, Optional, Union -import pydantic from typing_extensions import Literal import datahub.emitter.mce_builder as builder from datahub.api.entities.datacontract.assertion import BaseAssertion from datahub.api.entities.datacontract.assertion_operator import Operators -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AssertionInfoClass, @@ -25,7 +24,7 @@ class IdConfigMixin(BaseAssertion): - id_raw: Optional[str] = pydantic.Field( + id_raw: Optional[str] = v1_Field( default=None, alias="id", description="The id of the assertion. If not provided, one will be generated using the type.", @@ -38,7 +37,7 @@ def generate_default_id(self) -> str: class CustomSQLAssertion(IdConfigMixin, BaseAssertion): type: Literal["custom_sql"] sql: str - operator: Operators = pydantic.Field(discriminator="type") + operator: Operators = v1_Field(discriminator="type") def generate_default_id(self) -> str: return f"{self.type}-{self.sql}-{self.operator.id()}" @@ -89,11 +88,11 @@ def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass: ) -class DataQualityAssertion(ConfigModel): +class DataQualityAssertion(v1_ConfigModel): __root__: Union[ CustomSQLAssertion, ColumnUniqueAssertion, - ] = pydantic.Field(discriminator="type") + ] = v1_Field(discriminator="type") @property def id(self) -> str: diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py index f3c6be55e5fea9..e0ef85d5fd66c0 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py @@ -1,7 +1,6 @@ import collections from typing import Iterable, List, Optional, Tuple -import pydantic from ruamel.yaml import YAML from typing_extensions import Literal @@ -11,7 +10,11 @@ ) from datahub.api.entities.datacontract.freshness_assertion import FreshnessAssertion from datahub.api.entities.datacontract.schema_assertion import SchemaAssertion -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import ( + v1_ConfigModel, + v1_Field, + v1_validator, +) from datahub.emitter.mce_builder import datahub_guid, make_assertion_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -26,7 +29,7 @@ from datahub.utilities.urns.urn import guess_entity_type -class DataContract(ConfigModel): +class DataContract(v1_ConfigModel): """A yml representation of a Data Contract. This model is used as a simpler, Python-native representation of a DataHub data contract. @@ -36,29 +39,27 @@ class DataContract(ConfigModel): version: Literal[1] - id: Optional[str] = pydantic.Field( + id: Optional[str] = v1_Field( default=None, alias="urn", description="The data contract urn. If not provided, one will be generated.", ) - entity: str = pydantic.Field( + entity: str = v1_Field( description="The entity urn that the Data Contract is associated with" ) # TODO: add support for properties # properties: Optional[Dict[str, str]] = None - schema_field: Optional[SchemaAssertion] = pydantic.Field( - default=None, alias="schema" - ) + schema_field: Optional[SchemaAssertion] = v1_Field(default=None, alias="schema") - freshness: Optional[FreshnessAssertion] = pydantic.Field(default=None) + freshness: Optional[FreshnessAssertion] = v1_Field(default=None) # TODO: Add a validator to ensure that ids are unique - data_quality: Optional[List[DataQualityAssertion]] = pydantic.Field(default=None) + data_quality: Optional[List[DataQualityAssertion]] = v1_Field(default=None) _original_yaml_dict: Optional[dict] = None - @pydantic.validator("data_quality") + @v1_validator("data_quality") # type: ignore def validate_data_quality( cls, data_quality: Optional[List[DataQualityAssertion]] ) -> Optional[List[DataQualityAssertion]]: diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py index 71741d76b22fc4..86942766889676 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py @@ -3,11 +3,10 @@ from datetime import timedelta from typing import List, Union -import pydantic from typing_extensions import Literal from datahub.api.entities.datacontract.assertion import BaseAssertion -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AssertionInfoClass, @@ -25,10 +24,10 @@ class CronFreshnessAssertion(BaseAssertion): type: Literal["cron"] - cron: str = pydantic.Field( + cron: str = v1_Field( description="The cron expression to use. See https://crontab.guru/ for help." ) - timezone: str = pydantic.Field( + timezone: str = v1_Field( "UTC", description="The timezone to use for the cron schedule. Defaults to UTC.", ) @@ -58,10 +57,10 @@ def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleCla ) -class FreshnessAssertion(ConfigModel): - __root__: Union[ - CronFreshnessAssertion, FixedIntervalFreshnessAssertion - ] = pydantic.Field(discriminator="type") +class FreshnessAssertion(v1_ConfigModel): + __root__: Union[CronFreshnessAssertion, FixedIntervalFreshnessAssertion] = v1_Field( + discriminator="type" + ) @property def id(self): diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py index b62f94e0592fce..39297d1a98d026 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py @@ -3,11 +3,10 @@ import json from typing import List, Union -import pydantic from typing_extensions import Literal from datahub.api.entities.datacontract.assertion import BaseAssertion -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.extractor.json_schema_util import get_schema_metadata from datahub.metadata.schema_classes import ( @@ -23,7 +22,7 @@ class JsonSchemaContract(BaseAssertion): type: Literal["json-schema"] - json_schema: dict = pydantic.Field(alias="json-schema") + json_schema: dict = v1_Field(alias="json-schema") _schema_metadata: SchemaMetadataClass @@ -37,7 +36,10 @@ def _init_private_attributes(self) -> None: ) -class FieldListSchemaContract(BaseAssertion, arbitrary_types_allowed=True): +class FieldListSchemaContract(BaseAssertion): + class Config: + arbitrary_types_allowed = True + type: Literal["field-list"] fields: List[SchemaFieldClass] @@ -56,8 +58,8 @@ def _init_private_attributes(self) -> None: ) -class SchemaAssertion(ConfigModel): - __root__: Union[JsonSchemaContract, FieldListSchemaContract] = pydantic.Field( +class SchemaAssertion(v1_ConfigModel): + __root__: Union[JsonSchemaContract, FieldListSchemaContract] = v1_Field( discriminator="type" ) diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index f7996900f7a7ad..2732a72aea5399 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -126,10 +126,21 @@ def metadata_diff( default=False, help="Include extra information for each plugin.", ) +@click.option( + "--source", + type=str, + default=None, +) @telemetry.with_telemetry() -def plugins(verbose: bool) -> None: +def plugins(source: Optional[str], verbose: bool) -> None: """List the enabled ingestion plugins.""" + if source: + # Quick helper for one-off checks with full stack traces. + source_registry.get(source) + click.echo(f"Source {source} is enabled.") + return + click.secho("Sources:", bold=True) click.echo(source_registry.summary(verbose=verbose, col_width=25)) click.echo() diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index f225856ca43ce4..0030332bcfd541 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -99,8 +99,20 @@ def _schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None: @classmethod def parse_obj_allow_extras(cls: Type[_ConfigSelf], obj: Any) -> _ConfigSelf: - with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow): - return cls.parse_obj(obj) + if PYDANTIC_VERSION_2: + try: + with unittest.mock.patch.dict( + cls.model_config, # type: ignore + {"extra": "allow"}, + clear=False, + ): + cls.model_rebuild(force=True) # type: ignore + return cls.parse_obj(obj) + finally: + cls.model_rebuild(force=True) # type: ignore + else: + with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow): + return cls.parse_obj(obj) class PermissiveConfigModel(ConfigModel): diff --git a/metadata-ingestion/src/datahub/configuration/datetimes.py b/metadata-ingestion/src/datahub/configuration/datetimes.py index 41af7565593d9b..1520462fa9bf8c 100644 --- a/metadata-ingestion/src/datahub/configuration/datetimes.py +++ b/metadata-ingestion/src/datahub/configuration/datetimes.py @@ -65,6 +65,8 @@ def parse_absolute_time(input: str) -> datetime: def parse_relative_timespan(input: str) -> timedelta: + raw_input = input + neg = False input = input.strip() @@ -79,7 +81,7 @@ def parse_relative_timespan(input: str) -> timedelta: if neg: delta = -delta - logger.debug(f'Parsed "{input}" as {delta}.') + logger.debug(f'Parsed "{raw_input}" as {delta}.') return delta diff --git a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py index f1876b500598ba..bd931abe2e84d1 100644 --- a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py +++ b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py @@ -19,12 +19,41 @@ class PydanticDeprecatedSince20(Warning): # type: ignore if PYDANTIC_VERSION_2: from pydantic import BaseModel as GenericModel + from pydantic.v1 import ( # type: ignore + BaseModel as v1_BaseModel, + Extra as v1_Extra, + Field as v1_Field, + root_validator as v1_root_validator, + validator as v1_validator, + ) else: + from pydantic import ( # type: ignore + BaseModel as v1_BaseModel, + Extra as v1_Extra, + Field as v1_Field, + root_validator as v1_root_validator, + validator as v1_validator, + ) from pydantic.generics import GenericModel # type: ignore +class v1_ConfigModel(v1_BaseModel): + """A simplified variant of our main ConfigModel class. + + This one only uses pydantic v1 features. + """ + + class Config: + extra = v1_Extra.forbid + underscore_attrs_are_private = True + + __all__ = [ "PYDANTIC_VERSION_2", "PydanticDeprecatedSince20", "GenericModel", + "v1_ConfigModel", + "v1_Field", + "v1_root_validator", + "v1_validator", ] diff --git a/metadata-ingestion/src/datahub/configuration/time_window_config.py b/metadata-ingestion/src/datahub/configuration/time_window_config.py index 15de7470e4d823..f20ab85be05855 100644 --- a/metadata-ingestion/src/datahub/configuration/time_window_config.py +++ b/metadata-ingestion/src/datahub/configuration/time_window_config.py @@ -68,6 +68,12 @@ def default_start_time( assert abs(delta) >= get_bucket_duration_delta( values["bucket_duration"] ), "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'." + + # The end_time's default value is not yet populated, in which case + # we can just manually generate it here. + if "end_time" not in values: + values["end_time"] = datetime.now(tz=timezone.utc) + return get_time_bucket( values["end_time"] + delta, values["bucket_duration"] ) @@ -80,9 +86,13 @@ def default_start_time( @pydantic.validator("start_time", "end_time") def ensure_timestamps_in_utc(cls, v: datetime) -> datetime: - assert ( - v.tzinfo == timezone.utc - ), 'timezone is not UTC; try adding a "Z" to the value e.g. "2021-07-20T00:00:00Z"' + if v.tzinfo is None: + raise ValueError( + "Timestamps must be in UTC. Try adding a 'Z' to the value e.g. '2021-07-20T00:00:00Z'" + ) + + # If the timestamp is timezone-aware but not in UTC, convert it to UTC. + v = v.astimezone(timezone.utc) return v diff --git a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py index bb01f2b787123a..de2a16e9bf247d 100644 --- a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py +++ b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py @@ -49,4 +49,6 @@ def _validate_field_rename(cls: Type, values: dict) -> dict: # validator with pre=True gets all the values that were passed in. # Given that a renamed field doesn't show up in the fields list, we can't use # the field-level validator, even with a different field name. - return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename) + return pydantic.root_validator(pre=True, skip_on_failure=True, allow_reuse=True)( + _validate_field_rename + ) diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py index 1f2b7f5689ea3c..42eb930c80f9d4 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py @@ -8,6 +8,7 @@ from pydantic.fields import Field from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.ingestion.glossary.classifier import Classifier @@ -50,7 +51,10 @@ class ValuesFactorConfig(ConfigModel): class PredictionFactorsAndWeights(ConfigModel): class Config: - allow_population_by_field_name = True + if PYDANTIC_VERSION_2: + populate_by_name = True + else: + allow_population_by_field_name = True Name: float = Field(alias="name") Description: float = Field(alias="description") @@ -60,7 +64,10 @@ class Config: class InfoTypeConfig(ConfigModel): class Config: - allow_population_by_field_name = True + if PYDANTIC_VERSION_2: + populate_by_name = True + else: + allow_population_by_field_name = True Prediction_Factors_and_Weights: PredictionFactorsAndWeights = Field( description="Factors and their weights to consider when predicting info types", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index cbe68a454ea436..c13b08a6d9656b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -284,7 +284,7 @@ def validate_bigquery_audit_metadata_datasets( return v - @root_validator(pre=False) + @root_validator(pre=False, skip_on_failure=True) def backward_compatibility_configs_set(cls, values: Dict) -> Dict: project_id = values.get("project_id") project_id_pattern = values.get("project_id_pattern") diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py index f3616ca648a3e6..81a54d1327d05a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py @@ -4,6 +4,7 @@ import pydantic from cached_property import cached_property from pydantic import Field +from typing_extensions import Literal from datahub.configuration.common import AllowDenyPattern from datahub.configuration.source_common import ( @@ -46,10 +47,9 @@ class DeltaLakeSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin): "'/' and URNs will be created using " "relative_path only.", ) - platform: str = Field( + platform: Literal["delta-lake"] = Field( default="delta-lake", description="The platform that this source connects to", - const=True, ) platform_instance: Optional[str] = Field( default=None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 032bdef178fdf6..b896df1fa340e3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -176,7 +176,7 @@ def validate_include_column_lineage(cls, v, values): ) return v - @root_validator(pre=False) + @root_validator(pre=False, skip_on_failure=True) def validate_unsupported_configs(cls, values: Dict) -> Dict: value = values.get("include_read_operational_stats") if value is not None and value: diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 46bd24c7e1f4c3..e9db82ce75cd99 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -107,7 +107,7 @@ def validate_account_id(cls, account_id: str) -> str: return account_id @pydantic.validator("authentication_type", always=True) - def authenticator_type_is_valid(cls, v, values, field): + def authenticator_type_is_valid(cls, v, values): if v not in VALID_AUTH_TYPES.keys(): raise ValueError( f"unsupported authenticator type '{v}' was provided," diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py index 4f228494f416b8..3389a6fb05ee89 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py @@ -150,7 +150,7 @@ def modify_urn(urn: str) -> str: if guess_entity_type(urn) == "dataset": return _lowercase_dataset_urn(urn) elif guess_entity_type(urn) == "schemaField": - cur_urn = Urn.create_from_string(urn) + cur_urn = Urn.from_string(urn) cur_urn._entity_ids[0] = _lowercase_dataset_urn(cur_urn._entity_ids[0]) return str(cur_urn) return urn diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 1b58696e4014c9..39a62056a7e4ad 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -87,18 +87,18 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): confidence_level_threshold=0.58, info_types_config={ "Age": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, Values=1, Description=0, Datatype=0 + prediction_factors_and_weights=PredictionFactorsAndWeights( + name=0, values=1, description=0, datatype=0 ) ), "CloudRegion": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, - Description=0, - Datatype=0, - Values=1, + prediction_factors_and_weights=PredictionFactorsAndWeights( + name=0, + description=0, + datatype=0, + values=1, ), - Values=ValuesFactorConfig( + values=ValuesFactorConfig( prediction_type="regex", regex=[ r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+" diff --git a/metadata-ingestion/tests/unit/test_allow_deny.py b/metadata-ingestion/tests/unit/config/test_allow_deny.py similarity index 100% rename from metadata-ingestion/tests/unit/test_allow_deny.py rename to metadata-ingestion/tests/unit/config/test_allow_deny.py diff --git a/metadata-ingestion/tests/unit/test_config_clean.py b/metadata-ingestion/tests/unit/config/test_config_clean.py similarity index 100% rename from metadata-ingestion/tests/unit/test_config_clean.py rename to metadata-ingestion/tests/unit/config/test_config_clean.py diff --git a/metadata-ingestion/tests/unit/config/test_config_model.py b/metadata-ingestion/tests/unit/config/test_config_model.py index ffac5c465f5541..f53390a3deb18c 100644 --- a/metadata-ingestion/tests/unit/config/test_config_model.py +++ b/metadata-ingestion/tests/unit/config/test_config_model.py @@ -3,8 +3,11 @@ import pydantic import pytest -from datahub.configuration.common import ConfigModel, redact_raw_config -from datahub.ingestion.source.unity.config import UnityCatalogSourceConfig +from datahub.configuration.common import ( + AllowDenyPattern, + ConfigModel, + redact_raw_config, +) def test_extras_not_allowed(): @@ -76,8 +79,15 @@ def test_config_redaction(): def test_shared_defaults(): - c1 = UnityCatalogSourceConfig(token="s", workspace_url="https://workspace_url") - c2 = UnityCatalogSourceConfig(token="s", workspace_url="https://workspace_url") + class SourceConfig(ConfigModel): + token: str + workspace_url: str + catalog_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern.allow_all(), + ) + + c1 = SourceConfig(token="s", workspace_url="https://workspace_url") + c2 = SourceConfig(token="s", workspace_url="https://workspace_url") assert c2.catalog_pattern.allow == [".*"] c1.catalog_pattern.allow += ["foo"] diff --git a/metadata-ingestion/tests/unit/test_pydantic_validators.py b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py similarity index 92% rename from metadata-ingestion/tests/unit/test_pydantic_validators.py rename to metadata-ingestion/tests/unit/config/test_pydantic_validators.py index 3e9ec6cbaf3579..399245736805cc 100644 --- a/metadata-ingestion/tests/unit/test_pydantic_validators.py +++ b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py @@ -7,7 +7,10 @@ from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field -from datahub.utilities.global_warning_util import get_global_warnings +from datahub.utilities.global_warning_util import ( + clear_global_warnings, + get_global_warnings, +) def test_field_rename(): @@ -76,9 +79,11 @@ class TestModel(ConfigModel): def test_field_deprecated(): + clear_global_warnings() + class TestModel(ConfigModel): - d1: Optional[str] - d2: Optional[str] + d1: Optional[str] = None + d2: Optional[str] = None b: str _validate_deprecated_d1 = pydantic_field_deprecated("d1") @@ -93,3 +98,5 @@ class TestModel(ConfigModel): assert v.d2 == "deprecated" assert any(["d1 is deprecated" in warning for warning in get_global_warnings()]) assert any(["d2 is deprecated" in warning for warning in get_global_warnings()]) + + clear_global_warnings() diff --git a/metadata-ingestion/tests/unit/test_time_window_config.py b/metadata-ingestion/tests/unit/config/test_time_window_config.py similarity index 100% rename from metadata-ingestion/tests/unit/test_time_window_config.py rename to metadata-ingestion/tests/unit/config/test_time_window_config.py From 7b067822bd8602c00fe5a0efdd15a6bb7a33bad6 Mon Sep 17 00:00:00 2001 From: John Joyce Date: Mon, 18 Dec 2023 18:35:02 -0800 Subject: [PATCH 4/4] feat(gms): Add support for platform-based browse (#9376) Co-authored-by: John Joyce --- .../graphql/featureflags/FeatureFlags.java | 1 + .../resolvers/chart/BrowseV2Resolver.java | 20 +++- .../resolvers/config/AppConfigResolver.java | 1 + .../graphql/resolvers/search/SearchUtils.java | 14 +++ .../src/main/resources/app.graphql | 5 + .../src/main/resources/search.graphql | 9 +- .../browse/BrowseV2ResolverTest.java | 2 +- datahub-web-react/src/appConfigContext.tsx | 1 + datahub-web-react/src/graphql/app.graphql | 1 + .../metadata/client/JavaEntityClient.java | 24 +++++ .../elasticsearch/ElasticSearchService.java | 12 +++ .../elasticsearch/query/ESBrowseDAO.java | 91 +++++++++++++++++++ .../src/main/resources/application.yml | 1 + .../linkedin/entity/client/EntityClient.java | 22 +++++ .../entity/client/RestliEntityClient.java | 14 +++ .../metadata/search/EntitySearchService.java | 19 ++++ 16 files changed, 231 insertions(+), 6 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java index 07bd1fba5d8a86..e74ed09849763c 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java @@ -12,6 +12,7 @@ public class FeatureFlags { private boolean readOnlyModeEnabled = false; private boolean showSearchFiltersV2 = false; private boolean showBrowseV2 = false; + private boolean platformBrowseV2 = false; private PreProcessHooks preProcessHooks; private boolean showAcrylInfo = false; private boolean showAccessManagement = false; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java index 292d6108b7a044..da4a3a76dd7e0e 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java @@ -2,14 +2,16 @@ import static com.linkedin.datahub.graphql.Constants.BROWSE_PATH_V2_DELIMITER; import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.bindArgument; -import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.resolveView; +import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.*; +import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.BrowseResultGroupV2; import com.linkedin.datahub.graphql.generated.BrowseResultMetadata; import com.linkedin.datahub.graphql.generated.BrowseResultsV2; import com.linkedin.datahub.graphql.generated.BrowseV2Input; +import com.linkedin.datahub.graphql.generated.EntityType; import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; import com.linkedin.datahub.graphql.resolvers.ResolverUtils; import com.linkedin.datahub.graphql.resolvers.search.SearchUtils; @@ -43,8 +45,8 @@ public class BrowseV2Resolver implements DataFetcher get(DataFetchingEnvironment environment) { final QueryContext context = environment.getContext(); final BrowseV2Input input = bindArgument(environment.getArgument("input"), BrowseV2Input.class); - final String entityName = EntityTypeMapper.getName(input.getType()); + final List entityNames = getEntityNames(input); final int start = input.getStart() != null ? input.getStart() : DEFAULT_START; final int count = input.getCount() != null ? input.getCount() : DEFAULT_COUNT; final String query = input.getQuery() != null ? input.getQuery() : "*"; @@ -70,7 +72,7 @@ public CompletableFuture get(DataFetchingEnvironment environmen BrowseResultV2 browseResults = _entityClient.browseV2( - entityName, + entityNames, pathStr, maybeResolvedView != null ? SearchUtils.combineFilters( @@ -87,6 +89,18 @@ public CompletableFuture get(DataFetchingEnvironment environmen }); } + public static List getEntityNames(BrowseV2Input input) { + List entityTypes; + if (input.getTypes() != null && input.getTypes().size() > 0) { + entityTypes = input.getTypes(); + } else if (input.getType() != null) { + entityTypes = ImmutableList.of(input.getType()); + } else { + entityTypes = BROWSE_ENTITY_TYPES; + } + return entityTypes.stream().map(EntityTypeMapper::getName).collect(Collectors.toList()); + } + private BrowseResultsV2 mapBrowseResults(BrowseResultV2 browseResults) { BrowseResultsV2 results = new BrowseResultsV2(); results.setTotal(browseResults.getNumGroups()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java index 34f7f133f6fb94..81b52991cde90c 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java @@ -175,6 +175,7 @@ public CompletableFuture get(final DataFetchingEnvironment environmen .setShowAcrylInfo(_featureFlags.isShowAcrylInfo()) .setShowAccessManagement(_featureFlags.isShowAccessManagement()) .setNestedDomainsEnabled(_featureFlags.isNestedDomainsEnabled()) + .setPlatformBrowseV2(_featureFlags.isPlatformBrowseV2()) .build(); appConfig.setFeatureFlags(featureFlagsConfig); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java index d04cb57e1a860e..444ab4bcc3c3c9 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java @@ -92,6 +92,20 @@ private SearchUtils() {} EntityType.NOTEBOOK, EntityType.DATA_PRODUCT); + /** Entities that are part of browse by default */ + public static final List BROWSE_ENTITY_TYPES = + ImmutableList.of( + EntityType.DATASET, + EntityType.DASHBOARD, + EntityType.CHART, + EntityType.CONTAINER, + EntityType.MLMODEL, + EntityType.MLMODEL_GROUP, + EntityType.MLFEATURE_TABLE, + EntityType.DATA_FLOW, + EntityType.DATA_JOB, + EntityType.NOTEBOOK); + /** A prioritized list of source filter types used to generate quick filters */ public static final List PRIORITIZED_SOURCE_ENTITY_TYPES = Stream.of( diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql index 075a3b0fac43bc..52451e195ee841 100644 --- a/datahub-graphql-core/src/main/resources/app.graphql +++ b/datahub-graphql-core/src/main/resources/app.graphql @@ -437,6 +437,11 @@ type FeatureFlagsConfig { """ showBrowseV2: Boolean! + """ + Whether browse v2 is platform mode, which means that platforms are displayed instead of entity types at the root. + """ + platformBrowseV2: Boolean! + """ Whether we should show CTAs in the UI related to moving to Managed DataHub by Acryl. """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index e0cde5a2db9f99..8f2377edb546e0 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -1176,9 +1176,14 @@ Input required for browse queries """ input BrowseV2Input { """ - The browse entity type + The browse entity type - deprecated use types instead """ - type: EntityType! + type: EntityType + + """ + The browse entity type - deprecated use types instead. If not provided, all types will be used. + """ + types: [EntityType!] """ The browse path V2 - a list with each entry being part of the browse path V2 diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java index bffc2b31af2b9a..433772d7e2cfe1 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java @@ -249,7 +249,7 @@ private static EntityClient initMockEntityClient( EntityClient client = Mockito.mock(EntityClient.class); Mockito.when( client.browseV2( - Mockito.eq(entityName), + Mockito.eq(ImmutableList.of(entityName)), Mockito.eq(path), Mockito.eq(filter), Mockito.eq(query), diff --git a/datahub-web-react/src/appConfigContext.tsx b/datahub-web-react/src/appConfigContext.tsx index 4087ad453687c8..8c1089b868e5ab 100644 --- a/datahub-web-react/src/appConfigContext.tsx +++ b/datahub-web-react/src/appConfigContext.tsx @@ -50,6 +50,7 @@ export const DEFAULT_APP_CONFIG = { showAcrylInfo: false, showAccessManagement: false, nestedDomainsEnabled: true, + platformBrowseV2: false, }, }; diff --git a/datahub-web-react/src/graphql/app.graphql b/datahub-web-react/src/graphql/app.graphql index 4e9bbb11d8c5aa..fe283403491479 100644 --- a/datahub-web-react/src/graphql/app.graphql +++ b/datahub-web-react/src/graphql/app.graphql @@ -65,6 +65,7 @@ query appConfig { showAcrylInfo showAccessManagement nestedDomainsEnabled + platformBrowseV2 } } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 53b974b560e2a6..e7ec4d313b5f58 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -235,6 +235,30 @@ public BrowseResultV2 browseV2( return _entitySearchService.browseV2(entityName, path, filter, input, start, count); } + /** + * Gets browse V2 snapshot of a given path + * + * @param entityNames entities being browsed + * @param path path being browsed + * @param filter browse filter + * @param input search query + * @param start start offset of first group + * @param count max number of results requested + * @throws RemoteInvocationException + */ + @Nonnull + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count, + @Nonnull Authentication authentication) { + // TODO: cache browseV2 results + return _entitySearchService.browseV2(entityNames, path, filter, input, start, count); + } + @SneakyThrows @Deprecated public void update(@Nonnull final Entity entity, @Nonnull final Authentication authentication) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index f40da59a149faa..fd7491fe32ea34 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -210,6 +210,18 @@ public BrowseResultV2 browseV2( return esBrowseDAO.browseV2(entityName, path, filter, input, start, count); } + @Nonnull + @Override + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count) { + return esBrowseDAO.browseV2(entityNames, path, filter, input, start, count); + } + @Nonnull @Override public List getBrowsePaths(@Nonnull String entityName, @Nonnull Urn urn) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java index 5ea60b24a577a0..3c71a2dfd91809 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java @@ -427,6 +427,44 @@ public BrowseResultV2 browseV2( } } + public BrowseResultV2 browseV2( + @Nonnull List entities, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count) { + try { + final SearchResponse groupsResponse; + + try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esGroupSearch").time()) { + final String finalInput = input.isEmpty() ? "*" : input; + groupsResponse = + client.search( + constructGroupsSearchRequestBrowseAcrossEntities( + entities, path, filter, finalInput), + RequestOptions.DEFAULT); + } + + final BrowseGroupsResultV2 browseGroupsResult = + extractGroupsResponseV2(groupsResponse, path, start, count); + final int numGroups = browseGroupsResult.getTotalGroups(); + + return new BrowseResultV2() + .setMetadata( + new BrowseResultMetadata() + .setTotalNumEntities(browseGroupsResult.getTotalNumEntities()) + .setPath(path)) + .setGroups(new BrowseResultGroupV2Array(browseGroupsResult.getGroups())) + .setNumGroups(numGroups) + .setFrom(start) + .setPageSize(count); + } catch (Exception e) { + log.error("Browse Across Entities query failed: " + e.getMessage()); + throw new ESQueryException("Browse Across Entities query failed: ", e); + } + } + @Nonnull private SearchRequest constructGroupsSearchRequestV2( @Nonnull String entityName, @@ -448,6 +486,33 @@ private SearchRequest constructGroupsSearchRequestV2( return searchRequest; } + @Nonnull + private SearchRequest constructGroupsSearchRequestBrowseAcrossEntities( + @Nonnull List entities, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input) { + + List entitySpecs = + entities.stream().map(entityRegistry::getEntitySpec).collect(Collectors.toList()); + + String[] indexArray = + entities.stream().map(indexConvention::getEntityIndexName).toArray(String[]::new); + + final SearchRequest searchRequest = new SearchRequest(indexArray); + final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); + searchSourceBuilder.size(0); + searchSourceBuilder.query( + buildQueryStringBrowseAcrossEntities( + entitySpecs, + path, + SearchUtil.transformFilterForEntities(filter, indexConvention), + input)); + searchSourceBuilder.aggregation(buildAggregationsV2(path)); + searchRequest.source(searchSourceBuilder); + return searchRequest; + } + /** * Extracts the name of group from path. * @@ -494,6 +559,32 @@ private QueryBuilder buildQueryStringV2( return queryBuilder; } + @Nonnull + private QueryBuilder buildQueryStringBrowseAcrossEntities( + @Nonnull List entitySpecs, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input) { + final int browseDepthVal = getPathDepthV2(path); + + final BoolQueryBuilder queryBuilder = QueryBuilders.boolQuery(); + + QueryBuilder query = + SearchRequestHandler.getBuilder(entitySpecs, searchConfiguration, customSearchConfiguration) + .getQuery(input, false); + queryBuilder.must(query); + + if (!path.isEmpty()) { + queryBuilder.filter(QueryBuilders.matchQuery(BROWSE_PATH_V2, path)); + } + + queryBuilder.filter(QueryBuilders.rangeQuery(BROWSE_PATH_V2_DEPTH).gt(browseDepthVal)); + + queryBuilder.filter(SearchRequestHandler.getFilterQuery(filter)); + + return queryBuilder; + } + @Nonnull private AggregationBuilder buildAggregationsV2(@Nonnull String path) { final String currentLevel = ESUtils.escapeReservedCharacters(path) + "␟.*"; diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index a52b705cb8da63..0ea6b8712953e4 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -317,6 +317,7 @@ featureFlags: showAccessManagement: ${SHOW_ACCESS_MANAGEMENT:false} #Whether we should show AccessManagement tab in the datahub UI. showSearchFiltersV2: ${SHOW_SEARCH_FILTERS_V2:true} # Enables showing the search filters V2 experience. showBrowseV2: ${SHOW_BROWSE_V2:true} # Enables showing the browse v2 sidebar experience. + platformBrowseV2: ${PLATFORM_BROWSE_V2:false} # Enables the platform browse experience, instead of the entity-oriented browse default. preProcessHooks: uiEnabled: ${PRE_PROCESS_HOOKS_UI_ENABLED:true} # Circumvents Kafka for processing index updates for UI changes sourced from GraphQL to avoid processing delays showAcrylInfo: ${SHOW_ACRYL_INFO:false} # Show different CTAs within DataHub around moving to Managed DataHub. Set to true for the demo site. diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java index 7bc50a8f3dc7e6..598c252b4f7664 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -153,6 +153,28 @@ public BrowseResultV2 browseV2( @Nonnull Authentication authentication) throws RemoteInvocationException; + /** + * Gets browse snapshot of a given path + * + * @param entityNames entities being browsed + * @param path path being browsed + * @param filter browse filter + * @param input search query + * @param start start offset of first group + * @param count max number of results requested + * @throws RemoteInvocationException + */ + @Nonnull + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count, + @Nonnull Authentication authentication) + throws RemoteInvocationException; + @Deprecated public void update(@Nonnull final Entity entity, @Nonnull final Authentication authentication) throws RemoteInvocationException; diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index c854cb9dd279ec..d68c472ea91709 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -381,6 +381,20 @@ public BrowseResultV2 browseV2( throw new NotImplementedException("BrowseV2 is not implemented in Restli yet"); } + @Nonnull + @Override + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count, + @Nonnull Authentication authentication) + throws RemoteInvocationException { + throw new NotImplementedException("BrowseV2 is not implemented in Restli yet"); + } + public void update(@Nonnull final Entity entity, @Nonnull final Authentication authentication) throws RemoteInvocationException { EntitiesDoIngestRequestBuilder requestBuilder = diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java index 09a63e769f0253..189ae09e1b9382 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java @@ -207,6 +207,25 @@ public BrowseResultV2 browseV2( int start, int count); + /** + * Gets browse snapshot of a given path + * + * @param entityNames set of entities being browsed + * @param path path being browsed + * @param filter browse filter + * @param input search query + * @param start start offset of first group + * @param count max number of results requested + */ + @Nonnull + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count); + /** * Gets a list of paths for a given urn. *