From ecda3e618704c5eb335ad1a21c30f0c935581f64 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 18 Dec 2023 18:26:33 -0500 Subject: [PATCH] feat(ingest): pydantic v2 compatibility (#9434) --- .github/workflows/airflow-plugin.yml | 7 ++-- .../airflow-plugin/tox.ini | 9 +++++ metadata-ingestion/setup.py | 39 ++++++++++++++++--- .../api/entities/datacontract/assertion.py | 4 +- .../datacontract/assertion_operator.py | 16 ++++---- .../datacontract/data_quality_assertion.py | 11 +++--- .../api/entities/datacontract/datacontract.py | 23 +++++------ .../datacontract/freshness_assertion.py | 15 ++++--- .../entities/datacontract/schema_assertion.py | 14 ++++--- .../src/datahub/cli/check_cli.py | 13 ++++++- .../src/datahub/configuration/common.py | 16 +++++++- .../src/datahub/configuration/datetimes.py | 4 +- .../pydantic_migration_helpers.py | 29 ++++++++++++++ .../configuration/time_window_config.py | 16 ++++++-- .../configuration/validate_field_rename.py | 4 +- .../ingestion/glossary/datahub_classifier.py | 11 +++++- .../source/bigquery_v2/bigquery_config.py | 2 +- .../ingestion/source/delta_lake/config.py | 4 +- .../source/snowflake/snowflake_config.py | 2 +- .../ingestion/source_config/sql/snowflake.py | 2 +- .../src/datahub/utilities/urns/urn_iter.py | 2 +- .../integration/snowflake/test_snowflake.py | 16 ++++---- .../unit/{ => config}/test_allow_deny.py | 0 .../unit/{ => config}/test_config_clean.py | 0 .../tests/unit/config/test_config_model.py | 18 +++++++-- .../{ => config}/test_pydantic_validators.py | 13 +++++-- .../{ => config}/test_time_window_config.py | 0 27 files changed, 209 insertions(+), 81 deletions(-) rename metadata-ingestion/tests/unit/{ => config}/test_allow_deny.py (100%) rename metadata-ingestion/tests/unit/{ => config}/test_config_clean.py (100%) rename metadata-ingestion/tests/unit/{ => config}/test_pydantic_validators.py (92%) rename metadata-ingestion/tests/unit/{ => config}/test_time_window_config.py (100%) diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index cd1e159b7d53cc..70816e5f093d13 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -32,6 +32,7 @@ jobs: strategy: matrix: include: + # Note: this should be kept in sync with tox.ini. - python-version: "3.8" extra_pip_requirements: "apache-airflow~=2.1.4" extra_pip_extras: plugin-v1 @@ -39,13 +40,13 @@ jobs: extra_pip_requirements: "apache-airflow~=2.2.4" extra_pip_extras: plugin-v1 - python-version: "3.10" - extra_pip_requirements: "apache-airflow~=2.4.0" + extra_pip_requirements: 'apache-airflow~=2.4.0 pluggy==1.0.0 "pendulum<3.0"' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: "apache-airflow~=2.6.0" + extra_pip_requirements: 'apache-airflow~=2.6.0 "pendulum<3.0"' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: "apache-airflow>=2.7.0" + extra_pip_requirements: "apache-airflow>=2.7.0 pydantic==2.4.2" extra_pip_extras: plugin-v2 fail-fast: false steps: diff --git a/metadata-ingestion-modules/airflow-plugin/tox.ini b/metadata-ingestion-modules/airflow-plugin/tox.ini index 1010bd2933e452..27ae2ce65ba658 100644 --- a/metadata-ingestion-modules/airflow-plugin/tox.ini +++ b/metadata-ingestion-modules/airflow-plugin/tox.ini @@ -10,6 +10,7 @@ envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py31 use_develop = true extras = dev,integration-tests,plugin-v1 deps = + # This should be kept in sync with the Github Actions matrix. -e ../../metadata-ingestion/ # Airflow version airflow21: apache-airflow~=2.1.0 @@ -20,7 +21,15 @@ deps = # See https://github.com/datahub-project/datahub/pull/9365 airflow24: apache-airflow~=2.4.0,pluggy==1.0.0 airflow26: apache-airflow~=2.6.0 + # Respect the constraints file on pendulum. + # See https://github.com/apache/airflow/issues/36274 + airflow24,airflow26: pendulum>=2.0,<3.0 + # The Airflow 2.7 constraints file points at pydantic v2, so we match that here. + # https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt + # Note that Airflow is actually compatible with both pydantic v1 and v2, and the + # constraints file is overly restrictive. airflow27: apache-airflow~=2.7.0 + airflow27: pydantic==2.4.2 commands = pytest --cov-append {posargs} diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index cb13a40125c0da..13c9d3c99aaca1 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -14,9 +14,10 @@ "mypy_extensions>=0.4.3", # Actual dependencies. "typing-inspect", + # pydantic 1.8.2 is incompatible with mypy 0.910. + # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. # pydantic 1.10.3 is incompatible with typing-extensions 4.1.1 - https://github.com/pydantic/pydantic/issues/4885 - # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887 - "pydantic>=1.5.1,!=1.10.3,<2", + "pydantic>=1.10.0,!=1.10.3", "mixpanel>=4.9.0", "sentry-sdk", } @@ -53,6 +54,18 @@ "ruamel.yaml", } +pydantic_no_v2 = { + # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887 + # Tags sources that require the pydantic v2 API. + "pydantic<2", +} + +plugin_common = { + # While pydantic v2 support is experimental, require that all plugins + # continue to use v1. This will ensure that no ingestion recipes break. + *pydantic_no_v2, +} + rest_common = {"requests", "requests_file"} kafka_common = { @@ -118,6 +131,7 @@ "sqlalchemy>=1.4.39, <2", # Required for SQL profiling. "great-expectations>=0.15.12, <=0.15.50", + *pydantic_no_v2, # because of great-expectations # scipy version restricted to reduce backtracking, used by great-expectations, "scipy>=1.7.2", # GE added handling for higher version of jinja2 @@ -229,6 +243,7 @@ iceberg_common = { # Iceberg Python SDK "pyiceberg", + *pydantic_no_v2, # because of pyiceberg "pyarrow>=9.0.0, <13.0.0", } @@ -477,9 +492,6 @@ "flake8-bugbear==23.3.12", "isort>=5.7.0", "mypy==1.0.0", - # pydantic 1.8.2 is incompatible with mypy 0.910. - # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.10.0", *test_api_requirements, pytest_dep, "pytest-asyncio>=0.16.0", @@ -740,7 +752,22 @@ extras_require={ "base": list(framework_common), **{ - plugin: list(framework_common | dependencies) + plugin: list( + framework_common + | ( + plugin_common + if plugin + not in { + "airflow", + "datahub-rest", + "datahub-kafka", + "sync-file-emitter", + "sql-parser", + } + else set() + ) + | dependencies + ) for (plugin, dependencies) in plugins.items() }, "all": list( diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py index c45d4ddc924580..89ac528efe81a1 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py @@ -1,7 +1,7 @@ from typing import Optional -from datahub.configuration import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel -class BaseAssertion(ConfigModel): +class BaseAssertion(v1_ConfigModel): description: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py index a41b0f7aafd9f2..dc0c97d1c74e56 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py @@ -2,7 +2,7 @@ from typing_extensions import Literal, Protocol -from datahub.configuration import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel from datahub.metadata.schema_classes import ( AssertionStdOperatorClass, AssertionStdParameterClass, @@ -58,7 +58,7 @@ def _generate_assertion_std_parameters( ) -class EqualToOperator(ConfigModel): +class EqualToOperator(v1_ConfigModel): type: Literal["equal_to"] value: Union[str, int, float] @@ -71,7 +71,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class BetweenOperator(ConfigModel): +class BetweenOperator(v1_ConfigModel): type: Literal["between"] min: Union[int, float] max: Union[int, float] @@ -87,7 +87,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: ) -class LessThanOperator(ConfigModel): +class LessThanOperator(v1_ConfigModel): type: Literal["less_than"] value: Union[int, float] @@ -100,7 +100,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class GreaterThanOperator(ConfigModel): +class GreaterThanOperator(v1_ConfigModel): type: Literal["greater_than"] value: Union[int, float] @@ -113,7 +113,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class LessThanOrEqualToOperator(ConfigModel): +class LessThanOrEqualToOperator(v1_ConfigModel): type: Literal["less_than_or_equal_to"] value: Union[int, float] @@ -126,7 +126,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class GreaterThanOrEqualToOperator(ConfigModel): +class GreaterThanOrEqualToOperator(v1_ConfigModel): type: Literal["greater_than_or_equal_to"] value: Union[int, float] @@ -139,7 +139,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class NotNullOperator(ConfigModel): +class NotNullOperator(v1_ConfigModel): type: Literal["not_null"] operator: str = AssertionStdOperatorClass.NOT_NULL diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py index 6a3944ba36baf0..975aa359bd2031 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py @@ -1,12 +1,11 @@ from typing import List, Optional, Union -import pydantic from typing_extensions import Literal import datahub.emitter.mce_builder as builder from datahub.api.entities.datacontract.assertion import BaseAssertion from datahub.api.entities.datacontract.assertion_operator import Operators -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AssertionInfoClass, @@ -25,7 +24,7 @@ class IdConfigMixin(BaseAssertion): - id_raw: Optional[str] = pydantic.Field( + id_raw: Optional[str] = v1_Field( default=None, alias="id", description="The id of the assertion. If not provided, one will be generated using the type.", @@ -38,7 +37,7 @@ def generate_default_id(self) -> str: class CustomSQLAssertion(IdConfigMixin, BaseAssertion): type: Literal["custom_sql"] sql: str - operator: Operators = pydantic.Field(discriminator="type") + operator: Operators = v1_Field(discriminator="type") def generate_default_id(self) -> str: return f"{self.type}-{self.sql}-{self.operator.id()}" @@ -89,11 +88,11 @@ def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass: ) -class DataQualityAssertion(ConfigModel): +class DataQualityAssertion(v1_ConfigModel): __root__: Union[ CustomSQLAssertion, ColumnUniqueAssertion, - ] = pydantic.Field(discriminator="type") + ] = v1_Field(discriminator="type") @property def id(self) -> str: diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py index f3c6be55e5fea9..e0ef85d5fd66c0 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py @@ -1,7 +1,6 @@ import collections from typing import Iterable, List, Optional, Tuple -import pydantic from ruamel.yaml import YAML from typing_extensions import Literal @@ -11,7 +10,11 @@ ) from datahub.api.entities.datacontract.freshness_assertion import FreshnessAssertion from datahub.api.entities.datacontract.schema_assertion import SchemaAssertion -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import ( + v1_ConfigModel, + v1_Field, + v1_validator, +) from datahub.emitter.mce_builder import datahub_guid, make_assertion_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -26,7 +29,7 @@ from datahub.utilities.urns.urn import guess_entity_type -class DataContract(ConfigModel): +class DataContract(v1_ConfigModel): """A yml representation of a Data Contract. This model is used as a simpler, Python-native representation of a DataHub data contract. @@ -36,29 +39,27 @@ class DataContract(ConfigModel): version: Literal[1] - id: Optional[str] = pydantic.Field( + id: Optional[str] = v1_Field( default=None, alias="urn", description="The data contract urn. If not provided, one will be generated.", ) - entity: str = pydantic.Field( + entity: str = v1_Field( description="The entity urn that the Data Contract is associated with" ) # TODO: add support for properties # properties: Optional[Dict[str, str]] = None - schema_field: Optional[SchemaAssertion] = pydantic.Field( - default=None, alias="schema" - ) + schema_field: Optional[SchemaAssertion] = v1_Field(default=None, alias="schema") - freshness: Optional[FreshnessAssertion] = pydantic.Field(default=None) + freshness: Optional[FreshnessAssertion] = v1_Field(default=None) # TODO: Add a validator to ensure that ids are unique - data_quality: Optional[List[DataQualityAssertion]] = pydantic.Field(default=None) + data_quality: Optional[List[DataQualityAssertion]] = v1_Field(default=None) _original_yaml_dict: Optional[dict] = None - @pydantic.validator("data_quality") + @v1_validator("data_quality") # type: ignore def validate_data_quality( cls, data_quality: Optional[List[DataQualityAssertion]] ) -> Optional[List[DataQualityAssertion]]: diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py index 71741d76b22fc4..86942766889676 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py @@ -3,11 +3,10 @@ from datetime import timedelta from typing import List, Union -import pydantic from typing_extensions import Literal from datahub.api.entities.datacontract.assertion import BaseAssertion -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AssertionInfoClass, @@ -25,10 +24,10 @@ class CronFreshnessAssertion(BaseAssertion): type: Literal["cron"] - cron: str = pydantic.Field( + cron: str = v1_Field( description="The cron expression to use. See https://crontab.guru/ for help." ) - timezone: str = pydantic.Field( + timezone: str = v1_Field( "UTC", description="The timezone to use for the cron schedule. Defaults to UTC.", ) @@ -58,10 +57,10 @@ def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleCla ) -class FreshnessAssertion(ConfigModel): - __root__: Union[ - CronFreshnessAssertion, FixedIntervalFreshnessAssertion - ] = pydantic.Field(discriminator="type") +class FreshnessAssertion(v1_ConfigModel): + __root__: Union[CronFreshnessAssertion, FixedIntervalFreshnessAssertion] = v1_Field( + discriminator="type" + ) @property def id(self): diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py index b62f94e0592fce..39297d1a98d026 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py @@ -3,11 +3,10 @@ import json from typing import List, Union -import pydantic from typing_extensions import Literal from datahub.api.entities.datacontract.assertion import BaseAssertion -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.extractor.json_schema_util import get_schema_metadata from datahub.metadata.schema_classes import ( @@ -23,7 +22,7 @@ class JsonSchemaContract(BaseAssertion): type: Literal["json-schema"] - json_schema: dict = pydantic.Field(alias="json-schema") + json_schema: dict = v1_Field(alias="json-schema") _schema_metadata: SchemaMetadataClass @@ -37,7 +36,10 @@ def _init_private_attributes(self) -> None: ) -class FieldListSchemaContract(BaseAssertion, arbitrary_types_allowed=True): +class FieldListSchemaContract(BaseAssertion): + class Config: + arbitrary_types_allowed = True + type: Literal["field-list"] fields: List[SchemaFieldClass] @@ -56,8 +58,8 @@ def _init_private_attributes(self) -> None: ) -class SchemaAssertion(ConfigModel): - __root__: Union[JsonSchemaContract, FieldListSchemaContract] = pydantic.Field( +class SchemaAssertion(v1_ConfigModel): + __root__: Union[JsonSchemaContract, FieldListSchemaContract] = v1_Field( discriminator="type" ) diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index f7996900f7a7ad..2732a72aea5399 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -126,10 +126,21 @@ def metadata_diff( default=False, help="Include extra information for each plugin.", ) +@click.option( + "--source", + type=str, + default=None, +) @telemetry.with_telemetry() -def plugins(verbose: bool) -> None: +def plugins(source: Optional[str], verbose: bool) -> None: """List the enabled ingestion plugins.""" + if source: + # Quick helper for one-off checks with full stack traces. + source_registry.get(source) + click.echo(f"Source {source} is enabled.") + return + click.secho("Sources:", bold=True) click.echo(source_registry.summary(verbose=verbose, col_width=25)) click.echo() diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index f225856ca43ce4..0030332bcfd541 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -99,8 +99,20 @@ def _schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None: @classmethod def parse_obj_allow_extras(cls: Type[_ConfigSelf], obj: Any) -> _ConfigSelf: - with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow): - return cls.parse_obj(obj) + if PYDANTIC_VERSION_2: + try: + with unittest.mock.patch.dict( + cls.model_config, # type: ignore + {"extra": "allow"}, + clear=False, + ): + cls.model_rebuild(force=True) # type: ignore + return cls.parse_obj(obj) + finally: + cls.model_rebuild(force=True) # type: ignore + else: + with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow): + return cls.parse_obj(obj) class PermissiveConfigModel(ConfigModel): diff --git a/metadata-ingestion/src/datahub/configuration/datetimes.py b/metadata-ingestion/src/datahub/configuration/datetimes.py index 41af7565593d9b..1520462fa9bf8c 100644 --- a/metadata-ingestion/src/datahub/configuration/datetimes.py +++ b/metadata-ingestion/src/datahub/configuration/datetimes.py @@ -65,6 +65,8 @@ def parse_absolute_time(input: str) -> datetime: def parse_relative_timespan(input: str) -> timedelta: + raw_input = input + neg = False input = input.strip() @@ -79,7 +81,7 @@ def parse_relative_timespan(input: str) -> timedelta: if neg: delta = -delta - logger.debug(f'Parsed "{input}" as {delta}.') + logger.debug(f'Parsed "{raw_input}" as {delta}.') return delta diff --git a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py index f1876b500598ba..bd931abe2e84d1 100644 --- a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py +++ b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py @@ -19,12 +19,41 @@ class PydanticDeprecatedSince20(Warning): # type: ignore if PYDANTIC_VERSION_2: from pydantic import BaseModel as GenericModel + from pydantic.v1 import ( # type: ignore + BaseModel as v1_BaseModel, + Extra as v1_Extra, + Field as v1_Field, + root_validator as v1_root_validator, + validator as v1_validator, + ) else: + from pydantic import ( # type: ignore + BaseModel as v1_BaseModel, + Extra as v1_Extra, + Field as v1_Field, + root_validator as v1_root_validator, + validator as v1_validator, + ) from pydantic.generics import GenericModel # type: ignore +class v1_ConfigModel(v1_BaseModel): + """A simplified variant of our main ConfigModel class. + + This one only uses pydantic v1 features. + """ + + class Config: + extra = v1_Extra.forbid + underscore_attrs_are_private = True + + __all__ = [ "PYDANTIC_VERSION_2", "PydanticDeprecatedSince20", "GenericModel", + "v1_ConfigModel", + "v1_Field", + "v1_root_validator", + "v1_validator", ] diff --git a/metadata-ingestion/src/datahub/configuration/time_window_config.py b/metadata-ingestion/src/datahub/configuration/time_window_config.py index 15de7470e4d823..f20ab85be05855 100644 --- a/metadata-ingestion/src/datahub/configuration/time_window_config.py +++ b/metadata-ingestion/src/datahub/configuration/time_window_config.py @@ -68,6 +68,12 @@ def default_start_time( assert abs(delta) >= get_bucket_duration_delta( values["bucket_duration"] ), "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'." + + # The end_time's default value is not yet populated, in which case + # we can just manually generate it here. + if "end_time" not in values: + values["end_time"] = datetime.now(tz=timezone.utc) + return get_time_bucket( values["end_time"] + delta, values["bucket_duration"] ) @@ -80,9 +86,13 @@ def default_start_time( @pydantic.validator("start_time", "end_time") def ensure_timestamps_in_utc(cls, v: datetime) -> datetime: - assert ( - v.tzinfo == timezone.utc - ), 'timezone is not UTC; try adding a "Z" to the value e.g. "2021-07-20T00:00:00Z"' + if v.tzinfo is None: + raise ValueError( + "Timestamps must be in UTC. Try adding a 'Z' to the value e.g. '2021-07-20T00:00:00Z'" + ) + + # If the timestamp is timezone-aware but not in UTC, convert it to UTC. + v = v.astimezone(timezone.utc) return v diff --git a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py index bb01f2b787123a..de2a16e9bf247d 100644 --- a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py +++ b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py @@ -49,4 +49,6 @@ def _validate_field_rename(cls: Type, values: dict) -> dict: # validator with pre=True gets all the values that were passed in. # Given that a renamed field doesn't show up in the fields list, we can't use # the field-level validator, even with a different field name. - return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename) + return pydantic.root_validator(pre=True, skip_on_failure=True, allow_reuse=True)( + _validate_field_rename + ) diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py index 1f2b7f5689ea3c..42eb930c80f9d4 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py @@ -8,6 +8,7 @@ from pydantic.fields import Field from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.ingestion.glossary.classifier import Classifier @@ -50,7 +51,10 @@ class ValuesFactorConfig(ConfigModel): class PredictionFactorsAndWeights(ConfigModel): class Config: - allow_population_by_field_name = True + if PYDANTIC_VERSION_2: + populate_by_name = True + else: + allow_population_by_field_name = True Name: float = Field(alias="name") Description: float = Field(alias="description") @@ -60,7 +64,10 @@ class Config: class InfoTypeConfig(ConfigModel): class Config: - allow_population_by_field_name = True + if PYDANTIC_VERSION_2: + populate_by_name = True + else: + allow_population_by_field_name = True Prediction_Factors_and_Weights: PredictionFactorsAndWeights = Field( description="Factors and their weights to consider when predicting info types", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index cbe68a454ea436..c13b08a6d9656b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -284,7 +284,7 @@ def validate_bigquery_audit_metadata_datasets( return v - @root_validator(pre=False) + @root_validator(pre=False, skip_on_failure=True) def backward_compatibility_configs_set(cls, values: Dict) -> Dict: project_id = values.get("project_id") project_id_pattern = values.get("project_id_pattern") diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py index f3616ca648a3e6..81a54d1327d05a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py @@ -4,6 +4,7 @@ import pydantic from cached_property import cached_property from pydantic import Field +from typing_extensions import Literal from datahub.configuration.common import AllowDenyPattern from datahub.configuration.source_common import ( @@ -46,10 +47,9 @@ class DeltaLakeSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin): "'/' and URNs will be created using " "relative_path only.", ) - platform: str = Field( + platform: Literal["delta-lake"] = Field( default="delta-lake", description="The platform that this source connects to", - const=True, ) platform_instance: Optional[str] = Field( default=None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 032bdef178fdf6..b896df1fa340e3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -176,7 +176,7 @@ def validate_include_column_lineage(cls, v, values): ) return v - @root_validator(pre=False) + @root_validator(pre=False, skip_on_failure=True) def validate_unsupported_configs(cls, values: Dict) -> Dict: value = values.get("include_read_operational_stats") if value is not None and value: diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 46bd24c7e1f4c3..e9db82ce75cd99 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -107,7 +107,7 @@ def validate_account_id(cls, account_id: str) -> str: return account_id @pydantic.validator("authentication_type", always=True) - def authenticator_type_is_valid(cls, v, values, field): + def authenticator_type_is_valid(cls, v, values): if v not in VALID_AUTH_TYPES.keys(): raise ValueError( f"unsupported authenticator type '{v}' was provided," diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py index 4f228494f416b8..3389a6fb05ee89 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py @@ -150,7 +150,7 @@ def modify_urn(urn: str) -> str: if guess_entity_type(urn) == "dataset": return _lowercase_dataset_urn(urn) elif guess_entity_type(urn) == "schemaField": - cur_urn = Urn.create_from_string(urn) + cur_urn = Urn.from_string(urn) cur_urn._entity_ids[0] = _lowercase_dataset_urn(cur_urn._entity_ids[0]) return str(cur_urn) return urn diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 1b58696e4014c9..39a62056a7e4ad 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -87,18 +87,18 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): confidence_level_threshold=0.58, info_types_config={ "Age": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, Values=1, Description=0, Datatype=0 + prediction_factors_and_weights=PredictionFactorsAndWeights( + name=0, values=1, description=0, datatype=0 ) ), "CloudRegion": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, - Description=0, - Datatype=0, - Values=1, + prediction_factors_and_weights=PredictionFactorsAndWeights( + name=0, + description=0, + datatype=0, + values=1, ), - Values=ValuesFactorConfig( + values=ValuesFactorConfig( prediction_type="regex", regex=[ r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+" diff --git a/metadata-ingestion/tests/unit/test_allow_deny.py b/metadata-ingestion/tests/unit/config/test_allow_deny.py similarity index 100% rename from metadata-ingestion/tests/unit/test_allow_deny.py rename to metadata-ingestion/tests/unit/config/test_allow_deny.py diff --git a/metadata-ingestion/tests/unit/test_config_clean.py b/metadata-ingestion/tests/unit/config/test_config_clean.py similarity index 100% rename from metadata-ingestion/tests/unit/test_config_clean.py rename to metadata-ingestion/tests/unit/config/test_config_clean.py diff --git a/metadata-ingestion/tests/unit/config/test_config_model.py b/metadata-ingestion/tests/unit/config/test_config_model.py index ffac5c465f5541..f53390a3deb18c 100644 --- a/metadata-ingestion/tests/unit/config/test_config_model.py +++ b/metadata-ingestion/tests/unit/config/test_config_model.py @@ -3,8 +3,11 @@ import pydantic import pytest -from datahub.configuration.common import ConfigModel, redact_raw_config -from datahub.ingestion.source.unity.config import UnityCatalogSourceConfig +from datahub.configuration.common import ( + AllowDenyPattern, + ConfigModel, + redact_raw_config, +) def test_extras_not_allowed(): @@ -76,8 +79,15 @@ def test_config_redaction(): def test_shared_defaults(): - c1 = UnityCatalogSourceConfig(token="s", workspace_url="https://workspace_url") - c2 = UnityCatalogSourceConfig(token="s", workspace_url="https://workspace_url") + class SourceConfig(ConfigModel): + token: str + workspace_url: str + catalog_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern.allow_all(), + ) + + c1 = SourceConfig(token="s", workspace_url="https://workspace_url") + c2 = SourceConfig(token="s", workspace_url="https://workspace_url") assert c2.catalog_pattern.allow == [".*"] c1.catalog_pattern.allow += ["foo"] diff --git a/metadata-ingestion/tests/unit/test_pydantic_validators.py b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py similarity index 92% rename from metadata-ingestion/tests/unit/test_pydantic_validators.py rename to metadata-ingestion/tests/unit/config/test_pydantic_validators.py index 3e9ec6cbaf3579..399245736805cc 100644 --- a/metadata-ingestion/tests/unit/test_pydantic_validators.py +++ b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py @@ -7,7 +7,10 @@ from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field -from datahub.utilities.global_warning_util import get_global_warnings +from datahub.utilities.global_warning_util import ( + clear_global_warnings, + get_global_warnings, +) def test_field_rename(): @@ -76,9 +79,11 @@ class TestModel(ConfigModel): def test_field_deprecated(): + clear_global_warnings() + class TestModel(ConfigModel): - d1: Optional[str] - d2: Optional[str] + d1: Optional[str] = None + d2: Optional[str] = None b: str _validate_deprecated_d1 = pydantic_field_deprecated("d1") @@ -93,3 +98,5 @@ class TestModel(ConfigModel): assert v.d2 == "deprecated" assert any(["d1 is deprecated" in warning for warning in get_global_warnings()]) assert any(["d2 is deprecated" in warning for warning in get_global_warnings()]) + + clear_global_warnings() diff --git a/metadata-ingestion/tests/unit/test_time_window_config.py b/metadata-ingestion/tests/unit/config/test_time_window_config.py similarity index 100% rename from metadata-ingestion/tests/unit/test_time_window_config.py rename to metadata-ingestion/tests/unit/config/test_time_window_config.py