diff --git a/metadata-ingestion-modules/airflow-plugin/build.gradle b/metadata-ingestion-modules/airflow-plugin/build.gradle index 1bcb58e6b7c543..95b4ee3118f03e 100644 --- a/metadata-ingestion-modules/airflow-plugin/build.gradle +++ b/metadata-ingestion-modules/airflow-plugin/build.gradle @@ -73,16 +73,15 @@ task lint(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "find ${venv_name}/lib -path *airflow/_vendor/connexion/spec.py -exec sed -i.bak -e '169,169s/ # type: List\\[str\\]//g' {} \\; && " + "source ${venv_name}/bin/activate && set -x && " + - "black --check --diff src/ tests/ && " + "ruff check src/ tests/ && " + + "ruff format --check src/ tests/ && " + "mypy --show-traceback --show-error-codes src/ tests/" } task lintFix(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "black src/ tests/ && " + - "ruff check --fix src/ tests/" - "mypy src/ tests/ " + "ruff check --fix src/ tests/ && " + + "ruff format src/ tests/ " } // HACK: Some of the Airflow constraint files conflict with packages that we install (e.g. black). @@ -119,5 +118,8 @@ clean { delete venv_name delete 'build' delete 'dist' + delete '.ruff_cache' + delete '.mypy_cache' + delete '.pytest_cache' } clean.dependsOn cleanPythonCache diff --git a/metadata-ingestion-modules/airflow-plugin/pyproject.toml b/metadata-ingestion-modules/airflow-plugin/pyproject.toml index 7d03c2a14bf078..d1e1d0ad479442 100644 --- a/metadata-ingestion-modules/airflow-plugin/pyproject.toml +++ b/metadata-ingestion-modules/airflow-plugin/pyproject.toml @@ -2,13 +2,21 @@ build-backend = "setuptools.build_meta" requires = ["setuptools>=54.0.0", "wheel", "pip>=21.0.0"] -[tool.black] -extend-exclude = ''' -# A regex preceded with ^/ will apply only to files and directories -# in the root of the project. -^/tmp -''' -include = '\.pyi?$' +[tool.ruff] +line-length = 88 +target-version = "py38" +exclude = [ + ".git", + "venv", + ".tox", + "__pycache__", +] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" [tool.ruff.lint.isort] combine-as-imports = true @@ -28,31 +36,23 @@ required-imports = [] classes = ["typing"] [tool.ruff.lint] -select = [ - "B", - "C90", - "E", - "F", - "I", # For isort - "TID", +extend-select = [ + "B", # flake8-bugbear + "C90", # mccabe complexity + "E", # pycodestyle errors + "F", # pyflakes + "G010", # logging.warn -> logging.warning + "I", # isort + "TID", # flake8-tidy-imports ] ignore = [ - # Ignore line length violations (handled by Black) - "E501", - # Ignore whitespace before ':' (matches Black) - "E203", - "E203", - # Allow usages of functools.lru_cache - "B019", - # Allow function call in argument defaults - "B008", + "E501", # Line length violations (handled by formatter) ] [tool.ruff.lint.mccabe] max-complexity = 15 [tool.ruff.lint.flake8-tidy-imports] -# Disallow all relative imports. ban-relative-imports = "all" [tool.ruff.lint.per-file-ignores] diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 79c18a5188dd84..58c04158957ccd 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -71,7 +71,6 @@ def get_long_description(): dev_requirements = { *base_requirements, *mypy_stubs, - "black==22.12.0", "coverage>=5.1", "ruff==0.9.2", "mypy==1.10.1", diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py index fd01ac10f98de9..5904ce1e9e978c 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py @@ -63,9 +63,9 @@ def __init__(self): self.task_to_extractor.extractors["AthenaOperator"] = AthenaOperatorExtractor - self.task_to_extractor.extractors[ - "BigQueryInsertJobOperator" - ] = BigQueryInsertJobOperatorExtractor + self.task_to_extractor.extractors["BigQueryInsertJobOperator"] = ( + BigQueryInsertJobOperatorExtractor + ) self._graph: Optional["DataHubGraph"] = None diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py index 9de44811f60a48..b2ca61e3de3bf5 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py @@ -286,9 +286,9 @@ def _extract_lineage( if sql_parsing_result: if error := sql_parsing_result.debug_info.error: logger.info(f"SQL parsing error: {error}", exc_info=error) - datajob.properties[ - "datahub_sql_parser_error" - ] = f"{type(error).__name__}: {error}" + datajob.properties["datahub_sql_parser_error"] = ( + f"{type(error).__name__}: {error}" + ) if not sql_parsing_result.debug_info.table_error: input_urns.extend(sql_parsing_result.in_tables) output_urns.extend(sql_parsing_result.out_tables) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py index 4bf050d41473e4..99b0a40fd3c13e 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py @@ -44,11 +44,9 @@ def get_task_inlets_advanced(task: BaseOperator, context: Any) -> Iterable[Any]: if task_inlets and isinstance(task_inlets, list): inlets = [] - task_ids = ( - {o for o in task_inlets if isinstance(o, str)} - .union(op.task_id for op in task_inlets if isinstance(op, BaseOperator)) - .intersection(task.get_flat_relative_ids(upstream=True)) - ) + task_ids = {o for o in task_inlets if isinstance(o, str)}.union( + op.task_id for op in task_inlets if isinstance(op, BaseOperator) + ).intersection(task.get_flat_relative_ids(upstream=True)) from airflow.lineage import AUTO from cattr import structure diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py index 4351f40fe7e3ad..24e89211dd3c5b 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py @@ -2,6 +2,7 @@ This example demonstrates how to emit lineage to DataHub within an Airflow DAG. """ + from datetime import timedelta from airflow import DAG diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py index d2c9821295419c..2744c26021cde3 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py @@ -273,13 +273,21 @@ def _run_airflow( subprocess.check_call( [ # fmt: off - "airflow", "users", "create", - "--username", "airflow", - "--password", "airflow", - "--firstname", "admin", - "--lastname", "admin", - "--role", "Admin", - "--email", "airflow@example.com", + "airflow", + "users", + "create", + "--username", + "airflow", + "--password", + "airflow", + "--firstname", + "admin", + "--lastname", + "admin", + "--role", + "Admin", + "--email", + "airflow@example.com", # fmt: on ], env=environment, diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py index 1dc8e14a425dfc..4219c5fb9cefb3 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py @@ -242,9 +242,7 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): }, ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch( "airflow.models.BaseOperator.xcom_push" - ), patch_airflow_connection( - datahub_rest_connection_config - ): + ), patch_airflow_connection(datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" @@ -275,7 +273,10 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): if AIRFLOW_VERSION < packaging.version.parse("2.2.0"): ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE) # Ignoring type here because DagRun state is just a sring at Airflow 1 - dag_run = DagRun(state="success", run_id=f"scheduled_{DEFAULT_DATE.isoformat()}") # type: ignore + dag_run = DagRun( + state="success", # type: ignore[arg-type] + run_id=f"scheduled_{DEFAULT_DATE.isoformat()}", + ) else: from airflow.utils.state import DagRunState diff --git a/metadata-ingestion-modules/dagster-plugin/build.gradle b/metadata-ingestion-modules/dagster-plugin/build.gradle index 503b3556a41bfe..7dd7036e276151 100644 --- a/metadata-ingestion-modules/dagster-plugin/build.gradle +++ b/metadata-ingestion-modules/dagster-plugin/build.gradle @@ -54,16 +54,15 @@ task installDev(type: Exec, dependsOn: [install]) { task lint(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "black --check --diff src/ tests/ examples/ && " + - "ruff check src/ tests/ && " + + "ruff check src/ tests/ examples/ && " + + "ruff format --check src/ tests/ && " + "mypy --show-traceback --show-error-codes src/ tests/ examples/" } task lintFix(type: Exec, dependsOn: installDev) { commandLine 'bash', '-x', '-c', "source ${venv_name}/bin/activate && " + - "black src/ tests/ examples/ && " + - "ruff check --fix src/ tests/" - "mypy src/ tests/ examples/" + "ruff check --fix src/ tests/ examples/ && " + + "ruff format src/ tests/ examples/ " } task installDevTest(type: Exec, dependsOn: [installDev]) { @@ -105,5 +104,8 @@ clean { delete venv_name delete 'build' delete 'dist' + delete '.ruff_cache' + delete '.mypy_cache' + delete '.pytest_cache' } clean.dependsOn cleanPythonCache diff --git a/metadata-ingestion-modules/dagster-plugin/examples/advanced_ops_jobs.py b/metadata-ingestion-modules/dagster-plugin/examples/advanced_ops_jobs.py index 7b7616b1ec11de..75cab237b05a3e 100644 --- a/metadata-ingestion-modules/dagster-plugin/examples/advanced_ops_jobs.py +++ b/metadata-ingestion-modules/dagster-plugin/examples/advanced_ops_jobs.py @@ -9,9 +9,9 @@ job, op, ) + from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.utilities.urns.dataset_urn import DatasetUrn - from datahub_dagster_plugin.client.dagster_generator import ( DagsterGenerator, DatasetLineage, diff --git a/metadata-ingestion-modules/dagster-plugin/examples/assets_job.py b/metadata-ingestion-modules/dagster-plugin/examples/assets_job.py index 1ed3f2f915061b..9b26b502d770f2 100644 --- a/metadata-ingestion-modules/dagster-plugin/examples/assets_job.py +++ b/metadata-ingestion-modules/dagster-plugin/examples/assets_job.py @@ -7,9 +7,9 @@ define_asset_job, multi_asset, ) + from datahub.ingestion.graph.config import DatahubClientConfig from datahub.utilities.urns.dataset_urn import DatasetUrn - from datahub_dagster_plugin.sensors.datahub_sensors import ( DatahubDagsterSourceConfig, make_datahub_sensor, diff --git a/metadata-ingestion-modules/dagster-plugin/examples/basic_setup.py b/metadata-ingestion-modules/dagster-plugin/examples/basic_setup.py index 300cf9df022c66..2eeff225697261 100644 --- a/metadata-ingestion-modules/dagster-plugin/examples/basic_setup.py +++ b/metadata-ingestion-modules/dagster-plugin/examples/basic_setup.py @@ -1,6 +1,6 @@ from dagster import Definitions -from datahub.ingestion.graph.client import DatahubClientConfig +from datahub.ingestion.graph.client import DatahubClientConfig from datahub_dagster_plugin.sensors.datahub_sensors import ( DatahubDagsterSourceConfig, make_datahub_sensor, diff --git a/metadata-ingestion-modules/dagster-plugin/examples/ops_job.py b/metadata-ingestion-modules/dagster-plugin/examples/ops_job.py index a17fc89e6922df..aa2902ee5c708b 100644 --- a/metadata-ingestion-modules/dagster-plugin/examples/ops_job.py +++ b/metadata-ingestion-modules/dagster-plugin/examples/ops_job.py @@ -1,7 +1,7 @@ from dagster import Definitions, In, Out, PythonObjectDagsterType, job, op + from datahub.ingestion.graph.config import DatahubClientConfig from datahub.utilities.urns.dataset_urn import DatasetUrn - from datahub_dagster_plugin.sensors.datahub_sensors import ( DatahubDagsterSourceConfig, make_datahub_sensor, diff --git a/metadata-ingestion-modules/dagster-plugin/pyproject.toml b/metadata-ingestion-modules/dagster-plugin/pyproject.toml index 7d03c2a14bf078..d1e1d0ad479442 100644 --- a/metadata-ingestion-modules/dagster-plugin/pyproject.toml +++ b/metadata-ingestion-modules/dagster-plugin/pyproject.toml @@ -2,13 +2,21 @@ build-backend = "setuptools.build_meta" requires = ["setuptools>=54.0.0", "wheel", "pip>=21.0.0"] -[tool.black] -extend-exclude = ''' -# A regex preceded with ^/ will apply only to files and directories -# in the root of the project. -^/tmp -''' -include = '\.pyi?$' +[tool.ruff] +line-length = 88 +target-version = "py38" +exclude = [ + ".git", + "venv", + ".tox", + "__pycache__", +] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" [tool.ruff.lint.isort] combine-as-imports = true @@ -28,31 +36,23 @@ required-imports = [] classes = ["typing"] [tool.ruff.lint] -select = [ - "B", - "C90", - "E", - "F", - "I", # For isort - "TID", +extend-select = [ + "B", # flake8-bugbear + "C90", # mccabe complexity + "E", # pycodestyle errors + "F", # pyflakes + "G010", # logging.warn -> logging.warning + "I", # isort + "TID", # flake8-tidy-imports ] ignore = [ - # Ignore line length violations (handled by Black) - "E501", - # Ignore whitespace before ':' (matches Black) - "E203", - "E203", - # Allow usages of functools.lru_cache - "B019", - # Allow function call in argument defaults - "B008", + "E501", # Line length violations (handled by formatter) ] [tool.ruff.lint.mccabe] max-complexity = 15 [tool.ruff.lint.flake8-tidy-imports] -# Disallow all relative imports. ban-relative-imports = "all" [tool.ruff.lint.per-file-ignores] diff --git a/metadata-ingestion-modules/dagster-plugin/setup.py b/metadata-ingestion-modules/dagster-plugin/setup.py index b15f3716b28d0a..09859b6c4344e3 100644 --- a/metadata-ingestion-modules/dagster-plugin/setup.py +++ b/metadata-ingestion-modules/dagster-plugin/setup.py @@ -51,7 +51,6 @@ def get_long_description(): "dagster-aws >= 0.11.0", "dagster-snowflake >= 0.11.0", "dagster-snowflake-pandas >= 0.11.0", - "black==22.12.0", "coverage>=5.1", "ruff==0.9.2", "mypy>=1.4.0", diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py index 9a0a9a1b3a75ed..033d3967145017 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py @@ -507,7 +507,7 @@ def generate_datajob( job_property_bag: Dict[str, str] = {} if input_datasets: self.logger.info( - f"Input datasets for {op_def_snap.name} are { list(input_datasets.get(op_def_snap.name, []))}" + f"Input datasets for {op_def_snap.name} are {list(input_datasets.get(op_def_snap.name, []))}" ) inlets.update(input_datasets.get(op_def_snap.name, [])) @@ -515,7 +515,7 @@ def generate_datajob( if output_datasets: self.logger.info( - f"Output datasets for {op_def_snap.name} are { list(output_datasets.get(op_def_snap.name, []))}" + f"Output datasets for {op_def_snap.name} are {list(output_datasets.get(op_def_snap.name, []))}" ) datajob.outlets = list(output_datasets.get(op_def_snap.name, [])) @@ -606,7 +606,7 @@ def emit_job_run( if run.status not in status_result_map: raise Exception( f"Job run status should be either complete, failed or cancelled and it was " - f"{run.status }" + f"{run.status}" ) if run_stats.start_time is not None: @@ -673,7 +673,7 @@ def emit_op_run( if run_step_stats.status not in status_result_map: raise Exception( f"Step run status should be either complete, failed or cancelled and it was " - f"{run_step_stats.status }" + f"{run_step_stats.status}" ) if run_step_stats.start_time is not None: diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py index b91a9cfa56d398..5f049d55c16a12 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py @@ -262,7 +262,6 @@ def get_dagster_environment( and context.dagster_run.job_code_origin.repository_origin and context.dagster_run.job_code_origin.repository_origin.code_pointer ): - code_pointer = ( context.dagster_run.job_code_origin.repository_origin.code_pointer ) diff --git a/metadata-ingestion-modules/gx-plugin/build.gradle b/metadata-ingestion-modules/gx-plugin/build.gradle index a0604215426bf7..57a1ed0b2169d3 100644 --- a/metadata-ingestion-modules/gx-plugin/build.gradle +++ b/metadata-ingestion-modules/gx-plugin/build.gradle @@ -25,7 +25,7 @@ task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { outputs.file(sentinel_file) commandLine 'bash', '-c', "${python_executable} -m venv ${venv_name} && " + - "${venv_name}/bin/python -m pip install --upgrade uv && " + + "${venv_name}/bin/pip install --upgrade uv && " + "touch ${sentinel_file}" } @@ -54,16 +54,15 @@ task installDev(type: Exec, dependsOn: [install]) { task lint(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "black --check --diff src/ tests/ && " + "ruff check src/ tests/ && " + + "ruff format --check src/ tests/ && " + "mypy --show-traceback --show-error-codes src/ tests/" } task lintFix(type: Exec, dependsOn: installDev) { commandLine 'bash', '-x', '-c', "source ${venv_name}/bin/activate && " + - "black src/ tests/ && " + - "ruff check --fix src/ tests/" - "mypy src/ tests/" + "ruff check --fix src/ tests/ && " + + "ruff format src/ tests/ " } task installDevTest(type: Exec, dependsOn: [installDev]) { @@ -105,5 +104,8 @@ clean { delete venv_name delete 'build' delete 'dist' + delete '.ruff_cache' + delete '.mypy_cache' + delete '.pytest_cache' } clean.dependsOn cleanPythonCache diff --git a/metadata-ingestion-modules/gx-plugin/pyproject.toml b/metadata-ingestion-modules/gx-plugin/pyproject.toml index 7d03c2a14bf078..d1e1d0ad479442 100644 --- a/metadata-ingestion-modules/gx-plugin/pyproject.toml +++ b/metadata-ingestion-modules/gx-plugin/pyproject.toml @@ -2,13 +2,21 @@ build-backend = "setuptools.build_meta" requires = ["setuptools>=54.0.0", "wheel", "pip>=21.0.0"] -[tool.black] -extend-exclude = ''' -# A regex preceded with ^/ will apply only to files and directories -# in the root of the project. -^/tmp -''' -include = '\.pyi?$' +[tool.ruff] +line-length = 88 +target-version = "py38" +exclude = [ + ".git", + "venv", + ".tox", + "__pycache__", +] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" [tool.ruff.lint.isort] combine-as-imports = true @@ -28,31 +36,23 @@ required-imports = [] classes = ["typing"] [tool.ruff.lint] -select = [ - "B", - "C90", - "E", - "F", - "I", # For isort - "TID", +extend-select = [ + "B", # flake8-bugbear + "C90", # mccabe complexity + "E", # pycodestyle errors + "F", # pyflakes + "G010", # logging.warn -> logging.warning + "I", # isort + "TID", # flake8-tidy-imports ] ignore = [ - # Ignore line length violations (handled by Black) - "E501", - # Ignore whitespace before ':' (matches Black) - "E203", - "E203", - # Allow usages of functools.lru_cache - "B019", - # Allow function call in argument defaults - "B008", + "E501", # Line length violations (handled by formatter) ] [tool.ruff.lint.mccabe] max-complexity = 15 [tool.ruff.lint.flake8-tidy-imports] -# Disallow all relative imports. ban-relative-imports = "all" [tool.ruff.lint.per-file-ignores] diff --git a/metadata-ingestion-modules/gx-plugin/setup.py b/metadata-ingestion-modules/gx-plugin/setup.py index d114a4130ca4f2..fbc4097388993f 100644 --- a/metadata-ingestion-modules/gx-plugin/setup.py +++ b/metadata-ingestion-modules/gx-plugin/setup.py @@ -58,9 +58,8 @@ def get_long_description(): base_dev_requirements = { *base_requirements, *mypy_stubs, - "black==22.12.0", "coverage>=5.1", - "ruff==0.9.2", + "ruff==0.9.1", "mypy>=1.4.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. diff --git a/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py index 4f2aee52c3319f..1070d4d3d5d66d 100644 --- a/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py +++ b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py @@ -108,7 +108,6 @@ def __init__( convert_urns_to_lowercase: bool = False, name: str = "DataHubValidationAction", ): - if has_name_positional_arg: if len(args) >= 1 and isinstance(args[0], str): name = args[0] @@ -164,9 +163,7 @@ def _run( if isinstance( validation_result_suite_identifier, ValidationResultIdentifier ): - expectation_suite_name = ( - validation_result_suite_identifier.expectation_suite_identifier.expectation_suite_name - ) + expectation_suite_name = validation_result_suite_identifier.expectation_suite_identifier.expectation_suite_name run_id = validation_result_suite_identifier.run_id batch_identifier = validation_result_suite_identifier.batch_identifier diff --git a/metadata-ingestion-modules/prefect-plugin/build.gradle b/metadata-ingestion-modules/prefect-plugin/build.gradle index d16201834a0ff0..d13c9fe3c9abe7 100644 --- a/metadata-ingestion-modules/prefect-plugin/build.gradle +++ b/metadata-ingestion-modules/prefect-plugin/build.gradle @@ -54,16 +54,15 @@ task installDev(type: Exec, dependsOn: [install]) { task lint(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "black --check --diff src/ tests/ && " + "ruff check src/ tests/ && " + + "ruff format --check src/ tests/ && " + "mypy --show-traceback --show-error-codes src/ tests/" } task lintFix(type: Exec, dependsOn: installDev) { commandLine 'bash', '-x', '-c', "source ${venv_name}/bin/activate && " + - "black src/ tests/ && " + - "ruff check --fix src/ tests/" - "mypy src/ tests/ " + "ruff check --fix src/ tests/ && " + + "ruff format src/ tests/ " } task installDevTest(type: Exec, dependsOn: [installDev]) { @@ -111,5 +110,8 @@ clean { delete venv_name delete 'build' delete 'dist' + delete '.ruff_cache' + delete '.mypy_cache' + delete '.pytest_cache' } clean.dependsOn cleanPythonCache diff --git a/metadata-ingestion-modules/prefect-plugin/pyproject.toml b/metadata-ingestion-modules/prefect-plugin/pyproject.toml index 7d03c2a14bf078..d1e1d0ad479442 100644 --- a/metadata-ingestion-modules/prefect-plugin/pyproject.toml +++ b/metadata-ingestion-modules/prefect-plugin/pyproject.toml @@ -2,13 +2,21 @@ build-backend = "setuptools.build_meta" requires = ["setuptools>=54.0.0", "wheel", "pip>=21.0.0"] -[tool.black] -extend-exclude = ''' -# A regex preceded with ^/ will apply only to files and directories -# in the root of the project. -^/tmp -''' -include = '\.pyi?$' +[tool.ruff] +line-length = 88 +target-version = "py38" +exclude = [ + ".git", + "venv", + ".tox", + "__pycache__", +] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" [tool.ruff.lint.isort] combine-as-imports = true @@ -28,31 +36,23 @@ required-imports = [] classes = ["typing"] [tool.ruff.lint] -select = [ - "B", - "C90", - "E", - "F", - "I", # For isort - "TID", +extend-select = [ + "B", # flake8-bugbear + "C90", # mccabe complexity + "E", # pycodestyle errors + "F", # pyflakes + "G010", # logging.warn -> logging.warning + "I", # isort + "TID", # flake8-tidy-imports ] ignore = [ - # Ignore line length violations (handled by Black) - "E501", - # Ignore whitespace before ':' (matches Black) - "E203", - "E203", - # Allow usages of functools.lru_cache - "B019", - # Allow function call in argument defaults - "B008", + "E501", # Line length violations (handled by formatter) ] [tool.ruff.lint.mccabe] max-complexity = 15 [tool.ruff.lint.flake8-tidy-imports] -# Disallow all relative imports. ban-relative-imports = "all" [tool.ruff.lint.per-file-ignores] diff --git a/metadata-ingestion-modules/prefect-plugin/setup.py b/metadata-ingestion-modules/prefect-plugin/setup.py index 9587f0ed73780b..1d56cae8d938a2 100644 --- a/metadata-ingestion-modules/prefect-plugin/setup.py +++ b/metadata-ingestion-modules/prefect-plugin/setup.py @@ -57,9 +57,8 @@ def get_long_description(): dev_requirements = { *base_requirements, *mypy_stubs, - "black==22.12.0", "coverage>=5.1", - "ruff==0.9.1", + "ruff==0.9.2", "mypy>=1.4.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. diff --git a/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/datahub_emitter.py b/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/datahub_emitter.py index fcab6b6fd91430..190a249a912d1a 100644 --- a/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/datahub_emitter.py +++ b/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/datahub_emitter.py @@ -351,9 +351,9 @@ def _emit_tasks( for prefect_future in flow_run_ctx.task_run_futures: if prefect_future.task_run is not None: - task_run_key_map[ - str(prefect_future.task_run.id) - ] = prefect_future.task_run.task_key + task_run_key_map[str(prefect_future.task_run.id)] = ( + prefect_future.task_run.task_key + ) for node in graph_json: datajob_urn = DataJobUrn.create_from_ids( diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 16a6704949c875..be9d69a2f0e4b6 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -110,16 +110,16 @@ task modelDocUpload(type: Exec, dependsOn: [modelDocGen]) { task lint(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "black --check --diff src/ tests/ examples/ && " + "ruff check src/ tests/ examples/ && " + + "ruff format --check src/ tests/ examples/ && " + "mypy --show-traceback --show-error-codes src/ tests/ examples/" } task lintFix(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "black src/ tests/ examples/ && " + - "ruff check --fix src/ tests/ examples/" + "ruff check --fix src/ tests/ examples/ && " + + "ruff format src/ tests/ examples/ " } def pytest_default_env = "PYTHONDEVMODE=1" @@ -216,6 +216,7 @@ clean { delete 'src/datahub/metadata' delete '../docs/generated' delete 'generated' + delete '.ruff_cache' delete '.mypy_cache' delete '.pytest_cache' delete '.preflight_sentinel' diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md index ebe1cd3df81990..005b0427a5e6ad 100644 --- a/metadata-ingestion/developing.md +++ b/metadata-ingestion/developing.md @@ -177,11 +177,10 @@ The architecture of this metadata ingestion framework is heavily inspired by [Ap ## Code style -We use black, ruff, and mypy to ensure consistent code style and quality. +We use ruff, and mypy to ensure consistent code style and quality. ```shell # Assumes: pip install -e '.[dev]' and venv is activated -black src/ tests/ ruff check src/ tests/ mypy src/ tests/ ``` diff --git a/metadata-ingestion/examples/library/run_assertion.py b/metadata-ingestion/examples/library/run_assertion.py index 414e5f46cc7f91..e7c717837eed3c 100644 --- a/metadata-ingestion/examples/library/run_assertion.py +++ b/metadata-ingestion/examples/library/run_assertion.py @@ -16,5 +16,5 @@ assertion_result = graph.run_assertion(urn=assertion_urn, save_result=True) log.info( - f'Assertion result (SUCCESS / FAILURE / ERROR): {assertion_result.get("type")}' + f"Assertion result (SUCCESS / FAILURE / ERROR): {assertion_result.get('type')}" ) diff --git a/metadata-ingestion/pyproject.toml b/metadata-ingestion/pyproject.toml index 07f2010fde25f0..1d434eb8c3a94f 100644 --- a/metadata-ingestion/pyproject.toml +++ b/metadata-ingestion/pyproject.toml @@ -2,15 +2,6 @@ build-backend = "setuptools.build_meta" requires = ["setuptools>=63.0.0", "wheel"] -[tool.black] -extend-exclude = ''' -# A regex preceded with ^/ will apply only to files and directories -# in the root of the project. -^/tmp -''' -include = '\.pyi?$' -target-version = ['py38', 'py39', 'py310', 'py311'] - [tool.ruff.lint.isort] section-order = ["future", "patch", "standard-library", "third-party", "first-party", "local-folder"] sections = { "patch" = ["datahub.utilities._markupsafe_compat", "datahub.sql_parsing._sqlglot_patch"] } @@ -31,23 +22,22 @@ exclude = [ [tool.ruff.lint] extend-select = [ - "B", # Bugbear - "C90", - "E", - "F", - "G010", # logging.warn -> logging.warning - "I", # Import sorting - "TID", # Tidy imports + "B", # flake8-bugbear + "C90", # mccabe complexity + "E", # pycodestyle errors + "F", # pyflakes + "G010", # logging.warn -> logging.warning + "I", # isort + "TID", # flake8-tidy-imports ] extend-ignore = [ - # Ignore line length violations (handled by Black) - "E501", - # Ignore whitespace before ':' (matches Black) - "E203", - # Allow usages of functools.lru_cache - "B019", - # Allow function call in argument defaults - "B008", + "E501", # Handled by formatter + "E111", # Handled by formatter + "E114", # Handled by formatter + "E117", # Handled by formatter + "E203", # Ignore whitespace before ':' (matches Black) + "B019", # Allow usages of functools.lru_cache + "B008", # Allow function call in argument defaults # TODO: Enable these later "B006", # Mutable args "B017", # Do not assert blind exception @@ -61,4 +51,4 @@ max-complexity = 20 ban-relative-imports = "all" [tool.ruff.lint.per-file-ignores] -"__init__.py" = ["F401"] \ No newline at end of file +"__init__.py" = ["F401"] diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index ea1b0ad1582576..2cfdf9837f45ad 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -592,7 +592,6 @@ lint_requirements = { # This is pinned only to avoid spurious errors in CI. # We should make an effort to keep it up to date. - "black==23.3.0", "ruff==0.9.2", "mypy==1.10.1", } diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py index 8704ed13cb6c30..a05386798495de 100644 --- a/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py +++ b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py @@ -20,15 +20,13 @@ class Operator(Protocol): operator: str - def id(self) -> str: - ... + def id(self) -> str: ... - def generate_parameters(self) -> AssertionStdParametersClass: - ... + def generate_parameters(self) -> AssertionStdParametersClass: ... def _generate_assertion_std_parameter( - value: Union[str, int, float, list] + value: Union[str, int, float, list], ) -> AssertionStdParameterClass: if isinstance(value, str): return AssertionStdParameterClass( diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py index dc0c97d1c74e56..145a6097d7336c 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py @@ -19,15 +19,13 @@ class Operator(Protocol): operator: str - def id(self) -> str: - ... + def id(self) -> str: ... - def generate_parameters(self) -> AssertionStdParametersClass: - ... + def generate_parameters(self) -> AssertionStdParametersClass: ... def _generate_assertion_std_parameter( - value: Union[str, int, float] + value: Union[str, int, float], ) -> AssertionStdParameterClass: if isinstance(value, str): return AssertionStdParameterClass( diff --git a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py index 39de4d7f80558e..d2035d560716ae 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py +++ b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py @@ -321,9 +321,9 @@ def from_yaml( @classmethod def from_datahub(cls, graph: DataHubGraph, id: str) -> DataProduct: - data_product_properties: Optional[ - DataProductPropertiesClass - ] = graph.get_aspect(id, DataProductPropertiesClass) + data_product_properties: Optional[DataProductPropertiesClass] = ( + graph.get_aspect(id, DataProductPropertiesClass) + ) domains: Optional[DomainsClass] = graph.get_aspect(id, DomainsClass) assert domains, "Data Product must have an associated domain. Found none." owners: Optional[OwnershipClass] = graph.get_aspect(id, OwnershipClass) diff --git a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py index 315f2249d2e5cd..bf824a11a77b5d 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py +++ b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py @@ -266,7 +266,8 @@ def generate_mcp( if self.schema_metadata.fields: for field in self.schema_metadata.fields: field_urn = field.urn or make_schema_field_urn( - self.urn, field.id # type: ignore[arg-type] + self.urn, # type: ignore[arg-type] + field.id, # type: ignore[arg-type] ) assert field_urn.startswith("urn:li:schemaField:") diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index 179dbdb231c912..b0b434751ad2cc 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -118,9 +118,9 @@ def fqn(self) -> str: id = StructuredPropertyUrn.from_string(self.urn).id if self.qualified_name is not None: # ensure that qualified name and ID match - assert ( - self.qualified_name == id - ), "ID in the urn and the qualified_name must match" + assert self.qualified_name == id, ( + "ID in the urn and the qualified_name must match" + ) return id @validator("urn", pre=True, always=True) @@ -184,9 +184,9 @@ def create(file: str, graph: DataHubGraph) -> None: @classmethod def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": - structured_property: Optional[ - StructuredPropertyDefinitionClass - ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass) + structured_property: Optional[StructuredPropertyDefinitionClass] = ( + graph.get_aspect(urn, StructuredPropertyDefinitionClass) + ) if structured_property is None: raise Exception( "StructuredPropertyDefinition aspect is None. Unable to create structured property." diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index f6b5ba6176c59d..1f13391644c6c8 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -412,7 +412,7 @@ def generate_access_token( def ensure_has_system_metadata( event: Union[ MetadataChangeProposal, MetadataChangeProposalWrapper, MetadataChangeEvent - ] + ], ) -> None: if event.systemMetadata is None: event.systemMetadata = SystemMetadataClass() diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index 86bcd7eff1cbfc..b744ac573aed6e 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -296,9 +296,9 @@ def _restore( restore_indices: Optional[bool], primary_restore_file: Optional[str], ) -> int: - assert ( - restore_primary or restore_indices - ), "Either restore_primary or restore_indices must be set" + assert restore_primary or restore_indices, ( + "Either restore_primary or restore_indices must be set" + ) msg = "datahub> " if restore_primary: msg += f"Will restore primary database from {primary_restore_file}. " @@ -314,9 +314,9 @@ def _restore( assert primary_restore_file resolved_restore_file = os.path.expanduser(primary_restore_file) logger.info(f"Restoring primary db from backup at {resolved_restore_file}") - assert os.path.exists( - resolved_restore_file - ), f"File {resolved_restore_file} does not exist" + assert os.path.exists(resolved_restore_file), ( + f"File {resolved_restore_file} does not exist" + ) with open(resolved_restore_file) as fp: result = subprocess.run( [ diff --git a/metadata-ingestion/src/datahub/cli/lite_cli.py b/metadata-ingestion/src/datahub/cli/lite_cli.py index 957ee16245dd81..90bbb353deab18 100644 --- a/metadata-ingestion/src/datahub/cli/lite_cli.py +++ b/metadata-ingestion/src/datahub/cli/lite_cli.py @@ -176,7 +176,7 @@ def get( ) ) end_time = time.time() - logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis") + logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis") @lite.command() @@ -228,7 +228,7 @@ def ls(path: Optional[str]) -> None: try: browseables = lite.ls(path) end_time = time.time() - logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis") + logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis") auto_complete: List[AutoComplete] = [ b.auto_complete for b in browseables if b.auto_complete is not None ] diff --git a/metadata-ingestion/src/datahub/cli/migrate.py b/metadata-ingestion/src/datahub/cli/migrate.py index 1bf1211674f596..3bd1b6fc4dc124 100644 --- a/metadata-ingestion/src/datahub/cli/migrate.py +++ b/metadata-ingestion/src/datahub/cli/migrate.py @@ -426,9 +426,9 @@ def batch_get_ids( entities_yielded += 1 log.debug(f"yielding {x}") yield x - assert ( - entities_yielded == num_entities - ), "Did not delete all entities, try running this command again!" + assert entities_yielded == num_entities, ( + "Did not delete all entities, try running this command again!" + ) else: log.error(f"Failed to execute batch get with {str(response.content)}") response.raise_for_status() diff --git a/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py b/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py index dad724bfe11157..c0d93af90ada00 100644 --- a/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py +++ b/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py @@ -136,9 +136,9 @@ def extras_list_to_dict(extras: List[str]) -> Dict[str, str]: extra_properties: Dict[str, str] = dict() for x in extras: parts = x.split("=") - assert ( - len(parts) == 2 - ), f"Invalid value for extras {x}, should be in format key=value" + assert len(parts) == 2, ( + f"Invalid value for extras {x}, should be in format key=value" + ) extra_properties[parts[0]] = parts[1] return extra_properties diff --git a/metadata-ingestion/src/datahub/cli/timeline_cli.py b/metadata-ingestion/src/datahub/cli/timeline_cli.py index 37089e6f051f0d..174ce63e84ef4c 100644 --- a/metadata-ingestion/src/datahub/cli/timeline_cli.py +++ b/metadata-ingestion/src/datahub/cli/timeline_cli.py @@ -50,7 +50,7 @@ def pretty_id(id: Optional[str]) -> str: if id.startswith("urn:li:dataset"): dataset_key = dataset_urn_to_key(id) if dataset_key: - return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:'):], fg='white')}:{click.style(dataset_key.name, fg='white')}" + return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:') :], fg='white')}:{click.style(dataset_key.name, fg='white')}" # failed to prettify, return original return id diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 08817d9d5fdb93..8052de1b0669c4 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -200,8 +200,7 @@ class IgnorableError(MetaError): @runtime_checkable class ExceptionWithProps(Protocol): - def get_telemetry_props(self) -> Dict[str, Any]: - ... + def get_telemetry_props(self) -> Dict[str, Any]: ... def should_show_stack_trace(exc: Exception) -> bool: diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py index e7e9bfd43adca5..7e68e9f80da4ff 100644 --- a/metadata-ingestion/src/datahub/configuration/git.py +++ b/metadata-ingestion/src/datahub/configuration/git.py @@ -121,9 +121,9 @@ def infer_repo_ssh_locator( repo: str = values["repo"] if repo.startswith(_GITHUB_PREFIX): - return f"git@github.com:{repo[len(_GITHUB_PREFIX):]}.git" + return f"git@github.com:{repo[len(_GITHUB_PREFIX) :]}.git" elif repo.startswith(_GITLAB_PREFIX): - return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX):]}.git" + return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX) :]}.git" else: raise ValueError( "Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually." diff --git a/metadata-ingestion/src/datahub/configuration/time_window_config.py b/metadata-ingestion/src/datahub/configuration/time_window_config.py index b3cc0316091173..5fabcf904d3219 100644 --- a/metadata-ingestion/src/datahub/configuration/time_window_config.py +++ b/metadata-ingestion/src/datahub/configuration/time_window_config.py @@ -47,7 +47,10 @@ class BaseTimeWindowConfig(ConfigModel): default_factory=lambda: datetime.now(tz=timezone.utc), description="Latest date of lineage/usage to consider. Default: Current time in UTC", ) - start_time: datetime = Field(default=None, description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.") # type: ignore + start_time: datetime = Field( + default=None, + description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.", + ) # type: ignore @pydantic.validator("start_time", pre=True, always=True) def default_start_time( @@ -63,12 +66,14 @@ def default_start_time( # This is where start_time str is resolved to datetime try: delta = parse_relative_timespan(v) - assert delta < timedelta( - 0 - ), "Relative start time should start with minus sign (-) e.g. '-2 days'." + assert delta < timedelta(0), ( + "Relative start time should start with minus sign (-) e.g. '-2 days'." + ) assert abs(delta) >= get_bucket_duration_delta( values["bucket_duration"] - ), "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'." + ), ( + "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'." + ) # The end_time's default value is not yet populated, in which case # we can just manually generate it here. diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index f095fffbaea6b4..f5da90a86c9ef6 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -88,13 +88,11 @@ def get_sys_time() -> int: @overload -def make_ts_millis(ts: None) -> None: - ... +def make_ts_millis(ts: None) -> None: ... @overload -def make_ts_millis(ts: datetime) -> int: - ... +def make_ts_millis(ts: datetime) -> int: ... def make_ts_millis(ts: Optional[datetime]) -> Optional[int]: @@ -105,13 +103,11 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]: @overload -def parse_ts_millis(ts: float) -> datetime: - ... +def parse_ts_millis(ts: float) -> datetime: ... @overload -def parse_ts_millis(ts: None) -> None: - ... +def parse_ts_millis(ts: None) -> None: ... def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]: diff --git a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py index 17026a4114c128..e51c37d96e90f0 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py @@ -33,8 +33,7 @@ @runtime_checkable class SupportsToObj(Protocol): - def to_obj(self) -> Any: - ... + def to_obj(self) -> Any: ... def _recursive_to_obj(obj: Any) -> Any: diff --git a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py index 78a091f1ffe689..92ee158661d3d4 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py +++ b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py @@ -55,15 +55,9 @@ def convert_chart_info_to_patch( aspect.externalUrl ).set_type(aspect.type).set_title(aspect.title).set_access( aspect.access - ).set_last_modified( - aspect.lastModified - ).set_last_refreshed( + ).set_last_modified(aspect.lastModified).set_last_refreshed( aspect.lastRefreshed - ).set_description( - aspect.description - ).add_inputs( - aspect.inputs - ) + ).set_description(aspect.description).add_inputs(aspect.inputs) values = patch_builder.build() if values: diff --git a/metadata-ingestion/src/datahub/ingestion/api/report.py b/metadata-ingestion/src/datahub/ingestion/api/report.py index 32810189acd00b..8cfca5782bee40 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/report.py +++ b/metadata-ingestion/src/datahub/ingestion/api/report.py @@ -21,8 +21,7 @@ @runtime_checkable class SupportsAsObj(Protocol): - def as_obj(self) -> dict: - ... + def as_obj(self) -> dict: ... @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index f3e5b1db6a1c85..08af39cd24982a 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -48,7 +48,7 @@ def auto_workunit( - stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] + stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]], ) -> Iterable[MetadataWorkUnit]: """Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s.""" diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index 88d1fcc52e2196..1c440642e06d8b 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -131,9 +131,9 @@ def get_recursive(self, schema: Dict) -> Optional[str]: for i, schema_type in enumerate(p.schema_types): if schema_type == schema_str: # return the corresponding type for the schema that's a match - assert ( - len(p.type) > i - ), f"p.type({len(p.type)})) and p.schema_types({len(p.schema_types)}) should have the same length" + assert len(p.type) > i, ( + f"p.type({len(p.type)})) and p.schema_types({len(p.schema_types)}) should have the same length" + ) return p.type[i] return None diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py index d5af4f7a2389c0..dbb851c74e7e34 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py @@ -263,15 +263,13 @@ def _get_type_annotation(schema: SchemaOrField) -> str: @overload def _get_underlying_type_if_option_as_union( schema: SchemaOrField, default: SchemaOrField - ) -> SchemaOrField: - ... + ) -> SchemaOrField: ... @staticmethod @overload def _get_underlying_type_if_option_as_union( schema: SchemaOrField, default: Optional[SchemaOrField] = None - ) -> Optional[SchemaOrField]: - ... + ) -> Optional[SchemaOrField]: ... @staticmethod def _get_underlying_type_if_option_as_union( @@ -386,7 +384,7 @@ def emit(self) -> Iterable[SchemaField]: if "deprecated" in merged_props: description = ( - f"DEPRECATED: {merged_props['deprecated']}\n" + f'DEPRECATED: {merged_props["deprecated"]}\n' + description if description else "" diff --git a/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py b/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py index 9c34c4f83b0a93..beec42724529e6 100644 --- a/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py +++ b/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py @@ -17,9 +17,9 @@ def parse_s3_path(path: str) -> "S3Path": def assert_ok_status(s3_response): is_ok = s3_response["ResponseMetadata"]["HTTPStatusCode"] == 200 - assert ( - is_ok - ), f"Failed to fetch S3 object, error message: {s3_response['Error']['Message']}" + assert is_ok, ( + f"Failed to fetch S3 object, error message: {s3_response['Error']['Message']}" + ) @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py index 50268768d0ce9f..ba03083854e785 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py @@ -148,9 +148,9 @@ def input_config_selectively_overrides_default_config(cls, info_types_config): weight, ) in custom_infotype_config.Prediction_Factors_and_Weights.dict().items(): if weight > 0: - assert ( - getattr(custom_infotype_config, factor) is not None - ), f"Missing Configuration for Prediction Factor {factor} for Custom Info Type {custom_infotype}" + assert getattr(custom_infotype_config, factor) is not None, ( + f"Missing Configuration for Prediction Factor {factor} for Custom Info Type {custom_infotype}" + ) # Custom infotype supports only regex based prediction for column values if custom_infotype_config.Prediction_Factors_and_Weights.Values > 0: @@ -158,7 +158,9 @@ def input_config_selectively_overrides_default_config(cls, info_types_config): assert ( custom_infotype_config.Values.prediction_type == ValuePredictionType.REGEX - ), f"Invalid Prediction Type for Values for Custom Info Type {custom_infotype}. Only `regex` is supported." + ), ( + f"Invalid Prediction Type for Values for Custom Info Type {custom_infotype}. Only `regex` is supported." + ) return info_types_config diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 8c5f894a072d93..48a008536ed1ed 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -519,9 +519,9 @@ def get_aspects_for_entity( :return: Optionally, a map of aspect_name to aspect_value as a dictionary if present, aspect_value will be set to None if that aspect was not found. Returns None on HTTP status 404. :raises HttpError: if the HTTP response is not a 200 """ - assert len(aspects) == len( - aspect_types - ), f"number of aspects requested ({len(aspects)}) should be the same as number of aspect types provided ({len(aspect_types)})" + assert len(aspects) == len(aspect_types), ( + f"number of aspects requested ({len(aspects)}) should be the same as number of aspect types provided ({len(aspect_types)})" + ) # TODO: generate aspects list from type classes response_json = self.get_entity_raw(entity_urn, aspects) @@ -1576,9 +1576,7 @@ def run_assertion( ... assertionResult } } - """ % ( - self._assertion_result_shared() - ) + """ % (self._assertion_result_shared()) variables = { "assertionUrn": urn, diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index ef59ba7a3b58b4..25cbd340c9674b 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -109,9 +109,9 @@ def on_failure( mcp.systemMetadata.properties = {} if "workunit_id" not in mcp.systemMetadata.properties: # update the workunit id - mcp.systemMetadata.properties[ - "workunit_id" - ] = record_envelope.metadata["workunit_id"] + mcp.systemMetadata.properties["workunit_id"] = ( + record_envelope.metadata["workunit_id"] + ) record_envelope.record = mcp self.file_sink.write_record_async(record_envelope, self.logging_callback) @@ -701,7 +701,7 @@ def pretty_print_summary( num_failures_sink = len(self.sink.get_report().failures) click.secho( message_template.format( - status=f"with at least {num_failures_source+num_failures_sink} failures" + status=f"with at least {num_failures_source + num_failures_sink} failures" ), fg=self._get_text_color( running=currently_running, failures=True, warnings=False @@ -719,7 +719,7 @@ def pretty_print_summary( num_warn_global = len(global_warnings) click.secho( message_template.format( - status=f"with at least {num_warn_source+num_warn_sink+num_warn_global} warnings" + status=f"with at least {num_warn_source + num_warn_sink + num_warn_global} warnings" ), fg=self._get_text_color( running=currently_running, failures=False, warnings=True diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py index 7a4e7ec52a8e96..53e31aa2ea96e1 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py @@ -92,9 +92,9 @@ class PipelineConfig(ConfigModel): pipeline_name: Optional[str] = None failure_log: FailureLoggingConfig = FailureLoggingConfig() - _raw_dict: Optional[ - dict - ] = None # the raw dict that was parsed to construct this config + _raw_dict: Optional[dict] = ( + None # the raw dict that was parsed to construct this config + ) @validator("run_id", pre=True, always=True) def run_id_should_be_semantic( diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/datalake_profiler_config.py b/metadata-ingestion/src/datahub/ingestion/source/abs/datalake_profiler_config.py index 9f6d13a08b182e..d12ff7415faefc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/abs/datalake_profiler_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/abs/datalake_profiler_config.py @@ -85,8 +85,8 @@ def ensure_field_level_settings_are_normalized( if field_level_metric.startswith("include_field_"): values.setdefault(field_level_metric, False) - assert ( - max_num_fields_to_profile is None - ), f"{max_num_fields_to_profile_key} should be set to None" + assert max_num_fields_to_profile is None, ( + f"{max_num_fields_to_profile_key} should be set to None" + ) return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py index e4f9cd0ee7e018..586e7a3af3bcd1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py @@ -508,7 +508,12 @@ def abs_browser( ): abs_path = self.create_abs_path(obj.name) logger.debug(f"Sampling file: {abs_path}") - yield abs_path, obj.name, obj.last_modified, obj.size, + yield ( + abs_path, + obj.name, + obj.last_modified, + obj.size, + ) except Exception as e: # This odd check if being done because boto does not have a proper exception to catch # The exception that appears in stacktrace cannot actually be caught without a lot more work @@ -552,9 +557,12 @@ def local_browser( if os.path.isfile(prefix): logger.debug(f"Scanning single local file: {prefix}") file_name = prefix - yield prefix, file_name, datetime.utcfromtimestamp( - os.path.getmtime(prefix) - ), os.path.getsize(prefix) + yield ( + prefix, + file_name, + datetime.utcfromtimestamp(os.path.getmtime(prefix)), + os.path.getsize(prefix), + ) else: logger.debug(f"Scanning files under local folder: {prefix}") for root, dirs, files in os.walk(prefix): @@ -565,9 +573,12 @@ def local_browser( full_path = PurePath( os.path.normpath(os.path.join(root, file)) ).as_posix() - yield full_path, file, datetime.utcfromtimestamp( - os.path.getmtime(full_path) - ), os.path.getsize(full_path) + yield ( + full_path, + file, + datetime.utcfromtimestamp(os.path.getmtime(full_path)), + os.path.getsize(full_path), + ) def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.container_WU_creator = ContainerWUCreator( diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 30e81643837375..2509927854d4a0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -521,7 +521,7 @@ def process_dataflow_node( # otherwise, a node represents a transformation else: node_urn = mce_builder.make_data_job_urn_with_flow( - flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}' + flow_urn, job_id=f"{node['NodeType']}-{node['Id']}" ) return { @@ -679,7 +679,7 @@ def get_datajob_wu(self, node: Dict[str, Any], job_name: str) -> MetadataWorkUni ) ) - return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce) + return MetadataWorkUnit(id=f"{job_name}-{node['Id']}", mce=mce) def get_all_databases(self) -> Iterable[Mapping[str, Any]]: logger.debug("Getting all databases") @@ -750,13 +750,13 @@ def get_lineage_if_enabled( ) -> Optional[MetadataWorkUnit]: if self.source_config.emit_s3_lineage: # extract dataset properties aspect - dataset_properties: Optional[ - DatasetPropertiesClass - ] = mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass) + dataset_properties: Optional[DatasetPropertiesClass] = ( + mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass) + ) # extract dataset schema aspect - schema_metadata: Optional[ - SchemaMetadataClass - ] = mce_builder.get_aspect_if_available(mce, SchemaMetadataClass) + schema_metadata: Optional[SchemaMetadataClass] = ( + mce_builder.get_aspect_if_available(mce, SchemaMetadataClass) + ) if dataset_properties and "Location" in dataset_properties.customProperties: location = dataset_properties.customProperties["Location"] @@ -765,9 +765,9 @@ def get_lineage_if_enabled( location, self.source_config.env ) assert self.ctx.graph - schema_metadata_for_s3: Optional[ - SchemaMetadataClass - ] = self.ctx.graph.get_schema_metadata(s3_dataset_urn) + schema_metadata_for_s3: Optional[SchemaMetadataClass] = ( + self.ctx.graph.get_schema_metadata(s3_dataset_urn) + ) if self.source_config.glue_s3_lineage_direction == "upstream": fine_grained_lineages = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py index c4561b9d9e676a..d46d1c099383fe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py @@ -257,7 +257,7 @@ def get_feature_wu( mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit( - id=f'{feature_group_details["FeatureGroupName"]}-{feature["FeatureName"]}', + id=f"{feature_group_details['FeatureGroupName']}-{feature['FeatureName']}", mce=mce, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/models.py b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/models.py index 0f433aaecf2d96..f1374117af775f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/models.py @@ -212,7 +212,7 @@ def get_endpoint_wu( mce = MetadataChangeEvent(proposedSnapshot=endpoint_snapshot) return MetadataWorkUnit( - id=f'{endpoint_details["EndpointName"]}', + id=f"{endpoint_details['EndpointName']}", mce=mce, ) @@ -503,7 +503,7 @@ def get_model_wu( mce = MetadataChangeEvent(proposedSnapshot=model_snapshot) return MetadataWorkUnit( - id=f'{model_details["ModelName"]}', + id=f"{model_details['ModelName']}", mce=mce, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 508b4bbaa277dc..ceb010a7f0675f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -132,9 +132,9 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): self.filters = BigQueryFilter(self.config, self.report) self.identifiers = BigQueryIdentifierBuilder(self.config, self.report) - redundant_lineage_run_skip_handler: Optional[ - RedundantLineageRunSkipHandler - ] = None + redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = ( + None + ) if self.config.enable_stateful_lineage_ingestion: redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler( source=self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index 42f82704c81b99..d35c5265878c03 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -37,9 +37,9 @@ class BigqueryTableIdentifier: # Note: this regex may get overwritten by the sharded_table_pattern config. # The class-level constant, however, will not be overwritten. - _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[ - str - ] = _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX + _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[str] = ( + _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX + ) _BIGQUERY_WILDCARD_REGEX: ClassVar[str] = "((_(\\d+)?)\\*$)|\\*$" _BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd" diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index afbe919df4dcae..57bfa2e3090d31 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -137,9 +137,9 @@ class BigQueryCredential(ConfigModel): @root_validator(skip_on_failure=True) def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: if values.get("client_x509_cert_url") is None: - values[ - "client_x509_cert_url" - ] = f'https://www.googleapis.com/robot/v1/metadata/x509/{values["client_email"]}' + values["client_x509_cert_url"] = ( + f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}" + ) return values def create_credential_temp_file(self) -> str: @@ -611,9 +611,9 @@ def validate_bigquery_audit_metadata_datasets( cls, v: Optional[List[str]], values: Dict ) -> Optional[List[str]]: if values.get("use_exported_bigquery_audit_metadata"): - assert ( - v and len(v) > 0 - ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`." + assert v and len(v) > 0, ( + "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`." + ) return v diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py index 9da2aceb19220a..7dc0e4195d5dc9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py @@ -87,9 +87,9 @@ def get_platform_resource( key=platform_resource_key, graph_client=self.graph ) if platform_resource: - self.platform_resource_cache[ - platform_resource_key.primary_key - ] = platform_resource + self.platform_resource_cache[platform_resource_key.primary_key] = ( + platform_resource + ) return platform_resource return None @@ -115,7 +115,11 @@ def generate_label_platform_resource( and platform_resource.resource_info.value ): try: - existing_info: Optional[BigQueryLabelInfo] = platform_resource.resource_info.value.as_pydantic_object(BigQueryLabelInfo) # type: ignore + existing_info: Optional[BigQueryLabelInfo] = ( + platform_resource.resource_info.value.as_pydantic_object( # type: ignore + BigQueryLabelInfo + ) + ) except ValidationError as e: logger.error( f"Error converting existing value to BigQueryLabelInfo: {e}. Creating new one. Maybe this is because of a non backward compatible schema change." diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 56e930dfb811f1..ebfbbf0639c38c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -311,8 +311,10 @@ def gen_dataset_containers( platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource( label, tag_urn, managed_by_datahub=False ) - label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object( # type: ignore - BigQueryLabelInfo + label_info: BigQueryLabelInfo = ( + platform_resource.resource_info.value.as_pydantic_object( # type: ignore + BigQueryLabelInfo + ) ) tag_urn = TagUrn.from_string(label_info.datahub_urn) @@ -820,8 +822,10 @@ def gen_table_dataset_workunits( platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource( label, tag_urn, managed_by_datahub=False ) - label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object( # type: ignore - BigQueryLabelInfo + label_info: BigQueryLabelInfo = ( + platform_resource.resource_info.value.as_pydantic_object( # type: ignore + BigQueryLabelInfo + ) ) tag_urn = TagUrn.from_string(label_info.datahub_urn) @@ -860,8 +864,10 @@ def gen_view_dataset_workunits( platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource( label, tag_urn, managed_by_datahub=False ) - label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object( # type: ignore - BigQueryLabelInfo + label_info: BigQueryLabelInfo = ( + platform_resource.resource_info.value.as_pydantic_object( # type: ignore + BigQueryLabelInfo + ) ) tag_urn = TagUrn.from_string(label_info.datahub_urn) @@ -1203,9 +1209,9 @@ def get_tables_for_dataset( report=self.report, ) - self.report.metadata_extraction_sec[ - f"{project_id}.{dataset.name}" - ] = timer.elapsed_seconds(digits=2) + self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = ( + timer.elapsed_seconds(digits=2) + ) def get_core_table_details( self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 433282a21fdb66..da82c6a06f0395 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -697,7 +697,7 @@ def _create_lineage_map( if parsed_queries[-1]: query = f"""create table `{destination_table.get_sanitized_table_ref().table_identifier.get_table_name()}` AS ( - {parsed_queries[-1].sql(dialect='bigquery')} + {parsed_queries[-1].sql(dialect="bigquery")} )""" else: query = e.query @@ -809,11 +809,11 @@ def get_upstream_tables( upstream_lineage, temp_table_upstream ) - upstreams[ - ref_temp_table_upstream - ] = _merge_lineage_edge_columns( - upstreams.get(ref_temp_table_upstream), - collapsed_lineage, + upstreams[ref_temp_table_upstream] = ( + _merge_lineage_edge_columns( + upstreams.get(ref_temp_table_upstream), + collapsed_lineage, + ) ) else: upstreams[upstream_table_ref] = _merge_lineage_edge_columns( @@ -1004,9 +1004,9 @@ def get_lineage_for_external_table( dataset_urn ) for gcs_dataset_urn in gcs_urns: - schema_metadata_for_gcs: Optional[ - SchemaMetadataClass - ] = graph.get_schema_metadata(gcs_dataset_urn) + schema_metadata_for_gcs: Optional[SchemaMetadataClass] = ( + graph.get_schema_metadata(gcs_dataset_urn) + ) if schema_metadata and schema_metadata_for_gcs: fine_grained_lineage = self.get_fine_grained_lineages_with_gcs( dataset_urn, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py index 2ac40a48de4cc7..8a558d7736a389 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py @@ -387,9 +387,7 @@ class BigqueryQuery: OR protoPayload.metadata.tableDataRead.reason = "JOB" ) -""".strip( - "\t \n" -) +""".strip("\t \n") def bigquery_audit_metadata_query_template_lineage( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index 08c9beaa73c53b..0f9471219c6590 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -271,9 +271,9 @@ def get_workunits_internal( # Preprocessing stage that deduplicates the queries using query hash per usage bucket # Note: FileBackedDict is an ordered dictionary, so the order of execution of # queries is inherently maintained - queries_deduped: FileBackedDict[ - Dict[int, ObservedQuery] - ] = self.deduplicate_queries(queries) + queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] = ( + self.deduplicate_queries(queries) + ) self.report.num_unique_queries = len(queries_deduped) logger.info(f"Found {self.report.num_unique_queries} unique queries") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index f2f6cc731858d1..c2b849e58fc6dc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -763,9 +763,9 @@ def _create_operational_custom_properties( ) if event.query_event.default_dataset: - custom_properties[ - "defaultDatabase" - ] = event.query_event.default_dataset + custom_properties["defaultDatabase"] = ( + event.query_event.default_dataset + ) if event.read_event: if event.read_event.readReason: custom_properties["readReason"] = event.read_event.readReason diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py index dcdccc08ce0483..062c64d45767fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py @@ -91,7 +91,6 @@ class KeyspaceKey(ContainerKey): supported=True, ) class CassandraSource(StatefulIngestionSourceBase): - """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py index 75a0ba0c617734..b467ca0aca6be4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py @@ -107,10 +107,10 @@ class CassandraToSchemaFieldConverter: @staticmethod def get_column_type(cassandra_column_type: str) -> SchemaFieldDataType: - type_class: Optional[ - Type - ] = CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get( - cassandra_column_type + type_class: Optional[Type] = ( + CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get( + cassandra_column_type + ) ) if type_class is None: logger.warning( diff --git a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py index 2b75d0dca53cb7..5ba4dd13fb2ac9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py @@ -293,9 +293,9 @@ def _get_schema_and_fields( def _load_json_schema_with_resolved_references( self, schema: Schema, name: str, subject: str ) -> dict: - imported_json_schemas: List[ - JsonSchemaWrapper - ] = self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject) + imported_json_schemas: List[JsonSchemaWrapper] = ( + self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject) + ) schema_dict = json.loads(schema.schema_str) reference_map = {} for imported_schema in imported_json_schemas: @@ -332,9 +332,9 @@ def _get_schema_fields( ) elif schema.schema_type == "PROTOBUF": - imported_schemas: List[ - ProtobufSchema - ] = self.get_schemas_from_confluent_ref_protobuf(schema) + imported_schemas: List[ProtobufSchema] = ( + self.get_schemas_from_confluent_ref_protobuf(schema) + ) base_name: str = topic.replace(".", "_") fields = protobuf_util.protobuf_schema_to_mce_fields( ProtobufSchema( diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index 42e025073b534e..8ebb7b9ef7fbdf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -371,11 +371,11 @@ def get_resource_workunits( domain: Optional[str], description: Optional[str], ) -> Iterable[MetadataWorkUnit]: - maybe_terms_wu: Optional[ - MetadataWorkUnit - ] = self.get_resource_glossary_terms_work_unit( - entity_urn=entity_urn, - term_associations=term_associations, + maybe_terms_wu: Optional[MetadataWorkUnit] = ( + self.get_resource_glossary_terms_work_unit( + entity_urn=entity_urn, + term_associations=term_associations, + ) ) if maybe_terms_wu: self.report.num_glossary_term_workunits_produced += 1 @@ -389,31 +389,31 @@ def get_resource_workunits( self.report.num_tag_workunits_produced += 1 yield maybe_tags_wu - maybe_owners_wu: Optional[ - MetadataWorkUnit - ] = self.get_resource_owners_work_unit( - entity_urn=entity_urn, - owners=owners, + maybe_owners_wu: Optional[MetadataWorkUnit] = ( + self.get_resource_owners_work_unit( + entity_urn=entity_urn, + owners=owners, + ) ) if maybe_owners_wu: self.report.num_owners_workunits_produced += 1 yield maybe_owners_wu - maybe_domain_wu: Optional[ - MetadataWorkUnit - ] = self.get_resource_domain_work_unit( - entity_urn=entity_urn, - domain=domain, + maybe_domain_wu: Optional[MetadataWorkUnit] = ( + self.get_resource_domain_work_unit( + entity_urn=entity_urn, + domain=domain, + ) ) if maybe_domain_wu: self.report.num_domain_workunits_produced += 1 yield maybe_domain_wu - maybe_description_wu: Optional[ - MetadataWorkUnit - ] = self.get_resource_description_work_unit( - entity_urn=entity_urn, - description=description, + maybe_description_wu: Optional[MetadataWorkUnit] = ( + self.get_resource_description_work_unit( + entity_urn=entity_urn, + description=description, + ) ) if maybe_description_wu: self.report.num_description_workunits_produced += 1 @@ -426,9 +426,9 @@ def process_sub_resource_row( needs_write: bool, ) -> Tuple[EditableSchemaMetadataClass, bool]: field_path: str = sub_resource_row.field_path - term_associations: List[ - GlossaryTermAssociationClass - ] = sub_resource_row.term_associations + term_associations: List[GlossaryTermAssociationClass] = ( + sub_resource_row.term_associations + ) tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations description: Optional[str] = sub_resource_row.description has_terms: bool = len(term_associations) > 0 @@ -517,9 +517,9 @@ def get_sub_resource_work_units(self) -> Iterable[MetadataWorkUnit]: # Boolean field to tell whether we need to write an MCPW. needs_write = False - current_editable_schema_metadata: Optional[ - EditableSchemaMetadataClass - ] = None + current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = ( + None + ) if self.ctx.graph and not self.should_overwrite: # Fetch the current editable schema metadata current_editable_schema_metadata = self.ctx.graph.get_aspect( @@ -655,9 +655,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: entity_urn = row["resource"] entity_type = Urn.from_string(row["resource"]).get_type() - term_associations: List[ - GlossaryTermAssociationClass - ] = self.maybe_extract_glossary_terms(row) + term_associations: List[GlossaryTermAssociationClass] = ( + self.maybe_extract_glossary_terms(row) + ) tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row) owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py index ee105f4862caba..51a25829d21dba 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py @@ -152,7 +152,9 @@ def execute_server_cursor( ) -> Iterable[Dict[str, Any]]: with self.engine.connect() as conn: if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]: - with conn.begin(): # Transaction required for PostgreSQL server-side cursor + with ( + conn.begin() + ): # Transaction required for PostgreSQL server-side cursor # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects. # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results conn = conn.execution_options( @@ -222,7 +224,7 @@ def _parse_row( ) except Exception as e: logger.warning( - f'Failed to parse metadata for {row["urn"]}: {e}', exc_info=True + f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True ) self.report.num_database_parse_errors += 1 self.report.database_parse_errors.setdefault( diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index 5042f6d69b261a..41b59a9c8b892c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -194,20 +194,20 @@ def infer_metadata_endpoint(access_url: str) -> Optional[str]: _DBT_FIELDS_BY_TYPE = { "models": f""" - { _DBT_GRAPHQL_COMMON_FIELDS } - { _DBT_GRAPHQL_NODE_COMMON_FIELDS } - { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS } + {_DBT_GRAPHQL_COMMON_FIELDS} + {_DBT_GRAPHQL_NODE_COMMON_FIELDS} + {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS} dependsOn materializedType """, "seeds": f""" - { _DBT_GRAPHQL_COMMON_FIELDS } - { _DBT_GRAPHQL_NODE_COMMON_FIELDS } - { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS } + {_DBT_GRAPHQL_COMMON_FIELDS} + {_DBT_GRAPHQL_NODE_COMMON_FIELDS} + {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS} """, "sources": f""" - { _DBT_GRAPHQL_COMMON_FIELDS } - { _DBT_GRAPHQL_NODE_COMMON_FIELDS } + {_DBT_GRAPHQL_COMMON_FIELDS} + {_DBT_GRAPHQL_NODE_COMMON_FIELDS} identifier sourceName sourceDescription @@ -218,9 +218,9 @@ def infer_metadata_endpoint(access_url: str) -> Optional[str]: loader """, "snapshots": f""" - { _DBT_GRAPHQL_COMMON_FIELDS } - { _DBT_GRAPHQL_NODE_COMMON_FIELDS } - { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS } + {_DBT_GRAPHQL_COMMON_FIELDS} + {_DBT_GRAPHQL_NODE_COMMON_FIELDS} + {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS} parentsSources {{ uniqueId }} @@ -229,7 +229,7 @@ def infer_metadata_endpoint(access_url: str) -> Optional[str]: }} """, "tests": f""" - { _DBT_GRAPHQL_COMMON_FIELDS } + {_DBT_GRAPHQL_COMMON_FIELDS} state columnName status @@ -315,7 +315,7 @@ def _send_graphql_query( res = response.json() if "errors" in res: raise ValueError( - f'Unable to fetch metadata from dbt Cloud: {res["errors"]}' + f"Unable to fetch metadata from dbt Cloud: {res['errors']}" ) data = res["data"] except JSONDecodeError as e: diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 499e7e1231d050..fa85308b325979 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -506,16 +506,18 @@ class DBTNode: materialization: Optional[str] # table, view, ephemeral, incremental, snapshot # see https://docs.getdbt.com/reference/artifacts/manifest-json catalog_type: Optional[str] - missing_from_catalog: bool # indicates if the node was missing from the catalog.json + missing_from_catalog: ( + bool # indicates if the node was missing from the catalog.json + ) owner: Optional[str] columns: List[DBTColumn] = field(default_factory=list) upstream_nodes: List[str] = field(default_factory=list) # list of upstream dbt_name upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list) - raw_sql_parsing_result: Optional[ - SqlParsingResult - ] = None # only set for nodes that don't depend on ephemeral models + raw_sql_parsing_result: Optional[SqlParsingResult] = ( + None # only set for nodes that don't depend on ephemeral models + ) cll_debug_info: Optional[SqlParsingDebugInfo] = None meta: Dict[str, Any] = field(default_factory=dict) @@ -869,10 +871,10 @@ def create_test_entity_mcps( "platform": DBT_PLATFORM, "name": node.dbt_name, "instance": self.config.platform_instance, + # Ideally we'd include the env unconditionally. However, we started out + # not including env in the guid, so we need to maintain backwards compatibility + # with existing PROD assertions. **( - # Ideally we'd include the env unconditionally. However, we started out - # not including env in the guid, so we need to maintain backwards compatibility - # with existing PROD assertions. {"env": self.config.env} if self.config.env != mce_builder.DEFAULT_ENV and self.config.include_env_in_assertion_guid diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py index 072995c10ebcef..cf2d9670400ca5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py @@ -191,9 +191,9 @@ def authenticate(self, connection_args: "DremioSourceConfig") -> None: ) return else: - assert ( - connection_args.username and connection_args.password - ), "Username and password are required for authentication" + assert connection_args.username and connection_args.password, ( + "Username and password are required for authentication" + ) host = connection_args.hostname port = connection_args.port protocol = "https" if connection_args.tls else "http" diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py index e5d6b8e40fb3d8..482647f8d77da1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py @@ -101,9 +101,9 @@ def add_mapping( Add a new source type if not in the map (e.g., Dremio ARP). """ dremio_source_type = dremio_source_type.upper() - DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[ - dremio_source_type - ] = datahub_source_type + DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = ( + datahub_source_type + ) if category: if category.lower() == "file_object_storage": diff --git a/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py b/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py index 99aa5f54f6a576..ce1c60dcafdd46 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py +++ b/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py @@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter: @staticmethod def get_column_type(elastic_column_type: str) -> SchemaFieldDataType: - type_class: Optional[ - Type - ] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get( - elastic_column_type + type_class: Optional[Type] = ( + ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get( + elastic_column_type + ) ) if type_class is None: logger.warning( diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py index 02b29051dd2ebe..ffcd9218a2103c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py @@ -155,9 +155,9 @@ def _update_report(self, urn: str, entity_type: str) -> None: current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0) self.report.num_hard_deleted_by_type[entity_type] = current_count + 1 if entity_type not in self.report.sample_hard_deleted_aspects_by_type: - self.report.sample_hard_deleted_aspects_by_type[ - entity_type - ] = LossyList() + self.report.sample_hard_deleted_aspects_by_type[entity_type] = ( + LossyList() + ) self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn) def delete_entity(self, urn: str) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py b/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py index 18838af9bdf85f..5196c8ec5b998b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py @@ -141,8 +141,9 @@ def s3_source_overrides(self, source: S3Source) -> S3Source: source.source_config.platform = PLATFORM_GCS source.is_s3_platform = lambda: True # type: ignore - source.create_s3_path = lambda bucket_name, key: unquote(f"s3://{bucket_name}/{key}") # type: ignore - + source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore + f"s3://{bucket_name}/{key}" + ) return source def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index aba0deebd356c5..bde26f97bf271f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -327,7 +327,7 @@ def _is_single_row_query_method(query: Any) -> bool: def _run_with_query_combiner( - method: Callable[Concatenate["_SingleDatasetProfiler", P], None] + method: Callable[Concatenate["_SingleDatasetProfiler", P], None], ) -> Callable[Concatenate["_SingleDatasetProfiler", P], None]: @functools.wraps(method) def inner( @@ -1537,9 +1537,7 @@ def create_bigquery_temp_table( query_job: Optional["google.cloud.bigquery.job.query.QueryJob"] = ( # In google-cloud-bigquery 3.15.0, the _query_job attribute was # made public and renamed to query_job. - cursor.query_job - if hasattr(cursor, "query_job") - else cursor._query_job # type: ignore[attr-defined] + cursor.query_job if hasattr(cursor, "query_job") else cursor._query_job # type: ignore[attr-defined] ) assert query_job temp_destination_table = query_job.destination diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 42d0def0a46e7d..93142a347ca0e6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -220,9 +220,9 @@ def ensure_field_level_settings_are_normalized( ) values[field_level_metric] = False - assert ( - max_num_fields_to_profile is None - ), f"{max_num_fields_to_profile_key} should be set to None" + assert max_num_fields_to_profile is None, ( + f"{max_num_fields_to_profile_key} should be set to None" + ) # Disable expensive queries. if values.get("turn_off_expensive_profiling_metrics"): diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index 8101f0110509e3..9a62ee2dab52f4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -296,9 +296,9 @@ def _create_iceberg_workunit( custom_properties["snapshot-id"] = str( table.current_snapshot().snapshot_id ) - custom_properties[ - "manifest-list" - ] = table.current_snapshot().manifest_list + custom_properties["manifest-list"] = ( + table.current_snapshot().manifest_list + ) dataset_properties = DatasetPropertiesClass( name=table.name()[-1], description=table.metadata.properties.get("comment", None), diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py index 885b6514779cc4..edb9b7b8bd5264 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py @@ -354,9 +354,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield MetadataWorkUnit(id=group_status_wu_id, mcp=group_status_mcp) # Populate GroupMembership Aspects for CorpUsers - datahub_corp_user_urn_to_group_membership: Dict[ - str, GroupMembershipClass - ] = defaultdict(lambda: GroupMembershipClass(groups=[])) + datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = ( + defaultdict(lambda: GroupMembershipClass(groups=[])) + ) if ( self.config.ingest_group_membership and len(self.selected_azure_ad_groups) > 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index dda81b0e34a8d2..5452fbcd3f053b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -344,9 +344,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ).as_workunit() # Step 2: Populate GroupMembership Aspects for CorpUsers - datahub_corp_user_urn_to_group_membership: Dict[ - str, GroupMembershipClass - ] = defaultdict(lambda: GroupMembershipClass(groups=[])) + datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = ( + defaultdict(lambda: GroupMembershipClass(groups=[])) + ) if self.config.ingest_group_membership and okta_groups is not None: # Fetch membership for each group. for okta_group in okta_groups: diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py index fa842a15ba7328..9f15eda1501f11 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py @@ -419,10 +419,10 @@ def _extract_record( custom_props = self.build_custom_properties( topic, topic_detail, extra_topic_config ) - schema_name: Optional[ - str - ] = self.schema_registry_client._get_subject_for_topic( - topic, is_key_schema=False + schema_name: Optional[str] = ( + self.schema_registry_client._get_subject_for_topic( + topic, is_key_schema=False + ) ) if schema_name is not None: custom_props["Schema Name"] = schema_name @@ -610,11 +610,13 @@ def fetch_extra_topic_details(self, topics: List[str]) -> Dict[str, dict]: def fetch_topic_configurations(self, topics: List[str]) -> Dict[str, dict]: logger.info("Fetching config details for all topics") - configs: Dict[ - ConfigResource, concurrent.futures.Future - ] = self.admin_client.describe_configs( - resources=[ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics], - request_timeout=self.source_config.connection.client_timeout_seconds, + configs: Dict[ConfigResource, concurrent.futures.Future] = ( + self.admin_client.describe_configs( + resources=[ + ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics + ], + request_timeout=self.source_config.connection.client_timeout_seconds, + ) ) logger.debug("Waiting for config details futures to complete") concurrent.futures.wait(configs.values()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py index 72be864fc30a1c..9edfce5855f430 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py @@ -110,9 +110,8 @@ def get_connectors_manifest(self) -> Iterable[ConnectorManifest]: connector_manifest = self._get_connector_manifest( connector_name, connector_url ) - if ( - connector_manifest is None - or not self.config.connector_patterns.allowed(connector_manifest.name) + if connector_manifest is None or not self.config.connector_patterns.allowed( + connector_manifest.name ): self.report.report_dropped(connector_name) continue diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py index 2790460c8e6019..10255ed544b812 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py @@ -199,9 +199,9 @@ def get_parser( transforms.append(transform) for key in self.connector_manifest.config.keys(): if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] + transform[key.replace(f"transforms.{name}.", "")] = ( + self.connector_manifest.config[key] + ) if "defaultDataset" in connector_manifest.config: defaultDataset = connector_manifest.config["defaultDataset"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py index 7b3b6e551a0a1f..5e64d4e161e3ea 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py @@ -123,9 +123,9 @@ def get_parser( transforms.append(transform) for key in self.connector_manifest.config.keys(): if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] + transform[key.replace(f"transforms.{name}.", "")] = ( + self.connector_manifest.config[key] + ) return self.JdbcParser( db_connection_url, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 1183916e9b3fef..abe9b5684f8f1f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -596,9 +596,9 @@ class LookerUtil: @staticmethod def _extract_view_from_field(field: str) -> str: - assert ( - field.count(".") == 1 - ), f"Error: A field must be prefixed by a view name, field is: {field}" + assert field.count(".") == 1, ( + f"Error: A field must be prefixed by a view name, field is: {field}" + ) return field.split(".")[0] @staticmethod @@ -815,9 +815,9 @@ class LookerExplore: project_name: Optional[str] = None label: Optional[str] = None description: Optional[str] = None - upstream_views: Optional[ - List[ProjectInclude] - ] = None # captures the view name(s) this explore is derived from + upstream_views: Optional[List[ProjectInclude]] = ( + None # captures the view name(s) this explore is derived from + ) upstream_views_file_path: Dict[str, Optional[str]] = dataclasses_field( default_factory=dict ) # view_name is key and file_path is value. A single file may contains multiple views @@ -889,7 +889,7 @@ def from_dict( upstream_views.extend(parsed_explore.upstream_views or []) else: logger.warning( - f'Could not find extended explore {extended_explore} for explore {dict["name"]} in model {model_name}' + f"Could not find extended explore {extended_explore} for explore {dict['name']} in model {model_name}" ) else: # we only fallback to the view_names list if this is not an extended explore @@ -903,7 +903,7 @@ def from_dict( ) if not info: logger.warning( - f'Could not resolve view {view_name} for explore {dict["name"]} in model {model_name}' + f"Could not resolve view {view_name} for explore {dict['name']} in model {model_name}" ) else: upstream_views.append( @@ -935,9 +935,9 @@ def from_api( # noqa: C901 try: explore = client.lookml_model_explore(model, explore_name) views: Set[str] = set() - lkml_fields: List[ - LookmlModelExploreField - ] = explore_field_set_to_lkml_fields(explore) + lkml_fields: List[LookmlModelExploreField] = ( + explore_field_set_to_lkml_fields(explore) + ) if explore.view_name is not None and explore.view_name != explore.name: # explore is not named after a view and is instead using a from field, which is modeled as view_name. @@ -1034,9 +1034,9 @@ def from_api( # noqa: C901 if measure_field.name is None: continue else: - field_name_vs_raw_explore_field[ - measure_field.name - ] = measure_field + field_name_vs_raw_explore_field[measure_field.name] = ( + measure_field + ) view_fields.append( ViewField( @@ -1072,11 +1072,11 @@ def from_api( # noqa: C901 if view_project_map: logger.debug(f"views and their projects: {view_project_map}") - upstream_views_file_path: Dict[ - str, Optional[str] - ] = create_upstream_views_file_path_map( - lkml_fields=lkml_fields, - view_names=views, + upstream_views_file_path: Dict[str, Optional[str]] = ( + create_upstream_views_file_path_map( + lkml_fields=lkml_fields, + view_names=views, + ) ) if upstream_views_file_path: logger.debug(f"views and their file-paths: {upstream_views_file_path}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py index 4e9d0f68928a45..3ed3186399588e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py @@ -166,9 +166,9 @@ def _get_generic_definition( # e.g. spark1 or hive2 or druid_18 platform = re.sub(r"[0-9]+", "", dialect_name.split("_")[0]) - assert ( - platform is not None - ), f"Failed to extract a valid platform from connection {looker_connection}" + assert platform is not None, ( + f"Failed to extract a valid platform from connection {looker_connection}" + ) db = looker_connection.database schema = looker_connection.schema # ok for this to be None return platform, db, schema diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 8487d5113bc1d3..2f1fcd378d40fb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -250,9 +250,9 @@ def _set_test_connection_capability( @staticmethod def _extract_view_from_field(field: str) -> str: - assert ( - field.count(".") == 1 - ), f"Error: A field must be prefixed by a view name, field is: {field}" + assert field.count(".") == 1, ( + f"Error: A field must be prefixed by a view name, field is: {field}" + ) return field.split(".")[0] def _get_views_from_fields(self, fields: List[str]) -> List[str]: @@ -610,12 +610,12 @@ def _get_folder_browse_path_v2_entries( def _create_platform_instance_aspect( self, ) -> DataPlatformInstance: - assert ( - self.source_config.platform_name - ), "Platform name is not set in the configuration." - assert ( - self.source_config.platform_instance - ), "Platform instance is not set in the configuration." + assert self.source_config.platform_name, ( + "Platform name is not set in the configuration." + ) + assert self.source_config.platform_instance, ( + "Platform instance is not set in the configuration." + ) return DataPlatformInstance( platform=builder.make_data_platform_urn(self.source_config.platform_name), @@ -1016,9 +1016,9 @@ def _make_dashboard_and_chart_mces( yield from chart_events # Step 2: Emit metadata events for the Dashboard itself. - chart_urns: Set[ - str - ] = set() # Collect the unique child chart urns for dashboard input lineage. + chart_urns: Set[str] = ( + set() + ) # Collect the unique child chart urns for dashboard input lineage. for chart_event in chart_events: chart_event_urn = self._extract_event_urn(chart_event) if chart_event_urn: @@ -1538,20 +1538,20 @@ def extract_independent_looks(self) -> Iterable[MetadataWorkUnit]: } ) - dashboard_element: Optional[ - LookerDashboardElement - ] = self._get_looker_dashboard_element( - DashboardElement( - id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes), - # we add the "looks_" prefix to look.id. - title=look.title, - subtitle_text=look.description, - look_id=look.id, - dashboard_id=None, # As this is an independent look - look=LookWithQuery( - query=query, folder=look.folder, user_id=look.user_id + dashboard_element: Optional[LookerDashboardElement] = ( + self._get_looker_dashboard_element( + DashboardElement( + id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes), + # we add the "looks_" prefix to look.id. + title=look.title, + subtitle_text=look.description, + look_id=look.id, + dashboard_id=None, # As this is an independent look + look=LookWithQuery( + query=query, folder=look.folder, user_id=look.user_id + ), ), - ), + ) ) if dashboard_element is not None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py index 6d49d57e077435..2bcae4d46b8d52 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py @@ -33,9 +33,9 @@ class SpecialVariable: - SPECIAL_VARIABLE_PATTERN: ClassVar[ - str - ] = r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b" + SPECIAL_VARIABLE_PATTERN: ClassVar[str] = ( + r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b" + ) liquid_variable: dict def __init__(self, liquid_variable): diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py index 098d7d73a3da84..05806840b5c954 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py @@ -257,9 +257,9 @@ def _process_entity_timeseries_rows( for row in rows: logger.debug(row) - entity_stat_aspect[ - self.get_entity_stat_key(row) - ] = self.to_entity_timeseries_stat_aspect(row) + entity_stat_aspect[self.get_entity_stat_key(row)] = ( + self.to_entity_timeseries_stat_aspect(row) + ) return entity_stat_aspect @@ -385,10 +385,8 @@ def generate_usage_stat_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: entity_rows: List[Dict] = self._execute_query( entity_query_with_filters, "entity_query" ) - entity_usage_stat: Dict[ - Tuple[str, str], Any - ] = self._process_entity_timeseries_rows( - entity_rows + entity_usage_stat: Dict[Tuple[str, str], Any] = ( + self._process_entity_timeseries_rows(entity_rows) ) # Any type to pass mypy unbound Aspect type error user_wise_query_with_filters: LookerQuery = self._append_filters( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py index 103f4175a9ccff..4e38165bb56286 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py @@ -38,16 +38,16 @@ def merge_parent_and_child_fields( # Create a map field-name vs field child_field_map: dict = {} for field in child_fields: - assert ( - NAME in field - ), "A lookml view must have a name field" # name is required field of lookml field array + assert NAME in field, ( + "A lookml view must have a name field" + ) # name is required field of lookml field array child_field_map[field[NAME]] = field for field in parent_fields: - assert ( - NAME in field - ), "A lookml view must have a name field" # name is required field of lookml field array + assert NAME in field, ( + "A lookml view must have a name field" + ) # name is required field of lookml field array if field[NAME] in child_field_map: # Fields defined in the child view take higher precedence. diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index c7d3724472d3c8..a8575c84b510d5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -482,14 +482,14 @@ def get_project_name(self, model_name: str) -> str: if self.source_config.project_name is not None: return self.source_config.project_name - assert ( - self.looker_client is not None - ), "Failed to find a configured Looker API client" + assert self.looker_client is not None, ( + "Failed to find a configured Looker API client" + ) try: model = self.looker_client.lookml_model(model_name, fields="project_name") - assert ( - model.project_name is not None - ), f"Failed to find a project name for model {model_name}" + assert model.project_name is not None, ( + f"Failed to find a project name for model {model_name}" + ) return model.project_name except SDKError: raise ValueError( @@ -541,9 +541,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.reporter.git_clone_latency = datetime.now() - start_time self.source_config.base_folder = checkout_dir.resolve() - self.base_projects_folder[ - BASE_PROJECT_NAME - ] = self.source_config.base_folder + self.base_projects_folder[BASE_PROJECT_NAME] = ( + self.source_config.base_folder + ) visited_projects: Set[str] = set() @@ -641,9 +641,9 @@ def _recursively_check_manifests( repo_url=remote_project.url, ) - self.base_projects_folder[ - remote_project.name - ] = p_checkout_dir.resolve() + self.base_projects_folder[remote_project.name] = ( + p_checkout_dir.resolve() + ) repo = p_cloner.get_last_repo_cloned() assert repo remote_git_info = GitInfo( @@ -930,9 +930,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 logger.warning( f"view {maybe_looker_view.id.view_name} from model {model_name}, connection {model.connection} was previously processed via model {prev_model_name}, connection {prev_model_connection} and will likely lead to incorrect lineage to the underlying tables" ) - if ( - not self.source_config.emit_reachable_views_only - ): + if not self.source_config.emit_reachable_views_only: logger.warning( "Consider enabling the `emit_reachable_views_only` flag to handle this case." ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py index 971181e4300d69..f77eebb3cdd8cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py @@ -484,11 +484,11 @@ def __init__( ) def __get_upstream_dataset_urn(self) -> List[str]: - current_view_id: Optional[ - LookerViewId - ] = self.looker_view_id_cache.get_looker_view_id( - view_name=self.view_context.name(), - base_folder_path=self.view_context.base_folder_path, + current_view_id: Optional[LookerViewId] = ( + self.looker_view_id_cache.get_looker_view_id( + view_name=self.view_context.name(), + base_folder_path=self.view_context.base_folder_path, + ) ) # Current view will always be present in cache. assert will silence the lint diff --git a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py index b0b04dff20bffc..02125db83d2582 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py @@ -172,10 +172,10 @@ def _get_mlflow_registered_models(self) -> Iterable[RegisteredModel]: """ Get all Registered Models in MLflow Model Registry. """ - registered_models: Iterable[ - RegisteredModel - ] = self._traverse_mlflow_search_func( - search_func=self.client.search_registered_models, + registered_models: Iterable[RegisteredModel] = ( + self._traverse_mlflow_search_func( + search_func=self.client.search_registered_models, + ) ) return registered_models diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index bbc4897d227bac..ad8487c1a759ec 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -288,7 +288,9 @@ def __init__(self, ctx: PipelineContext, config: MongoDBConfig): # See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes self.mongo_client = MongoClient( - self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options # type: ignore + self.config.connect_uri, + datetime_conversion="DATETIME_AUTO", + **options, # type: ignore ) # This cheaply tests the connection. For details, see @@ -470,9 +472,9 @@ def _infer_schema_metadata( ) # Add this information to the custom properties so user can know they are looking at downsampled schema dataset_properties.customProperties["schema.downsampled"] = "True" - dataset_properties.customProperties[ - "schema.totalFields" - ] = f"{collection_schema_size}" + dataset_properties.customProperties["schema.totalFields"] = ( + f"{collection_schema_size}" + ) logger.debug(f"Size of collection fields = {len(collection_fields)}") # append each schema field (sort so output is consistent) diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index 7f446f6d1c2718..52b1386e21d85a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -184,9 +184,9 @@ def validator_site_url_to_site_name(cls, values): @validator("site_url") def validator_site_url(cls, site_url: str) -> str: - assert site_url.startswith( - ("http://", "https://") - ), "site_url must start with http:// or https://" + assert site_url.startswith(("http://", "https://")), ( + "site_url must start with http:// or https://" + ) if not site_url.endswith("/"): site_url = site_url + "/" @@ -487,9 +487,7 @@ def rest_api_base_url(self): def get_report(self) -> SourceReport: return self.report - def update_flow( - self, pg_flow_dto: Dict, recursion_level: int = 0 - ) -> None: # noqa: C901 + def update_flow(self, pg_flow_dto: Dict, recursion_level: int = 0) -> None: # noqa: C901 """ Update self.nifi_flow with contents of the input process group `pg_flow_dto` """ @@ -548,16 +546,16 @@ def update_flow( for inputPort in flow_dto.get("inputPorts", []): component = inputPort.get("component") if inputPort.get("allowRemoteAccess"): - self.nifi_flow.remotely_accessible_ports[ - component.get("id") - ] = NifiComponent( - component.get("id"), - component.get("name"), - component.get("type"), - component.get("parentGroupId"), - NifiType.INPUT_PORT, - comments=component.get("comments"), - status=component.get("status", {}).get("runStatus"), + self.nifi_flow.remotely_accessible_ports[component.get("id")] = ( + NifiComponent( + component.get("id"), + component.get("name"), + component.get("type"), + component.get("parentGroupId"), + NifiType.INPUT_PORT, + comments=component.get("comments"), + status=component.get("status", {}).get("runStatus"), + ) ) logger.debug(f"Adding remotely accessible port {component.get('id')}") else: @@ -576,16 +574,16 @@ def update_flow( for outputPort in flow_dto.get("outputPorts", []): component = outputPort.get("component") if outputPort.get("allowRemoteAccess"): - self.nifi_flow.remotely_accessible_ports[ - component.get("id") - ] = NifiComponent( - component.get("id"), - component.get("name"), - component.get("type"), - component.get("parentGroupId"), - NifiType.OUTPUT_PORT, - comments=component.get("comments"), - status=component.get("status", {}).get("runStatus"), + self.nifi_flow.remotely_accessible_ports[component.get("id")] = ( + NifiComponent( + component.get("id"), + component.get("name"), + component.get("type"), + component.get("parentGroupId"), + NifiType.OUTPUT_PORT, + comments=component.get("comments"), + status=component.get("status", {}).get("runStatus"), + ) ) logger.debug(f"Adding remotely accessible port {component.get('id')}") else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py index 8289265483d598..2075e999ea1d0e 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py @@ -101,16 +101,16 @@ def get_swagger(self) -> Dict: # details there once, and then use that session for all requests. self.token = f"Bearer {self.bearer_token}" else: - assert ( - "url_complement" in self.get_token.keys() - ), "When 'request_type' is set to 'get', an url_complement is needed for the request." + assert "url_complement" in self.get_token.keys(), ( + "When 'request_type' is set to 'get', an url_complement is needed for the request." + ) if self.get_token["request_type"] == "get": - assert ( - "{username}" in self.get_token["url_complement"] - ), "we expect the keyword {username} to be present in the url" - assert ( - "{password}" in self.get_token["url_complement"] - ), "we expect the keyword {password} to be present in the url" + assert "{username}" in self.get_token["url_complement"], ( + "we expect the keyword {username} to be present in the url" + ) + assert "{password}" in self.get_token["url_complement"], ( + "we expect the keyword {password} to be present in the url" + ) url4req = self.get_token["url_complement"].replace( "{username}", self.username ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index b49d40a0c7eb6a..14beab6bc9391e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -225,9 +225,9 @@ def report_charts_dropped(self, view: str) -> None: def default_for_dataset_type_mapping() -> Dict[str, str]: dict_: dict = {} for item in SupportedDataPlatform: - dict_[ - item.value.powerbi_data_platform_name - ] = item.value.datahub_data_platform_name + dict_[item.value.powerbi_data_platform_name] = ( + item.value.datahub_data_platform_name + ) return dict_ @@ -303,15 +303,15 @@ class PowerBiDashboardSourceConfig( # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on. - dataset_type_mapping: Union[ - Dict[str, str], Dict[str, PlatformDetail] - ] = pydantic.Field( - default_factory=default_for_dataset_type_mapping, - description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to " - "DataHub supported datasources." - "You can configured platform instance for dataset lineage. " - "See Quickstart Recipe for mapping", - hidden_from_docs=True, + dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = ( + pydantic.Field( + default_factory=default_for_dataset_type_mapping, + description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to " + "DataHub supported datasources." + "You can configured platform instance for dataset lineage. " + "See Quickstart Recipe for mapping", + hidden_from_docs=True, + ) ) # PowerBI datasource's server to platform instance mapping server_to_platform_instance: Dict[ diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 2a5de7494920b2..759fc6d7dadfba 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -128,17 +128,17 @@ def get_upstream_tables( reporter.m_query_parse_successes += 1 try: - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = resolver.MQueryResolver( - table=table, - parse_tree=parse_tree, - reporter=reporter, - parameters=parameters, - ).resolve_to_lineage( - ctx=ctx, - config=config, - platform_instance_resolver=platform_instance_resolver, + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + resolver.MQueryResolver( + table=table, + parse_tree=parse_tree, + reporter=reporter, + parameters=parameters, + ).resolve_to_lineage( + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) if lineage: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py index 63520bd731de86..54b810650f5854 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py @@ -170,8 +170,7 @@ def create_reference_table( logger.debug(f"Processing arguments {arguments}") if ( - len(arguments) - >= 4 # [0] is warehouse FQDN. + len(arguments) >= 4 # [0] is warehouse FQDN. # [1] is endpoint, we are not using it. # [2] is "Catalog" key # [3] is catalog's value @@ -215,16 +214,16 @@ def parse_custom_sql( native_sql_parser.remove_special_characters(query) ) - parsed_result: Optional[ - "SqlParsingResult" - ] = native_sql_parser.parse_custom_sql( - ctx=self.ctx, - query=query, - platform=self.get_platform_pair().datahub_data_platform_name, - platform_instance=platform_detail.platform_instance, - env=platform_detail.env, - database=database, - schema=schema, + parsed_result: Optional["SqlParsingResult"] = ( + native_sql_parser.parse_custom_sql( + ctx=self.ctx, + query=query, + platform=self.get_platform_pair().datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + database=database, + schema=schema, + ) ) if parsed_result is None: @@ -410,9 +409,9 @@ def create_lineage( f"Processing Databrick data-access function detail {data_access_func_detail}" ) table_detail: Dict[str, str] = {} - temp_accessor: Optional[ - IdentifierAccessor - ] = data_access_func_detail.identifier_accessor + temp_accessor: Optional[IdentifierAccessor] = ( + data_access_func_detail.identifier_accessor + ) while temp_accessor: # Condition to handle databricks M-query pattern where table, schema and database all are present in @@ -647,11 +646,13 @@ def create_lineage( db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore # Second is schema name schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore + IdentifierAccessor, + data_access_func_detail.identifier_accessor.next, # type: ignore ).items["Name"] # Third is table name table_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore + IdentifierAccessor, + data_access_func_detail.identifier_accessor.next.next, # type: ignore ).items["Name"] qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" @@ -768,10 +769,13 @@ def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: ): # database name is explicitly set return database - return get_next_item( # database name is set in Name argument - data_access_tokens, "Name" - ) or get_next_item( # If both above arguments are not available, then try Catalog - data_access_tokens, "Catalog" + return ( + get_next_item( # database name is set in Name argument + data_access_tokens, "Name" + ) + or get_next_item( # If both above arguments are not available, then try Catalog + data_access_tokens, "Catalog" + ) ) def create_lineage( @@ -819,9 +823,7 @@ def create_lineage( values=tree_function.remove_whitespaces_from_list( tree_function.token_values(flat_argument_list[1]) ), - )[ - 0 - ] # Remove any whitespaces and double quotes character + )[0] # Remove any whitespaces and double quotes character server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 2756a113d1ef0c..42963c08d992d1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -188,9 +188,9 @@ def _process_invoke_expression( # - The inner function Table.TransformColumnTypes takes #"Removed Columns1" # (a table reference) as its first argument # - Its result is then passed as the first argument to Table.SplitColumn - second_invoke_expression: Optional[ - Tree - ] = tree_function.first_invoke_expression_func(first_argument) + second_invoke_expression: Optional[Tree] = ( + tree_function.first_invoke_expression_func(first_argument) + ) if second_invoke_expression: # 1. The First argument is function call # 2. That function's first argument references next table variable @@ -304,14 +304,14 @@ def internal( logger.debug(v_statement.pretty()) return None - invoke_expression: Optional[ - Tree - ] = tree_function.first_invoke_expression_func(rh_tree) + invoke_expression: Optional[Tree] = ( + tree_function.first_invoke_expression_func(rh_tree) + ) if invoke_expression is not None: - result: Union[ - DataAccessFunctionDetail, List[str], None - ] = self._process_invoke_expression(invoke_expression) + result: Union[DataAccessFunctionDetail, List[str], None] = ( + self._process_invoke_expression(invoke_expression) + ) if result is None: return None # No need to process some un-expected grammar found while processing invoke_expression if isinstance(result, DataAccessFunctionDetail): @@ -368,9 +368,9 @@ def resolve_to_lineage( return lineage # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail - table_links: List[ - DataAccessFunctionDetail - ] = self.create_data_access_functional_detail(output_variable) + table_links: List[DataAccessFunctionDetail] = ( + self.create_data_access_functional_detail(output_variable) + ) # Each item is data-access function for f_detail in table_links: @@ -390,7 +390,7 @@ def resolve_to_lineage( # From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it # & also pass additional information that will be need to generate lineage - pattern_handler: (AbstractLineage) = supported_resolver.handler()( + pattern_handler: AbstractLineage = supported_resolver.handler()( ctx=ctx, table=self.table, config=config, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 044946a5d308d1..5e5636f2d50fe3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -945,9 +945,9 @@ def to_datahub_work_units( # Convert tiles to charts ds_mcps, chart_mcps = self.to_datahub_chart(dashboard.tiles, workspace) # Lets convert dashboard to datahub dashboard - dashboard_mcps: List[ - MetadataChangeProposalWrapper - ] = self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps) + dashboard_mcps: List[MetadataChangeProposalWrapper] = ( + self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps) + ) # Now add MCPs in sequence mcps.extend(ds_mcps) @@ -1472,9 +1472,9 @@ def get_workspace_workunit( def _get_dashboard_patch_work_unit( self, work_unit: MetadataWorkUnit ) -> Optional[MetadataWorkUnit]: - dashboard_info_aspect: Optional[ - DashboardInfoClass - ] = work_unit.get_aspect_of_type(DashboardInfoClass) + dashboard_info_aspect: Optional[DashboardInfoClass] = ( + work_unit.get_aspect_of_type(DashboardInfoClass) + ) if dashboard_info_aspect and self.source_config.patch_metadata: return convert_dashboard_info_to_patch( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index 161975fa635fdb..927840c44bf0b0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -425,9 +425,9 @@ def itr_pages( response.raise_for_status() - assert ( - Constant.VALUE in response.json() - ), "'value' key is not present in paginated response" + assert Constant.VALUE in response.json(), ( + "'value' key is not present in paginated response" + ) if not response.json()[Constant.VALUE]: # if it is an empty list then break break @@ -447,13 +447,13 @@ def get_app( if raw_app is None: return None - assert ( - Constant.ID in raw_app - ), f"{Constant.ID} is required field not present in server response" + assert Constant.ID in raw_app, ( + f"{Constant.ID} is required field not present in server response" + ) - assert ( - Constant.NAME in raw_app - ), f"{Constant.NAME} is required field not present in server response" + assert Constant.NAME in raw_app, ( + f"{Constant.NAME} is required field not present in server response" + ) return App( id=raw_app[Constant.ID], diff --git a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_api.py b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_api.py index 31b0731aaa751c..10b062c98c147f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_api.py @@ -156,7 +156,7 @@ def _get_sheet( ) if chart: if not chart.title: - chart.title = f"Object {i+1} of Sheet '{sheet.title}'" + chart.title = f"Object {i + 1} of Sheet '{sheet.title}'" sheet.charts.append(chart) websocket_connection.handle.pop() return sheet diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py index cad48eaf1c2375..932ada0a908b28 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py @@ -178,9 +178,9 @@ class RedshiftConfig( @root_validator(pre=True) def check_email_is_set_on_usage(cls, values): if values.get("include_usage_statistics"): - assert ( - "email_domain" in values and values["email_domain"] - ), "email_domain needs to be set if usage is enabled" + assert "email_domain" in values and values["email_domain"], ( + "email_domain needs to be set if usage is enabled" + ) return values @root_validator(skip_on_failure=True) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 9bfca941ce48fb..cce282c71056a2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -305,13 +305,13 @@ def test_connection(config_dict: dict) -> TestConnectionReport: test_report.capability_report = {} try: RedshiftDataDictionary.get_schemas(connection, database=config.database) - test_report.capability_report[ - SourceCapability.SCHEMA_METADATA - ] = CapabilityReport(capable=True) + test_report.capability_report[SourceCapability.SCHEMA_METADATA] = ( + CapabilityReport(capable=True) + ) except Exception as e: - test_report.capability_report[ - SourceCapability.SCHEMA_METADATA - ] = CapabilityReport(capable=False, failure_reason=str(e)) + test_report.capability_report[SourceCapability.SCHEMA_METADATA] = ( + CapabilityReport(capable=False, failure_reason=str(e)) + ) except Exception as e: test_report.basic_connectivity = CapabilityReport( @@ -947,9 +947,9 @@ def cache_tables_and_views(self, connection, database): def get_all_tables( self, ) -> Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]]: - all_tables: Dict[ - str, Dict[str, List[Union[RedshiftView, RedshiftTable]]] - ] = defaultdict(dict) + all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]] = ( + defaultdict(dict) + ) for db in set().union(self.db_tables, self.db_views): tables = self.db_tables.get(db, {}) views = self.db_views.get(db, {}) @@ -967,9 +967,9 @@ def extract_usage( all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]], ) -> Iterable[MetadataWorkUnit]: with PerfTimer() as timer: - redundant_usage_run_skip_handler: Optional[ - RedundantUsageRunSkipHandler - ] = None + redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = ( + None + ) if self.config.enable_stateful_usage_ingestion: redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler( source=self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py index d66a1ee18be40f..a5758bdd825702 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py @@ -199,10 +199,10 @@ def _get_workunits_internal( end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT), database=self.config.database, ) - access_events_iterable: Iterable[ - RedshiftAccessEvent - ] = self._gen_access_events_from_history_query( - query, connection=self.connection, all_tables=all_tables + access_events_iterable: Iterable[RedshiftAccessEvent] = ( + self._gen_access_events_from_history_query( + query, connection=self.connection, all_tables=all_tables + ) ) aggregated_events: AggregatedAccessEvents = self._aggregate_access_events( @@ -225,10 +225,10 @@ def _gen_operation_aspect_workunits( start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT), end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT), ) - access_events_iterable: Iterable[ - RedshiftAccessEvent - ] = self._gen_access_events_from_history_query( - query, connection, all_tables=all_tables + access_events_iterable: Iterable[RedshiftAccessEvent] = ( + self._gen_access_events_from_history_query( + query, connection, all_tables=all_tables + ) ) # Generate operation aspect work units from the access events diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py index 89c092875e4490..58e930eb6e809c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py @@ -85,8 +85,8 @@ def ensure_field_level_settings_are_normalized( if field_level_metric.startswith("include_field_"): values.setdefault(field_level_metric, False) - assert ( - max_num_fields_to_profile is None - ), f"{max_num_fields_to_profile_key} should be set to None" + assert max_num_fields_to_profile is None, ( + f"{max_num_fields_to_profile_key} should be set to None" + ) return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py index 88679efdf5fc31..66e0e6b741d1ff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py +++ b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py @@ -236,12 +236,12 @@ def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None: try: if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN: logger.debug("Access Token Provided in Config") - assert ( - self.config.access_token is not None - ), "Config access_token is required for DIRECT_ACCESS_TOKEN auth" - assert ( - self.config.instance_url is not None - ), "Config instance_url is required for DIRECT_ACCESS_TOKEN auth" + assert self.config.access_token is not None, ( + "Config access_token is required for DIRECT_ACCESS_TOKEN auth" + ) + assert self.config.instance_url is not None, ( + "Config instance_url is required for DIRECT_ACCESS_TOKEN auth" + ) self.sf = Salesforce( instance_url=self.config.instance_url, @@ -250,15 +250,15 @@ def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None: ) elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD: logger.debug("Username/Password Provided in Config") - assert ( - self.config.username is not None - ), "Config username is required for USERNAME_PASSWORD auth" - assert ( - self.config.password is not None - ), "Config password is required for USERNAME_PASSWORD auth" - assert ( - self.config.security_token is not None - ), "Config security_token is required for USERNAME_PASSWORD auth" + assert self.config.username is not None, ( + "Config username is required for USERNAME_PASSWORD auth" + ) + assert self.config.password is not None, ( + "Config password is required for USERNAME_PASSWORD auth" + ) + assert self.config.security_token is not None, ( + "Config security_token is required for USERNAME_PASSWORD auth" + ) self.sf = Salesforce( username=self.config.username, @@ -269,15 +269,15 @@ def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None: elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN: logger.debug("Json Web Token provided in the config") - assert ( - self.config.username is not None - ), "Config username is required for JSON_WEB_TOKEN auth" - assert ( - self.config.consumer_key is not None - ), "Config consumer_key is required for JSON_WEB_TOKEN auth" - assert ( - self.config.private_key is not None - ), "Config private_key is required for JSON_WEB_TOKEN auth" + assert self.config.username is not None, ( + "Config username is required for JSON_WEB_TOKEN auth" + ) + assert self.config.consumer_key is not None, ( + "Config consumer_key is required for JSON_WEB_TOKEN auth" + ) + assert self.config.private_key is not None, ( + "Config private_key is required for JSON_WEB_TOKEN auth" + ) self.sf = Salesforce( username=self.config.username, @@ -439,7 +439,8 @@ def get_platform_instance_workunit(self, datasetUrn: str) -> MetadataWorkUnit: dataPlatformInstance = DataPlatformInstanceClass( builder.make_data_platform_urn(self.platform), instance=builder.make_dataplatform_instance_urn( - self.platform, self.config.platform_instance # type:ignore + self.platform, + self.config.platform_instance, # type:ignore ), ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma.py b/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma.py index e96eeb58d96efe..0468792f44aabb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma.py @@ -477,9 +477,9 @@ def _gen_elements_workunit( upstream_dataset_urns and dataset_urn not in self.dataset_upstream_urn_mapping ): - self.dataset_upstream_urn_mapping[ - dataset_urn - ] = upstream_dataset_urns + self.dataset_upstream_urn_mapping[dataset_urn] = ( + upstream_dataset_urns + ) element_input_fields = [ InputFieldClass( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma_api.py b/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma_api.py index 3e88f43142ede6..6762302ebe57c7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma_api.py @@ -126,9 +126,9 @@ def fill_workspaces(self) -> None: response.raise_for_status() response_dict = response.json() for workspace_dict in response_dict[Constant.ENTRIES]: - self.workspaces[ - workspace_dict[Constant.WORKSPACEID] - ] = Workspace.parse_obj(workspace_dict) + self.workspaces[workspace_dict[Constant.WORKSPACEID]] = ( + Workspace.parse_obj(workspace_dict) + ) if response_dict[Constant.NEXTPAGE]: url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}" else: @@ -147,9 +147,9 @@ def _get_users(self) -> Dict[str, str]: response.raise_for_status() response_dict = response.json() for user_dict in response_dict[Constant.ENTRIES]: - users[ - user_dict[Constant.MEMBERID] - ] = f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}" + users[user_dict[Constant.MEMBERID]] = ( + f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}" + ) if response_dict[Constant.NEXTPAGE]: url = f"{members_url}&page={response_dict[Constant.NEXTPAGE]}" else: @@ -327,10 +327,12 @@ def get_page_elements(self, workbook: Workbook, page: Page) -> List[Element]: response.raise_for_status() for i, element_dict in enumerate(response.json()[Constant.ENTRIES]): if not element_dict.get(Constant.NAME): - element_dict[Constant.NAME] = f"Element {i+1} of Page '{page.name}'" - element_dict[ - Constant.URL - ] = f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true" + element_dict[Constant.NAME] = ( + f"Element {i + 1} of Page '{page.name}'" + ) + element_dict[Constant.URL] = ( + f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true" + ) element = Element.parse_obj(element_dict) if ( self.config.extract_lineage diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index b14e51a982082c..5f732e2621656f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -384,18 +384,20 @@ def validate_shares( assert all( consumer.platform_instance != share_details.platform_instance for consumer in share_details.consumers - ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake." + ), ( + "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake." + ) databases_included_in_share.append(shared_db) databases_created_from_share.extend(share_details.consumers) for db_from_share in databases_created_from_share: - assert ( - db_from_share not in databases_included_in_share - ), "Database included in a share can not be present as consumer in any share." - assert ( - databases_created_from_share.count(db_from_share) == 1 - ), "Same database can not be present as consumer in more than one share." + assert db_from_share not in databases_included_in_share, ( + "Database included in a share can not be present as consumer in any share." + ) + assert databases_created_from_share.count(db_from_share) == 1, ( + "Same database can not be present as consumer in more than one share." + ) return shares diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py index 2239338972d9be..2854a99198d62b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py @@ -250,9 +250,9 @@ def get_connect_args(self) -> dict: if self.private_key is not None: pkey_bytes = self.private_key.replace("\\n", "\n").encode() else: - assert ( - self.private_key_path - ), "missing required private key path to read key from" + assert self.private_key_path, ( + "missing required private key path to read key from" + ) with open(self.private_key_path, "rb") as key: pkey_bytes = key.read() @@ -284,9 +284,9 @@ def get_options(self) -> dict: return self.options def get_oauth_connection(self) -> NativeSnowflakeConnection: - assert ( - self.oauth_config - ), "oauth_config should be provided if using oauth based authentication" + assert self.oauth_config, ( + "oauth_config should be provided if using oauth based authentication" + ) generator = OAuthTokenGenerator( client_id=self.oauth_config.client_id, authority_url=self.oauth_config.authority_url, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index b82734cbbe84ea..69d0b62a8edfdf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -623,7 +623,7 @@ def _build_enriched_query_log_query( query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3) AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3) AND execution_status = 'SUCCESS' - AND {users_filter or 'TRUE'} + AND {users_filter or "TRUE"} ) , deduplicated_queries as ( SELECT @@ -651,7 +651,7 @@ def _build_enriched_query_log_query( WHERE query_start_time >= to_timestamp_ltz({start_time_millis}, 3) AND query_start_time < to_timestamp_ltz({end_time_millis}, 3) - AND {users_filter or 'TRUE'} + AND {users_filter or "TRUE"} AND query_id IN ( SELECT query_id FROM deduplicated_queries ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index d165be3f3cc656..173024aec0cf38 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -142,9 +142,9 @@ def __init__(self) -> None: ) # self._table_tags[][][] = list of tags applied to table - self._table_tags: Dict[ - str, Dict[str, Dict[str, List[SnowflakeTag]]] - ] = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + self._table_tags: Dict[str, Dict[str, Dict[str, List[SnowflakeTag]]]] = ( + defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + ) # self._column_tags[][][][] = list of tags applied to column self._column_tags: Dict[ diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 393e4d3c96d51f..a2d69d9e552916 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -194,9 +194,9 @@ def __init__( config, self.data_dictionary, self.report ) self.profiler: Optional[SnowflakeProfiler] = profiler - self.snowsight_url_builder: Optional[ - SnowsightUrlBuilder - ] = snowsight_url_builder + self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = ( + snowsight_url_builder + ) # These are populated as side-effects of get_workunits_internal. self.databases: List[SnowflakeDatabase] = [] @@ -267,9 +267,9 @@ def get_databases(self) -> Optional[List[SnowflakeDatabase]]: ) return None else: - ischema_databases: List[ - SnowflakeDatabase - ] = self.get_databases_from_ischema(databases) + ischema_databases: List[SnowflakeDatabase] = ( + self.get_databases_from_ischema(databases) + ) if len(ischema_databases) == 0: self.structured_reporter.failure( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_tag.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_tag.py index 75567cc3da8830..597e7bee4d4cc0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_tag.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_tag.py @@ -38,9 +38,9 @@ def _get_tags_on_object_without_propagation( table_name: Optional[str], ) -> List[SnowflakeTag]: if db_name not in self.tag_cache: - self.tag_cache[ - db_name - ] = self.data_dictionary.get_tags_for_database_without_propagation(db_name) + self.tag_cache[db_name] = ( + self.data_dictionary.get_tags_for_database_without_propagation(db_name) + ) if domain == SnowflakeObjectDomain.DATABASE: return self.tag_cache[db_name].get_database_tags(db_name) @@ -130,10 +130,10 @@ def get_column_tags_for_table( temp_column_tags: Dict[str, List[SnowflakeTag]] = {} if self.config.extract_tags == TagOption.without_lineage: if db_name not in self.tag_cache: - self.tag_cache[ - db_name - ] = self.data_dictionary.get_tags_for_database_without_propagation( - db_name + self.tag_cache[db_name] = ( + self.data_dictionary.get_tags_for_database_without_propagation( + db_name + ) ) temp_column_tags = self.tag_cache[db_name].get_column_tags_for_table( table_name, schema_name, db_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index 85e4071aec07df..edd13ee48326bb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -549,9 +549,9 @@ def parse_event_objects(self, event_dict: Dict) -> None: ): # NOTE: Generated emails may be incorrect, as email may be different than # username@email_domain - event_dict[ - "EMAIL" - ] = f'{event_dict["USER_NAME"]}@{self.config.email_domain}'.lower() + event_dict["EMAIL"] = ( + f"{event_dict['USER_NAME']}@{self.config.email_domain}".lower() + ) if not event_dict["EMAIL"]: self.report.rows_missing_email += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 885bee1ccdb908..030edfde4ca1da 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -21,8 +21,7 @@ class SnowflakeStructuredReportMixin(abc.ABC): @property @abc.abstractmethod - def structured_reporter(self) -> SourceReport: - ... + def structured_reporter(self) -> SourceReport: ... class SnowsightUrlBuilder: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index b8afd145727400..b4ef2180d71d45 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -211,9 +211,9 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.usage_extractor: Optional[SnowflakeUsageExtractor] = None if self.config.include_usage_stats or self.config.include_operational_stats: - redundant_usage_run_skip_handler: Optional[ - RedundantUsageRunSkipHandler - ] = None + redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = ( + None + ) if self.config.enable_stateful_usage_ingestion: redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler( source=self, @@ -296,7 +296,16 @@ class SnowflakePrivilege: _report: Dict[Union[SourceCapability, str], CapabilityReport] = dict() privileges: List[SnowflakePrivilege] = [] - capabilities: List[SourceCapability] = [c.capability for c in SnowflakeV2Source.get_capabilities() if c.capability not in (SourceCapability.PLATFORM_INSTANCE, SourceCapability.DOMAINS, SourceCapability.DELETION_DETECTION)] # type: ignore + capabilities: List[SourceCapability] = [ + c.capability + for c in SnowflakeV2Source.get_capabilities() # type: ignore + if c.capability + not in ( + SourceCapability.PLATFORM_INSTANCE, + SourceCapability.DOMAINS, + SourceCapability.DELETION_DETECTION, + ) + ] cur = conn.query("select current_role()") current_role = [row["CURRENT_ROLE()"] for row in cur][0] diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index 6f7decc79b1df2..cfc43454b51fad 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -104,9 +104,7 @@ def get_view_definition(self, connection, view_name, schema=None, **kw): return "\n".join([r for r in res]) @typing.no_type_check - def _get_column_type( - self, type_: Union[str, Dict[str, Any]] - ) -> TypeEngine: # noqa: C901 + def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine: # noqa: C901 """Derives the data type of the Athena column. This method is overwritten to extend the behavior of PyAthena. diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py index 2899bcc2de37b0..a8208ca807ed02 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py @@ -218,9 +218,7 @@ def _get_all_table_comments_and_properties(self, connection, **kw): , comment , {properties_clause} AS properties FROM system.tables - WHERE name NOT LIKE '.inner%'""".format( - properties_clause=properties_clause - ) + WHERE name NOT LIKE '.inner%'""".format(properties_clause=properties_clause) ) all_table_comments: Dict[Tuple[str, str], Dict[str, Any]] = {} @@ -301,9 +299,7 @@ def _get_schema_column_info(self, connection, schema=None, **kw): , comment FROM system.columns WHERE {schema_clause} - ORDER BY database, table, position""".format( - schema_clause=schema_clause - ) + ORDER BY database, table, position""".format(schema_clause=schema_clause) ) ) ) @@ -474,7 +470,7 @@ def _get_all_tables(self) -> Set[str]: logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **self.config.options) for db_row in engine.execute(text(all_tables_query)): - all_tables_set.add(f'{db_row["database"]}.{db_row["table_name"]}') + all_tables_set.add(f"{db_row['database']}.{db_row['table_name']}") return all_tables_set @@ -503,7 +499,7 @@ def _populate_lineage_map( try: for db_row in engine.execute(text(query)): - dataset_name = f'{db_row["target_schema"]}.{db_row["target_table"]}' + dataset_name = f"{db_row['target_schema']}.{db_row['target_table']}" if not self.config.database_pattern.allowed( db_row["target_schema"] ) or not self.config.table_pattern.allowed(dataset_name): @@ -512,7 +508,7 @@ def _populate_lineage_map( # Target target_path = ( - f'{self.config.platform_instance+"." if self.config.platform_instance else ""}' + f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}" f"{dataset_name}" ) target = LineageItem( @@ -525,7 +521,7 @@ def _populate_lineage_map( # Source platform = LineageDatasetPlatform.CLICKHOUSE - path = f'{db_row["source_schema"]}.{db_row["source_table"]}' + path = f"{db_row['source_schema']}.{db_row['source_table']}" sources = [ LineageDataset( @@ -552,9 +548,7 @@ def _populate_lineage_map( target.dataset.path ].upstreams = self._lineage_map[ target.dataset.path - ].upstreams.union( - target.upstreams - ) + ].upstreams.union(target.upstreams) else: self._lineage_map[target.dataset.path] = target diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py index 52db3cd11a759d..ac568c58af6c68 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py @@ -234,9 +234,7 @@ def get_columns( WHERE col.table_name = id.table_name AND col.column_name = id.column_name AND col.owner = id.owner - ) AS identity_options""".format( - dblink=dblink - ) + ) AS identity_options""".format(dblink=dblink) else: identity_cols = "NULL as default_on_null, NULL as identity_options" diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index c91be9b494c006..664735053f1852 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -278,8 +278,7 @@ def is_dataset_eligible_for_profiling( if self.config.profiling.profile_table_size_limit is not None and ( size_in_bytes is not None - and size_in_bytes / (2**30) - > self.config.profiling.profile_table_size_limit + and size_in_bytes / (2**30) > self.config.profiling.profile_table_size_limit ): self.report.profiling_skipped_size_limit[schema_name] += 1 logger.debug( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py index 5b76fe41d92e97..84b65d6635e9d4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -599,7 +599,12 @@ def __init__(self, config: TeradataConfig, ctx: PipelineContext): setattr( # noqa: B010 TeradataDialect, "get_columns", - lambda self, connection, table_name, schema=None, use_qvci=self.config.use_qvci, **kw: optimized_get_columns( + lambda self, + connection, + table_name, + schema=None, + use_qvci=self.config.use_qvci, + **kw: optimized_get_columns( self, connection, table_name, @@ -613,7 +618,11 @@ def __init__(self, config: TeradataConfig, ctx: PipelineContext): setattr( # noqa: B010 TeradataDialect, "get_pk_constraint", - lambda self, connection, table_name, schema=None, **kw: optimized_get_pk_constraint( + lambda self, + connection, + table_name, + schema=None, + **kw: optimized_get_pk_constraint( self, connection, table_name, schema, **kw ), ) @@ -621,7 +630,11 @@ def __init__(self, config: TeradataConfig, ctx: PipelineContext): setattr( # noqa: B010 TeradataDialect, "get_foreign_keys", - lambda self, connection, table_name, schema=None, **kw: optimized_get_foreign_keys( + lambda self, + connection, + table_name, + schema=None, + **kw: optimized_get_foreign_keys( self, connection, table_name, schema, **kw ), ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/profiling_state_handler.py b/metadata-ingestion/src/datahub/ingestion/source/state/profiling_state_handler.py index 9883bc2b8e9b0b..6080ddadb65e40 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/profiling_state_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/profiling_state_handler.py @@ -41,9 +41,9 @@ def __init__( run_id: str, ): self.state_provider = source.state_provider - self.stateful_ingestion_config: Optional[ - ProfilingStatefulIngestionConfig - ] = config.stateful_ingestion + self.stateful_ingestion_config: Optional[ProfilingStatefulIngestionConfig] = ( + config.stateful_ingestion + ) self.pipeline_name = pipeline_name self.run_id = run_id self.checkpointing_enabled: bool = ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/redundant_run_skip_handler.py b/metadata-ingestion/src/datahub/ingestion/source/state/redundant_run_skip_handler.py index 8630a959d3f6a3..e4a2646f6ccd3c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/redundant_run_skip_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/redundant_run_skip_handler.py @@ -48,9 +48,9 @@ def __init__( ): self.source = source self.state_provider = source.state_provider - self.stateful_ingestion_config: Optional[ - StatefulIngestionConfig - ] = config.stateful_ingestion + self.stateful_ingestion_config: Optional[StatefulIngestionConfig] = ( + config.stateful_ingestion + ) self.pipeline_name = pipeline_name self.run_id = run_id self._job_id = self._init_job_id() @@ -145,8 +145,7 @@ def should_skip_this_run( ) logger.debug( - f"{self.job_id} : Last run start, end times:" - f"({last_run_time_window})" + f"{self.job_id} : Last run start, end times:({last_run_time_window})" ) # If current run's time window is subset of last run's time window, then skip. @@ -212,8 +211,7 @@ def suggest_run_time_window( ) self.log( - "Adjusted start, end times: " - f"({suggested_start_time}, {suggested_end_time})" + f"Adjusted start, end times: ({suggested_start_time}, {suggested_end_time})" ) return (suggested_start_time, suggested_end_time) diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py b/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py index d4fcbf09924e9e..017d78bc1abf8d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py @@ -111,9 +111,9 @@ def __init__( self.state_type_class = state_type_class self.pipeline_name = pipeline_name self.run_id = run_id - self.stateful_ingestion_config: Optional[ - StatefulStaleMetadataRemovalConfig - ] = config.stateful_ingestion + self.stateful_ingestion_config: Optional[StatefulStaleMetadataRemovalConfig] = ( + config.stateful_ingestion + ) self.checkpointing_enabled: bool = ( True if ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py index 8f4a53ffc3ed58..1f5a651fc64a79 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py @@ -70,20 +70,20 @@ def get_latest_checkpoint( self.orchestrator_name, pipeline_name, job_name ) - latest_checkpoint: Optional[ - DatahubIngestionCheckpointClass - ] = self.graph.get_latest_timeseries_value( - entity_urn=data_job_urn, - aspect_type=DatahubIngestionCheckpointClass, - filter_criteria_map={ - "pipelineName": pipeline_name, - }, + latest_checkpoint: Optional[DatahubIngestionCheckpointClass] = ( + self.graph.get_latest_timeseries_value( + entity_urn=data_job_urn, + aspect_type=DatahubIngestionCheckpointClass, + filter_criteria_map={ + "pipelineName": pipeline_name, + }, + ) ) if latest_checkpoint: logger.debug( f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}'," f" job_name:'{job_name}' found with start_time:" - f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}" + f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}" ) return latest_checkpoint else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py b/metadata-ingestion/src/datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py index a37774773b84d7..55f0903b9c91c7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py @@ -67,7 +67,7 @@ def get_latest_checkpoint( logger.debug( f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}'," f" job_name:'{job_name}' found with start_time:" - f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}" + f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}" ) return latest_checkpoint else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 8187fff559208e..f961bd8ecba604 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -281,9 +281,9 @@ def get_tableau_auth( return authentication def make_tableau_client(self, site: str) -> Server: - authentication: Union[ - TableauAuth, PersonalAccessTokenAuth - ] = self.get_tableau_auth(site) + authentication: Union[TableauAuth, PersonalAccessTokenAuth] = ( + self.get_tableau_auth(site) + ) try: server = Server( self.connect_uri, @@ -635,7 +635,7 @@ def projects_backward_compatibility(cls, values: Dict) -> Dict: project_path_pattern = values.get("project_path_pattern") if project_pattern is None and project_path_pattern is None and projects: logger.warning( - "projects is deprecated, please use " "project_path_pattern instead." + "projects is deprecated, please use project_path_pattern instead." ) logger.info("Initializing project_pattern from projects") values["project_pattern"] = AllowDenyPattern( @@ -708,18 +708,18 @@ class DatabaseTable: """ urn: str - id: Optional[ - str - ] = None # is not None only for tables that came from Tableau metadata + id: Optional[str] = ( + None # is not None only for tables that came from Tableau metadata + ) num_cols: Optional[int] = None - paths: Optional[ - Set[str] - ] = None # maintains all browse paths encountered for this table + paths: Optional[Set[str]] = ( + None # maintains all browse paths encountered for this table + ) - parsed_columns: Optional[ - Set[str] - ] = None # maintains all columns encountered for this table during parsing SQL queries + parsed_columns: Optional[Set[str]] = ( + None # maintains all columns encountered for this table during parsing SQL queries + ) def update_table( self, @@ -2310,8 +2310,7 @@ def _get_datasource_project_luid(self, ds: dict) -> Optional[str]: c.EMBEDDED_DATA_SOURCE, ): logger.debug( - f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is " - f"unsupported" + f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is unsupported" ) return None @@ -2493,9 +2492,9 @@ def parse_custom_sql( def _enrich_database_tables_with_parsed_schemas( self, parsing_result: SqlParsingResult ) -> None: - in_tables_schemas: Dict[ - str, Set[str] - ] = transform_parsing_result_to_in_tables_schemas(parsing_result) + in_tables_schemas: Dict[str, Set[str]] = ( + transform_parsing_result_to_in_tables_schemas(parsing_result) + ) if not in_tables_schemas: logger.info("Unable to extract table schema from parsing result") @@ -3559,25 +3558,25 @@ def emit_project_in_topological_order( generated_project_keys.add(project_key.guid()) - parent_project_key: Optional[ - Union[ProjectKey, SiteKey] - ] = None # It is going + parent_project_key: Optional[Union[ProjectKey, SiteKey]] = ( + None # It is going + ) # to be used as a parent container key for the current tableau project if project_.parent_id is not None: # Go to the parent project as we need to generate container first for parent parent_project_key = self.gen_project_key(project_.parent_id) - parent_tableau_project: Optional[ - TableauProject - ] = self.tableau_project_registry.get(project_.parent_id) + parent_tableau_project: Optional[TableauProject] = ( + self.tableau_project_registry.get(project_.parent_id) + ) if ( parent_tableau_project is None ): # It is not in project registry because of project_pattern - assert ( - project_.parent_name - ), f"project {project_.name} should not be null" + assert project_.parent_name, ( + f"project {project_.name} should not be null" + ) parent_tableau_project = TableauProject( id=project_.parent_id, name=project_.parent_name, @@ -3669,16 +3668,16 @@ def ingest_tableau_site(self): if self.config.extract_usage_stats: with PerfTimer() as timer: self._populate_usage_stat_registry() - self.report.extract_usage_stats_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + self.report.extract_usage_stats_timer[self.site_content_url] = ( + timer.elapsed_seconds(digits=2) + ) if self.config.permission_ingestion: with PerfTimer() as timer: self._fetch_groups() - self.report.fetch_groups_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + self.report.fetch_groups_timer[self.site_content_url] = ( + timer.elapsed_seconds(digits=2) + ) # Populate the map of database names and database hostnames to be used later to map # databases to platform instances. @@ -3691,9 +3690,9 @@ def ingest_tableau_site(self): with PerfTimer() as timer: self._populate_projects_registry() - self.report.populate_projects_registry_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + self.report.populate_projects_registry_timer[self.site_content_url] = ( + timer.elapsed_seconds(digits=2) + ) if self.config.add_site_container: yield from self.emit_site_container() @@ -3701,23 +3700,23 @@ def ingest_tableau_site(self): with PerfTimer() as timer: yield from self.emit_workbooks() - self.report.emit_workbooks_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + self.report.emit_workbooks_timer[self.site_content_url] = ( + timer.elapsed_seconds(digits=2) + ) if self.sheet_ids: with PerfTimer() as timer: yield from self.emit_sheets() - self.report.emit_sheets_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + self.report.emit_sheets_timer[self.site_content_url] = ( + timer.elapsed_seconds(digits=2) + ) if self.dashboard_ids: with PerfTimer() as timer: yield from self.emit_dashboards() - self.report.emit_dashboards_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + self.report.emit_dashboards_timer[self.site_content_url] = ( + timer.elapsed_seconds(digits=2) + ) if self.embedded_datasource_ids_being_used: with PerfTimer() as timer: @@ -3743,6 +3742,6 @@ def ingest_tableau_site(self): if self.database_tables: with PerfTimer() as timer: yield from self.emit_upstream_tables() - self.report.emit_upstream_tables_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + self.report.emit_upstream_tables_timer[self.site_content_url] = ( + timer.elapsed_seconds(digits=2) + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index 1fdce3aa1e2d34..6c3f7a51294797 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -254,7 +254,9 @@ class UnityCatalogSourceConfig( ) # TODO: Remove `type:ignore` by refactoring config - profiling: Union[UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig] = Field( # type: ignore + profiling: Union[ + UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig + ] = Field( # type: ignore default=UnityCatalogGEProfilerConfig(), description="Data profiling configuration", discriminator="method", diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index 9b96953794dcd5..fd6fa8a50f707b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -363,7 +363,7 @@ def _escape_sequence(value: str) -> str: @staticmethod def _create_metastore( - obj: Union[GetMetastoreSummaryResponse, MetastoreInfo] + obj: Union[GetMetastoreSummaryResponse, MetastoreInfo], ) -> Optional[Metastore]: if not obj.name: return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 43bd788f809c3e..29562eaf3ce5b1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -205,9 +205,9 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig): self.table_refs: Set[TableReference] = set() self.view_refs: Set[TableReference] = set() self.notebooks: FileBackedDict[Notebook] = FileBackedDict() - self.view_definitions: FileBackedDict[ - Tuple[TableReference, str] - ] = FileBackedDict() + self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = ( + FileBackedDict() + ) # Global map of tables, for profiling self.tables: FileBackedDict[Table] = FileBackedDict() diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py index 718818d9b347bf..2e9f7fc00c8784 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py @@ -103,7 +103,9 @@ def _get_workunits_internal( query, table_info ) for source_table in table_info.source_tables: - with self.report.usage_perf_report.aggregator_add_event_timer: + with ( + self.report.usage_perf_report.aggregator_add_event_timer + ): self.usage_aggregator.aggregate_event( resource=source_table, start_time=query.start_time, diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py index 4c2e4d42c440e8..2e1e315c4df956 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py @@ -213,15 +213,15 @@ def _get_joined_access_event(self, events): def _aggregate_access_events( self, events: List[ClickHouseJoinedAccessEvent] ) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]: - datasets: Dict[ - datetime, Dict[ClickHouseTableRef, AggregatedDataset] - ] = collections.defaultdict(dict) + datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = ( + collections.defaultdict(dict) + ) for event in events: floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration) resource = ( - f'{self.config.platform_instance+"." if self.config.platform_instance else ""}' + f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}" f"{event.database}.{event.table}" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py index 6ded11027c83a8..e4138696186416 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py @@ -235,9 +235,9 @@ def _get_joined_access_event(self, events): def _aggregate_access_events( self, events: List[TrinoJoinedAccessEvent] ) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]: - datasets: Dict[ - datetime, Dict[TrinoTableRef, AggregatedDataset] - ] = collections.defaultdict(dict) + datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = ( + collections.defaultdict(dict) + ) for event in events: floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py index bb1c297513de10..b4dc8835f9fba9 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py @@ -80,10 +80,10 @@ def handle_end_of_stream( ).add_asset(container_urn) data_products_container[data_product_urn] = container_product else: - data_products_container[ - data_product_urn - ] = data_products_container[data_product_urn].add_asset( - container_urn + data_products_container[data_product_urn] = ( + data_products_container[data_product_urn].add_asset( + container_urn + ) ) mcps: List[ diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_properties.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_properties.py index 668f6ed7abe074..4b9b4c9e6f5da6 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_properties.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_properties.py @@ -61,9 +61,9 @@ def _merge_with_server_properties( ) -> Optional[DatasetPropertiesClass]: assert dataset_properties_aspect - server_dataset_properties_aspect: Optional[ - DatasetPropertiesClass - ] = graph.get_dataset_properties(entity_urn) + server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = ( + graph.get_dataset_properties(entity_urn) + ) # No need to take any action if server properties is None or there is not customProperties in server properties if ( server_dataset_properties_aspect is None diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_tags.py index ba3b6508daaecd..d2687ebc5e76f6 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_tags.py @@ -89,9 +89,9 @@ def transform_aspect( server_field_map: dict = {} if self.config.semantics == TransformerSemantics.PATCH: assert self.ctx.graph - server_schema_metadata_aspect: Optional[ - SchemaMetadataClass - ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn) + server_schema_metadata_aspect: Optional[SchemaMetadataClass] = ( + self.ctx.graph.get_schema_metadata(entity_urn=entity_urn) + ) if server_schema_metadata_aspect is not None: if not schema_metadata_aspect: schema_metadata_aspect = server_schema_metadata_aspect diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_terms.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_terms.py index a7e92d4bd7edbd..d17a39bee6cfbf 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_terms.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_terms.py @@ -108,9 +108,9 @@ def transform_aspect( ] = {} # Map to cache server field objects, where fieldPath is key if self.config.semantics == TransformerSemantics.PATCH: assert self.ctx.graph - server_schema_metadata_aspect: Optional[ - SchemaMetadataClass - ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn) + server_schema_metadata_aspect: Optional[SchemaMetadataClass] = ( + self.ctx.graph.get_schema_metadata(entity_urn=entity_urn) + ) if server_schema_metadata_aspect is not None: if not schema_metadata_aspect: schema_metadata_aspect = server_schema_metadata_aspect diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain_based_on_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain_based_on_tags.py index 7be8069e1b0852..bb2f318dcac8b8 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain_based_on_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain_based_on_tags.py @@ -60,10 +60,10 @@ def transform_aspect( domain_aspect.domains.extend(mapped_domains.domains) if self.config.semantics == TransformerSemantics.PATCH: # Try merging with server-side domains - patch_domain_aspect: Optional[ - DomainsClass - ] = AddDatasetDomain._merge_with_server_domains( - self.ctx.graph, entity_urn, domain_aspect + patch_domain_aspect: Optional[DomainsClass] = ( + AddDatasetDomain._merge_with_server_domains( + self.ctx.graph, entity_urn, domain_aspect + ) ) return cast(Optional[Aspect], patch_domain_aspect) return cast(Optional[Aspect], domain_aspect) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py index 212e018dd64fb7..32707dcd3a372f 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py @@ -141,9 +141,9 @@ def transform_aspect( else: owner_type = get_owner_type(self.config.owner_type) if owner_type == OwnershipTypeClass.CUSTOM: - assert ( - self.config.owner_type_urn is not None - ), "owner_type_urn must be set if owner_type is CUSTOM" + assert self.config.owner_type_urn is not None, ( + "owner_type_urn must be set if owner_type is CUSTOM" + ) owners.append( OwnerClass( diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py index 7e6125079f16e3..65cf2ac3614ae0 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py @@ -92,9 +92,9 @@ def transform_aspect( in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags( entity_urn ) - in_schema_metadata_aspect: Optional[ - SchemaMetadataClass - ] = self.ctx.graph.get_schema_metadata(entity_urn) + in_schema_metadata_aspect: Optional[SchemaMetadataClass] = ( + self.ctx.graph.get_schema_metadata(entity_urn) + ) if in_global_tags_aspect is None and in_schema_metadata_aspect is None: return cast(Aspect, in_glossary_terms) @@ -134,10 +134,10 @@ def transform_aspect( ) if self.config.semantics == TransformerSemantics.PATCH: - patch_glossary_terms: Optional[ - GlossaryTermsClass - ] = TagsToTermMapper._merge_with_server_glossary_terms( - self.ctx.graph, entity_urn, out_glossary_terms + patch_glossary_terms: Optional[GlossaryTermsClass] = ( + TagsToTermMapper._merge_with_server_glossary_terms( + self.ctx.graph, entity_urn, out_glossary_terms + ) ) return cast(Optional[Aspect], patch_glossary_terms) else: diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py index 8d2ae2960ebd05..e32f1ddc3943ae 100644 --- a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py @@ -61,17 +61,17 @@ def __init__(self, output_dir: str, extras: Dict[str, str]) -> None: def create( cls, output_dir: str, extras: Dict[str, str] ) -> "SnowflakeAssertionCompiler": - assert os.path.exists( - output_dir - ), f"Specified location {output_dir} does not exist." + assert os.path.exists(output_dir), ( + f"Specified location {output_dir} does not exist." + ) - assert os.path.isdir( - output_dir - ), f"Specified location {output_dir} is not a folder." + assert os.path.isdir(output_dir), ( + f"Specified location {output_dir} is not a folder." + ) - assert any( - x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras - ), "Must specify value for DMF schema using -x DMF_SCHEMA=" + assert any(x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras), ( + "Must specify value for DMF schema using -x DMF_SCHEMA=" + ) return SnowflakeAssertionCompiler(output_dir, extras) @@ -232,6 +232,6 @@ def get_dmf_schedule(trigger: AssertionTrigger) -> str: elif isinstance(trigger.trigger, CronTrigger): return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}" elif isinstance(trigger.trigger, IntervalTrigger): - return f"{trigger.trigger.interval.seconds/60} MIN" + return f"{trigger.trigger.interval.seconds / 60} MIN" else: raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}") diff --git a/metadata-ingestion/src/datahub/lite/duckdb_lite.py b/metadata-ingestion/src/datahub/lite/duckdb_lite.py index 89317383520923..fe025842822b13 100644 --- a/metadata-ingestion/src/datahub/lite/duckdb_lite.py +++ b/metadata-ingestion/src/datahub/lite/duckdb_lite.py @@ -163,9 +163,9 @@ def write( if "properties" not in writeable_dict["systemMetadata"]: writeable_dict["systemMetadata"]["properties"] = {} - writeable_dict["systemMetadata"]["properties"][ - "sysVersion" - ] = new_version + writeable_dict["systemMetadata"]["properties"]["sysVersion"] = ( + new_version + ) if needs_write: self.duckdb_client.execute( query="INSERT INTO metadata_aspect_v2 VALUES (?, ?, ?, ?, ?, ?)", @@ -208,9 +208,9 @@ def write( "lastObserved": writeable.systemMetadata.lastObserved } else: - system_metadata[ - "lastObserved" - ] = writeable.systemMetadata.lastObserved + system_metadata["lastObserved"] = ( + writeable.systemMetadata.lastObserved + ) self.duckdb_client.execute( query="UPDATE metadata_aspect_v2 SET system_metadata = ? WHERE urn = ? AND aspect_name = ? AND version = 0", parameters=[ @@ -497,9 +497,9 @@ def get_all_entities( aspect_name = r[1] aspect_payload = json.loads(r[2]) if typed: - assert ( - aspect_name in ASPECT_MAP - ), f"Missing aspect name {aspect_name} in the registry" + assert aspect_name in ASPECT_MAP, ( + f"Missing aspect name {aspect_name} in the registry" + ) try: aspect_payload = ASPECT_MAP[aspect_name].from_obj( post_json_transform(aspect_payload) @@ -531,7 +531,9 @@ def get_all_aspects(self) -> Iterable[MetadataChangeProposalWrapper]: for r in results.fetchall(): urn = r[0] aspect_name = r[1] - aspect_metadata = ASPECT_MAP[aspect_name].from_obj(post_json_transform(json.loads(r[2]))) # type: ignore + aspect_metadata = ASPECT_MAP[aspect_name].from_obj( + post_json_transform(json.loads(r[2])) + ) # type: ignore system_metadata = SystemMetadataClass.from_obj(json.loads(r[3])) mcp = MetadataChangeProposalWrapper( entityUrn=urn, diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py index 1fd1585a913581..4b8b4d0bc99bc0 100644 --- a/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py +++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py @@ -9,8 +9,7 @@ class HasCustomPropertiesPatch(MetadataPatchProposal): @classmethod @abstractmethod - def _custom_properties_location(self) -> Tuple[str, PatchPath]: - ... + def _custom_properties_location(self) -> Tuple[str, PatchPath]: ... def add_custom_property(self, key: str, value: str) -> Self: """Add a custom property to the entity. diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py index 6aa10381a883ef..55b026a144c6d5 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py +++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py @@ -33,14 +33,11 @@ class GraphQLSchemaMetadata(TypedDict): class SchemaResolverInterface(Protocol): @property - def platform(self) -> str: - ... + def platform(self) -> str: ... - def includes_temp_tables(self) -> bool: - ... + def includes_temp_tables(self) -> bool: ... - def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]: - ... + def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]: ... def __hash__(self) -> int: # Mainly to make lru_cache happy in methods that accept a schema resolver. @@ -232,8 +229,7 @@ def convert_graphql_schema_metadata_to_info( return { get_simple_field_path_from_v2_field_path(field["fieldPath"]): ( # The actual types are more of a "nice to have". - field["nativeDataType"] - or "str" + field["nativeDataType"] or "str" ) for field in schema["fields"] # TODO: We can't generate lineage to columns nested within structs yet. @@ -289,8 +285,7 @@ def _convert_schema_field_list_to_info( return { get_simple_field_path_from_v2_field_path(col.fieldPath): ( # The actual types are more of a "nice to have". - col.nativeDataType - or "str" + col.nativeDataType or "str" ) for col in schema_fields # TODO: We can't generate lineage to columns nested within structs yet. diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index e1deeaec5ba826..8637802f6b9fee 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -682,10 +682,10 @@ def add_known_lineage_mapping( query_id = self._known_lineage_query_id() # Generate CLL if schema of downstream is known - column_lineage: List[ - ColumnLineageInfo - ] = self._generate_identity_column_lineage( - upstream_urn=upstream_urn, downstream_urn=downstream_urn + column_lineage: List[ColumnLineageInfo] = ( + self._generate_identity_column_lineage( + upstream_urn=upstream_urn, downstream_urn=downstream_urn + ) ) # Register the query. @@ -1044,9 +1044,9 @@ def _make_schema_resolver_for_session( temp_table_schemas: Dict[str, Optional[List[models.SchemaFieldClass]]] = {} for temp_table_urn, query_ids in self._temp_lineage_map[session_id].items(): for query_id in query_ids: - temp_table_schemas[ - temp_table_urn - ] = self._inferred_temp_schemas.get(query_id) + temp_table_schemas[temp_table_urn] = ( + self._inferred_temp_schemas.get(query_id) + ) if temp_table_schemas: break @@ -1073,9 +1073,9 @@ def _process_view_definition( schema_resolver=self._schema_resolver, ) if parsed.debug_info.error: - self.report.views_parse_failures[ - view_urn - ] = f"{parsed.debug_info.error} on query: {view_definition.view_definition[:100]}" + self.report.views_parse_failures[view_urn] = ( + f"{parsed.debug_info.error} on query: {view_definition.view_definition[:100]}" + ) if parsed.debug_info.table_error: self.report.num_views_failed += 1 return # we can't do anything with this query @@ -1583,9 +1583,9 @@ def _recurse_into_query( temp_query_lineage_info ) else: - temp_upstream_queries[ - upstream - ] = temp_query_lineage_info + temp_upstream_queries[upstream] = ( + temp_query_lineage_info + ) # Compute merged upstreams. new_upstreams = OrderedSet[UrnStr]() @@ -1665,9 +1665,9 @@ def _recurse_into_query( composed_of_queries_truncated: LossyList[str] = LossyList() for query_id in composed_of_queries: composed_of_queries_truncated.append(query_id) - self.report.queries_with_temp_upstreams[ - composite_query_id - ] = composed_of_queries_truncated + self.report.queries_with_temp_upstreams[composite_query_id] = ( + composed_of_queries_truncated + ) merged_query_text = ";\n\n".join( [q.formatted_query_string for q in ordered_queries] diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index bf28ab0e7b229b..c825deeccd9592 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -442,9 +442,9 @@ def _create_table_ddl_cll( ) -> List[_ColumnLineageInfo]: column_lineage: List[_ColumnLineageInfo] = [] - assert ( - output_table is not None - ), "output_table must be set for create DDL statements" + assert output_table is not None, ( + "output_table must be set for create DDL statements" + ) create_schema: sqlglot.exp.Schema = statement.this sqlglot_columns = create_schema.expressions diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py index 57a5cc3c9a6574..5b12c64a831666 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py @@ -404,7 +404,7 @@ def replace_cte_refs(node: sqlglot.exp.Expression) -> sqlglot.exp.Expression: if new_statement == statement: if iteration > 1: logger.debug( - f"Required {iteration+1} iterations to detach and eliminate all CTEs" + f"Required {iteration + 1} iterations to detach and eliminate all CTEs" ) break statement = new_statement diff --git a/metadata-ingestion/src/datahub/telemetry/stats.py b/metadata-ingestion/src/datahub/telemetry/stats.py index bf98bd72b574ce..d6835e49de56aa 100644 --- a/metadata-ingestion/src/datahub/telemetry/stats.py +++ b/metadata-ingestion/src/datahub/telemetry/stats.py @@ -5,8 +5,7 @@ class SupportsLT(Protocol): - def __lt__(self, __other: Any) -> Any: - ... + def __lt__(self, __other: Any) -> Any: ... _SupportsComparisonT = TypeVar("_SupportsComparisonT", bound=SupportsLT) diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py index fb028605c35b77..79da90ba20ea9f 100644 --- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py +++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py @@ -224,9 +224,9 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]): _use_sqlite_on_conflict: bool = field(repr=False, default=True) def __post_init__(self) -> None: - assert ( - self.cache_eviction_batch_size > 0 - ), "cache_eviction_batch_size must be positive" + assert self.cache_eviction_batch_size > 0, ( + "cache_eviction_batch_size must be positive" + ) for reserved_column in ("key", "value", "rowid"): if reserved_column in self.extra_columns: @@ -261,7 +261,7 @@ def __post_init__(self) -> None: rowid INTEGER PRIMARY KEY AUTOINCREMENT, key TEXT UNIQUE, value BLOB - {''.join(f', {column_name} BLOB' for column_name in self.extra_columns.keys())} + {"".join(f", {column_name} BLOB" for column_name in self.extra_columns.keys())} )""" ) @@ -316,12 +316,12 @@ def _prune_cache(self, num_items_to_prune: int) -> None: f"""INSERT INTO {self.tablename} ( key, value - {''.join(f', {column_name}' for column_name in self.extra_columns.keys())} + {"".join(f", {column_name}" for column_name in self.extra_columns.keys())} ) - VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))}) + VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))}) ON CONFLICT (key) DO UPDATE SET value = excluded.value - {''.join(f', {column_name} = excluded.{column_name}' for column_name in self.extra_columns.keys())} + {"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns.keys())} """, items_to_write, ) @@ -332,16 +332,16 @@ def _prune_cache(self, num_items_to_prune: int) -> None: f"""INSERT INTO {self.tablename} ( key, value - {''.join(f', {column_name}' for column_name in self.extra_columns.keys())} + {"".join(f", {column_name}" for column_name in self.extra_columns.keys())} ) - VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})""", + VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""", item, ) except sqlite3.IntegrityError: self._conn.execute( f"""UPDATE {self.tablename} SET value = ? - {''.join(f', {column_name} = ?' for column_name in self.extra_columns.keys())} + {"".join(f", {column_name} = ?" for column_name in self.extra_columns.keys())} WHERE key = ?""", (*item[1:], item[0]), ) diff --git a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py index e98fe42c1d56ce..fccd8dd8a60c35 100644 --- a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py +++ b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py @@ -142,10 +142,10 @@ def _parse_struct_fields_string(s: str, **kwargs: Any) -> Dict[str, object]: fields.append({"name": field_name, "type": field_type}) if kwargs.get("ustruct_seqn") is not None: - struct_name = f'__structn_{kwargs["ustruct_seqn"]}_{str(uuid.uuid4()).replace("-", "")}' + struct_name = f"__structn_{kwargs['ustruct_seqn']}_{str(uuid.uuid4()).replace('-', '')}" else: - struct_name = f'__struct_{str(uuid.uuid4()).replace("-", "")}' + struct_name = f"__struct_{str(uuid.uuid4()).replace('-', '')}" return { "type": "record", "name": struct_name, diff --git a/metadata-ingestion/src/datahub/utilities/logging_manager.py b/metadata-ingestion/src/datahub/utilities/logging_manager.py index 926b8782fbf119..a5fd20fef307d0 100644 --- a/metadata-ingestion/src/datahub/utilities/logging_manager.py +++ b/metadata-ingestion/src/datahub/utilities/logging_manager.py @@ -130,9 +130,9 @@ def _formatMessageColor(self, record: logging.LogRecord) -> str: # Mimic our default format, but with color. message_fg = self.MESSAGE_COLORS.get(record.levelname) return ( - f'{click.style(f"[{self.formatTime(record, self.datefmt)}]", fg="green", dim=True)} ' + f"{click.style(f'[{self.formatTime(record, self.datefmt)}]', fg='green', dim=True)} " f"{click.style(f'{record.levelname:8}', fg=message_fg)} " - f'{click.style(f"{{{record.name}:{record.lineno}}}", fg="blue", dim=True)} - ' + f"{click.style(f'{{{record.name}:{record.lineno}}}', fg='blue', dim=True)} - " f"{click.style(record.getMessage(), fg=message_fg)}" ) diff --git a/metadata-ingestion/src/datahub/utilities/lossy_collections.py b/metadata-ingestion/src/datahub/utilities/lossy_collections.py index f71aad51ab0b6b..31d6d0eb842d04 100644 --- a/metadata-ingestion/src/datahub/utilities/lossy_collections.py +++ b/metadata-ingestion/src/datahub/utilities/lossy_collections.py @@ -151,9 +151,9 @@ def __str__(self) -> str: def as_obj(self) -> Dict[Union[_KT, str], Union[_VT, str]]: base_dict: Dict[Union[_KT, str], Union[_VT, str]] = super().copy() # type: ignore if self.sampled: - base_dict[ - "sampled" - ] = f"{len(self.keys())} sampled of at most {self.total_key_count()} entries." + base_dict["sampled"] = ( + f"{len(self.keys())} sampled of at most {self.total_key_count()} entries." + ) return base_dict def total_key_count(self) -> int: diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 17023c7b388e76..96870fc6fcd378 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -349,9 +349,9 @@ def convert_to_aspects(self, operation_map: Dict[str, list]) -> Dict[str, Any]: elements=[institutional_memory_element] ) - aspect_map[ - Constants.ADD_DOC_LINK_OPERATION - ] = institutional_memory_aspect + aspect_map[Constants.ADD_DOC_LINK_OPERATION] = ( + institutional_memory_aspect + ) else: raise Exception( f"Expected 1 item of type list for the documentation_link meta_mapping config," diff --git a/metadata-ingestion/src/datahub/utilities/serialized_lru_cache.py b/metadata-ingestion/src/datahub/utilities/serialized_lru_cache.py index b5f490720340ce..bdfe4285065522 100644 --- a/metadata-ingestion/src/datahub/utilities/serialized_lru_cache.py +++ b/metadata-ingestion/src/datahub/utilities/serialized_lru_cache.py @@ -41,7 +41,9 @@ def decorator(func: Callable[_F, _T]) -> Callable[_F, _T]: def wrapper(*args: _F.args, **kwargs: _F.kwargs) -> _T: # We need a type ignore here because there's no way for us to require that # the args and kwargs are hashable while using ParamSpec. - key: _Key = cachetools.keys.hashkey(*args, **{k: v for k, v in kwargs.items() if "cache_exclude" not in k}) # type: ignore + key: _Key = cachetools.keys.hashkey( + *args, **{k: v for k, v in kwargs.items() if "cache_exclude" not in k} + ) # type: ignore with cache_lock: if key in cache: diff --git a/metadata-ingestion/src/datahub/utilities/sqlalchemy_query_combiner.py b/metadata-ingestion/src/datahub/utilities/sqlalchemy_query_combiner.py index 11c04082ee7ad5..cf92336c68cdf6 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlalchemy_query_combiner.py +++ b/metadata-ingestion/src/datahub/utilities/sqlalchemy_query_combiner.py @@ -160,12 +160,12 @@ class SQLAlchemyQueryCombiner: _greenlets_by_thread_lock: threading.Lock = dataclasses.field( default_factory=lambda: threading.Lock() ) - _queries_by_thread: Dict[ - greenlet.greenlet, Dict[str, _QueryFuture] - ] = dataclasses.field(default_factory=lambda: collections.defaultdict(dict)) - _greenlets_by_thread: Dict[ - greenlet.greenlet, Set[greenlet.greenlet] - ] = dataclasses.field(default_factory=lambda: collections.defaultdict(set)) + _queries_by_thread: Dict[greenlet.greenlet, Dict[str, _QueryFuture]] = ( + dataclasses.field(default_factory=lambda: collections.defaultdict(dict)) + ) + _greenlets_by_thread: Dict[greenlet.greenlet, Set[greenlet.greenlet]] = ( + dataclasses.field(default_factory=lambda: collections.defaultdict(set)) + ) @staticmethod def _generate_sql_safe_identifier() -> str: diff --git a/metadata-ingestion/src/datahub/utilities/stats_collections.py b/metadata-ingestion/src/datahub/utilities/stats_collections.py index 09a9490abc0fbe..c0bd9d058e5d37 100644 --- a/metadata-ingestion/src/datahub/utilities/stats_collections.py +++ b/metadata-ingestion/src/datahub/utilities/stats_collections.py @@ -48,7 +48,9 @@ def as_obj(self) -> Dict[_KT, _VT]: total_value: Union[_VT, str] = sum(trimmed_dict.values()) # type: ignore except Exception: total_value = "" - trimmed_dict[f"... top {self.top_k} of total {len(self)} entries"] = total_value # type: ignore + trimmed_dict[f"... top {self.top_k} of total {len(self)} entries"] = ( # type: ignore + total_value # type: ignore + ) return trimmed_dict diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py index f0e4c6f5ee14a1..d792e0bba649dd 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py @@ -21,7 +21,7 @@ def _add_prefix_to_paths( def list_urns_with_path( - model: Union[DictWrapper, MetadataChangeProposalWrapper] + model: Union[DictWrapper, MetadataChangeProposalWrapper], ) -> List[Tuple[str, _Path]]: """List urns in the given model with their paths. @@ -145,7 +145,7 @@ def lowercase_dataset_urns( MetadataChangeEventClass, MetadataChangeProposalClass, MetadataChangeProposalWrapper, - ] + ], ) -> None: def modify_urn(urn: str) -> str: if guess_entity_type(urn) == "dataset": diff --git a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py index 024bb62bbe9ce9..5bb078a368dd50 100644 --- a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py +++ b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py @@ -98,7 +98,9 @@ def load_test_resources(test_resources_dir): with azure_ad_nested_group_json_file.open() as azure_ad_nested_group_json: reference_nested_group = json.loads(azure_ad_nested_group_json.read()) - with azure_ad_nested_groups_members_json_file.open() as azure_ad_nested_groups_users_json: + with ( + azure_ad_nested_groups_members_json_file.open() + ) as azure_ad_nested_groups_users_json: reference_nested_groups_users = json.loads( azure_ad_nested_groups_users_json.read() ) diff --git a/metadata-ingestion/tests/integration/dremio/test_dremio.py b/metadata-ingestion/tests/integration/dremio/test_dremio.py index 401f487d8a14b8..c286746c68b79d 100644 --- a/metadata-ingestion/tests/integration/dremio/test_dremio.py +++ b/metadata-ingestion/tests/integration/dremio/test_dremio.py @@ -190,9 +190,9 @@ def create_mysql_source(headers): "type": "MYSQL", } response = requests.post(url, headers=headers, data=json.dumps(payload)) - assert ( - response.status_code == 200 - ), f"Failed to add mysql datasource: {response.text}" + assert response.status_code == 200, ( + f"Failed to add mysql datasource: {response.text}" + ) def upload_dataset(headers): @@ -537,9 +537,9 @@ def test_dremio_platform_instance_urns( # Check dataset URN structure if mce["entityType"] == "dataset" and "entityUrn" in mce: - assert ( - "test-platform.dremio" in mce["entityUrn"] - ), f"Platform instance missing in dataset URN: {mce['entityUrn']}" + assert "test-platform.dremio" in mce["entityUrn"], ( + f"Platform instance missing in dataset URN: {mce['entityUrn']}" + ) # Check aspects for both datasets and containers if "aspectName" in mce: @@ -558,9 +558,9 @@ def test_dremio_platform_instance_urns( instance = aspect_json["instance"] expected_instance = "urn:li:dataPlatformInstance:(urn:li:dataPlatform:dremio,test-platform)" - assert ( - instance == expected_instance - ), f"Invalid platform instance format: {instance}" + assert instance == expected_instance, ( + f"Invalid platform instance format: {instance}" + ) # Verify against golden file mce_helpers.check_golden_file( diff --git a/metadata-ingestion/tests/integration/grafana/test_grafana.py b/metadata-ingestion/tests/integration/grafana/test_grafana.py index 6eb6b0b8509263..cbac965884365d 100644 --- a/metadata-ingestion/tests/integration/grafana/test_grafana.py +++ b/metadata-ingestion/tests/integration/grafana/test_grafana.py @@ -120,7 +120,7 @@ def test_grafana_dashboard(loaded_grafana, pytestconfig, tmp_path, test_resource time.sleep(5) resp = requests.get(url) if resp.status_code == 200: - logging.info(f"Grafana started after waiting {i*5} seconds") + logging.info(f"Grafana started after waiting {i * 5} seconds") break else: pytest.fail("Grafana did not start in time") @@ -131,12 +131,12 @@ def test_grafana_dashboard(loaded_grafana, pytestconfig, tmp_path, test_resource assert resp.status_code == 200, "Failed to load default dashboard" dashboard = resp.json() - assert ( - dashboard["dashboard"]["title"] == "Default Dashboard" - ), "Default dashboard title mismatch" - assert any( - panel["type"] == "text" for panel in dashboard["dashboard"]["panels"] - ), "Default dashboard missing text panel" + assert dashboard["dashboard"]["title"] == "Default Dashboard", ( + "Default dashboard title mismatch" + ) + assert any(panel["type"] == "text" for panel in dashboard["dashboard"]["panels"]), ( + "Default dashboard missing text panel" + ) # Verify the output. (You can add further checks here if needed) logging.info("Default dashboard verified successfully") @@ -153,7 +153,7 @@ def test_grafana_ingest( time.sleep(5) resp = requests.get(url) if resp.status_code == 200: - logging.info(f"Grafana started after waiting {i*5} seconds") + logging.info(f"Grafana started after waiting {i * 5} seconds") break else: pytest.fail("Grafana did not start in time") diff --git a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py index d0f4fc35fc03eb..d8c98b12951f5d 100644 --- a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py +++ b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py @@ -482,9 +482,9 @@ def test_kafka_connect_ingest_stateful( "mysql_source1", "mysql_source2", ] - pipeline_run1_config["sink"]["config"][ - "filename" - ] = f"{tmp_path}/{output_file_name}" + pipeline_run1_config["sink"]["config"]["filename"] = ( + f"{tmp_path}/{output_file_name}" + ) pipeline_run1 = Pipeline.create(pipeline_run1_config) pipeline_run1.run() pipeline_run1.raise_from_status() @@ -506,14 +506,16 @@ def test_kafka_connect_ingest_stateful( mock_datahub_graph, ) as mock_checkpoint: mock_checkpoint.return_value = mock_datahub_graph - pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(base_pipeline_config) # type: ignore + pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict( + base_pipeline_config # type: ignore + ) # Set the special properties for this run pipeline_run1_config["source"]["config"]["connector_patterns"]["allow"] = [ "mysql_source1", ] - pipeline_run2_config["sink"]["config"][ - "filename" - ] = f"{tmp_path}/{output_file_deleted_name}" + pipeline_run2_config["sink"]["config"]["filename"] = ( + f"{tmp_path}/{output_file_deleted_name}" + ) pipeline_run2 = Pipeline.create(pipeline_run2_config) pipeline_run2.run() pipeline_run2.raise_from_status() diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index c96bcc729a95da..bbcc6332539c02 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -1096,9 +1096,9 @@ def test_file_path_in_view_naming_pattern( ): mocked_client = mock.MagicMock() new_recipe = get_default_recipe(output_file_path=f"{tmp_path}/looker_mces.json") - new_recipe["source"]["config"][ - "view_naming_pattern" - ] = "{project}.{file_path}.view.{name}" + new_recipe["source"]["config"]["view_naming_pattern"] = ( + "{project}.{file_path}.view.{name}" + ) with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 940e7f36675f79..d803b8498104fd 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -101,13 +101,13 @@ def test_lookml_refinement_ingest(pytestconfig, tmp_path, mock_time): ) new_recipe["source"]["config"]["process_refinements"] = True - new_recipe["source"]["config"][ - "view_naming_pattern" - ] = "{project}.{file_path}.view.{name}" + new_recipe["source"]["config"]["view_naming_pattern"] = ( + "{project}.{file_path}.view.{name}" + ) - new_recipe["source"]["config"][ - "view_browse_pattern" - ] = "/{env}/{platform}/{project}/{file_path}/views" + new_recipe["source"]["config"]["view_browse_pattern"] = ( + "/{env}/{platform}/{project}/{file_path}/views" + ) pipeline = Pipeline.create(new_recipe) pipeline.run() diff --git a/metadata-ingestion/tests/integration/nifi/test_nifi.py b/metadata-ingestion/tests/integration/nifi/test_nifi.py index b992de058879ef..924e854a47e4eb 100644 --- a/metadata-ingestion/tests/integration/nifi/test_nifi.py +++ b/metadata-ingestion/tests/integration/nifi/test_nifi.py @@ -72,7 +72,7 @@ def test_nifi_ingest_standalone( status = next(s for s in statuses if s["name"] == "FetchS3Object") if status["aggregateSnapshot"]["flowFilesOut"] >= 1: - logging.info(f"Waited for time {i*5} seconds") + logging.info(f"Waited for time {i * 5} seconds") break # Run the metadata ingestion pipeline. @@ -124,7 +124,7 @@ def test_nifi_ingest_cluster(loaded_nifi, pytestconfig, tmp_path, test_resources statuses = [pg["status"] for pg in pgs] status = next(s for s in statuses if s["name"] == "Cluster_Site_S3_to_S3") if status["aggregateSnapshot"]["flowFilesSent"] >= 1: - logging.info(f"Waited for time {i*5} seconds") + logging.info(f"Waited for time {i * 5} seconds") break test_resources_dir = pytestconfig.rootpath / "tests/integration/nifi" # Run the metadata ingestion pipeline. diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 6f7a9c7833ba1a..0d85d370265cae 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -765,14 +765,14 @@ def test_sqlglot_parser(): } ) - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = parser.get_upstream_tables( - table, - reporter, - ctx=ctx, - config=config, - platform_instance_resolver=platform_instance_resolver, + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) data_platform_tables: List[DataPlatformTable] = lineage[0].upstreams @@ -814,9 +814,9 @@ def test_sqlglot_parser(): def test_databricks_multi_cloud(): q = M_QUERIES[25] - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + get_data_platform_tables_with_dummy_table(q=q) + ) assert len(lineage) == 1 @@ -833,9 +833,9 @@ def test_databricks_multi_cloud(): def test_databricks_catalog_pattern_1(): q = M_QUERIES[26] - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + get_data_platform_tables_with_dummy_table(q=q) + ) assert len(lineage) == 1 @@ -904,14 +904,14 @@ def test_sqlglot_parser_2(): } ) - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = parser.get_upstream_tables( - table, - reporter, - ctx=ctx, - config=config, - platform_instance_resolver=platform_instance_resolver, + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) data_platform_tables: List[DataPlatformTable] = lineage[0].upstreams @@ -965,9 +965,9 @@ def test_databricks_regular_case_with_view(): def test_snowflake_double_double_quotes(): q = M_QUERIES[30] - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + get_data_platform_tables_with_dummy_table(q=q) + ) assert len(lineage) == 1 @@ -984,9 +984,9 @@ def test_snowflake_double_double_quotes(): def test_databricks_multicloud(): q = M_QUERIES[31] - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + get_data_platform_tables_with_dummy_table(q=q) + ) assert len(lineage) == 1 @@ -1003,9 +1003,9 @@ def test_databricks_multicloud(): def test_snowflake_multi_function_call(): q = M_QUERIES[32] - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + get_data_platform_tables_with_dummy_table(q=q) + ) assert len(lineage) == 1 @@ -1022,9 +1022,9 @@ def test_snowflake_multi_function_call(): def test_mssql_drop_with_select(): q = M_QUERIES[33] - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + get_data_platform_tables_with_dummy_table(q=q) + ) assert len(lineage) == 1 @@ -1075,18 +1075,18 @@ def test_unsupported_data_platform(): is_entry_present = True break - assert ( - is_entry_present - ), 'Info message "Non-Data Platform Expression" should be present in reporter' + assert is_entry_present, ( + 'Info message "Non-Data Platform Expression" should be present in reporter' + ) def test_empty_string_in_m_query(): # TRIM(TRIM(TRIM(AGENT_NAME, '\"\"'), '+'), '\\'') is in Query q = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TRIM(TRIM(TRIM(AGENT_NAME, '\"\"'), '+'), '\\'') AS TRIM_AGENT_NAME,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + get_data_platform_tables_with_dummy_table(q=q) + ) assert len(lineage) == 1 @@ -1108,9 +1108,9 @@ def test_double_quotes_in_alias(): # SELECT CAST(sales_date AS DATE) AS \"\"Date\"\" in query q = 'let \n Source = Sql.Database("abc.com", "DB", [Query="SELECT CAST(sales_date AS DATE) AS ""Date"",#(lf) SUM(cshintrpret) / 60.0 AS ""Total Order All Items"",#(lf)#(tab)#(tab)#(tab) SUM(cshintrpret) / 60.0 - LAG(SUM(cshintrpret) / 60.0, 1) OVER (ORDER BY CAST(sales_date AS DATE)) AS ""Total minute difference"",#(lf)#(tab)#(tab)#(tab) SUM(sale_price) / 60.0 - LAG(SUM(sale_price) / 60.0, 1) OVER (ORDER BY CAST(sales_date AS DATE)) AS ""Normal minute difference""#(lf) FROM [DB].[dbo].[sales_t]#(lf) WHERE sales_date >= GETDATE() - 365#(lf) GROUP BY CAST(sales_date AS DATE),#(lf)#(tab)#(tab)CAST(sales_date AS TIME);"]) \n in \n Source' - lineage: List[ - datahub.ingestion.source.powerbi.m_query.data_classes.Lineage - ] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( + get_data_platform_tables_with_dummy_table(q=q) + ) assert len(lineage) == 1 @@ -1168,9 +1168,9 @@ def test_m_query_timeout(mock_get_lark_parser): is_entry_present = True break - assert ( - is_entry_present - ), 'Warning message "M-Query Parsing Timeout" should be present in reporter' + assert is_entry_present, ( + 'Warning message "M-Query Parsing Timeout" should be present in reporter' + ) def test_comments_in_m_query(): diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 911d8a9f35139f..7f62e433bc8014 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -828,9 +828,9 @@ def dataset_type_mapping_set_to_all_platform(pipeline: Pipeline) -> None: # Generate default dataset_type_mapping and compare it with source_config.dataset_type_mapping default_dataset_type_mapping: dict = {} for item in SupportedDataPlatform: - default_dataset_type_mapping[ - item.value.powerbi_data_platform_name - ] = item.value.datahub_data_platform_name + default_dataset_type_mapping[item.value.powerbi_data_platform_name] = ( + item.value.datahub_data_platform_name + ) assert default_dataset_type_mapping == source_config.dataset_type_mapping @@ -1443,9 +1443,9 @@ def test_powerbi_cross_workspace_reference_info_message( is_entry_present = True break - assert ( - is_entry_present - ), 'Info message "Missing Lineage For Tile" should be present in reporter' + assert is_entry_present, ( + 'Info message "Missing Lineage For Tile" should be present in reporter' + ) test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -1568,6 +1568,6 @@ def test_powerbi_app_ingest_info_message( is_entry_present = True break - assert ( - is_entry_present - ), "The extract_app flag should be set to false by default. We need to keep this flag as false until all GMS instances are updated to the latest release." + assert is_entry_present, ( + "The extract_app flag should be set to false by default. We need to keep this flag as false until all GMS instances are updated to the latest release." + ) diff --git a/metadata-ingestion/tests/integration/salesforce/test_salesforce.py b/metadata-ingestion/tests/integration/salesforce/test_salesforce.py index 89a37a372df843..9e68ff22a767e2 100644 --- a/metadata-ingestion/tests/integration/salesforce/test_salesforce.py +++ b/metadata-ingestion/tests/integration/salesforce/test_salesforce.py @@ -89,15 +89,15 @@ def test_latest_version(mock_sdk): ) SalesforceSource(config=config, ctx=Mock()) calls = mock_sf._call_salesforce.mock_calls - assert ( - len(calls) == 1 - ), "We didn't specify version but source didn't call SF API to get the latest one" - assert calls[0].ends_with( - "/services/data" - ), "Source didn't call proper SF API endpoint to get all versions" - assert ( - mock_sf.sf_version == "54.0" - ), "API version was not correctly set (see versions_responses.json)" + assert len(calls) == 1, ( + "We didn't specify version but source didn't call SF API to get the latest one" + ) + assert calls[0].ends_with("/services/data"), ( + "Source didn't call proper SF API endpoint to get all versions" + ) + assert mock_sf.sf_version == "54.0", ( + "API version was not correctly set (see versions_responses.json)" + ) @mock.patch("datahub.ingestion.source.salesforce.Salesforce") @@ -133,12 +133,12 @@ def test_custom_version(mock_sdk): SalesforceSource(config=config, ctx=Mock()) calls = mock_sf._call_salesforce.mock_calls - assert ( - len(calls) == 0 - ), "Source called API to get all versions even though we specified proper version" - assert ( - mock_sdk.call_args.kwargs["version"] == "46.0" - ), "API client object was not correctly initialized with the custom version" + assert len(calls) == 0, ( + "Source called API to get all versions even though we specified proper version" + ) + assert mock_sdk.call_args.kwargs["version"] == "46.0", ( + "API client object was not correctly initialized with the custom version" + ) @freeze_time(FROZEN_TIME) diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index b969f77b4c3c18..7fab5fc7dae1ba 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -57,7 +57,7 @@ def test_mssql_ingest(mssql_runner, pytestconfig, tmp_path, mock_time, config_fi pytestconfig, output_path=tmp_path / "mssql_mces.json", golden_path=test_resources_dir - / f"golden_files/golden_mces_{config_file.replace('yml','json')}", + / f"golden_files/golden_mces_{config_file.replace('yml', 'json')}", ignore_paths=[ r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['job_id'\]", r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_created'\]", diff --git a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py index b8b0563a1d24e5..9c7b86a275f6d0 100644 --- a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py +++ b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py @@ -205,55 +205,57 @@ def register_mock_data(workspace_client): ), ] - workspace_client.tables.get = lambda *args, **kwargs: databricks.sdk.service.catalog.TableInfo.from_dict( - { - "name": "quickstart_table", - "catalog_name": "quickstart_catalog", - "schema_name": "quickstart_schema", - "table_type": "MANAGED", - "data_source_format": "DELTA", - "columns": [ - { - "name": "columnA", - "type_text": "int", - "type_json": '{"name":"columnA","type":"integer","nullable":true,"metadata":{}}', - "type_name": "INT", - "type_precision": 0, - "type_scale": 0, - "position": 0, - "nullable": True, - }, - { - "name": "columnB", - "type_text": "string", - "type_json": '{"name":"columnB","type":"string","nullable":true,"metadata":{}}', - "type_name": "STRING", - "type_precision": 0, - "type_scale": 0, - "position": 1, - "nullable": True, + workspace_client.tables.get = ( + lambda *args, **kwargs: databricks.sdk.service.catalog.TableInfo.from_dict( + { + "name": "quickstart_table", + "catalog_name": "quickstart_catalog", + "schema_name": "quickstart_schema", + "table_type": "MANAGED", + "data_source_format": "DELTA", + "columns": [ + { + "name": "columnA", + "type_text": "int", + "type_json": '{"name":"columnA","type":"integer","nullable":true,"metadata":{}}', + "type_name": "INT", + "type_precision": 0, + "type_scale": 0, + "position": 0, + "nullable": True, + }, + { + "name": "columnB", + "type_text": "string", + "type_json": '{"name":"columnB","type":"string","nullable":true,"metadata":{}}', + "type_name": "STRING", + "type_precision": 0, + "type_scale": 0, + "position": 1, + "nullable": True, + }, + ], + "storage_location": "s3://db-02eec1f70bfe4115445be9fdb1aac6ac-s3-root-bucket/metastore/2c983545-d403-4f87-9063-5b7e3b6d3736/tables/cff27aa1-1c6a-4d78-b713-562c660c2896", + "owner": "account users", + "properties": { + "delta.lastCommitTimestamp": "1666185711000", + "delta.lastUpdateVersion": "1", + "delta.minReaderVersion": "1", + "delta.minWriterVersion": "2", + "spark.sql.statistics.numRows": "10", + "spark.sql.statistics.totalSize": "512", }, - ], - "storage_location": "s3://db-02eec1f70bfe4115445be9fdb1aac6ac-s3-root-bucket/metastore/2c983545-d403-4f87-9063-5b7e3b6d3736/tables/cff27aa1-1c6a-4d78-b713-562c660c2896", - "owner": "account users", - "properties": { - "delta.lastCommitTimestamp": "1666185711000", - "delta.lastUpdateVersion": "1", - "delta.minReaderVersion": "1", - "delta.minWriterVersion": "2", - "spark.sql.statistics.numRows": "10", - "spark.sql.statistics.totalSize": "512", - }, - "generation": 2, - "metastore_id": "2c983545-d403-4f87-9063-5b7e3b6d3736", - "full_name": "quickstart_catalog.quickstart_schema.quickstart_table", - "data_access_configuration_id": "00000000-0000-0000-0000-000000000000", - "created_at": 1666185698688, - "created_by": "abc@acryl.io", - "updated_at": 1666186049633, - "updated_by": "abc@acryl.io", - "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", - } + "generation": 2, + "metastore_id": "2c983545-d403-4f87-9063-5b7e3b6d3736", + "full_name": "quickstart_catalog.quickstart_schema.quickstart_table", + "data_access_configuration_id": "00000000-0000-0000-0000-000000000000", + "created_at": 1666185698688, + "created_by": "abc@acryl.io", + "updated_at": 1666186049633, + "updated_by": "abc@acryl.io", + "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", + } + ) ) workspace_client.service_principals.list.return_value = [ @@ -437,9 +439,7 @@ def test_ingestion(pytestconfig, tmp_path, requests_mock): "datahub.ingestion.source.unity.proxy.WorkspaceClient" ) as mock_client, patch.object( HiveMetastoreProxy, "get_inspector" - ) as get_inspector, patch.object( - HiveMetastoreProxy, "_execute_sql" - ) as execute_sql: + ) as get_inspector, patch.object(HiveMetastoreProxy, "_execute_sql") as execute_sql: workspace_client: mock.MagicMock = mock.MagicMock() mock_client.return_value = workspace_client register_mock_data(workspace_client) diff --git a/metadata-ingestion/tests/performance/databricks/generator.py b/metadata-ingestion/tests/performance/databricks/generator.py index 29df325d856a1a..b11771e55b2c9e 100644 --- a/metadata-ingestion/tests/performance/databricks/generator.py +++ b/metadata-ingestion/tests/performance/databricks/generator.py @@ -167,7 +167,7 @@ def _generate_insert_lineage(table: Table, upstream: Table) -> str: def _generate_view_definition(view: View) -> str: from_statement = f"FROM {_quote_table(view.upstreams[0])} t0" join_statement = " ".join( - f"JOIN {_quote_table(upstream)} t{i+1} ON t0.id = t{i+1}.id" + f"JOIN {_quote_table(upstream)} t{i + 1} ON t0.id = t{i + 1}.id" for i, upstream in enumerate(view.upstreams[1:]) ) return f"CREATE VIEW {_quote_table(view)} AS SELECT * {from_statement} {join_statement} {view.definition}" diff --git a/metadata-ingestion/tests/test_helpers/mce_helpers.py b/metadata-ingestion/tests/test_helpers/mce_helpers.py index 0105e6d596970b..d70a440dab0657 100644 --- a/metadata-ingestion/tests/test_helpers/mce_helpers.py +++ b/metadata-ingestion/tests/test_helpers/mce_helpers.py @@ -300,9 +300,9 @@ def assert_for_each_entity( for urn, aspect_val in aspect_map.items(): if aspect_val is not None: for f in aspect_field_matcher: - assert aspect_field_matcher[f] == _get_element( - aspect_val, [f] - ), f"urn: {urn} -> Field {f} must match value {aspect_field_matcher[f]}, found {_get_element(aspect_val, [f])}" + assert aspect_field_matcher[f] == _get_element(aspect_val, [f]), ( + f"urn: {urn} -> Field {f} must match value {aspect_field_matcher[f]}, found {_get_element(aspect_val, [f])}" + ) success.append(urn) elif urn not in exception_urns: print(f"Adding {urn} to failures") @@ -361,9 +361,9 @@ def assert_entity_mcp_aspect( assert mcp.aspect aspect_val = mcp.aspect.to_obj() for f in aspect_field_matcher: - assert aspect_field_matcher[f] == _get_element( - aspect_val, [f] - ), f"urn: {mcp.entityUrn} -> Field {f} must match value {aspect_field_matcher[f]}, found {_get_element(aspect_val, [f])}" + assert aspect_field_matcher[f] == _get_element(aspect_val, [f]), ( + f"urn: {mcp.entityUrn} -> Field {f} must match value {aspect_field_matcher[f]}, found {_get_element(aspect_val, [f])}" + ) matches = matches + 1 return matches diff --git a/metadata-ingestion/tests/test_helpers/state_helpers.py b/metadata-ingestion/tests/test_helpers/state_helpers.py index f68aef742fc730..c469db6ce8cf80 100644 --- a/metadata-ingestion/tests/test_helpers/state_helpers.py +++ b/metadata-ingestion/tests/test_helpers/state_helpers.py @@ -104,7 +104,7 @@ def monkey_patch_get_latest_timeseries_value( @pytest.fixture def mock_datahub_graph_instance( - mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph] + mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], ) -> DataHubGraph: return mock_datahub_graph(DatahubClientConfig(server="http://fake.domain.local")) diff --git a/metadata-ingestion/tests/unit/api/entities/dataproducts/test_dataproduct.py b/metadata-ingestion/tests/unit/api/entities/dataproducts/test_dataproduct.py index e796f0b3f37219..dad7662d9ad00b 100644 --- a/metadata-ingestion/tests/unit/api/entities/dataproducts/test_dataproduct.py +++ b/metadata-ingestion/tests/unit/api/entities/dataproducts/test_dataproduct.py @@ -26,7 +26,7 @@ def base_entity_metadata(): @pytest.fixture def base_mock_graph( - base_entity_metadata: Dict[str, Dict[str, Any]] + base_entity_metadata: Dict[str, Dict[str, Any]], ) -> MockDataHubGraph: return MockDataHubGraph(entity_graph=base_entity_metadata) diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py b/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py index bdf1e0a2e0e860..8a45efb46893ae 100644 --- a/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py @@ -159,9 +159,9 @@ def test_ensure_size_of_proper_dataset_profile(processor): processor.ensure_dataset_profile_size( "urn:li:dataset:(s3, dummy_dataset, DEV)", profile ) - assert orig_repr == json.dumps( - profile.to_obj() - ), "Aspect was modified in case where workunit processor should have been no-op" + assert orig_repr == json.dumps(profile.to_obj()), ( + "Aspect was modified in case where workunit processor should have been no-op" + ) @freeze_time("2023-01-02 00:00:00") @@ -177,9 +177,9 @@ def test_ensure_size_of_too_big_schema_metadata(processor): # +100kb is completely arbitrary, but we are truncating the aspect based on schema fields size only, not total taken # by other parameters of the aspect - it is reasonable approach though - schema fields is the only field in schema # metadata which can be expected to grow out of control - assert ( - len(json.dumps(schema.to_obj())) < INGEST_MAX_PAYLOAD_BYTES + 100000 - ), "Aspect exceeded acceptable size" + assert len(json.dumps(schema.to_obj())) < INGEST_MAX_PAYLOAD_BYTES + 100000, ( + "Aspect exceeded acceptable size" + ) @freeze_time("2023-01-02 00:00:00") @@ -189,9 +189,9 @@ def test_ensure_size_of_proper_schema_metadata(processor): processor.ensure_schema_metadata_size( "urn:li:dataset:(s3, dummy_dataset, DEV)", schema ) - assert orig_repr == json.dumps( - schema.to_obj() - ), "Aspect was modified in case where workunit processor should have been no-op" + assert orig_repr == json.dumps(schema.to_obj()), ( + "Aspect was modified in case where workunit processor should have been no-op" + ) @freeze_time("2023-01-02 00:00:00") @@ -214,9 +214,9 @@ def test_ensure_size_of_too_big_dataset_profile(processor): ) assert expected_profile.fieldProfiles expected_profile.fieldProfiles.insert(4, reduced_field) - assert json.dumps(profile.to_obj()) == json.dumps( - expected_profile.to_obj() - ), "Field 'big' was not properly removed from aspect due to its size" + assert json.dumps(profile.to_obj()) == json.dumps(expected_profile.to_obj()), ( + "Field 'big' was not properly removed from aspect due to its size" + ) @freeze_time("2023-01-02 00:00:00") diff --git a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py index a1981ccf767916..f494ed78211dcf 100644 --- a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py @@ -232,9 +232,9 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: else [] for fine_grained_lineage in upstream_lineage.fineGrainedLineages ] - assert all( - urn in expected_schema_field_urns for urn in actual_schema_field_urns - ), "Some expected column URNs are missing from fine grained lineage." + assert all(urn in expected_schema_field_urns for urn in actual_schema_field_urns), ( + "Some expected column URNs are missing from fine grained lineage." + ) def test_lineage_for_external_bq_table_no_column_lineage(mock_datahub_graph_instance): @@ -286,9 +286,9 @@ def fake_schema_metadata(entity_urn: str) -> Optional[models.SchemaMetadataClass assert len(upstream_lineage.upstreams) == 3 # Extracting dataset URNs from upstream_lineage.upstreams actual_dataset_urns = [upstream.dataset for upstream in upstream_lineage.upstreams] - assert all( - urn in actual_dataset_urns for urn in expected_dataset_urns - ), "Some expected dataset URNs are missing from upstream lineage." + assert all(urn in actual_dataset_urns for urn in expected_dataset_urns), ( + "Some expected dataset URNs are missing from upstream lineage." + ) assert upstream_lineage.fineGrainedLineages is None diff --git a/metadata-ingestion/tests/unit/cli/assertion/test_compile.py b/metadata-ingestion/tests/unit/cli/assertion/test_compile.py index 47253b5b0d71ea..0a1870d83212e8 100644 --- a/metadata-ingestion/tests/unit/cli/assertion/test_compile.py +++ b/metadata-ingestion/tests/unit/cli/assertion/test_compile.py @@ -37,6 +37,6 @@ def test_compile_assertion_config_spec_for_snowflake(pytestconfig, tmp_path): for file_name in output_file_names: assert os.path.exists(tmp_path / file_name) - assert filecmp.cmp( - golden_file_path / file_name, tmp_path / file_name - ), f"{file_name} is not as expected" + assert filecmp.cmp(golden_file_path / file_name, tmp_path / file_name), ( + f"{file_name} is not as expected" + ) diff --git a/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py index 941d13be0a6139..27045dfc656cbe 100644 --- a/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py +++ b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py @@ -262,8 +262,7 @@ def test_collapse_temp_lineage(): lineage_item: LineageItem = lineage_extractor._lineage_map[target_urn] assert list(lineage_item.upstreams)[0].urn == ( - "urn:li:dataset:(urn:li:dataPlatform:redshift," - "test.public.player_activity,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:redshift,test.public.player_activity,PROD)" ) assert lineage_item.cll is not None @@ -276,8 +275,7 @@ def test_collapse_temp_lineage(): assert lineage_item.cll[0].downstream.column == "price" assert lineage_item.cll[0].upstreams[0].table == ( - "urn:li:dataset:(urn:li:dataPlatform:redshift," - "test.public.player_activity,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:redshift,test.public.player_activity,PROD)" ) assert lineage_item.cll[0].upstreams[0].column == "price" @@ -441,8 +439,7 @@ def test_collapse_temp_recursive_cll_lineage(): ) assert target_dataset_cll[0].upstreams[0].table == ( - "urn:li:dataset:(urn:li:dataPlatform:redshift," - "dev.public.player_activity,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.player_activity,PROD)" ) assert target_dataset_cll[0].upstreams[0].column == "price" @@ -638,8 +635,7 @@ def test_collapse_temp_recursive_with_compex_column_cll_lineage(): ) assert target_dataset_cll[0].upstreams[0].table == ( - "urn:li:dataset:(urn:li:dataPlatform:redshift," - "dev.public.player_activity,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.player_activity,PROD)" ) assert target_dataset_cll[0].upstreams[0].column == "price" assert target_dataset_cll[0].upstreams[1].column == "tax" diff --git a/metadata-ingestion/tests/unit/serde/test_codegen.py b/metadata-ingestion/tests/unit/serde/test_codegen.py index b49f7153129136..13fcf3d919cc03 100644 --- a/metadata-ingestion/tests/unit/serde/test_codegen.py +++ b/metadata-ingestion/tests/unit/serde/test_codegen.py @@ -156,9 +156,9 @@ def _err(msg: str) -> None: f"entity {entity_type}: aspect {aspect_name} is missing from the entity registry" ) - assert ( - not errors - ), f'To fix these errors, run "UPDATE_ENTITY_REGISTRY=true pytest {__file__}"' + assert not errors, ( + f'To fix these errors, run "UPDATE_ENTITY_REGISTRY=true pytest {__file__}"' + ) def test_enum_options(): diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py index 96ab8f7a01a386..e69727f73b6bf4 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py @@ -226,9 +226,9 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time): pipeline_run1_config: Dict[str, Dict[str, Dict[str, Any]]] = dict( # type: ignore base_pipeline_config # type: ignore ) - pipeline_run1_config["sink"]["config"][ - "filename" - ] = f"{tmp_path}/{output_file_name}" + pipeline_run1_config["sink"]["config"]["filename"] = ( + f"{tmp_path}/{output_file_name}" + ) pipeline_run1 = Pipeline.create(pipeline_run1_config) pipeline_run1.run() pipeline_run1.raise_from_status() @@ -254,16 +254,18 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time): ) as mock_state: mock_state.return_value = GenericCheckpointState(serde="utf-8") pipeline_run2 = None - pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(base_pipeline_config) # type: ignore + pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict( + base_pipeline_config # type: ignore + ) pipeline_run2_config["source"]["config"]["dataset_patterns"] = { "allow": ["dummy_dataset1", "dummy_dataset2"], } pipeline_run2_config["source"]["config"]["dpi_id_to_ingest"] = "job2" pipeline_run2_config["source"]["config"]["query_id_to_ingest"] = "query2" - pipeline_run2_config["sink"]["config"][ - "filename" - ] = f"{tmp_path}/{output_file_name_after_deleted}" + pipeline_run2_config["sink"]["config"]["filename"] = ( + f"{tmp_path}/{output_file_name_after_deleted}" + ) pipeline_run2 = Pipeline.create(pipeline_run2_config) pipeline_run2.run() pipeline_run2.raise_from_status() @@ -370,9 +372,9 @@ def test_stateful_ingestion_failure(pytestconfig, tmp_path, mock_time): pipeline_run1_config: Dict[str, Dict[str, Dict[str, Any]]] = dict( # type: ignore base_pipeline_config # type: ignore ) - pipeline_run1_config["sink"]["config"][ - "filename" - ] = f"{tmp_path}/{output_file_name}" + pipeline_run1_config["sink"]["config"]["filename"] = ( + f"{tmp_path}/{output_file_name}" + ) pipeline_run1 = Pipeline.create(pipeline_run1_config) pipeline_run1.run() pipeline_run1.raise_from_status() @@ -398,14 +400,16 @@ def test_stateful_ingestion_failure(pytestconfig, tmp_path, mock_time): ) as mock_state: mock_state.return_value = GenericCheckpointState(serde="utf-8") pipeline_run2 = None - pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(base_pipeline_config) # type: ignore + pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict( + base_pipeline_config # type: ignore + ) pipeline_run2_config["source"]["config"]["dataset_patterns"] = { "allow": ["dummy_dataset1", "dummy_dataset2"], } pipeline_run2_config["source"]["config"]["report_failure"] = True - pipeline_run2_config["sink"]["config"][ - "filename" - ] = f"{tmp_path}/{output_file_name_after_deleted}" + pipeline_run2_config["sink"]["config"]["filename"] = ( + f"{tmp_path}/{output_file_name_after_deleted}" + ) pipeline_run2 = Pipeline.create(pipeline_run2_config) pipeline_run2.run() pipeline_run2.pretty_print_summary() diff --git a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py index 3500636f00eddf..effa6ba85acaeb 100644 --- a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py +++ b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py @@ -85,16 +85,18 @@ def new_get_latest_version(subject_name: str) -> RegisteredSchema: "get_latest_version", new_get_latest_version, ): - schema_str = confluent_schema_registry.get_schema_str_replace_confluent_ref_avro( - # The external reference would match by name. - schema=Schema( - schema_str=schema_str_orig, - schema_type="AVRO", - references=[ - SchemaReference( - name="TestTopic1", subject="schema_subject_1", version=1 - ) - ], + schema_str = ( + confluent_schema_registry.get_schema_str_replace_confluent_ref_avro( + # The external reference would match by name. + schema=Schema( + schema_str=schema_str_orig, + schema_type="AVRO", + references=[ + SchemaReference( + name="TestTopic1", subject="schema_subject_1", version=1 + ) + ], + ) ) ) assert schema_str == ConfluentSchemaRegistry._compact_schema( @@ -106,16 +108,18 @@ def new_get_latest_version(subject_name: str) -> RegisteredSchema: "get_latest_version", new_get_latest_version, ): - schema_str = confluent_schema_registry.get_schema_str_replace_confluent_ref_avro( - # The external reference would match by subject. - schema=Schema( - schema_str=schema_str_orig, - schema_type="AVRO", - references=[ - SchemaReference( - name="schema_subject_1", subject="TestTopic1", version=1 - ) - ], + schema_str = ( + confluent_schema_registry.get_schema_str_replace_confluent_ref_avro( + # The external reference would match by subject. + schema=Schema( + schema_str=schema_str_orig, + schema_type="AVRO", + references=[ + SchemaReference( + name="schema_subject_1", subject="TestTopic1", version=1 + ) + ], + ) ) ) assert schema_str == ConfluentSchemaRegistry._compact_schema( diff --git a/metadata-ingestion/tests/unit/test_dbt_source.py b/metadata-ingestion/tests/unit/test_dbt_source.py index ff22ffedc6228f..d7899af69f8405 100644 --- a/metadata-ingestion/tests/unit/test_dbt_source.py +++ b/metadata-ingestion/tests/unit/test_dbt_source.py @@ -475,9 +475,9 @@ def test_get_column_type_redshift(): # Test 'super' type which should not show any warnings/errors result_super = get_column_type(report, dataset_name, "super", "redshift") assert isinstance(result_super.type, NullTypeClass) - assert ( - len(report.infos) == 0 - ), "No warnings should be generated for known SUPER type" + assert len(report.infos) == 0, ( + "No warnings should be generated for known SUPER type" + ) # Test unknown type, which generates a warning but resolves to NullTypeClass unknown_type = "unknown_type" diff --git a/metadata-ingestion/tests/unit/test_iceberg.py b/metadata-ingestion/tests/unit/test_iceberg.py index 3afa26b35dfe9f..48524450caf36e 100644 --- a/metadata-ingestion/tests/unit/test_iceberg.py +++ b/metadata-ingestion/tests/unit/test_iceberg.py @@ -88,15 +88,15 @@ def assert_field( expected_nullable: bool, expected_type: Any, ) -> None: - assert ( - schema_field.description == expected_description - ), f"Field description '{schema_field.description}' is different from expected description '{expected_description}'" - assert ( - schema_field.nullable == expected_nullable - ), f"Field nullable '{schema_field.nullable}' is different from expected nullable '{expected_nullable}'" - assert isinstance( - schema_field.type.type, expected_type - ), f"Field type {schema_field.type.type} is different from expected type {expected_type}" + assert schema_field.description == expected_description, ( + f"Field description '{schema_field.description}' is different from expected description '{expected_description}'" + ) + assert schema_field.nullable == expected_nullable, ( + f"Field nullable '{schema_field.nullable}' is different from expected nullable '{expected_nullable}'" + ) + assert isinstance(schema_field.type.type, expected_type), ( + f"Field type {schema_field.type.type} is different from expected type {expected_type}" + ) def test_config_no_catalog(): @@ -219,9 +219,9 @@ def test_iceberg_primitive_type_to_schema_field( ]: schema = Schema(column) schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) - assert ( - len(schema_fields) == 1 - ), f"Expected 1 field, but got {len(schema_fields)}" + assert len(schema_fields) == 1, ( + f"Expected 1 field, but got {len(schema_fields)}" + ) assert_field( schema_fields[0], column.doc, @@ -300,19 +300,19 @@ def test_iceberg_list_to_schema_field( iceberg_source_instance = with_iceberg_source() schema = Schema(list_column) schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) - assert ( - len(schema_fields) == 1 - ), f"Expected 1 field, but got {len(schema_fields)}" + assert len(schema_fields) == 1, ( + f"Expected 1 field, but got {len(schema_fields)}" + ) assert_field( schema_fields[0], list_column.doc, list_column.optional, ArrayTypeClass ) - assert isinstance( - schema_fields[0].type.type, ArrayType - ), f"Field type {schema_fields[0].type.type} was expected to be {ArrayType}" + assert isinstance(schema_fields[0].type.type, ArrayType), ( + f"Field type {schema_fields[0].type.type} was expected to be {ArrayType}" + ) arrayType: ArrayType = schema_fields[0].type.type - assert arrayType.nestedType == [ - expected_array_nested_type - ], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}" + assert arrayType.nestedType == [expected_array_nested_type], ( + f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}" + ) @pytest.mark.parametrize( @@ -387,9 +387,9 @@ def test_iceberg_map_to_schema_field( schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) # Converting an Iceberg Map type will be done by creating an array of struct(key, value) records. # The first field will be the array. - assert ( - len(schema_fields) == 3 - ), f"Expected 3 fields, but got {len(schema_fields)}" + assert len(schema_fields) == 3, ( + f"Expected 3 fields, but got {len(schema_fields)}" + ) assert_field( schema_fields[0], map_column.doc, map_column.optional, ArrayTypeClass ) diff --git a/metadata-ingestion/tests/unit/test_postgres_source.py b/metadata-ingestion/tests/unit/test_postgres_source.py index 91a62b603bb584..25140cf1b997f8 100644 --- a/metadata-ingestion/tests/unit/test_postgres_source.py +++ b/metadata-ingestion/tests/unit/test_postgres_source.py @@ -21,9 +21,7 @@ def test_initial_database(create_engine_mock): @patch("datahub.ingestion.source.sql.postgres.create_engine") def test_get_inspectors_multiple_databases(create_engine_mock): - execute_mock = ( - create_engine_mock.return_value.connect.return_value.__enter__.return_value.execute - ) + execute_mock = create_engine_mock.return_value.connect.return_value.__enter__.return_value.execute execute_mock.return_value = [{"datname": "db1"}, {"datname": "db2"}] config = PostgresConfig.parse_obj({**_base_config(), "initial_database": "db0"}) @@ -37,9 +35,7 @@ def test_get_inspectors_multiple_databases(create_engine_mock): @patch("datahub.ingestion.source.sql.postgres.create_engine") def tests_get_inspectors_with_database_provided(create_engine_mock): - execute_mock = ( - create_engine_mock.return_value.connect.return_value.__enter__.return_value.execute - ) + execute_mock = create_engine_mock.return_value.connect.return_value.__enter__.return_value.execute execute_mock.return_value = [{"datname": "db1"}, {"datname": "db2"}] config = PostgresConfig.parse_obj({**_base_config(), "database": "custom_db"}) @@ -51,9 +47,7 @@ def tests_get_inspectors_with_database_provided(create_engine_mock): @patch("datahub.ingestion.source.sql.postgres.create_engine") def tests_get_inspectors_with_sqlalchemy_uri_provided(create_engine_mock): - execute_mock = ( - create_engine_mock.return_value.connect.return_value.__enter__.return_value.execute - ) + execute_mock = create_engine_mock.return_value.connect.return_value.__enter__.return_value.execute execute_mock.return_value = [{"datname": "db1"}, {"datname": "db2"}] config = PostgresConfig.parse_obj( diff --git a/metadata-ingestion/tests/unit/test_rest_sink.py b/metadata-ingestion/tests/unit/test_rest_sink.py index a76f96039c2c71..564cf613c04464 100644 --- a/metadata-ingestion/tests/unit/test_rest_sink.py +++ b/metadata-ingestion/tests/unit/test_rest_sink.py @@ -283,9 +283,9 @@ def test_datahub_rest_emitter(requests_mock, record, path, snapshot): def match_request_text(request: requests.Request) -> bool: requested_snapshot = request.json() - assert ( - requested_snapshot == snapshot - ), f"Expected snapshot to be {json.dumps(snapshot)}, got {json.dumps(requested_snapshot)}" + assert requested_snapshot == snapshot, ( + f"Expected snapshot to be {json.dumps(snapshot)}, got {json.dumps(requested_snapshot)}" + ) return True requests_mock.post( diff --git a/metadata-ingestion/tests/unit/utilities/test_lossy_collections.py b/metadata-ingestion/tests/unit/utilities/test_lossy_collections.py index 43967367dff389..e137d671e95d71 100644 --- a/metadata-ingestion/tests/unit/utilities/test_lossy_collections.py +++ b/metadata-ingestion/tests/unit/utilities/test_lossy_collections.py @@ -34,7 +34,7 @@ def test_lossyset_sampling(length, sampling): assert len(lossy_set) == min(10, length) assert lossy_set.sampled is sampling if sampling: - assert f"... sampled with at most {length-10} elements missing" in str( + assert f"... sampled with at most {length - 10} elements missing" in str( lossy_set ) else: @@ -66,7 +66,7 @@ def test_lossydict_sampling(length, sampling, sub_length): element_length_map[i] = len(lossy_dict[i]) current_list = lossy_dict.get(i, LossyList()) - current_list.append(f"{i}:{round(time.time(),2)} Hello World") + current_list.append(f"{i}:{round(time.time(), 2)} Hello World") lossy_dict[i] = current_list element_length_map[i] += 1 diff --git a/metadata-ingestion/tests/unit/utilities/test_partition_executor.py b/metadata-ingestion/tests/unit/utilities/test_partition_executor.py index ce211c2d618062..89e95d185e8028 100644 --- a/metadata-ingestion/tests/unit/utilities/test_partition_executor.py +++ b/metadata-ingestion/tests/unit/utilities/test_partition_executor.py @@ -37,9 +37,9 @@ def task(key: str, id: str) -> None: saw_keys_in_parallel = False while executing_tasks or not done_tasks: keys_executing = [key for key, _ in executing_tasks] - assert list(sorted(keys_executing)) == list( - sorted(set(keys_executing)) - ), "partitioning not working" + assert list(sorted(keys_executing)) == list(sorted(set(keys_executing))), ( + "partitioning not working" + ) if len(keys_executing) == 2: saw_keys_in_parallel = True diff --git a/metadata-ingestion/tests/unit/utilities/test_threaded_iterator_executor.py b/metadata-ingestion/tests/unit/utilities/test_threaded_iterator_executor.py index 35c44c7b4a8479..fb7e2266e1c9d3 100644 --- a/metadata-ingestion/tests/unit/utilities/test_threaded_iterator_executor.py +++ b/metadata-ingestion/tests/unit/utilities/test_threaded_iterator_executor.py @@ -4,7 +4,7 @@ def test_threaded_iterator_executor(): def table_of(i): for j in range(1, 11): - yield f"{i}x{j}={i*j}" + yield f"{i}x{j}={i * j}" assert { res diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index f3dc1de830ccef..c38468ca8cd8b0 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -73,16 +73,16 @@ task installDev(type: Exec) { task pythonLint(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "black --check --diff tests/ && " + "ruff check tests/ && " + + "ruff format --check tests/ && " + "mypy tests/" } task pythonLintFix(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "black tests/ && " + "ruff check --fix tests/ && " + + "ruff format tests/ && " + "mypy tests/" } @@ -154,3 +154,19 @@ task lint { task lintFix { dependsOn pythonLintFix } + +task cleanPythonCache(type: Exec) { + commandLine 'bash', '-c', + "find . -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete" +} + + +clean { + delete venv_name + delete 'build' + delete 'dist' + delete '.ruff_cache' + delete '.mypy_cache' + delete '.pytest_cache' +} +clean.dependsOn cleanPythonCache \ No newline at end of file diff --git a/smoke-test/pyproject.toml b/smoke-test/pyproject.toml index 55f037db2effea..55e286c73c01b9 100644 --- a/smoke-test/pyproject.toml +++ b/smoke-test/pyproject.toml @@ -7,20 +7,21 @@ name = "smoke-test" version = "0.0.0" description = "" authors = [ - { name="Acryl Data", email="eng@acryl.io" }, + { name="Acryl Data", email="eng@acryl.io" }, ] requires-python = ">=3.9" +[tool.ruff] +# Enable ruff format +target-version = "py310" +line-length = 88 +extend-exclude = ["tmp", "venv"] -[tool.black] -extend-exclude = ''' -# A regex preceded with ^/ will apply only to files and directories -# in the root of the project. -tmp -venv -''' -include = '\.pyi?$' -target-version = ['py310'] +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" [tool.ruff.lint.isort] combine-as-imports = true @@ -40,19 +41,19 @@ required-imports = [] classes = ["typing"] [tool.ruff.lint] -select = [ - "B", - "C90", - "E", - "F", - "I", # For isort - "TID", +extend-select = [ + "B", # flake8-bugbear + "C90", # mccabe complexity + "E", # pycodestyle errors + "F", # pyflakes + "G010", # logging.warn -> logging.warning + "I", # isort + "TID", # flake8-tidy-imports ] ignore = [ - 'E501', # Ignore line length, since black handles that. - 'D203', # Ignore 1 blank line required before class docstring. - 'B904', # exception with `raise ... from err` or `raise ... from None` to distinguish - 'TID252', # Prefer absolute imports over relative imports + "E501", # Line length violations (handled by formatter) + "B904", # exception with `raise ... from err` or `raise ... from None` to distinguish + "TID252",# Prefer absolute imports over relative imports ] [tool.ruff.lint.mccabe] @@ -74,5 +75,4 @@ disallow_untyped_decorators = true warn_unused_configs = true # eventually we'd like to enable these disallow_incomplete_defs = false -disallow_untyped_defs = false - +disallow_untyped_defs = false \ No newline at end of file diff --git a/smoke-test/requirements.txt b/smoke-test/requirements.txt index 6779733a850bad..fadc3dbec1f2b5 100644 --- a/smoke-test/requirements.txt +++ b/smoke-test/requirements.txt @@ -9,7 +9,6 @@ joblib pytest-xdist networkx # libaries for linting below this -black==23.7.0 mypy==1.5.1 ruff==0.9.2 # stub version are copied from metadata-ingestion/setup.py and that should be the source of truth diff --git a/smoke-test/tests/data_process_instance/test_data_process_instance.py b/smoke-test/tests/data_process_instance/test_data_process_instance.py index f1c532af515cfa..a68db03cf8cf16 100644 --- a/smoke-test/tests/data_process_instance/test_data_process_instance.py +++ b/smoke-test/tests/data_process_instance/test_data_process_instance.py @@ -250,14 +250,14 @@ def test_search_dpi(auth_session, ingest_cleanup_data): assert res_data, "Response should not be empty" assert "data" in res_data, "Response should contain 'data' field" print("RESPONSE DATA:" + str(res_data)) - assert ( - "scrollAcrossEntities" in res_data["data"] - ), "Response should contain 'scrollAcrossEntities' field" + assert "scrollAcrossEntities" in res_data["data"], ( + "Response should contain 'scrollAcrossEntities' field" + ) search_results = res_data["data"]["scrollAcrossEntities"] - assert ( - "searchResults" in search_results - ), "Response should contain 'searchResults' field" + assert "searchResults" in search_results, ( + "Response should contain 'searchResults' field" + ) results = search_results["searchResults"] assert len(results) > 0, "Should find at least one result" diff --git a/smoke-test/tests/dataproduct/test_dataproduct.py b/smoke-test/tests/dataproduct/test_dataproduct.py index 0aa66984b394c8..8d484820d1ed45 100644 --- a/smoke-test/tests/dataproduct/test_dataproduct.py +++ b/smoke-test/tests/dataproduct/test_dataproduct.py @@ -135,9 +135,9 @@ def validate_relationships( urn_match[dataset_urn] = True urns_missing = [k for k in urn_match if urn_match[k] is False] - assert ( - urns_missing == [] - ), "All dataset urns should have a DataProductContains relationship to the data product" + assert urns_missing == [], ( + "All dataset urns should have a DataProductContains relationship to the data product" + ) dataset_urns_matched = set() for e in graph_client.get_related_entities( @@ -147,9 +147,9 @@ def validate_relationships( ): dataset_urns_matched.add(e.urn) - assert ( - set(dataset_urns) == dataset_urns_matched - ), "All dataset urns should be navigable from the data product" + assert set(dataset_urns) == dataset_urns_matched, ( + "All dataset urns should be navigable from the data product" + ) @tenacity.retry( @@ -247,6 +247,6 @@ def test_create_data_product(graph_client, ingest_cleanup_data): urn_match[dataset_urn] = True urns_missing = [k for k in urn_match if urn_match[k] is False] - assert set(urns_missing) == set( - dataset_urns - ), f"All dataset urns should no longer have a DataProductContains relationship to the data product {data_product_urn}" + assert set(urns_missing) == set(dataset_urns), ( + f"All dataset urns should no longer have a DataProductContains relationship to the data product {data_product_urn}" + ) diff --git a/smoke-test/tests/lineage/test_lineage.py b/smoke-test/tests/lineage/test_lineage.py index 771085043926dd..dd5309667806c5 100644 --- a/smoke-test/tests/lineage/test_lineage.py +++ b/smoke-test/tests/lineage/test_lineage.py @@ -92,7 +92,7 @@ def _explain_sal_result(result: dict) -> str: explain += "Entities: " try: for e in entities: - explain += f"\t{e.replace('urn:li:','')}\n" + explain += f"\t{e.replace('urn:li:', '')}\n" for entity in entities: paths = [ x["paths"][0]["path"] @@ -349,9 +349,9 @@ def get_expectation_for_query(self, query: ImpactQuery) -> LineageExpectation: lineage_expectation.impacted_entities[impacted_entity] ) else: - entries_to_add[ - impacted_dataset_entity - ] = lineage_expectation.impacted_entities[impacted_entity] + entries_to_add[impacted_dataset_entity] = ( + lineage_expectation.impacted_entities[impacted_entity] + ) entries_to_remove.append(impacted_entity) for impacted_entity in entries_to_remove: del lineage_expectation.impacted_entities[impacted_entity] @@ -756,9 +756,9 @@ def test_expectation(self, graph: DataHubGraph) -> bool: ] ) try: - assert ( - impacted_entities == impacted_entities_expectation - ), f"Expected impacted entities to be {impacted_entities_expectation}, found {impacted_entities}" + assert impacted_entities == impacted_entities_expectation, ( + f"Expected impacted entities to be {impacted_entities_expectation}, found {impacted_entities}" + ) except Exception: # breakpoint() raise @@ -783,10 +783,14 @@ def test_expectation(self, graph: DataHubGraph) -> bool: try: assert len(impacted_entity_paths) == len( expectation.impacted_entities[impacted_entity] - ), f"Expected length of impacted entity paths to be {len(expectation.impacted_entities[impacted_entity])}, found {len(impacted_entity_paths)}" + ), ( + f"Expected length of impacted entity paths to be {len(expectation.impacted_entities[impacted_entity])}, found {len(impacted_entity_paths)}" + ) assert set(impacted_entity_paths) == set( expectation.impacted_entities[impacted_entity] - ), f"Expected impacted entity paths to be {expectation.impacted_entities[impacted_entity]}, found {impacted_entity_paths}" + ), ( + f"Expected impacted entity paths to be {expectation.impacted_entities[impacted_entity]}, found {impacted_entity_paths}" + ) except Exception: # breakpoint() raise diff --git a/smoke-test/tests/managed_ingestion/managed_ingestion_test.py b/smoke-test/tests/managed_ingestion/managed_ingestion_test.py index 5d6179de6be644..7fe9421af85b1a 100644 --- a/smoke-test/tests/managed_ingestion/managed_ingestion_test.py +++ b/smoke-test/tests/managed_ingestion/managed_ingestion_test.py @@ -489,9 +489,9 @@ def test_create_list_get_ingestion_execution_request(auth_session): assert res_data assert res_data["data"] - assert ( - res_data["data"]["createIngestionExecutionRequest"] is not None - ), f"res_data was {res_data}" + assert res_data["data"]["createIngestionExecutionRequest"] is not None, ( + f"res_data was {res_data}" + ) assert "errors" not in res_data execution_request_urn = res_data["data"]["createIngestionExecutionRequest"] diff --git a/smoke-test/tests/read_only/test_search.py b/smoke-test/tests/read_only/test_search.py index 36ecf68395f919..66bbeb408d0529 100644 --- a/smoke-test/tests/read_only/test_search.py +++ b/smoke-test/tests/read_only/test_search.py @@ -153,6 +153,6 @@ def test_openapi_v3_entity(auth_session, entity_type): expected_data = {"urn": first_urn} - assert ( - actual_data["urn"] == expected_data["urn"] - ), f"Mismatch: expected {expected_data}, got {actual_data}" + assert actual_data["urn"] == expected_data["urn"], ( + f"Mismatch: expected {expected_data}, got {actual_data}" + ) diff --git a/smoke-test/tests/read_only/test_services_up.py b/smoke-test/tests/read_only/test_services_up.py index 12ff04965548f0..79812b46476fa8 100644 --- a/smoke-test/tests/read_only/test_services_up.py +++ b/smoke-test/tests/read_only/test_services_up.py @@ -27,6 +27,6 @@ def test_gms_config_accessible(auth_session) -> None: default_cli_version: str = gms_config["managedIngestion"]["defaultCliVersion"] print(f"Default CLI version: {default_cli_version}") assert not default_cli_version.startswith("@") - assert "." in default_cli_version or looks_like_a_short_sha( - default_cli_version - ), "Default CLI version does not look like a version string" + assert "." in default_cli_version or looks_like_a_short_sha(default_cli_version), ( + "Default CLI version does not look like a version string" + ) diff --git a/smoke-test/tests/utilities/file_emitter.py b/smoke-test/tests/utilities/file_emitter.py index ddbcff8db31d8b..d5539d143af737 100644 --- a/smoke-test/tests/utilities/file_emitter.py +++ b/smoke-test/tests/utilities/file_emitter.py @@ -7,7 +7,7 @@ class FileEmitter: def __init__( - self, filename: str, run_id: str = f"test_{int(time.time()*1000.0)}" + self, filename: str, run_id: str = f"test_{int(time.time() * 1000.0)}" ) -> None: self.sink: FileSink = FileSink( ctx=PipelineContext(run_id=run_id),