diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index b0666f4a42aac8..3eb34eca85a46f 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -91,6 +91,8 @@ jobs: -x :metadata-ingestion-modules:airflow-plugin:check \ -x :metadata-ingestion-modules:dagster-plugin:build \ -x :metadata-ingestion-modules:dagster-plugin:check \ + -x :metadata-ingestion-modules:gx-plugin:build \ + -x :metadata-ingestion-modules:gx-plugin:check \ -x :datahub-frontend:build \ -x :datahub-web-react:build \ --parallel diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml new file mode 100644 index 00000000000000..84ba2e0559be1b --- /dev/null +++ b/.github/workflows/gx-plugin.yml @@ -0,0 +1,87 @@ +name: GX Plugin +on: + push: + branches: + - master + paths: + - ".github/workflows/gx-plugin.yml" + - "metadata-ingestion-modules/gx-plugin/**" + - "metadata-ingestion/**" + - "metadata-models/**" + pull_request: + branches: + - master + paths: + - ".github/**" + - "metadata-ingestion-modules/gx-plugin/**" + - "metadata-ingestion/**" + - "metadata-models/**" + release: + types: [published] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + gx-plugin: + runs-on: ubuntu-latest + env: + SPARK_VERSION: 3.0.3 + DATAHUB_TELEMETRY_ENABLED: false + strategy: + matrix: + python-version: ["3.8", "3.10"] + include: + - python-version: "3.8" + extraPythonRequirement: "great-expectations~=0.15.12" + - python-version: "3.10" + extraPythonRequirement: "great-expectations~=0.16.0 numpy~=1.26.0" + - python-version: "3.11" + extraPythonRequirement: "great-expectations~=0.17.0" + fail-fast: false + steps: + - name: Set up JDK 17 + uses: actions/setup-java@v3 + with: + distribution: "zulu" + java-version: 17 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + - name: Install dependencies + run: ./metadata-ingestion/scripts/install_deps.sh + - name: Install GX package and test (extras ${{ matrix.extraPythonRequirement }}) + run: ./gradlew -Pextra_pip_requirements='${{ matrix.extraPythonRequirement }}' :metadata-ingestion-modules:gx-plugin:lint :metadata-ingestion-modules:gx-plugin:testQuick + - name: pip freeze show list installed + if: always() + run: source metadata-ingestion-modules/gx-plugin/venv/bin/activate && pip freeze + - uses: actions/upload-artifact@v3 + if: ${{ always() && matrix.python-version == '3.11' && matrix.extraPythonRequirement == 'great-expectations~=0.17.0' }} + with: + name: Test Results (GX Plugin ${{ matrix.python-version}}) + path: | + **/build/reports/tests/test/** + **/build/test-results/test/** + **/junit.*.xml + - name: Upload coverage to Codecov + if: always() + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + directory: . + fail_ci_if_error: false + flags: gx-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} + name: pytest-gx + verbose: true + + event-file: + runs-on: ubuntu-latest + steps: + - name: Upload + uses: actions/upload-artifact@v3 + with: + name: Event File + path: ${{ github.event_path }} diff --git a/.github/workflows/test-results.yml b/.github/workflows/test-results.yml index a122ef3835f4d7..947fc35f169a04 100644 --- a/.github/workflows/test-results.yml +++ b/.github/workflows/test-results.yml @@ -2,7 +2,7 @@ name: Test Results on: workflow_run: - workflows: ["build & test", "metadata ingestion", "Airflow Plugin", "Dagster Plugin"] + workflows: ["build & test", "metadata ingestion", "Airflow Plugin", "Dagster Plugin", "GX Plugin"] types: - completed diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/FormUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/FormUtils.java index 17718f39c12387..d118c04d19393d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/FormUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/FormUtils.java @@ -202,7 +202,7 @@ public static FormActorAssignment mapFormActorAssignment( if (input.getGroups() != null) { UrnArray groupUrns = new UrnArray(); input.getGroups().forEach(group -> groupUrns.add(UrnUtils.getUrn(group))); - result.setUsers(groupUrns); + result.setGroups(groupUrns); } return result; diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 304bf3a67a5b27..f64886953fe225 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -55,7 +55,7 @@ dependencies { // mock internal schema registry implementation externalDependency.kafkaAvroSerde implementation externalDependency.kafkaAvroSerializer - implementation "org.apache.kafka:kafka_2.12:3.7.0" + implementation "org.apache.kafka:kafka_2.12:3.7.1" implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index ad1d01c1ce97c0..af32dd5dd4d36f 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -22,7 +22,7 @@ ARG ALPINE_REPO_URL ARG APACHE_DOWNLOAD_URL ARG GITHUB_REPO_URL -ENV KAFKA_VERSION=3.7.0 +ENV KAFKA_VERSION=3.7.1 ENV SCALA_VERSION=2.13 LABEL name="kafka" version=${KAFKA_VERSION} diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 798047a562ffd2..803112bf857166 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -86,6 +86,7 @@ task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, ':metadata-ingestion:buildWheel', ':metadata-ingestion-modules:airflow-plugin:buildWheel', ':metadata-ingestion-modules:dagster-plugin:buildWheel', + ':metadata-ingestion-modules:gx-plugin:buildWheel', ]) { inputs.files(projectMdFiles) outputs.cacheIf { true } diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index 23888d9000161d..ceac79bd5cad37 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -573,6 +573,7 @@ function copy_python_wheels(): void { "../metadata-ingestion/dist", "../metadata-ingestion-modules/airflow-plugin/dist", "../metadata-ingestion-modules/dagster-plugin/dist", + "../metadata-ingestion-modules/gx-plugin/dist", ]; const wheel_output_directory = path.join(STATIC_DIRECTORY, "wheels"); diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 1f9c0a4d79a9d8..835263fb8872fb 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -917,6 +917,7 @@ module.exports = { // "metadata-integration/java/openlineage-converter/README" //"metadata-ingestion-modules/airflow-plugin/README" //"metadata-ingestion-modules/dagster-plugin/README" + //"metadata-ingestion-modules/gx-plugin/README" // "metadata-ingestion/schedule_docs/datahub", // we can delete this // TODO: change the titles of these, removing the "What is..." portion from the sidebar" // "docs/what/entity", diff --git a/docs/quick-ingestion-guides/bigquery/setup.md b/docs/quick-ingestion-guides/bigquery/setup.md index 10351d6572c531..96850f2deb68ed 100644 --- a/docs/quick-ingestion-guides/bigquery/setup.md +++ b/docs/quick-ingestion-guides/bigquery/setup.md @@ -38,7 +38,9 @@ Please refer to the BigQuery [Permissions](https://cloud.google.com/iam/docs/per You can always add/remove roles to Service Accounts later on. Please refer to the BigQuery [Manage access to projects, folders, and organizations](https://cloud.google.com/iam/docs/granting-changing-revoking-access) guide for more details. ::: -3. Create and download a [Service Account Key](https://cloud.google.com/iam/docs/creating-managing-service-account-keys). We will use this to set up authentication within DataHub. +3. To filter projects based on the `project_labels` configuration, first visit [cloudresourcemanager.googleapis.com](https://console.developers.google.com/apis/api/cloudresourcemanager.googleapis.com/overview) and enable the `Cloud Resource Manager API` + +4. Create and download a [Service Account Key](https://cloud.google.com/iam/docs/creating-managing-service-account-keys). We will use this to set up authentication within DataHub. The key file looks like this: diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 2401b169cd6607..80d2efd3ed164e 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -16,7 +16,9 @@ def get_long_description(): _version: str = package_metadata["__version__"] _self_pin = ( - f"=={_version}" if not (_version.endswith("dev0") or "docker" in _version) else "" + f"=={_version}" + if not (_version.endswith(("dev0", "dev1")) or "docker" in _version) + else "" ) diff --git a/metadata-ingestion-modules/dagster-plugin/build.gradle b/metadata-ingestion-modules/dagster-plugin/build.gradle index 6cb7b9295549a7..74ca7cedea3a52 100644 --- a/metadata-ingestion-modules/dagster-plugin/build.gradle +++ b/metadata-ingestion-modules/dagster-plugin/build.gradle @@ -33,7 +33,7 @@ task installPackage(type: Exec, dependsOn: [environmentSetup, ':metadata-ingesti outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "uv pip install -e . ${extra_pip_requirements} && " + + "${pip_install_command} -e . ${extra_pip_requirements} && " + "touch ${sentinel_file}" } @@ -45,15 +45,11 @@ task installDev(type: Exec, dependsOn: [install]) { outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "uv pip install -e .[dev] ${extra_pip_requirements} && " + + "${pip_install_command} -e .[dev] ${extra_pip_requirements} && " + "touch ${sentinel_file}" } task lint(type: Exec, dependsOn: installDev) { - /* - The find/sed combo below is a temporary work-around for the following mypy issue with airflow 2.2.0: - "venv/lib/python3.8/site-packages/airflow/_vendor/connexion/spec.py:169: error: invalid syntax". - */ commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + "black --check --diff src/ tests/ examples/ && " + @@ -77,7 +73,7 @@ task installDevTest(type: Exec, dependsOn: [installDev]) { outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "uv pip install -e .[dev,integration-tests] ${extra_pip_requirements} && " + + "${pip_install_command} -e .[dev,integration-tests] ${extra_pip_requirements} && " + "touch ${sentinel_file}" } @@ -105,10 +101,6 @@ task testQuick(type: Exec, dependsOn: installDevTest) { } -task testFull(type: Exec, dependsOn: [testQuick, installDevTest]) { - commandLine 'bash', '-x', '-c', - "source ${venv_name}/bin/activate && pytest -m 'not slow_integration' -vv --continue-on-collection-errors --junit-xml=junit.full.xml" -} task buildWheel(type: Exec, dependsOn: [environmentSetup]) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " + 'uv pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_INSTALL=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh' diff --git a/metadata-ingestion-modules/dagster-plugin/setup.py b/metadata-ingestion-modules/dagster-plugin/setup.py index 8a2a1d76d345bf..bf9fcf09a66bc1 100644 --- a/metadata-ingestion-modules/dagster-plugin/setup.py +++ b/metadata-ingestion-modules/dagster-plugin/setup.py @@ -17,7 +17,9 @@ def get_long_description(): _version: str = package_metadata["__version__"] _self_pin = ( - f"=={_version}" if not (_version.endswith("dev0") or "docker" in _version) else "" + f"=={_version}" + if not (_version.endswith(("dev0", "dev1")) or "docker" in _version) + else "" ) base_requirements = { @@ -25,9 +27,7 @@ def get_long_description(): "dagster >= 1.3.3", "dagit >= 1.3.3", *rest_common, - # Ignoring the dependency below because it causes issues with the vercel built wheel install - # f"acryl-datahub[datahub-rest]{_self_pin}", - "acryl-datahub[datahub-rest]", + f"acryl-datahub[datahub-rest]{_self_pin}", } mypy_stubs = { diff --git a/metadata-ingestion-modules/gx-plugin/.gitignore b/metadata-ingestion-modules/gx-plugin/.gitignore new file mode 100644 index 00000000000000..8c01744589e35e --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/.gitignore @@ -0,0 +1,143 @@ +.envrc +src/datahub_gx_plugin/__init__.py.bak +.vscode/ +output +pvenv36/ +bq_credentials.json +/tmp +*.bak + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Generated classes +src/datahub/metadata/ +wheels/ +junit.quick.xml diff --git a/metadata-ingestion-modules/gx-plugin/README.md b/metadata-ingestion-modules/gx-plugin/README.md new file mode 100644 index 00000000000000..1ffd87a955432d --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/README.md @@ -0,0 +1,4 @@ +# Datahub GX Plugin + +See the DataHub GX docs for details. + diff --git a/metadata-ingestion-modules/gx-plugin/build.gradle b/metadata-ingestion-modules/gx-plugin/build.gradle new file mode 100644 index 00000000000000..f1adbc6676e5bc --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/build.gradle @@ -0,0 +1,123 @@ +plugins { + id 'base' +} + +ext { + python_executable = 'python3' + venv_name = 'venv' +} + +if (!project.hasProperty("extra_pip_requirements")) { + ext.extra_pip_requirements = "" +} + +def pip_install_command = "VIRTUAL_ENV=${venv_name} ${venv_name}/bin/uv pip install -e ../../metadata-ingestion" + +task checkPythonVersion(type: Exec) { + commandLine python_executable, '-c', 'import sys; assert sys.version_info >= (3, 8)' +} + +task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { + def sentinel_file = "${venv_name}/.venv_environment_sentinel" + inputs.file file('setup.py') + outputs.file(sentinel_file) + commandLine 'bash', '-c', + "${python_executable} -m venv ${venv_name} && " + + "${venv_name}/bin/python -m pip install --upgrade pip uv wheel 'setuptools>=63.0.0' && " + + "touch ${sentinel_file}" +} + +task installPackage(type: Exec, dependsOn: [environmentSetup, ':metadata-ingestion:codegen']) { + def sentinel_file = "${venv_name}/.build_install_package_sentinel" + inputs.file file('setup.py') + outputs.file(sentinel_file) + commandLine 'bash', '-c', + "source ${venv_name}/bin/activate && set -x && " + + "${pip_install_command} -e . ${extra_pip_requirements} && " + + "touch ${sentinel_file}" +} + +task install(dependsOn: [installPackage]) + +task installDev(type: Exec, dependsOn: [install]) { + def sentinel_file = "${venv_name}/.build_install_dev_sentinel" + inputs.file file('setup.py') + outputs.file(sentinel_file) + commandLine 'bash', '-c', + "source ${venv_name}/bin/activate && set -x && " + + "${pip_install_command} -e .[dev] ${extra_pip_requirements} && " + + "touch ${sentinel_file}" +} + +task lint(type: Exec, dependsOn: installDev) { + commandLine 'bash', '-c', + "source ${venv_name}/bin/activate && set -x && " + + "black --check --diff src/ tests/ && " + + "isort --check --diff src/ tests/ && " + + "flake8 --count --statistics src/ tests/ && " + + "mypy --show-traceback --show-error-codes src/ tests/" +} +task lintFix(type: Exec, dependsOn: installDev) { + commandLine 'bash', '-x', '-c', + "source ${venv_name}/bin/activate && " + + "black src/ tests/ && " + + "isort src/ tests/ && " + + "flake8 src/ tests/ && " + + "mypy src/ tests/" +} + +task installDevTest(type: Exec, dependsOn: [installDev]) { + def sentinel_file = "${venv_name}/.build_install_dev_test_sentinel" + inputs.file file('setup.py') + outputs.dir("${venv_name}") + outputs.file(sentinel_file) + commandLine 'bash', '-c', + "source ${venv_name}/bin/activate && set -x && " + + "${pip_install_command} -e .[dev,integration-tests] ${extra_pip_requirements} && " + + "touch ${sentinel_file}" +} + +def testFile = hasProperty('testFile') ? testFile : 'unknown' +task testSingle(dependsOn: [installDevTest]) { + doLast { + if (testFile != 'unknown') { + exec { + commandLine 'bash', '-x', '-c', + "source ${venv_name}/bin/activate && pytest ${testFile}" + } + } else { + throw new GradleException("No file provided. Use -PtestFile=") + } + } +} + +task testQuick(type: Exec, dependsOn: installDevTest) { + // We can't enforce the coverage requirements if we run a subset of the tests. + inputs.files(project.fileTree(dir: "src/", include: "**/*.py")) + inputs.files(project.fileTree(dir: "tests/")) + outputs.dir("${venv_name}") + commandLine 'bash', '-x', '-c', + "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" +} + + +task buildWheel(type: Exec, dependsOn: [environmentSetup]) { + commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " + + 'uv pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_INSTALL=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh' +} + +task cleanPythonCache(type: Exec) { + commandLine 'bash', '-c', + "find src -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete" +} + +build.dependsOn install +check.dependsOn lint +check.dependsOn testQuick + +clean { + delete venv_name + delete 'build' + delete 'dist' +} +clean.dependsOn cleanPythonCache diff --git a/metadata-ingestion-modules/gx-plugin/pyproject.toml b/metadata-ingestion-modules/gx-plugin/pyproject.toml new file mode 100644 index 00000000000000..fba81486b9f677 --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools>=54.0.0", "wheel", "pip>=21.0.0"] + +[tool.black] +extend-exclude = ''' +# A regex preceded with ^/ will apply only to files and directories +# in the root of the project. +^/tmp +''' +include = '\.pyi?$' + +[tool.isort] +indent = ' ' +profile = 'black' +sections = 'FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER' + +[tool.pyright] +extraPaths = ['tests'] \ No newline at end of file diff --git a/metadata-ingestion-modules/gx-plugin/scripts/release.sh b/metadata-ingestion-modules/gx-plugin/scripts/release.sh new file mode 100755 index 00000000000000..058add495821cb --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/scripts/release.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -euxo pipefail + +if [[ ! ${RELEASE_SKIP_TEST:-} ]] && [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then + ../../gradlew build # also runs tests +elif [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then + ../../gradlew install +fi + +MODULE=datahub_gx_plugin + +# Check packaging constraint. +python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"' +if [[ ${RELEASE_VERSION:-} ]]; then + # Replace version with RELEASE_VERSION env variable + sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/__init__.py +else + vim src/${MODULE}/__init__.py +fi + +rm -rf build dist || true +python -m build +if [[ ! ${RELEASE_SKIP_UPLOAD:-} ]]; then + python -m twine upload 'dist/*' +fi +mv src/${MODULE}/__init__.py.bak src/${MODULE}/__init__.py diff --git a/metadata-ingestion-modules/gx-plugin/setup.cfg b/metadata-ingestion-modules/gx-plugin/setup.cfg new file mode 100644 index 00000000000000..bbdd85f0fdc4ed --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/setup.cfg @@ -0,0 +1,71 @@ +[flake8] +max-complexity = 15 +ignore = + # Ignore: line length issues, since black's formatter will take care of them. + E501, + # Ignore: 1 blank line required before class docstring. + D203, + # See https://stackoverflow.com/a/57074416. + W503, + # See https://github.com/psf/black/issues/315. + E203 +exclude = + .git, + venv, + .tox, + __pycache__ +per-file-ignores = + # imported but unused + __init__.py: F401 +ban-relative-imports = true + +[mypy] +plugins = + pydantic.mypy +exclude = ^(venv|build|dist)/ +ignore_missing_imports = yes +strict_optional = yes +check_untyped_defs = yes +disallow_incomplete_defs = yes +disallow_untyped_decorators = yes +warn_unused_configs = yes +# eventually we'd like to enable these +disallow_untyped_defs = no + +# try to be a bit more strict in certain areas of the codebase +[mypy-datahub.*] +ignore_missing_imports = no +[mypy-tests.*] +ignore_missing_imports = no + +[tool:pytest] +asyncio_mode = auto +addopts = --cov=src --cov-report term-missing --cov-config setup.cfg --strict-markers +markers = + integration: marks all integration tests, across all batches (deselect with '-m "not integration"') +testpaths = + tests/unit + tests/integration + +[coverage:run] +# Because of some quirks in the way setup.cfg, coverage.py, pytest-cov, +# and tox interact, we should not uncomment the following line. +# See https://pytest-cov.readthedocs.io/en/latest/config.html and +# https://coverage.readthedocs.io/en/coverage-5.0/config.html. +# We also have some additional pytest/cov config options in tox.ini. +# source = src + +[coverage:paths] +# This is necessary for tox-based coverage to be counted properly. +source = + src + */site-packages + +[coverage:report] +# The fail_under value ensures that at least some coverage data is collected. +# We override its value in the tox config. +show_missing = true +exclude_lines = + pragma: no cover + @abstract + if TYPE_CHECKING: diff --git a/metadata-ingestion-modules/gx-plugin/setup.py b/metadata-ingestion-modules/gx-plugin/setup.py new file mode 100644 index 00000000000000..1584111f820f59 --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/setup.py @@ -0,0 +1,157 @@ +import os +import pathlib + +import setuptools + +package_metadata: dict = {} +with open("./src/datahub_gx_plugin/__init__.py") as fp: + exec(fp.read(), package_metadata) + + +def get_long_description(): + root = os.path.dirname(__file__) + return pathlib.Path(os.path.join(root, "README.md")).read_text() + + +rest_common = {"requests", "requests_file"} + +# TODO: Can we move away from sqllineage and use sqlglot ?? +sqllineage_lib = { + "sqllineage==1.3.8", + # We don't have a direct dependency on sqlparse but it is a dependency of sqllineage. + # There have previously been issues from not pinning sqlparse, so it's best to pin it. + # Related: https://github.com/reata/sqllineage/issues/361 and https://github.com/reata/sqllineage/pull/360 + "sqlparse==0.4.4", +} + +_version: str = package_metadata["__version__"] +_self_pin = ( + f"=={_version}" + if not (_version.endswith(("dev0", "dev1")) or "docker" in _version) + else "" +) + +base_requirements = { + # Actual dependencies. + # This is temporary lower bound that we're open to loosening/tightening as requirements show up + "sqlalchemy>=1.4.39, <2", + # GE added handling for higher version of jinja2 in version 0.15.12 + # https://github.com/great-expectations/great_expectations/pull/5382/files + # TODO: support GX 0.18.0 + "great-expectations>=0.15.12, <0.18.0", + # datahub does not depend on traitlets directly but great expectations does. + # https://github.com/ipython/traitlets/issues/741 + "traitlets<5.2.2", + *rest_common, + *sqllineage_lib, + f"acryl-datahub[datahub-rest]{_self_pin}", +} + +mypy_stubs = { + "types-dataclasses", + "sqlalchemy-stubs", + "types-setuptools", + "types-six", + "types-python-dateutil", + "types-requests", + "types-toml", + "types-PyYAML", + "types-freezegun", + "types-cachetools", + # versions 0.1.13 and 0.1.14 seem to have issues + "types-click==0.1.12", + "types-tabulate", + # avrogen package requires this + "types-pytz", +} + +base_dev_requirements = { + *base_requirements, + *mypy_stubs, + "black==22.12.0", + "coverage>=5.1", + "flake8>=6.0.0", + "flake8-tidy-imports>=4.3.0", + "flake8-bugbear==23.3.12", + "isort>=5.7.0", + "mypy>=1.4.0", + # pydantic 1.8.2 is incompatible with mypy 0.910. + # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. + "pydantic>=1.10.0,!=1.10.3", + "pytest>=6.2.2", + "pytest-asyncio>=0.16.0", + "pytest-cov>=2.8.1", + "tox", + "deepdiff", + "requests-mock", + "freezegun", + "jsonpickle", + "build", + "twine", + "packaging", +} + +dev_requirements = { + *base_dev_requirements, +} + +integration_test_requirements = { + *dev_requirements, + "psycopg2-binary", + "pyspark", + f"acryl-datahub[testing-utils]{_self_pin}", + "pytest-docker>=1.1.0", +} + +entry_points = { + "gx.plugins": "acryl-datahub-gx-plugin = datahub_gx_plugin.action:DataHubValidationAction" +} + + +setuptools.setup( + # Package metadata. + name=package_metadata["__package_name__"], + version=package_metadata["__version__"], + url="https://datahubproject.io/", + project_urls={ + "Documentation": "https://datahubproject.io/docs/", + "Source": "https://github.com/datahub-project/datahub", + "Changelog": "https://github.com/datahub-project/datahub/releases", + }, + license="Apache License 2.0", + description="Datahub GX plugin to capture executions and send to Datahub", + long_description=get_long_description(), + long_description_content_type="text/markdown", + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: System Administrators", + "License :: OSI Approved", + "License :: OSI Approved :: Apache Software License", + "Operating System :: Unix", + "Operating System :: POSIX :: Linux", + "Environment :: Console", + "Environment :: MacOS X", + "Topic :: Software Development", + ], + # Package info. + zip_safe=False, + python_requires=">=3.8", + package_dir={"": "src"}, + packages=setuptools.find_namespace_packages(where="./src"), + entry_points=entry_points, + # Dependencies. + install_requires=list(base_requirements), + extras_require={ + "ignore": [], # This is a dummy extra to allow for trailing commas in the list. + "dev": list(dev_requirements), + "integration-tests": list(integration_test_requirements), + }, +) diff --git a/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/__init__.py b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/__init__.py new file mode 100644 index 00000000000000..a7689be82a5d99 --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/__init__.py @@ -0,0 +1,21 @@ +# Published at https://pypi.org/project/acryl-datahub/. +__package_name__ = "acryl-datahub-gx-plugin" +__version__ = "1!0.0.0.dev0" + + +def is_dev_mode() -> bool: + return __version__.endswith("dev0") + + +def nice_version_name() -> str: + if is_dev_mode(): + return "unavailable (installed in develop mode)" + return __version__ + + +def get_provider_info(): + return { + "package-name": f"{__package_name__}", + "name": f"{__package_name__}", + "description": "Datahub metadata collector plugin", + } diff --git a/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py new file mode 100644 index 00000000000000..76e43cf8c2c3db --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py @@ -0,0 +1,871 @@ +import json +import logging +import sys +import time +from dataclasses import dataclass +from datetime import timezone +from decimal import Decimal +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +import datahub.emitter.mce_builder as builder +from datahub.cli.env_utils import get_boolean_env_variable +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.rest_emitter import DatahubRestEmitter +from datahub.emitter.serialization_helper import pre_json_transform +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionInfo, + AssertionResult, + AssertionResultType, + AssertionRunEvent, + AssertionRunStatus, + AssertionStdAggregation, + AssertionStdOperator, + AssertionStdParameter, + AssertionStdParameters, + AssertionStdParameterType, + AssertionType, + BatchSpec, + DatasetAssertionInfo, + DatasetAssertionScope, +) +from datahub.metadata.com.linkedin.pegasus2avro.common import DataPlatformInstance +from datahub.metadata.schema_classes import PartitionSpecClass, PartitionTypeClass +from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED +from datahub.utilities.sql_parser import DefaultSQLParser +from great_expectations.checkpoint.actions import ValidationAction +from great_expectations.core.batch import Batch +from great_expectations.core.batch_spec import ( + RuntimeDataBatchSpec, + RuntimeQueryBatchSpec, + SqlAlchemyDatasourceBatchSpec, +) +from great_expectations.core.expectation_validation_result import ( + ExpectationSuiteValidationResult, +) +from great_expectations.data_asset.data_asset import DataAsset +from great_expectations.data_context import AbstractDataContext +from great_expectations.data_context.types.resource_identifiers import ( + ExpectationSuiteIdentifier, + ValidationResultIdentifier, +) +from great_expectations.execution_engine import PandasExecutionEngine +from great_expectations.execution_engine.sqlalchemy_execution_engine import ( + SqlAlchemyExecutionEngine, +) +from great_expectations.validator.validator import Validator +from sqlalchemy.engine.base import Connection, Engine +from sqlalchemy.engine.url import make_url + +if TYPE_CHECKING: + from great_expectations.data_context.types.resource_identifiers import ( + GXCloudIdentifier, + ) + +assert MARKUPSAFE_PATCHED +logger = logging.getLogger(__name__) +if get_boolean_env_variable("DATAHUB_DEBUG", False): + handler = logging.StreamHandler(stream=sys.stdout) + logger.addHandler(handler) + logger.setLevel(logging.DEBUG) + +GE_PLATFORM_NAME = "great-expectations" + + +class DataHubValidationAction(ValidationAction): + def __init__( + self, + data_context: AbstractDataContext, + server_url: str, + env: str = builder.DEFAULT_ENV, + platform_alias: Optional[str] = None, + platform_instance_map: Optional[Dict[str, str]] = None, + graceful_exceptions: bool = True, + token: Optional[str] = None, + timeout_sec: Optional[float] = None, + retry_status_codes: Optional[List[int]] = None, + retry_max_times: Optional[int] = None, + extra_headers: Optional[Dict[str, str]] = None, + exclude_dbname: Optional[bool] = None, + parse_table_names_from_sql: bool = False, + convert_urns_to_lowercase: bool = False, + name: str = "DataHubValidationAction", + ): + + super().__init__(data_context) + self.server_url = server_url + self.env = env + self.platform_alias = platform_alias + self.platform_instance_map = platform_instance_map + self.graceful_exceptions = graceful_exceptions + self.token = token + self.timeout_sec = timeout_sec + self.retry_status_codes = retry_status_codes + self.retry_max_times = retry_max_times + self.extra_headers = extra_headers + self.exclude_dbname = exclude_dbname + self.parse_table_names_from_sql = parse_table_names_from_sql + self.convert_urns_to_lowercase = convert_urns_to_lowercase + + def _run( + self, + validation_result_suite: ExpectationSuiteValidationResult, + validation_result_suite_identifier: Union[ + ValidationResultIdentifier, "GXCloudIdentifier" + ], + data_asset: Union[Validator, DataAsset, Batch], + payload: Optional[Any] = None, + expectation_suite_identifier: Optional[ExpectationSuiteIdentifier] = None, + checkpoint_identifier: Optional[Any] = None, + ) -> Dict: + datasets = [] + try: + emitter = DatahubRestEmitter( + gms_server=self.server_url, + token=self.token, + read_timeout_sec=self.timeout_sec, + connect_timeout_sec=self.timeout_sec, + retry_status_codes=self.retry_status_codes, + retry_max_times=self.retry_max_times, + extra_headers=self.extra_headers, + ) + + expectation_suite_name = validation_result_suite.meta.get( + "expectation_suite_name" + ) + run_id = validation_result_suite.meta.get("run_id") + if hasattr(data_asset, "active_batch_id"): + batch_identifier = data_asset.active_batch_id + else: + batch_identifier = data_asset.batch_id + + if isinstance( + validation_result_suite_identifier, ValidationResultIdentifier + ): + expectation_suite_name = ( + validation_result_suite_identifier.expectation_suite_identifier.expectation_suite_name + ) + run_id = validation_result_suite_identifier.run_id + batch_identifier = validation_result_suite_identifier.batch_identifier + + # Returns datasets and corresponding batch requests + datasets = self.get_dataset_partitions(batch_identifier, data_asset) + + if len(datasets) == 0 or datasets[0]["dataset_urn"] is None: + warn("Metadata not sent to datahub. No datasets found.") + return {"datahub_notification_result": "none required"} + + # Returns assertion info and assertion results + assertions = self.get_assertions_with_results( + validation_result_suite, + expectation_suite_name, + run_id, + payload, + datasets, + ) + + logger.info("Sending metadata to datahub ...") + logger.info("Dataset URN - {urn}".format(urn=datasets[0]["dataset_urn"])) + + for assertion in assertions: + logger.info( + "Assertion URN - {urn}".format(urn=assertion["assertionUrn"]) + ) + + # Construct a MetadataChangeProposalWrapper object. + assertion_info_mcp = MetadataChangeProposalWrapper( + entityUrn=assertion["assertionUrn"], + aspect=assertion["assertionInfo"], + ) + emitter.emit_mcp(assertion_info_mcp) + + # Construct a MetadataChangeProposalWrapper object. + assertion_platform_mcp = MetadataChangeProposalWrapper( + entityUrn=assertion["assertionUrn"], + aspect=assertion["assertionPlatform"], + ) + emitter.emit_mcp(assertion_platform_mcp) + + for assertionResult in assertion["assertionResults"]: + dataset_assertionResult_mcp = MetadataChangeProposalWrapper( + entityUrn=assertionResult.assertionUrn, + aspect=assertionResult, + ) + + # Emit Result! (timeseries aspect) + emitter.emit_mcp(dataset_assertionResult_mcp) + logger.info("Metadata sent to datahub.") + result = "DataHub notification succeeded" + except Exception as e: + result = "DataHub notification failed" + if self.graceful_exceptions: + logger.error(e) + logger.info("Suppressing error because graceful_exceptions is set") + else: + raise + + return {"datahub_notification_result": result} + + def get_assertions_with_results( + self, + validation_result_suite, + expectation_suite_name, + run_id, + payload, + datasets, + ): + dataPlatformInstance = DataPlatformInstance( + platform=builder.make_data_platform_urn(GE_PLATFORM_NAME) + ) + docs_link = None + if payload: + # process the payload + for action_names in payload.keys(): + if payload[action_names]["class"] == "UpdateDataDocsAction": + data_docs_pages = payload[action_names] + for docs_link_key, docs_link_val in data_docs_pages.items(): + if "file://" not in docs_link_val and docs_link_key != "class": + docs_link = docs_link_val + + assertions_with_results = [] + for result in validation_result_suite.results: + expectation_config = result["expectation_config"] + expectation_type = expectation_config["expectation_type"] + success = bool(result["success"]) + kwargs = { + k: v for k, v in expectation_config["kwargs"].items() if k != "batch_id" + } + + result = result["result"] + assertion_datasets = [d["dataset_urn"] for d in datasets] + if len(datasets) == 1 and "column" in kwargs: + assertion_fields = [ + builder.make_schema_field_urn( + datasets[0]["dataset_urn"], kwargs["column"] + ) + ] + else: + assertion_fields = None # type:ignore + + # Be careful what fields to consider for creating assertion urn. + # Any change in fields below would lead to a new assertion + # FIXME - Currently, when using evaluation parameters, new assertion is + # created when runtime resolved kwargs are different, + # possibly for each validation run + assertionUrn = builder.make_assertion_urn( + builder.datahub_guid( + pre_json_transform( + { + "platform": GE_PLATFORM_NAME, + "nativeType": expectation_type, + "nativeParameters": kwargs, + "dataset": assertion_datasets[0], + "fields": assertion_fields, + } + ) + ) + ) + logger.debug( + "GE expectation_suite_name - {name}, expectation_type - {type}, Assertion URN - {urn}".format( + name=expectation_suite_name, type=expectation_type, urn=assertionUrn + ) + ) + assertionInfo: AssertionInfo = self.get_assertion_info( + expectation_type, + kwargs, + assertion_datasets[0], + assertion_fields, + expectation_suite_name, + ) + + # TODO: Understand why their run time is incorrect. + run_time = run_id.run_time.astimezone(timezone.utc) + evaluation_parameters = ( + { + k: convert_to_string(v) + for k, v in validation_result_suite.evaluation_parameters.items() + if k and v + } + if validation_result_suite.evaluation_parameters + else None + ) + + nativeResults = { + k: convert_to_string(v) + for k, v in result.items() + if ( + k + in [ + "observed_value", + "partial_unexpected_list", + "partial_unexpected_counts", + "details", + ] + and v + ) + } + + actualAggValue = ( + result.get("observed_value") + if isinstance(result.get("observed_value"), (int, float)) + else None + ) + + ds = datasets[0] + # https://docs.greatexpectations.io/docs/reference/expectations/result_format/ + assertionResult = AssertionRunEvent( + timestampMillis=int(round(time.time() * 1000)), + assertionUrn=assertionUrn, + asserteeUrn=ds["dataset_urn"], + runId=run_time.strftime("%Y-%m-%dT%H:%M:%SZ"), + result=AssertionResult( + type=( + AssertionResultType.SUCCESS + if success + else AssertionResultType.FAILURE + ), + rowCount=parse_int_or_default(result.get("element_count")), + missingCount=parse_int_or_default(result.get("missing_count")), + unexpectedCount=parse_int_or_default( + result.get("unexpected_count") + ), + actualAggValue=actualAggValue, + externalUrl=docs_link, + nativeResults=nativeResults, + ), + batchSpec=ds["batchSpec"], + status=AssertionRunStatus.COMPLETE, + runtimeContext=evaluation_parameters, + ) + if ds.get("partitionSpec") is not None: + assertionResult.partitionSpec = ds.get("partitionSpec") + assertionResults = [assertionResult] + assertions_with_results.append( + { + "assertionUrn": assertionUrn, + "assertionInfo": assertionInfo, + "assertionPlatform": dataPlatformInstance, + "assertionResults": assertionResults, + } + ) + return assertions_with_results + + def get_assertion_info( + self, expectation_type, kwargs, dataset, fields, expectation_suite_name + ): + # TODO - can we find exact type of min and max value + def get_min_max(kwargs, type=AssertionStdParameterType.UNKNOWN): + return AssertionStdParameters( + minValue=AssertionStdParameter( + value=convert_to_string(kwargs.get("min_value")), + type=type, + ), + maxValue=AssertionStdParameter( + value=convert_to_string(kwargs.get("max_value")), + type=type, + ), + ) + + known_expectations: Dict[str, DataHubStdAssertion] = { + # column aggregate expectations + "expect_column_min_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.MIN, + parameters=get_min_max(kwargs), + ), + "expect_column_max_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.MAX, + parameters=get_min_max(kwargs), + ), + "expect_column_median_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.MEDIAN, + parameters=get_min_max(kwargs), + ), + "expect_column_stdev_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.STDDEV, + parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), + ), + "expect_column_mean_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.MEAN, + parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), + ), + "expect_column_unique_value_count_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.UNIQUE_COUNT, + parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), + ), + "expect_column_proportion_of_unique_values_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.UNIQUE_PROPOTION, + parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), + ), + "expect_column_sum_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.SUM, + parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), + ), + "expect_column_quantile_values_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation._NATIVE_, + ), + # column map expectations + "expect_column_values_to_not_be_null": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.NOT_NULL, + aggregation=AssertionStdAggregation.IDENTITY, + ), + "expect_column_values_to_be_in_set": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.IN, + aggregation=AssertionStdAggregation.IDENTITY, + parameters=AssertionStdParameters( + value=AssertionStdParameter( + value=convert_to_string(kwargs.get("value_set")), + type=AssertionStdParameterType.SET, + ) + ), + ), + "expect_column_values_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.IDENTITY, + parameters=get_min_max(kwargs), + ), + "expect_column_values_to_match_regex": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.REGEX_MATCH, + aggregation=AssertionStdAggregation.IDENTITY, + parameters=AssertionStdParameters( + value=AssertionStdParameter( + value=kwargs.get("regex"), + type=AssertionStdParameterType.STRING, + ) + ), + ), + "expect_column_values_to_match_regex_list": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_COLUMN, + operator=AssertionStdOperator.REGEX_MATCH, + aggregation=AssertionStdAggregation.IDENTITY, + parameters=AssertionStdParameters( + value=AssertionStdParameter( + value=convert_to_string(kwargs.get("regex_list")), + type=AssertionStdParameterType.LIST, + ) + ), + ), + "expect_table_columns_to_match_ordered_list": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_SCHEMA, + operator=AssertionStdOperator.EQUAL_TO, + aggregation=AssertionStdAggregation.COLUMNS, + parameters=AssertionStdParameters( + value=AssertionStdParameter( + value=convert_to_string(kwargs.get("column_list")), + type=AssertionStdParameterType.LIST, + ) + ), + ), + "expect_table_columns_to_match_set": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_SCHEMA, + operator=AssertionStdOperator.EQUAL_TO, + aggregation=AssertionStdAggregation.COLUMNS, + parameters=AssertionStdParameters( + value=AssertionStdParameter( + value=convert_to_string(kwargs.get("column_set")), + type=AssertionStdParameterType.SET, + ) + ), + ), + "expect_table_column_count_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_SCHEMA, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.COLUMN_COUNT, + parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), + ), + "expect_table_column_count_to_equal": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_SCHEMA, + operator=AssertionStdOperator.EQUAL_TO, + aggregation=AssertionStdAggregation.COLUMN_COUNT, + parameters=AssertionStdParameters( + value=AssertionStdParameter( + value=convert_to_string(kwargs.get("value")), + type=AssertionStdParameterType.NUMBER, + ) + ), + ), + "expect_column_to_exist": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_SCHEMA, + operator=AssertionStdOperator._NATIVE_, + aggregation=AssertionStdAggregation._NATIVE_, + ), + "expect_table_row_count_to_equal": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_ROWS, + operator=AssertionStdOperator.EQUAL_TO, + aggregation=AssertionStdAggregation.ROW_COUNT, + parameters=AssertionStdParameters( + value=AssertionStdParameter( + value=convert_to_string(kwargs.get("value")), + type=AssertionStdParameterType.NUMBER, + ) + ), + ), + "expect_table_row_count_to_be_between": DataHubStdAssertion( + scope=DatasetAssertionScope.DATASET_ROWS, + operator=AssertionStdOperator.BETWEEN, + aggregation=AssertionStdAggregation.ROW_COUNT, + parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), + ), + } + + datasetAssertionInfo = DatasetAssertionInfo( + dataset=dataset, + fields=fields, + operator=AssertionStdOperator._NATIVE_, + aggregation=AssertionStdAggregation._NATIVE_, + nativeType=expectation_type, + nativeParameters={k: convert_to_string(v) for k, v in kwargs.items()}, + scope=DatasetAssertionScope.DATASET_ROWS, + ) + + if expectation_type in known_expectations.keys(): + assertion = known_expectations[expectation_type] + datasetAssertionInfo.scope = assertion.scope + datasetAssertionInfo.aggregation = assertion.aggregation + datasetAssertionInfo.operator = assertion.operator + datasetAssertionInfo.parameters = assertion.parameters + + # Heuristically mapping other expectations + else: + if "column" in kwargs and expectation_type.startswith( + "expect_column_value" + ): + datasetAssertionInfo.scope = DatasetAssertionScope.DATASET_COLUMN + datasetAssertionInfo.aggregation = AssertionStdAggregation.IDENTITY + elif "column" in kwargs: + datasetAssertionInfo.scope = DatasetAssertionScope.DATASET_COLUMN + datasetAssertionInfo.aggregation = AssertionStdAggregation._NATIVE_ + + return AssertionInfo( + type=AssertionType.DATASET, + datasetAssertion=datasetAssertionInfo, + customProperties={"expectation_suite_name": expectation_suite_name}, + ) + + def get_dataset_partitions(self, batch_identifier, data_asset): + dataset_partitions = [] + + logger.debug("Finding datasets being validated") + + # for now, we support only v3-api and sqlalchemy execution engine and Pandas engine + is_sql_alchemy = isinstance(data_asset, Validator) and ( + isinstance(data_asset.execution_engine, SqlAlchemyExecutionEngine) + ) + is_pandas = isinstance(data_asset.execution_engine, PandasExecutionEngine) + if is_sql_alchemy or is_pandas: + ge_batch_spec = data_asset.active_batch_spec + partitionSpec = None + batchSpecProperties = { + "data_asset_name": str( + data_asset.active_batch_definition.data_asset_name + ), + "datasource_name": str( + data_asset.active_batch_definition.datasource_name + ), + } + sqlalchemy_uri = None + if is_sql_alchemy and isinstance( + data_asset.execution_engine.engine, Engine + ): + sqlalchemy_uri = data_asset.execution_engine.engine.url + # For snowflake sqlalchemy_execution_engine.engine is actually instance of Connection + elif is_sql_alchemy and isinstance( + data_asset.execution_engine.engine, Connection + ): + sqlalchemy_uri = data_asset.execution_engine.engine.engine.url + + if isinstance(ge_batch_spec, SqlAlchemyDatasourceBatchSpec): + # e.g. ConfiguredAssetSqlDataConnector with splitter_method or sampling_method + schema_name = ge_batch_spec.get("schema_name") + table_name = ge_batch_spec.get("table_name") + + dataset_urn = make_dataset_urn_from_sqlalchemy_uri( + sqlalchemy_uri, + schema_name, + table_name, + self.env, + self.get_platform_instance( + data_asset.active_batch_definition.datasource_name + ), + self.exclude_dbname, + self.platform_alias, + self.convert_urns_to_lowercase, + ) + batchSpec = BatchSpec( + nativeBatchId=batch_identifier, + customProperties=batchSpecProperties, + ) + + splitter_method = ge_batch_spec.get("splitter_method") + if ( + splitter_method is not None + and splitter_method != "_split_on_whole_table" + ): + batch_identifiers = ge_batch_spec.get("batch_identifiers", {}) + partitionSpec = PartitionSpecClass( + partition=convert_to_string(batch_identifiers) + ) + sampling_method = ge_batch_spec.get("sampling_method", "") + if sampling_method == "_sample_using_limit": + batchSpec.limit = ge_batch_spec["sampling_kwargs"]["n"] + + dataset_partitions.append( + { + "dataset_urn": dataset_urn, + "partitionSpec": partitionSpec, + "batchSpec": batchSpec, + } + ) + elif isinstance(ge_batch_spec, RuntimeQueryBatchSpec): + if not self.parse_table_names_from_sql: + warn( + "Enable parse_table_names_from_sql in DatahubValidationAction config\ + to try to parse the tables being asserted from SQL query" + ) + return [] + query = data_asset.batches[ + batch_identifier + ].batch_request.runtime_parameters["query"] + partitionSpec = PartitionSpecClass( + type=PartitionTypeClass.QUERY, + partition=f"Query_{builder.datahub_guid(pre_json_transform(query))}", + ) + + batchSpec = BatchSpec( + nativeBatchId=batch_identifier, + query=query, + customProperties=batchSpecProperties, + ) + try: + tables = DefaultSQLParser(query).get_tables() + except Exception as e: + logger.warning(f"Sql parser failed on {query} with {e}") + tables = [] + + if len(set(tables)) != 1: + warn( + "DataHubValidationAction does not support cross dataset assertions." + ) + return [] + for table in tables: + dataset_urn = make_dataset_urn_from_sqlalchemy_uri( + sqlalchemy_uri, + None, + table, + self.env, + self.get_platform_instance( + data_asset.active_batch_definition.datasource_name + ), + self.exclude_dbname, + self.platform_alias, + self.convert_urns_to_lowercase, + ) + dataset_partitions.append( + { + "dataset_urn": dataset_urn, + "partitionSpec": partitionSpec, + "batchSpec": batchSpec, + } + ) + elif isinstance(ge_batch_spec, RuntimeDataBatchSpec): + data_platform = self.get_platform_instance( + data_asset.active_batch_definition.datasource_name + ) + dataset_urn = builder.make_dataset_urn_with_platform_instance( + platform=( + data_platform + if self.platform_alias is None + else self.platform_alias + ), + name=data_asset.active_batch_definition.datasource_name, + platform_instance="", + env=self.env, + ) + batchSpec = BatchSpec( + nativeBatchId=batch_identifier, + query="", + customProperties=batchSpecProperties, + ) + dataset_partitions.append( + { + "dataset_urn": dataset_urn, + "partitionSpec": partitionSpec, + "batchSpec": batchSpec, + } + ) + else: + warn( + "DataHubValidationAction does not recognize this GE batch spec type- {batch_spec_type}.".format( + batch_spec_type=type(ge_batch_spec) + ) + ) + else: + # TODO - v2-spec - SqlAlchemyDataset support + warn( + "DataHubValidationAction does not recognize this GE data asset type - {asset_type}. This is either using v2-api or execution engine other than sqlalchemy.".format( + asset_type=type(data_asset) + ) + ) + + return dataset_partitions + + def get_platform_instance(self, datasource_name): + if self.platform_instance_map and datasource_name in self.platform_instance_map: + return self.platform_instance_map[datasource_name] + else: + warn( + f"Datasource {datasource_name} is not present in platform_instance_map" + ) + return None + + +def parse_int_or_default(value, default_value=None): + if value is None: + return default_value + else: + return int(value) + + +def make_dataset_urn_from_sqlalchemy_uri( + sqlalchemy_uri, + schema_name, + table_name, + env, + platform_instance=None, + exclude_dbname=None, + platform_alias=None, + convert_urns_to_lowercase=False, +): + data_platform = get_platform_from_sqlalchemy_uri(str(sqlalchemy_uri)) + url_instance = make_url(sqlalchemy_uri) + + if schema_name is None and "." in table_name: + schema_name, table_name = table_name.split(".")[-2:] + + if data_platform in ["redshift", "postgres"]: + schema_name = schema_name or "public" + if url_instance.database is None: + warn( + f"DataHubValidationAction failed to locate database name for {data_platform}." + ) + return None + schema_name = ( + schema_name if exclude_dbname else f"{url_instance.database}.{schema_name}" + ) + elif data_platform == "mssql": + schema_name = schema_name or "dbo" + if url_instance.database is None: + warn( + f"DataHubValidationAction failed to locate database name for {data_platform}." + ) + return None + schema_name = ( + schema_name if exclude_dbname else f"{url_instance.database}.{schema_name}" + ) + elif data_platform in ["trino", "snowflake"]: + if schema_name is None or url_instance.database is None: + warn( + "DataHubValidationAction failed to locate schema name and/or database name for {data_platform}.".format( + data_platform=data_platform + ) + ) + return None + # If data platform is snowflake, we artificially lowercase the Database name. + # This is because DataHub also does this during ingestion. + # Ref: https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py#L155 + database_name = ( + url_instance.database.lower() + if data_platform == "snowflake" + else url_instance.database + ) + if database_name.endswith(f"/{schema_name}"): + database_name = database_name[: -len(f"/{schema_name}")] + schema_name = ( + schema_name if exclude_dbname else f"{database_name}.{schema_name}" + ) + + elif data_platform == "bigquery": + if url_instance.host is None or url_instance.database is None: + warn( + "DataHubValidationAction failed to locate host and/or database name for {data_platform}. ".format( + data_platform=data_platform + ) + ) + return None + schema_name = f"{url_instance.host}.{url_instance.database}" + + schema_name = schema_name or url_instance.database + if schema_name is None: + warn( + f"DataHubValidationAction failed to locate schema name for {data_platform}." + ) + return None + + dataset_name = f"{schema_name}.{table_name}" + + if convert_urns_to_lowercase: + dataset_name = dataset_name.lower() + + dataset_urn = builder.make_dataset_urn_with_platform_instance( + platform=data_platform if platform_alias is None else platform_alias, + name=dataset_name, + platform_instance=platform_instance, + env=env, + ) + + return dataset_urn + + +@dataclass +class DataHubStdAssertion: + scope: Union[str, DatasetAssertionScope] + operator: Union[str, AssertionStdOperator] + aggregation: Union[str, AssertionStdAggregation] + parameters: Optional[AssertionStdParameters] = None + + +class DecimalEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, Decimal): + return str(o) + return super().default(o) + + +def convert_to_string(var: Any) -> str: + try: + tmp = ( + str(var) + if isinstance(var, (str, int, float)) + else json.dumps(var, cls=DecimalEncoder) + ) + except TypeError as e: + logger.debug(e) + tmp = str(var) + return tmp + + +def warn(msg): + logger.warning(msg) diff --git a/metadata-ingestion-modules/gx-plugin/tests/__init__.py b/metadata-ingestion-modules/gx-plugin/tests/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion-modules/gx-plugin/tests/conftest.py b/metadata-ingestion-modules/gx-plugin/tests/conftest.py new file mode 100644 index 00000000000000..c99230fba30949 --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/tests/conftest.py @@ -0,0 +1 @@ +from datahub.testing.docker_utils import docker_compose_runner # noqa: F401 diff --git a/metadata-ingestion/tests/integration/great-expectations/docker-compose.yml b/metadata-ingestion-modules/gx-plugin/tests/integration/docker-compose.yml similarity index 100% rename from metadata-ingestion/tests/integration/great-expectations/docker-compose.yml rename to metadata-ingestion-modules/gx-plugin/tests/integration/docker-compose.yml diff --git a/metadata-ingestion/tests/integration/great-expectations/ge_mcps_golden.json b/metadata-ingestion-modules/gx-plugin/tests/integration/ge_mcps_golden.json similarity index 100% rename from metadata-ingestion/tests/integration/great-expectations/ge_mcps_golden.json rename to metadata-ingestion-modules/gx-plugin/tests/integration/ge_mcps_golden.json diff --git a/metadata-ingestion/tests/integration/great-expectations/ge_mcps_golden_2.json b/metadata-ingestion-modules/gx-plugin/tests/integration/ge_mcps_golden_2.json similarity index 100% rename from metadata-ingestion/tests/integration/great-expectations/ge_mcps_golden_2.json rename to metadata-ingestion-modules/gx-plugin/tests/integration/ge_mcps_golden_2.json diff --git a/metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/checkpoints/test_checkpoint.yml b/metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/checkpoints/test_checkpoint.yml similarity index 97% rename from metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/checkpoints/test_checkpoint.yml rename to metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/checkpoints/test_checkpoint.yml index 466cbfe39a4ab0..0e6fa886d57847 100644 --- a/metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/checkpoints/test_checkpoint.yml +++ b/metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/checkpoints/test_checkpoint.yml @@ -19,7 +19,7 @@ action_list: site_names: [] - name: datahub_action action: - module_name: datahub.integrations.great_expectations.action + module_name: datahub_gx_plugin.action class_name: DataHubValidationAction server_url: http://localhost:8080 graceful_exceptions: False diff --git a/metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/checkpoints/test_checkpoint_2.yml b/metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/checkpoints/test_checkpoint_2.yml similarity index 97% rename from metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/checkpoints/test_checkpoint_2.yml rename to metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/checkpoints/test_checkpoint_2.yml index 409d93f64db160..d0fa2a8c179920 100644 --- a/metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/checkpoints/test_checkpoint_2.yml +++ b/metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/checkpoints/test_checkpoint_2.yml @@ -19,7 +19,7 @@ action_list: site_names: [] - name: datahub_action action: - module_name: datahub.integrations.great_expectations.action + module_name: datahub_gx_plugin.action class_name: DataHubValidationAction server_url: http://localhost:8080 graceful_exceptions: False diff --git a/metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/expectations/.ge_store_backend_id b/metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/expectations/.ge_store_backend_id similarity index 100% rename from metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/expectations/.ge_store_backend_id rename to metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/expectations/.ge_store_backend_id diff --git a/metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/expectations/test_suite.json b/metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/expectations/test_suite.json similarity index 100% rename from metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/expectations/test_suite.json rename to metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/expectations/test_suite.json diff --git a/metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/great_expectations.yml b/metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/great_expectations.yml similarity index 100% rename from metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/great_expectations.yml rename to metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/great_expectations.yml diff --git a/metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css b/metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css similarity index 100% rename from metadata-ingestion/tests/integration/great-expectations/setup/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css rename to metadata-ingestion-modules/gx-plugin/tests/integration/setup/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css diff --git a/metadata-ingestion/tests/integration/great-expectations/setup/setup.sql b/metadata-ingestion-modules/gx-plugin/tests/integration/setup/setup.sql similarity index 100% rename from metadata-ingestion/tests/integration/great-expectations/setup/setup.sql rename to metadata-ingestion-modules/gx-plugin/tests/integration/setup/setup.sql diff --git a/metadata-ingestion/tests/integration/great-expectations/test_great_expectations.py b/metadata-ingestion-modules/gx-plugin/tests/integration/test_great_expectations.py similarity index 68% rename from metadata-ingestion/tests/integration/great-expectations/test_great_expectations.py rename to metadata-ingestion-modules/gx-plugin/tests/integration/test_great_expectations.py index 0bb87b993e6b06..b03681dc780584 100644 --- a/metadata-ingestion/tests/integration/great-expectations/test_great_expectations.py +++ b/metadata-ingestion-modules/gx-plugin/tests/integration/test_great_expectations.py @@ -1,17 +1,30 @@ +import os import shutil from typing import List from unittest import mock +import packaging.version import pytest -from freezegun import freeze_time -from great_expectations.data_context.data_context.file_data_context import ( - FileDataContext, -) - from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.sink.file import write_metadata_file -from tests.test_helpers import mce_helpers -from tests.test_helpers.docker_helpers import wait_for_port +from datahub.testing.compare_metadata_json import assert_metadata_files_equal +from datahub.testing.docker_utils import wait_for_port +from freezegun import freeze_time +from great_expectations.data_context import FileDataContext + +try: + from great_expectations import __version__ as GX_VERSION # type: ignore + + use_gx_folder = packaging.version.parse(GX_VERSION) > packaging.version.Version( + "0.17.0" + ) +except Exception: + use_gx_folder = False + + +def should_update_golden_file() -> bool: + return bool(os.getenv("DATAHUB_GOLDEN_FILE_UPDATE", False)) + FROZEN_TIME = "2021-12-28 12:00:00" @@ -40,12 +53,11 @@ def test_ge_ingest( docker_compose_runner, pytestconfig, tmp_path, - mock_time, checkpoint, golden_json, **kwargs, ): - test_resources_dir = pytestconfig.rootpath / "tests/integration/great-expectations" + test_resources_dir = pytestconfig.rootpath / "tests/integration" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "great-expectations" @@ -57,18 +69,21 @@ def test_ge_ingest( emitter = MockDatahubEmitter("") mock_emit_mcp.side_effect = emitter.emit_mcp + gx_context_folder_name = "gx" if use_gx_folder else "great_expectations" shutil.copytree( test_resources_dir / "setup/great_expectations", - tmp_path / "great_expectations", + tmp_path / gx_context_folder_name, ) + context = FileDataContext.create(tmp_path) context.run_checkpoint(checkpoint_name=checkpoint) emitter.write_to_file(tmp_path / "ge_mcps.json") - mce_helpers.check_golden_file( - pytestconfig, + assert_metadata_files_equal( output_path=tmp_path / "ge_mcps.json", golden_path=test_resources_dir / golden_json, + copy_output=False, + update_golden=should_update_golden_file(), ignore_paths=[], ) diff --git a/metadata-ingestion/tests/unit/test_great_expectations_action.py b/metadata-ingestion-modules/gx-plugin/tests/unit/test_great_expectations_action.py similarity index 98% rename from metadata-ingestion/tests/unit/test_great_expectations_action.py rename to metadata-ingestion-modules/gx-plugin/tests/unit/test_great_expectations_action.py index 2e23949d296893..c870a4449abea1 100644 --- a/metadata-ingestion/tests/unit/test_great_expectations_action.py +++ b/metadata-ingestion-modules/gx-plugin/tests/unit/test_great_expectations_action.py @@ -4,6 +4,22 @@ import pandas as pd import pytest +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionResultClass, + AssertionResultTypeClass, + AssertionRunEventClass, + AssertionRunStatusClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionTypeClass, + BatchSpecClass, + DataPlatformInstanceClass, + DatasetAssertionInfoClass, + DatasetAssertionScopeClass, + PartitionSpecClass, +) from great_expectations.core.batch import Batch, BatchDefinition, BatchRequest from great_expectations.core.batch_spec import ( RuntimeDataBatchSpec, @@ -14,10 +30,7 @@ ) from great_expectations.core.id_dict import IDDict from great_expectations.core.run_identifier import RunIdentifier -from great_expectations.data_context import DataContext -from great_expectations.data_context.data_context.file_data_context import ( - FileDataContext, -) +from great_expectations.data_context import DataContext, FileDataContext from great_expectations.data_context.types.resource_identifiers import ( ExpectationSuiteIdentifier, ValidationResultIdentifier, @@ -33,23 +46,7 @@ ) from great_expectations.validator.validator import Validator -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.integrations.great_expectations.action import DataHubValidationAction -from datahub.metadata.schema_classes import ( - AssertionInfoClass, - AssertionResultClass, - AssertionResultTypeClass, - AssertionRunEventClass, - AssertionRunStatusClass, - AssertionStdParameterClass, - AssertionStdParametersClass, - AssertionTypeClass, - BatchSpecClass, - DataPlatformInstanceClass, - DatasetAssertionInfoClass, - DatasetAssertionScopeClass, - PartitionSpecClass, -) +from datahub_gx_plugin.action import DataHubValidationAction logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md index e0dbc7c8d4b145..b37c4e5ad96738 100644 --- a/metadata-ingestion/developing.md +++ b/metadata-ingestion/developing.md @@ -68,6 +68,18 @@ cd metadata-ingestion-modules/dagster-plugin source venv/bin/activate datahub version # should print "DataHub CLI version: unavailable (installed in develop mode)" ``` + +### (Optional) Set up your Python environment for developing on GX Plugin + +From the repository root: + +```shell +cd metadata-ingestion-modules/gx-plugin +../../gradlew :metadata-ingestion-modules:gx-plugin:installDev +source venv/bin/activate +datahub version # should print "DataHub CLI version: unavailable (installed in develop mode)" +``` + ### Common setup issues Common issues (click to expand): diff --git a/metadata-ingestion/integration_docs/great-expectations.md b/metadata-ingestion/integration_docs/great-expectations.md index 80f5bedf42661a..9a4097a8f3af35 100644 --- a/metadata-ingestion/integration_docs/great-expectations.md +++ b/metadata-ingestion/integration_docs/great-expectations.md @@ -23,7 +23,7 @@ This integration does not support 1. Install the required dependency in your Great Expectations environment. ```shell - pip install 'acryl-datahub[great-expectations]' + pip install 'acryl-datahub-gx-plugin' ``` @@ -32,7 +32,7 @@ This integration does not support action_list: - name: datahub_action action: - module_name: datahub.integrations.great_expectations.action + module_name: module_name: datahub_gx_plugin.action class_name: DataHubValidationAction server_url: http://localhost:8080 #datahub server url ``` diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 7fb83fb6a83253..88c60e00c2e901 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -8,7 +8,9 @@ _version: str = package_metadata["__version__"] _self_pin = ( - f"=={_version}" if not (_version.endswith("dev0") or "docker" in _version) else "" + f"=={_version}" + if not (_version.endswith(("dev0", "dev1")) or "docker" in _version) + else "" ) base_requirements = { @@ -173,7 +175,7 @@ *sqlglot_lib, "GitPython>2", "python-liquid", - "deepmerge>=1.1.1" + "deepmerge>=1.1.1", } bigquery_common = { @@ -181,6 +183,7 @@ "google-cloud-logging<=3.5.0", "google-cloud-bigquery", "google-cloud-datacatalog>=1.5.0", + "google-cloud-resource-manager", "more-itertools>=8.12.0", "sqlalchemy-bigquery>=1.4.1", } @@ -331,7 +334,9 @@ "gql[requests]>=3.3.0", }, "datahub": mysql | kafka_common, - "great-expectations": sql_common | sqllineage_lib, + "great-expectations": { + f"acryl-datahub-gx-plugin{_self_pin}", + }, # Misc plugins. "sql-parser": sqlglot_lib, # Source plugins @@ -481,6 +486,9 @@ # The Airflow extra is only retained for compatibility, but new users should # be using the datahub-airflow-plugin package instead. "airflow", + # The great-expectations extra is only retained for compatibility, but new users should + # be using the datahub-gx-plugin package instead. + "great-expectations", # SQL Server ODBC requires additional drivers, and so we don't want to keep # it included in the default "all" installation. "mssql-odbc", @@ -526,9 +534,12 @@ } -pytest_dep = "pytest>=6.2.2" -deepdiff_dep = "deepdiff" -test_api_requirements = {pytest_dep, deepdiff_dep, "PyYAML"} +test_api_requirements = { + "pytest>=6.2.2", + "deepdiff", + "PyYAML", + "pytest-docker>=1.1.0", +} debug_requirements = { "memray", @@ -550,12 +561,9 @@ "isort>=5.7.0", "mypy==1.10.1", *test_api_requirements, - pytest_dep, "pytest-asyncio>=0.16.0", "pytest-cov>=2.8.1", - "pytest-docker>=1.1.0", "pytest-random-order~=1.1.0", - deepdiff_dep, "requests-mock", "freezegun", "jsonpickle", @@ -589,7 +597,6 @@ "kafka", "datahub-rest", "datahub-lite", - "great-expectations", "presto", "redash", "redshift", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 7a96b2f0643ab0..0d73c9ad028972 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -113,8 +113,9 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX = "" self.bigquery_data_dictionary = BigQuerySchemaApi( - self.report.schema_api_perf, - self.config.get_bigquery_client(), + report=BigQueryV2Report().schema_api_perf, + projects_client=config.get_projects_client(), + client=config.get_bigquery_client(), ) if self.config.extract_policy_tags_from_catalog: self.bigquery_data_dictionary.datacatalog_client = ( @@ -257,14 +258,37 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: def _get_projects(self) -> List[BigqueryProject]: logger.info("Getting projects") + if self.config.project_ids or self.config.project_id: project_ids = self.config.project_ids or [self.config.project_id] # type: ignore return [ BigqueryProject(id=project_id, name=project_id) for project_id in project_ids ] - else: - return list(self._query_project_list()) + + if self.config.project_labels: + return list(self._query_project_list_from_labels()) + + return list(self._query_project_list()) + + def _query_project_list_from_labels(self) -> Iterable[BigqueryProject]: + projects = self.bigquery_data_dictionary.get_projects_with_labels( + self.config.project_labels + ) + + if not projects: # Report failure on exception and if empty list is returned + self.report.report_failure( + "metadata-extraction", + "Get projects didn't return any project with any of the specified label(s). " + "Maybe resourcemanager.projects.list permission is missing for the service account. " + "You can assign predefined roles/bigquery.metadataViewer role to your service account.", + ) + + for project in projects: + if self.config.project_id_pattern.allowed(project.id): + yield project + else: + self.report.report_dropped(project.id) def _query_project_list(self) -> Iterable[BigqueryProject]: try: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index fe961dbd780f6f..af9256d8877f50 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -3,7 +3,7 @@ from datetime import timedelta from typing import Any, Dict, List, Optional, Union -from google.cloud import bigquery, datacatalog_v1 +from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3 from google.cloud.logging_v2.client import Client as GCPLoggingClient from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator @@ -34,12 +34,16 @@ class BigQueryUsageConfig(BaseUsageConfig): max_query_duration: timedelta = Field( default=timedelta(minutes=15), - description="Correction to pad start_time and end_time with. For handling the case where the read happens within our time range but the query completion event is delayed and happens after the configured end time.", + description="Correction to pad start_time and end_time with. For handling the case where the read happens " + "within our time range but the query completion event is delayed and happens after the configured" + " end time.", ) apply_view_usage_to_tables: bool = Field( default=False, - description="Whether to apply view's usage to its base tables. If set to False, uses sql parser and applies usage to views / tables mentioned in the query. If set to True, usage is applied to base tables only.", + description="Whether to apply view's usage to its base tables. If set to False, uses sql parser and applies " + "usage to views / tables mentioned in the query. If set to True, usage is applied to base tables " + "only.", ) @@ -74,6 +78,9 @@ def get_bigquery_client(self) -> bigquery.Client: client_options = self.extra_client_options return bigquery.Client(self.project_on_behalf, **client_options) + def get_projects_client(self) -> resourcemanager_v3.ProjectsClient: + return resourcemanager_v3.ProjectsClient() + def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient: return datacatalog_v1.PolicyTagManagerClient() @@ -143,12 +150,14 @@ class BigQueryV2Config( dataset_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), - description="Regex patterns for dataset to filter in ingestion. Specify regex to only match the schema name. e.g. to match all tables in schema analytics, use the regex 'analytics'", + description="Regex patterns for dataset to filter in ingestion. Specify regex to only match the schema name. " + "e.g. to match all tables in schema analytics, use the regex 'analytics'", ) match_fully_qualified_names: bool = Field( default=True, - description="[deprecated] Whether `dataset_pattern` is matched against fully qualified dataset name `.`.", + description="[deprecated] Whether `dataset_pattern` is matched against fully qualified dataset name " + "`.`.", ) include_external_url: bool = Field( @@ -169,7 +178,9 @@ class BigQueryV2Config( table_snapshot_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), - description="Regex patterns for table snapshots to filter in ingestion. Specify regex to match the entire snapshot name in database.schema.snapshot format. e.g. to match all snapshots starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'", + description="Regex patterns for table snapshots to filter in ingestion. Specify regex to match the entire " + "snapshot name in database.schema.snapshot format. e.g. to match all snapshots starting with " + "customer in Customer database and public schema, use the regex 'Customer.public.customer.*'", ) debug_include_full_payloads: bool = Field( @@ -180,17 +191,22 @@ class BigQueryV2Config( number_of_datasets_process_in_batch: int = Field( hidden_from_docs=True, default=10000, - description="Number of table queried in batch when getting metadata. This is a low level config property which should be touched with care.", + description="Number of table queried in batch when getting metadata. This is a low level config property " + "which should be touched with care.", ) number_of_datasets_process_in_batch_if_profiling_enabled: int = Field( default=1000, - description="Number of partitioned table queried in batch when getting metadata. This is a low level config property which should be touched with care. This restriction is needed because we query partitions system view which throws error if we try to touch too many tables.", + description="Number of partitioned table queried in batch when getting metadata. This is a low level config " + "property which should be touched with care. This restriction is needed because we query " + "partitions system view which throws error if we try to touch too many tables.", ) use_tables_list_query_v2: bool = Field( default=False, - description="List tables using an improved query that extracts partitions and last modified timestamps more accurately. Requires the ability to read table data. Automatically enabled when profiling is enabled.", + description="List tables using an improved query that extracts partitions and last modified timestamps more " + "accurately. Requires the ability to read table data. Automatically enabled when profiling is " + "enabled.", ) @property @@ -199,7 +215,9 @@ def have_table_data_read_permission(self) -> bool: column_limit: int = Field( default=300, - description="Maximum number of columns to process in a table. This is a low level config property which should be touched with care. This restriction is needed because excessively wide tables can result in failure to ingest the schema.", + description="Maximum number of columns to process in a table. This is a low level config property which " + "should be touched with care. This restriction is needed because excessively wide tables can " + "result in failure to ingest the schema.", ) # The inheritance hierarchy is wonky here, but these options need modifications. project_id: Optional[str] = Field( @@ -214,6 +232,15 @@ def have_table_data_read_permission(self) -> bool: "Overrides `project_id_pattern`." ), ) + project_labels: List[str] = Field( + default_factory=list, + description=( + "Ingests projects with the specified labels. Set value in the format of `key:value`. Use this property to " + "define which projects to ingest based" + "on project-level labels. If project_ids or project_id is set, this configuration has no effect. The " + "ingestion process filters projects by label first, and then applies the project_id_pattern." + ), + ) storage_project_id: None = Field(default=None, hidden_from_docs=True) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 4cfcc3922ddc3d..807e99604f0133 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -31,6 +31,7 @@ class BigQuerySchemaApiPerfReport(Report): num_get_snapshots_for_dataset_api_requests: int = 0 list_projects: PerfTimer = field(default_factory=PerfTimer) + list_projects_with_labels: PerfTimer = field(default_factory=PerfTimer) list_datasets: PerfTimer = field(default_factory=PerfTimer) get_columns_for_dataset_sec: float = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index d73ac46c862ea1..4326ff7a35527f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -5,7 +5,7 @@ from typing import Any, Dict, Iterable, Iterator, List, Optional from google.api_core import retry -from google.cloud import bigquery, datacatalog_v1 +from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3 from google.cloud.bigquery.table import ( RowIterator, TableListItem, @@ -144,9 +144,11 @@ def __init__( self, report: BigQuerySchemaApiPerfReport, client: bigquery.Client, + projects_client: resourcemanager_v3.ProjectsClient, datacatalog_client: Optional[datacatalog_v1.PolicyTagManagerClient] = None, ) -> None: self.bq_client = client + self.projects_client = projects_client self.report = report self.datacatalog_client = datacatalog_client @@ -175,7 +177,7 @@ def _should_retry(exc: BaseException) -> bool: # 'Quota exceeded: Your user exceeded quota for concurrent project.lists requests.' # Hence, added the api request retry of 15 min. # We already tried adding rate_limit externally, proving max_result and page_size - # to restrict the request calls inside list_project but issue still occured. + # to restrict the request calls inside list_project but issue still occurred. projects_iterator = self.bq_client.list_projects( max_results=max_results_per_page, page_token=page_token, @@ -202,6 +204,26 @@ def _should_retry(exc: BaseException) -> bool: return [] return projects + def get_projects_with_labels(self, labels: List[str]) -> List[BigqueryProject]: + with self.report.list_projects_with_labels: + try: + projects = [] + labels_query = " OR ".join([f"labels.{label}" for label in labels]) + for project in self.projects_client.search_projects(query=labels_query): + projects.append( + BigqueryProject( + id=project.project_id, name=project.display_name + ) + ) + + return projects + + except Exception as e: + logger.error( + f"Error getting projects with labels: {labels}. {e}", exc_info=True + ) + return [] + def get_datasets_for_project_id( self, project_id: str, maxResults: Optional[int] = None ) -> List[BigqueryDataset]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py index 3aac78c154b2ee..e21aadd91d7d52 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py @@ -96,7 +96,9 @@ def metadata_read_capability_test( client: bigquery.Client = config.get_bigquery_client() assert client bigquery_data_dictionary = BigQuerySchemaApi( - BigQueryV2Report().schema_api_perf, client + report=BigQueryV2Report().schema_api_perf, + projects_client=config.get_projects_client(), + client=client, ) result = bigquery_data_dictionary.get_datasets_for_project_id( project_id, 10 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 496bd64d3b4fe2..9d156914917402 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -479,7 +479,9 @@ def lineage_via_catalog_lineage_api( lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient() data_dictionary = BigQuerySchemaApi( - self.report.schema_api_perf, self.config.get_bigquery_client() + self.report.schema_api_perf, + self.config.get_bigquery_client(), + self.config.get_projects_client(), ) # Filtering datasets diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_constant.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_constant.py index 5f47d361abb37c..920efeaa709e59 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_constant.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_constant.py @@ -1,4 +1,7 @@ IMPORTED_PROJECTS = "imported_projects" +DIMENSIONS = "dimensions" +MEASURES = "measures" +DIMENSION_GROUPS = "dimension_groups" SQL_TABLE_NAME = "sql_table_name" DATAHUB_TRANSFORMED_SQL_TABLE_NAME = "datahub_transformed_sql_table_name" DERIVED_TABLE = "derived_table" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py index 69b9f842ac14db..bf24f4b84679b1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py @@ -8,6 +8,11 @@ find_view_from_resolved_includes, ) from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition +from datahub.ingestion.source.looker.looker_constant import ( + DIMENSION_GROUPS, + DIMENSIONS, + MEASURES, +) from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.lookml_config import ( @@ -23,6 +28,39 @@ logger = logging.getLogger(__name__) +def merge_parent_and_child_fields( + child_fields: List[dict], parent_fields: List[dict] +) -> List[Dict]: + # Fetch the fields from the parent view, i.e., the view name mentioned in view.extends, and include those + # fields in child_fields. This inclusion will resolve the fields according to the precedence rules mentioned + # in the LookML documentation: https://cloud.google.com/looker/docs/reference/param-view-extends. + + # Create a map field-name vs field + child_field_map: dict = {} + for field in child_fields: + assert ( + NAME in field + ), "A lookml view must have a name field" # name is required field of lookml field array + + child_field_map[field[NAME]] = field + + for field in parent_fields: + assert ( + NAME in field + ), "A lookml view must have a name field" # name is required field of lookml field array + + if field[NAME] in child_field_map: + # Fields defined in the child view take higher precedence. + # This is an override case where the child has redefined the parent field. + # There are some additive attributes; however, we are not consuming them in metadata ingestion + # and hence not adding them to the child field. + continue + + child_fields.append(field) + + return child_fields + + class LookerFieldContext: raw_field: Dict[Any, Any] @@ -248,23 +286,21 @@ def resolve_extends_view_name( ) return None - def get_including_extends( + def _get_parent_attribute( self, - field: str, + attribute_name: str, ) -> Optional[Any]: + """ + Search for the attribute_name in the parent views of the current view and return its value. + """ extends = list( itertools.chain.from_iterable( self.raw_view.get("extends", self.raw_view.get("extends__all", [])) ) ) - # First, check the current view. - if field in self.raw_view: - return self.raw_view[field] - - # The field might be defined in another view and this view is extending that view, - # so we resolve this field while taking that into account. - # following Looker's precedence rules. + # Following Looker's precedence rules. + # reversed the view-names mentioned in `extends` attribute for extend in reversed(extends): assert extend != self.raw_view[NAME], "a view cannot extend itself" extend_view = self.resolve_extends_view_name( @@ -275,8 +311,33 @@ def get_including_extends( f"failed to resolve extends view {extend} in view {self.raw_view[NAME]} of" f" file {self.view_file.absolute_file_path}" ) - if field in extend_view: - return extend_view[field] + if attribute_name in extend_view: + return extend_view[attribute_name] + + return None + + def get_including_extends( + self, + field: str, + ) -> Optional[Any]: + + # According to Looker's inheritance rules, we need to merge the fields(i.e. dimensions, measures and + # dimension_groups) from both the child and parent. + if field in [DIMENSIONS, DIMENSION_GROUPS, MEASURES]: + # Get the child fields + child_fields = self._get_list_dict(field) + # merge parent and child fields + return merge_parent_and_child_fields( + child_fields=child_fields, + parent_fields=self._get_parent_attribute(attribute_name=field) or [], + ) + else: + # Return the field from the current view if it exists. + if field in self.raw_view: + return self.raw_view[field] + + # The field might be defined in another view, and this view is extending that view, + return self._get_parent_attribute(field) return None @@ -383,13 +444,13 @@ def _get_list_dict(self, attribute_name: str) -> List[Dict]: return [] def dimensions(self) -> List[Dict]: - return self._get_list_dict("dimensions") + return self.get_including_extends(field=DIMENSIONS) or [] def measures(self) -> List[Dict]: - return self._get_list_dict("measures") + return self.get_including_extends(field=MEASURES) or [] def dimension_groups(self) -> List[Dict]: - return self._get_list_dict("dimension_groups") + return self.get_including_extends(field=DIMENSION_GROUPS) or [] def is_materialized_derived_view(self) -> bool: for k in self.derived_table(): @@ -433,7 +494,7 @@ def is_sql_based_derived_case(self) -> bool: return False def is_native_derived_case(self) -> bool: - # It is pattern 5 + # It is pattern 5, mentioned in Class documentation if ( "derived_table" in self.raw_view and "explore_source" in self.raw_view["derived_table"] @@ -443,7 +504,7 @@ def is_native_derived_case(self) -> bool: return False def is_sql_based_derived_view_without_fields_case(self) -> bool: - # Pattern 6 + # Pattern 6, mentioned in Class documentation fields: List[Dict] = [] fields.extend(self.dimensions()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py index d5929b52aea3a3..0917a9e9faafee 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py @@ -385,7 +385,7 @@ def get_upstream_column_ref( config=self.config, ) - return upstreams_column_refs + return _drop_hive_dot_from_upstream(upstreams_column_refs) def get_upstream_dataset_urn(self) -> List[Urn]: return self._get_upstream_dataset_urn() diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index 7ce3b5bc34da2f..e4dadaf602852c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -18,9 +18,13 @@ from datahub.emitter.mce_builder import ( make_data_platform_urn, make_dataplatform_instance_urn, - make_dataset_urn_with_platform_instance, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.mcp_builder import ( + DatabaseKey, + add_dataset_to_container, + gen_containers, +) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SourceCapability, @@ -32,6 +36,7 @@ ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes from datahub.ingestion.source.schema_inference.object import ( SchemaDescription, construct_schema, @@ -64,6 +69,7 @@ DataPlatformInstanceClass, DatasetPropertiesClass, ) +from datahub.metadata.urns import DatasetUrn logger = logging.getLogger(__name__) @@ -263,6 +269,7 @@ class MongoDBSource(StatefulIngestionSourceBase): config: MongoDBConfig report: MongoDBSourceReport mongo_client: MongoClient + platform: str = "mongodb" def __init__(self, ctx: PipelineContext, config: MongoDBConfig): super().__init__(config, ctx) @@ -282,7 +289,9 @@ def __init__(self, ctx: PipelineContext, config: MongoDBConfig): } # See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes - self.mongo_client = MongoClient(self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options) # type: ignore + self.mongo_client = MongoClient( + self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options + ) # type: ignore # This cheaply tests the connection. For details, see # https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient @@ -351,8 +360,6 @@ def get_field_type( return SchemaFieldDataType(type=TypeClass()) def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - platform = "mongodb" - database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent @@ -364,8 +371,19 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: continue database = self.mongo_client[database_name] - collection_names: List[str] = database.list_collection_names() + database_key = DatabaseKey( + database=database_name, + platform=self.platform, + instance=self.config.platform_instance, + env=self.config.env, + ) + yield from gen_containers( + container_key=database_key, + name=database_name, + sub_types=[DatasetContainerSubTypes.DATABASE], + ) + collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" @@ -374,9 +392,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.report_dropped(dataset_name) continue - dataset_urn = make_dataset_urn_with_platform_instance( - platform=platform, - name=dataset_name, + dataset_urn = DatasetUrn.create_from_ids( + platform_id=self.platform, + table_name=dataset_name, env=self.config.env, platform_instance=self.config.platform_instance, ) @@ -385,9 +403,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: data_platform_instance = None if self.config.platform_instance: data_platform_instance = DataPlatformInstanceClass( - platform=make_data_platform_urn(platform), + platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( - platform, self.config.platform_instance + self.platform, self.config.platform_instance ), ) @@ -397,83 +415,21 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) schema_metadata: Optional[SchemaMetadata] = None - if self.config.enableSchemaInference: - assert self.config.maxDocumentSize is not None - collection_schema = construct_schema_pymongo( - database[collection_name], - delimiter=".", - use_random_sampling=self.config.useRandomSampling, - max_document_size=self.config.maxDocumentSize, - should_add_document_size_filter=self.should_add_document_size_filter(), - sample_size=self.config.schemaSamplingSize, - ) - - # initialize the schema for the collection - canonical_schema: List[SchemaField] = [] - max_schema_size = self.config.maxSchemaSize - collection_schema_size = len(collection_schema.values()) - collection_fields: Union[ - List[SchemaDescription], ValuesView[SchemaDescription] - ] = collection_schema.values() - assert max_schema_size is not None - if collection_schema_size > max_schema_size: - # downsample the schema, using frequency as the sort key - self.report.report_warning( - title="Too many schema fields", - message=f"Downsampling the collection schema because it has too many schema fields. Configured threshold is {max_schema_size}", - context=f"Schema Size: {collection_schema_size}, Collection: {dataset_urn}", - ) - # Add this information to the custom properties so user can know they are looking at downsampled schema - dataset_properties.customProperties[ - "schema.downsampled" - ] = "True" - dataset_properties.customProperties[ - "schema.totalFields" - ] = f"{collection_schema_size}" - - logger.debug( - f"Size of collection fields = {len(collection_fields)}" - ) - # append each schema field (sort so output is consistent) - for schema_field in sorted( - collection_fields, - key=lambda x: ( - -x["count"], - x["delimited_name"], - ), # Negate `count` for descending order, `delimited_name` stays the same for ascending - )[0:max_schema_size]: - field = SchemaField( - fieldPath=schema_field["delimited_name"], - nativeDataType=self.get_pymongo_type_string( - schema_field["type"], dataset_name - ), - type=self.get_field_type( - schema_field["type"], dataset_name - ), - description=None, - nullable=schema_field["nullable"], - recursive=False, - ) - canonical_schema.append(field) - - # create schema metadata object for collection - schema_metadata = SchemaMetadata( - schemaName=collection_name, - platform=f"urn:li:dataPlatform:{platform}", - version=0, - hash="", - platformSchema=SchemalessClass(), - fields=canonical_schema, + schema_metadata = self._infer_schema_metadata( + collection=database[collection_name], + dataset_urn=dataset_urn, + dataset_properties=dataset_properties, ) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. + yield from add_dataset_to_container(database_key, dataset_urn.urn()) yield from [ mcp.as_workunit() for mcp in MetadataChangeProposalWrapper.construct_many( - entityUrn=dataset_urn, + entityUrn=dataset_urn.urn(), aspects=[ schema_metadata, dataset_properties, @@ -482,6 +438,74 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) ] + def _infer_schema_metadata( + self, + collection: pymongo.collection.Collection, + dataset_urn: DatasetUrn, + dataset_properties: DatasetPropertiesClass, + ) -> SchemaMetadata: + assert self.config.maxDocumentSize is not None + collection_schema = construct_schema_pymongo( + collection, + delimiter=".", + use_random_sampling=self.config.useRandomSampling, + max_document_size=self.config.maxDocumentSize, + should_add_document_size_filter=self.should_add_document_size_filter(), + sample_size=self.config.schemaSamplingSize, + ) + + # initialize the schema for the collection + canonical_schema: List[SchemaField] = [] + max_schema_size = self.config.maxSchemaSize + collection_schema_size = len(collection_schema.values()) + collection_fields: Union[ + List[SchemaDescription], ValuesView[SchemaDescription] + ] = collection_schema.values() + assert max_schema_size is not None + if collection_schema_size > max_schema_size: + # downsample the schema, using frequency as the sort key + self.report.report_warning( + title="Too many schema fields", + message=f"Downsampling the collection schema because it has too many schema fields. Configured threshold is {max_schema_size}", + context=f"Schema Size: {collection_schema_size}, Collection: {dataset_urn}", + ) + # Add this information to the custom properties so user can know they are looking at downsampled schema + dataset_properties.customProperties["schema.downsampled"] = "True" + dataset_properties.customProperties[ + "schema.totalFields" + ] = f"{collection_schema_size}" + + logger.debug(f"Size of collection fields = {len(collection_fields)}") + # append each schema field (sort so output is consistent) + for schema_field in sorted( + collection_fields, + key=lambda x: ( + -x["count"], + x["delimited_name"], + ), # Negate `count` for descending order, `delimited_name` stays the same for ascending + )[0:max_schema_size]: + field = SchemaField( + fieldPath=schema_field["delimited_name"], + nativeDataType=self.get_pymongo_type_string( + schema_field["type"], dataset_urn.name + ), + type=self.get_field_type(schema_field["type"], dataset_urn.name), + description=None, + nullable=schema_field["nullable"], + recursive=False, + ) + canonical_schema.append(field) + + # create schema metadata object for collection + return SchemaMetadata( + schemaName=collection.name, + platform=f"urn:li:dataPlatform:{self.platform}", + version=0, + hash="", + platformSchema=SchemalessClass(), + fields=canonical_schema, + ) + def is_server_version_gte_4_4(self) -> bool: try: server_version = self.mongo_client.server_info().get("versionArray") diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py index 2e628269edbc37..594f88dd521ad5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py @@ -313,23 +313,19 @@ def get_table_stats(enriched_tables, field_names, schema, table): size_in_bytes: Optional[int] = None rows_count: Optional[int] = None if schema in enriched_tables and table_name in enriched_tables[schema]: - if enriched_tables[schema][table_name].last_accessed is not None: - # Mypy seems to be not clever enough to understand the above check - last_accessed = enriched_tables[schema][table_name].last_accessed - assert last_accessed + if ( + last_accessed := enriched_tables[schema][table_name].last_accessed + ) is not None: last_altered = last_accessed.replace(tzinfo=timezone.utc) elif creation_time: last_altered = creation_time - if enriched_tables[schema][table_name].size is not None: - # Mypy seems to be not clever enough to understand the above check - size = enriched_tables[schema][table_name].size - if size: - size_in_bytes = size * 1024 * 1024 + if (size := enriched_tables[schema][table_name].size) is not None: + size_in_bytes = size * 1024 * 1024 - if enriched_tables[schema][table_name].estimated_visible_rows is not None: - rows = enriched_tables[schema][table_name].estimated_visible_rows - assert rows + if ( + rows := enriched_tables[schema][table_name].estimated_visible_rows + ) is not None: rows_count = int(rows) else: # The object was not found in the enriched data. diff --git a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py index 94501b0d499b75..cdc8c8268b4883 100644 --- a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py +++ b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py @@ -1,867 +1,3 @@ -from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED +from datahub_gx_plugin.action import DataHubValidationAction -import json -import logging -import sys -import time -from dataclasses import dataclass -from datetime import timezone -from decimal import Decimal -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union - -from great_expectations.checkpoint.actions import ValidationAction -from great_expectations.core.batch import Batch -from great_expectations.core.batch_spec import ( - RuntimeDataBatchSpec, - RuntimeQueryBatchSpec, - SqlAlchemyDatasourceBatchSpec, -) -from great_expectations.core.expectation_validation_result import ( - ExpectationSuiteValidationResult, -) -from great_expectations.data_asset.data_asset import DataAsset -from great_expectations.data_context.data_context import DataContext -from great_expectations.data_context.types.resource_identifiers import ( - ExpectationSuiteIdentifier, - ValidationResultIdentifier, -) -from great_expectations.execution_engine import PandasExecutionEngine -from great_expectations.execution_engine.sqlalchemy_execution_engine import ( - SqlAlchemyExecutionEngine, -) -from great_expectations.validator.validator import Validator -from sqlalchemy.engine.base import Connection, Engine -from sqlalchemy.engine.url import make_url - -import datahub.emitter.mce_builder as builder -from datahub.cli.env_utils import get_boolean_env_variable -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.emitter.serialization_helper import pre_json_transform -from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( - get_platform_from_sqlalchemy_uri, -) -from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( - AssertionInfo, - AssertionResult, - AssertionResultType, - AssertionRunEvent, - AssertionRunStatus, - AssertionStdAggregation, - AssertionStdOperator, - AssertionStdParameter, - AssertionStdParameters, - AssertionStdParameterType, - AssertionType, - BatchSpec, - DatasetAssertionInfo, - DatasetAssertionScope, -) -from datahub.metadata.com.linkedin.pegasus2avro.common import DataPlatformInstance -from datahub.metadata.schema_classes import PartitionSpecClass, PartitionTypeClass -from datahub.utilities.sql_parser import DefaultSQLParser - -if TYPE_CHECKING: - from great_expectations.data_context.types.resource_identifiers import ( - GXCloudIdentifier, - ) - -assert MARKUPSAFE_PATCHED -logger = logging.getLogger(__name__) -if get_boolean_env_variable("DATAHUB_DEBUG", False): - handler = logging.StreamHandler(stream=sys.stdout) - logger.addHandler(handler) - logger.setLevel(logging.DEBUG) - -GE_PLATFORM_NAME = "great-expectations" - - -class DataHubValidationAction(ValidationAction): - def __init__( - self, - data_context: DataContext, - server_url: str, - env: str = builder.DEFAULT_ENV, - platform_alias: Optional[str] = None, - platform_instance_map: Optional[Dict[str, str]] = None, - graceful_exceptions: bool = True, - token: Optional[str] = None, - timeout_sec: Optional[float] = None, - retry_status_codes: Optional[List[int]] = None, - retry_max_times: Optional[int] = None, - extra_headers: Optional[Dict[str, str]] = None, - exclude_dbname: Optional[bool] = None, - parse_table_names_from_sql: bool = False, - convert_urns_to_lowercase: bool = False, - ): - super().__init__(data_context) - self.server_url = server_url - self.env = env - self.platform_alias = platform_alias - self.platform_instance_map = platform_instance_map - self.graceful_exceptions = graceful_exceptions - self.token = token - self.timeout_sec = timeout_sec - self.retry_status_codes = retry_status_codes - self.retry_max_times = retry_max_times - self.extra_headers = extra_headers - self.exclude_dbname = exclude_dbname - self.parse_table_names_from_sql = parse_table_names_from_sql - self.convert_urns_to_lowercase = convert_urns_to_lowercase - - def _run( - self, - validation_result_suite: ExpectationSuiteValidationResult, - validation_result_suite_identifier: Union[ - ValidationResultIdentifier, "GXCloudIdentifier" - ], - data_asset: Union[Validator, DataAsset, Batch], - payload: Optional[Any] = None, - expectation_suite_identifier: Optional[ExpectationSuiteIdentifier] = None, - checkpoint_identifier: Optional[Any] = None, - ) -> Dict: - datasets = [] - try: - emitter = DatahubRestEmitter( - gms_server=self.server_url, - token=self.token, - read_timeout_sec=self.timeout_sec, - connect_timeout_sec=self.timeout_sec, - retry_status_codes=self.retry_status_codes, - retry_max_times=self.retry_max_times, - extra_headers=self.extra_headers, - ) - - expectation_suite_name = validation_result_suite.meta.get( - "expectation_suite_name" - ) - run_id = validation_result_suite.meta.get("run_id") - if hasattr(data_asset, "active_batch_id"): - batch_identifier = data_asset.active_batch_id - else: - batch_identifier = data_asset.batch_id - - if isinstance( - validation_result_suite_identifier, ValidationResultIdentifier - ): - expectation_suite_name = ( - validation_result_suite_identifier.expectation_suite_identifier.expectation_suite_name - ) - run_id = validation_result_suite_identifier.run_id - batch_identifier = validation_result_suite_identifier.batch_identifier - - # Returns datasets and corresponding batch requests - datasets = self.get_dataset_partitions(batch_identifier, data_asset) - - if len(datasets) == 0 or datasets[0]["dataset_urn"] is None: - warn("Metadata not sent to datahub. No datasets found.") - return {"datahub_notification_result": "none required"} - - # Returns assertion info and assertion results - assertions = self.get_assertions_with_results( - validation_result_suite, - expectation_suite_name, - run_id, - payload, - datasets, - ) - - logger.info("Sending metadata to datahub ...") - logger.info("Dataset URN - {urn}".format(urn=datasets[0]["dataset_urn"])) - - for assertion in assertions: - logger.info( - "Assertion URN - {urn}".format(urn=assertion["assertionUrn"]) - ) - - # Construct a MetadataChangeProposalWrapper object. - assertion_info_mcp = MetadataChangeProposalWrapper( - entityUrn=assertion["assertionUrn"], - aspect=assertion["assertionInfo"], - ) - emitter.emit_mcp(assertion_info_mcp) - - # Construct a MetadataChangeProposalWrapper object. - assertion_platform_mcp = MetadataChangeProposalWrapper( - entityUrn=assertion["assertionUrn"], - aspect=assertion["assertionPlatform"], - ) - emitter.emit_mcp(assertion_platform_mcp) - - for assertionResult in assertion["assertionResults"]: - dataset_assertionResult_mcp = MetadataChangeProposalWrapper( - entityUrn=assertionResult.assertionUrn, - aspect=assertionResult, - ) - - # Emit Result! (timeseries aspect) - emitter.emit_mcp(dataset_assertionResult_mcp) - logger.info("Metadata sent to datahub.") - result = "DataHub notification succeeded" - except Exception as e: - result = "DataHub notification failed" - if self.graceful_exceptions: - logger.error(e) - logger.info("Suppressing error because graceful_exceptions is set") - else: - raise - - return {"datahub_notification_result": result} - - def get_assertions_with_results( - self, - validation_result_suite, - expectation_suite_name, - run_id, - payload, - datasets, - ): - dataPlatformInstance = DataPlatformInstance( - platform=builder.make_data_platform_urn(GE_PLATFORM_NAME) - ) - docs_link = None - if payload: - # process the payload - for action_names in payload.keys(): - if payload[action_names]["class"] == "UpdateDataDocsAction": - data_docs_pages = payload[action_names] - for docs_link_key, docs_link_val in data_docs_pages.items(): - if "file://" not in docs_link_val and docs_link_key != "class": - docs_link = docs_link_val - - assertions_with_results = [] - for result in validation_result_suite.results: - expectation_config = result["expectation_config"] - expectation_type = expectation_config["expectation_type"] - success = bool(result["success"]) - kwargs = { - k: v for k, v in expectation_config["kwargs"].items() if k != "batch_id" - } - - result = result["result"] - assertion_datasets = [d["dataset_urn"] for d in datasets] - if len(datasets) == 1 and "column" in kwargs: - assertion_fields = [ - builder.make_schema_field_urn( - datasets[0]["dataset_urn"], kwargs["column"] - ) - ] - else: - assertion_fields = None # type:ignore - - # Be careful what fields to consider for creating assertion urn. - # Any change in fields below would lead to a new assertion - # FIXME - Currently, when using evaluation parameters, new assertion is - # created when runtime resolved kwargs are different, - # possibly for each validation run - assertionUrn = builder.make_assertion_urn( - builder.datahub_guid( - pre_json_transform( - { - "platform": GE_PLATFORM_NAME, - "nativeType": expectation_type, - "nativeParameters": kwargs, - "dataset": assertion_datasets[0], - "fields": assertion_fields, - } - ) - ) - ) - logger.debug( - "GE expectation_suite_name - {name}, expectation_type - {type}, Assertion URN - {urn}".format( - name=expectation_suite_name, type=expectation_type, urn=assertionUrn - ) - ) - assertionInfo: AssertionInfo = self.get_assertion_info( - expectation_type, - kwargs, - assertion_datasets[0], - assertion_fields, - expectation_suite_name, - ) - - # TODO: Understand why their run time is incorrect. - run_time = run_id.run_time.astimezone(timezone.utc) - evaluation_parameters = ( - { - k: convert_to_string(v) - for k, v in validation_result_suite.evaluation_parameters.items() - if k and v - } - if validation_result_suite.evaluation_parameters - else None - ) - - nativeResults = { - k: convert_to_string(v) - for k, v in result.items() - if ( - k - in [ - "observed_value", - "partial_unexpected_list", - "partial_unexpected_counts", - "details", - ] - and v - ) - } - - actualAggValue = ( - result.get("observed_value") - if isinstance(result.get("observed_value"), (int, float)) - else None - ) - - ds = datasets[0] - # https://docs.greatexpectations.io/docs/reference/expectations/result_format/ - assertionResult = AssertionRunEvent( - timestampMillis=int(round(time.time() * 1000)), - assertionUrn=assertionUrn, - asserteeUrn=ds["dataset_urn"], - runId=run_time.strftime("%Y-%m-%dT%H:%M:%SZ"), - result=AssertionResult( - type=AssertionResultType.SUCCESS - if success - else AssertionResultType.FAILURE, - rowCount=parse_int_or_default(result.get("element_count")), - missingCount=parse_int_or_default(result.get("missing_count")), - unexpectedCount=parse_int_or_default( - result.get("unexpected_count") - ), - actualAggValue=actualAggValue, - externalUrl=docs_link, - nativeResults=nativeResults, - ), - batchSpec=ds["batchSpec"], - status=AssertionRunStatus.COMPLETE, - runtimeContext=evaluation_parameters, - ) - if ds.get("partitionSpec") is not None: - assertionResult.partitionSpec = ds.get("partitionSpec") - assertionResults = [assertionResult] - assertions_with_results.append( - { - "assertionUrn": assertionUrn, - "assertionInfo": assertionInfo, - "assertionPlatform": dataPlatformInstance, - "assertionResults": assertionResults, - } - ) - return assertions_with_results - - def get_assertion_info( - self, expectation_type, kwargs, dataset, fields, expectation_suite_name - ): - # TODO - can we find exact type of min and max value - def get_min_max(kwargs, type=AssertionStdParameterType.UNKNOWN): - return AssertionStdParameters( - minValue=AssertionStdParameter( - value=convert_to_string(kwargs.get("min_value")), - type=type, - ), - maxValue=AssertionStdParameter( - value=convert_to_string(kwargs.get("max_value")), - type=type, - ), - ) - - known_expectations: Dict[str, DataHubStdAssertion] = { - # column aggregate expectations - "expect_column_min_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.MIN, - parameters=get_min_max(kwargs), - ), - "expect_column_max_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.MAX, - parameters=get_min_max(kwargs), - ), - "expect_column_median_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.MEDIAN, - parameters=get_min_max(kwargs), - ), - "expect_column_stdev_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.STDDEV, - parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), - ), - "expect_column_mean_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.MEAN, - parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), - ), - "expect_column_unique_value_count_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.UNIQUE_COUNT, - parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), - ), - "expect_column_proportion_of_unique_values_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.UNIQUE_PROPOTION, - parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), - ), - "expect_column_sum_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.SUM, - parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), - ), - "expect_column_quantile_values_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation._NATIVE_, - ), - # column map expectations - "expect_column_values_to_not_be_null": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.NOT_NULL, - aggregation=AssertionStdAggregation.IDENTITY, - ), - "expect_column_values_to_be_in_set": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.IN, - aggregation=AssertionStdAggregation.IDENTITY, - parameters=AssertionStdParameters( - value=AssertionStdParameter( - value=convert_to_string(kwargs.get("value_set")), - type=AssertionStdParameterType.SET, - ) - ), - ), - "expect_column_values_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.IDENTITY, - parameters=get_min_max(kwargs), - ), - "expect_column_values_to_match_regex": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.REGEX_MATCH, - aggregation=AssertionStdAggregation.IDENTITY, - parameters=AssertionStdParameters( - value=AssertionStdParameter( - value=kwargs.get("regex"), - type=AssertionStdParameterType.STRING, - ) - ), - ), - "expect_column_values_to_match_regex_list": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_COLUMN, - operator=AssertionStdOperator.REGEX_MATCH, - aggregation=AssertionStdAggregation.IDENTITY, - parameters=AssertionStdParameters( - value=AssertionStdParameter( - value=convert_to_string(kwargs.get("regex_list")), - type=AssertionStdParameterType.LIST, - ) - ), - ), - "expect_table_columns_to_match_ordered_list": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_SCHEMA, - operator=AssertionStdOperator.EQUAL_TO, - aggregation=AssertionStdAggregation.COLUMNS, - parameters=AssertionStdParameters( - value=AssertionStdParameter( - value=convert_to_string(kwargs.get("column_list")), - type=AssertionStdParameterType.LIST, - ) - ), - ), - "expect_table_columns_to_match_set": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_SCHEMA, - operator=AssertionStdOperator.EQUAL_TO, - aggregation=AssertionStdAggregation.COLUMNS, - parameters=AssertionStdParameters( - value=AssertionStdParameter( - value=convert_to_string(kwargs.get("column_set")), - type=AssertionStdParameterType.SET, - ) - ), - ), - "expect_table_column_count_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_SCHEMA, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.COLUMN_COUNT, - parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), - ), - "expect_table_column_count_to_equal": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_SCHEMA, - operator=AssertionStdOperator.EQUAL_TO, - aggregation=AssertionStdAggregation.COLUMN_COUNT, - parameters=AssertionStdParameters( - value=AssertionStdParameter( - value=convert_to_string(kwargs.get("value")), - type=AssertionStdParameterType.NUMBER, - ) - ), - ), - "expect_column_to_exist": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_SCHEMA, - operator=AssertionStdOperator._NATIVE_, - aggregation=AssertionStdAggregation._NATIVE_, - ), - "expect_table_row_count_to_equal": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_ROWS, - operator=AssertionStdOperator.EQUAL_TO, - aggregation=AssertionStdAggregation.ROW_COUNT, - parameters=AssertionStdParameters( - value=AssertionStdParameter( - value=convert_to_string(kwargs.get("value")), - type=AssertionStdParameterType.NUMBER, - ) - ), - ), - "expect_table_row_count_to_be_between": DataHubStdAssertion( - scope=DatasetAssertionScope.DATASET_ROWS, - operator=AssertionStdOperator.BETWEEN, - aggregation=AssertionStdAggregation.ROW_COUNT, - parameters=get_min_max(kwargs, AssertionStdParameterType.NUMBER), - ), - } - - datasetAssertionInfo = DatasetAssertionInfo( - dataset=dataset, - fields=fields, - operator=AssertionStdOperator._NATIVE_, - aggregation=AssertionStdAggregation._NATIVE_, - nativeType=expectation_type, - nativeParameters={k: convert_to_string(v) for k, v in kwargs.items()}, - scope=DatasetAssertionScope.DATASET_ROWS, - ) - - if expectation_type in known_expectations.keys(): - assertion = known_expectations[expectation_type] - datasetAssertionInfo.scope = assertion.scope - datasetAssertionInfo.aggregation = assertion.aggregation - datasetAssertionInfo.operator = assertion.operator - datasetAssertionInfo.parameters = assertion.parameters - - # Heuristically mapping other expectations - else: - if "column" in kwargs and expectation_type.startswith( - "expect_column_value" - ): - datasetAssertionInfo.scope = DatasetAssertionScope.DATASET_COLUMN - datasetAssertionInfo.aggregation = AssertionStdAggregation.IDENTITY - elif "column" in kwargs: - datasetAssertionInfo.scope = DatasetAssertionScope.DATASET_COLUMN - datasetAssertionInfo.aggregation = AssertionStdAggregation._NATIVE_ - - return AssertionInfo( - type=AssertionType.DATASET, - datasetAssertion=datasetAssertionInfo, - customProperties={"expectation_suite_name": expectation_suite_name}, - ) - - def get_dataset_partitions(self, batch_identifier, data_asset): - dataset_partitions = [] - - logger.debug("Finding datasets being validated") - - # for now, we support only v3-api and sqlalchemy execution engine and Pandas engine - is_sql_alchemy = isinstance(data_asset, Validator) and ( - isinstance(data_asset.execution_engine, SqlAlchemyExecutionEngine) - ) - is_pandas = isinstance(data_asset.execution_engine, PandasExecutionEngine) - if is_sql_alchemy or is_pandas: - ge_batch_spec = data_asset.active_batch_spec - partitionSpec = None - batchSpecProperties = { - "data_asset_name": str( - data_asset.active_batch_definition.data_asset_name - ), - "datasource_name": str( - data_asset.active_batch_definition.datasource_name - ), - } - sqlalchemy_uri = None - if is_sql_alchemy and isinstance( - data_asset.execution_engine.engine, Engine - ): - sqlalchemy_uri = data_asset.execution_engine.engine.url - # For snowflake sqlalchemy_execution_engine.engine is actually instance of Connection - elif is_sql_alchemy and isinstance( - data_asset.execution_engine.engine, Connection - ): - sqlalchemy_uri = data_asset.execution_engine.engine.engine.url - - if isinstance(ge_batch_spec, SqlAlchemyDatasourceBatchSpec): - # e.g. ConfiguredAssetSqlDataConnector with splitter_method or sampling_method - schema_name = ge_batch_spec.get("schema_name") - table_name = ge_batch_spec.get("table_name") - - dataset_urn = make_dataset_urn_from_sqlalchemy_uri( - sqlalchemy_uri, - schema_name, - table_name, - self.env, - self.get_platform_instance( - data_asset.active_batch_definition.datasource_name - ), - self.exclude_dbname, - self.platform_alias, - self.convert_urns_to_lowercase, - ) - batchSpec = BatchSpec( - nativeBatchId=batch_identifier, - customProperties=batchSpecProperties, - ) - - splitter_method = ge_batch_spec.get("splitter_method") - if ( - splitter_method is not None - and splitter_method != "_split_on_whole_table" - ): - batch_identifiers = ge_batch_spec.get("batch_identifiers", {}) - partitionSpec = PartitionSpecClass( - partition=convert_to_string(batch_identifiers) - ) - sampling_method = ge_batch_spec.get("sampling_method", "") - if sampling_method == "_sample_using_limit": - batchSpec.limit = ge_batch_spec["sampling_kwargs"]["n"] - - dataset_partitions.append( - { - "dataset_urn": dataset_urn, - "partitionSpec": partitionSpec, - "batchSpec": batchSpec, - } - ) - elif isinstance(ge_batch_spec, RuntimeQueryBatchSpec): - if not self.parse_table_names_from_sql: - warn( - "Enable parse_table_names_from_sql in DatahubValidationAction config\ - to try to parse the tables being asserted from SQL query" - ) - return [] - query = data_asset.batches[ - batch_identifier - ].batch_request.runtime_parameters["query"] - partitionSpec = PartitionSpecClass( - type=PartitionTypeClass.QUERY, - partition=f"Query_{builder.datahub_guid(pre_json_transform(query))}", - ) - - batchSpec = BatchSpec( - nativeBatchId=batch_identifier, - query=query, - customProperties=batchSpecProperties, - ) - try: - tables = DefaultSQLParser(query).get_tables() - except Exception as e: - logger.warning(f"Sql parser failed on {query} with {e}") - tables = [] - - if len(set(tables)) != 1: - warn( - "DataHubValidationAction does not support cross dataset assertions." - ) - return [] - for table in tables: - dataset_urn = make_dataset_urn_from_sqlalchemy_uri( - sqlalchemy_uri, - None, - table, - self.env, - self.get_platform_instance( - data_asset.active_batch_definition.datasource_name - ), - self.exclude_dbname, - self.platform_alias, - self.convert_urns_to_lowercase, - ) - dataset_partitions.append( - { - "dataset_urn": dataset_urn, - "partitionSpec": partitionSpec, - "batchSpec": batchSpec, - } - ) - elif isinstance(ge_batch_spec, RuntimeDataBatchSpec): - data_platform = self.get_platform_instance( - data_asset.active_batch_definition.datasource_name - ) - dataset_urn = builder.make_dataset_urn_with_platform_instance( - platform=data_platform - if self.platform_alias is None - else self.platform_alias, - name=data_asset.active_batch_definition.datasource_name, - platform_instance="", - env=self.env, - ) - batchSpec = BatchSpec( - nativeBatchId=batch_identifier, - query="", - customProperties=batchSpecProperties, - ) - dataset_partitions.append( - { - "dataset_urn": dataset_urn, - "partitionSpec": partitionSpec, - "batchSpec": batchSpec, - } - ) - else: - warn( - "DataHubValidationAction does not recognize this GE batch spec type- {batch_spec_type}.".format( - batch_spec_type=type(ge_batch_spec) - ) - ) - else: - # TODO - v2-spec - SqlAlchemyDataset support - warn( - "DataHubValidationAction does not recognize this GE data asset type - {asset_type}. This is either using v2-api or execution engine other than sqlalchemy.".format( - asset_type=type(data_asset) - ) - ) - - return dataset_partitions - - def get_platform_instance(self, datasource_name): - if self.platform_instance_map and datasource_name in self.platform_instance_map: - return self.platform_instance_map[datasource_name] - else: - warn( - f"Datasource {datasource_name} is not present in platform_instance_map" - ) - return None - - -def parse_int_or_default(value, default_value=None): - if value is None: - return default_value - else: - return int(value) - - -def make_dataset_urn_from_sqlalchemy_uri( - sqlalchemy_uri, - schema_name, - table_name, - env, - platform_instance=None, - exclude_dbname=None, - platform_alias=None, - convert_urns_to_lowercase=False, -): - data_platform = get_platform_from_sqlalchemy_uri(str(sqlalchemy_uri)) - url_instance = make_url(sqlalchemy_uri) - - if schema_name is None and "." in table_name: - schema_name, table_name = table_name.split(".")[-2:] - - if data_platform in ["redshift", "postgres"]: - schema_name = schema_name or "public" - if url_instance.database is None: - warn( - f"DataHubValidationAction failed to locate database name for {data_platform}." - ) - return None - schema_name = ( - schema_name if exclude_dbname else f"{url_instance.database}.{schema_name}" - ) - elif data_platform == "mssql": - schema_name = schema_name or "dbo" - if url_instance.database is None: - warn( - f"DataHubValidationAction failed to locate database name for {data_platform}." - ) - return None - schema_name = ( - schema_name if exclude_dbname else f"{url_instance.database}.{schema_name}" - ) - elif data_platform in ["trino", "snowflake"]: - if schema_name is None or url_instance.database is None: - warn( - "DataHubValidationAction failed to locate schema name and/or database name for {data_platform}.".format( - data_platform=data_platform - ) - ) - return None - # If data platform is snowflake, we artificially lowercase the Database name. - # This is because DataHub also does this during ingestion. - # Ref: https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py#L155 - database_name = ( - url_instance.database.lower() - if data_platform == "snowflake" - else url_instance.database - ) - if database_name.endswith(f"/{schema_name}"): - database_name = database_name[: -len(f"/{schema_name}")] - schema_name = ( - schema_name if exclude_dbname else f"{database_name}.{schema_name}" - ) - - elif data_platform == "bigquery": - if url_instance.host is None or url_instance.database is None: - warn( - "DataHubValidationAction failed to locate host and/or database name for {data_platform}. ".format( - data_platform=data_platform - ) - ) - return None - schema_name = f"{url_instance.host}.{url_instance.database}" - - schema_name = schema_name or url_instance.database - if schema_name is None: - warn( - f"DataHubValidationAction failed to locate schema name for {data_platform}." - ) - return None - - dataset_name = f"{schema_name}.{table_name}" - - if convert_urns_to_lowercase: - dataset_name = dataset_name.lower() - - dataset_urn = builder.make_dataset_urn_with_platform_instance( - platform=data_platform if platform_alias is None else platform_alias, - name=dataset_name, - platform_instance=platform_instance, - env=env, - ) - - return dataset_urn - - -@dataclass -class DataHubStdAssertion: - scope: Union[str, DatasetAssertionScope] - operator: Union[str, AssertionStdOperator] - aggregation: Union[str, AssertionStdAggregation] - parameters: Optional[AssertionStdParameters] = None - - -class DecimalEncoder(json.JSONEncoder): - def default(self, o): - if isinstance(o, Decimal): - return str(o) - return super().default(o) - - -def convert_to_string(var: Any) -> str: - try: - tmp = ( - str(var) - if isinstance(var, (str, int, float)) - else json.dumps(var, cls=DecimalEncoder) - ) - except TypeError as e: - logger.debug(e) - tmp = str(var) - return tmp - - -def warn(msg): - logger.warning(msg) +__all__ = ["DataHubValidationAction"] diff --git a/metadata-ingestion/src/datahub/testing/docker_utils.py b/metadata-ingestion/src/datahub/testing/docker_utils.py new file mode 100644 index 00000000000000..7c1c0304f480e6 --- /dev/null +++ b/metadata-ingestion/src/datahub/testing/docker_utils.py @@ -0,0 +1,70 @@ +import contextlib +import logging +import subprocess +from typing import Callable, Iterator, List, Optional, Union + +import pytest +import pytest_docker.plugin + +logger = logging.getLogger(__name__) + + +def is_responsive(container_name: str, port: int, hostname: Optional[str]) -> bool: + """A cheap way to figure out if a port is responsive on a container""" + if hostname: + cmd = f"docker exec {container_name} /bin/bash -c 'echo -n > /dev/tcp/{hostname}/{port}'" + else: + # use the hostname of the container + cmd = f"docker exec {container_name} /bin/bash -c 'c_host=`hostname`;echo -n > /dev/tcp/$c_host/{port}'" + ret = subprocess.run( + cmd, + shell=True, + ) + return ret.returncode == 0 + + +def wait_for_port( + docker_services: pytest_docker.plugin.Services, + container_name: str, + container_port: int, + hostname: Optional[str] = None, + timeout: float = 30.0, + pause: float = 0.5, + checker: Optional[Callable[[], bool]] = None, +) -> None: + try: + docker_services.wait_until_responsive( + timeout=timeout, + pause=pause, + check=( + checker + if checker + else lambda: is_responsive(container_name, container_port, hostname) + ), + ) + logger.info(f"Container {container_name} is ready!") + finally: + # use check=True to raise an error if command gave bad exit code + subprocess.run(f"docker logs {container_name}", shell=True, check=True) + + +@pytest.fixture(scope="module") +def docker_compose_runner( + docker_compose_command, docker_compose_project_name, docker_setup, docker_cleanup +): + @contextlib.contextmanager + def run( + compose_file_path: Union[str, List[str]], key: str, cleanup: bool = True + ) -> Iterator[pytest_docker.plugin.Services]: + with pytest_docker.plugin.get_docker_services( + docker_compose_command=docker_compose_command, + # We can remove the type ignore once this is merged: + # https://github.com/avast/pytest-docker/pull/108 + docker_compose_file=compose_file_path, # type: ignore + docker_compose_project_name=f"{docker_compose_project_name}-{key}", + docker_setup=docker_setup, + docker_cleanup=docker_cleanup if cleanup else [], + ) as docker_services: + yield docker_services + + return run diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json new file mode 100644 index 00000000000000..a529ddc6221a7a --- /dev/null +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json @@ -0,0 +1,452 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "dev" + }, + "name": "dev" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ce17940c2d64e7e315e68f8d7d071b1e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "dev", + "dataset_id": "bigquery-dataset-1" + }, + "externalUrl": "https://console.cloud.google.com/bigquery?project=dev&ws=!1m4!1m3!3m2!1sdev!2sbigquery-dataset-1", + "name": "bigquery-dataset-1" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ce17940c2d64e7e315e68f8d7d071b1e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ce17940c2d64e7e315e68f8d7d071b1e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ce17940c2d64e7e315e68f8d7d071b1e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ce17940c2d64e7e315e68f8d7d071b1e", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ce17940c2d64e7e315e68f8d7d071b1e", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e", + "urn": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,dev.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,dev.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "dev.bigquery-dataset-1.table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Age" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Email_Address" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,dev.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=dev&ws=!1m5!1m4!4m3!1sdev!2sbigquery-dataset-1!3stable-1", + "name": "table-1", + "qualifiedName": "dev.bigquery-dataset-1.table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,dev.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ce17940c2d64e7e315e68f8d7d071b1e" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,dev.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,dev)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,dev.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,dev.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e", + "urn": "urn:li:container:f284164f9a7db03ca6bbdb7bb17d5a7e" + }, + { + "id": "urn:li:container:ce17940c2d64e7e315e68f8d7d071b1e", + "urn": "urn:li:container:ce17940c2d64e7e315e68f8d7d071b1e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:Age", + "changeType": "UPSERT", + "aspectName": "glossaryTermKey", + "aspect": { + "json": { + "name": "Age" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:Email_Address", + "changeType": "UPSERT", + "aspectName": "glossaryTermKey", + "aspect": { + "json": { + "name": "Email_Address" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Test Policy Tag", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Test Policy Tag" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index 762c73d2a55c60..dff7f18db6135c 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -15,6 +15,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryColumn, BigqueryDataset, + BigqueryProject, BigQuerySchemaApi, BigqueryTable, ) @@ -39,6 +40,33 @@ def random_email(): ) +def recipe(mcp_output_path: str, override: dict = {}) -> dict: + return { + "source": { + "type": "bigquery", + "config": { + "project_ids": ["project-id-1"], + "include_usage_statistics": False, + "include_table_lineage": False, + "include_data_platform_instance": True, + "classification": ClassificationConfig( + enabled=True, + classifiers=[ + DynamicTypedClassifierConfig( + type="datahub", + config=DataHubClassifierConfig( + minimum_values_threshold=1, + ), + ) + ], + max_workers=1, + ).dict(), + }, + }, + "sink": {"type": "file", "config": {"filename": mcp_output_path}}, + } + + @freeze_time(FROZEN_TIME) @patch.object(BigQuerySchemaApi, "get_tables_for_dataset") @patch.object(BigQuerySchemaGenerator, "get_core_table_details") @@ -47,9 +75,11 @@ def random_email(): @patch.object(BigQueryDataReader, "get_sample_data_for_table") @patch("google.cloud.bigquery.Client") @patch("google.cloud.datacatalog_v1.PolicyTagManagerClient") +@patch("google.cloud.resourcemanager_v3.ProjectsClient") def test_bigquery_v2_ingest( client, policy_tag_manager_client, + projects_client, get_sample_data_for_table, get_columns_for_dataset, get_datasets_for_project_id, @@ -111,33 +141,105 @@ def test_bigquery_v2_ingest( ) get_tables_for_dataset.return_value = iter([bigquery_table]) - source_config_dict: Dict[str, Any] = { - "project_ids": ["project-id-1"], - "include_usage_statistics": False, - "include_table_lineage": False, - "include_data_platform_instance": True, - "classification": ClassificationConfig( - enabled=True, - classifiers=[ - DynamicTypedClassifierConfig( - type="datahub", - config=DataHubClassifierConfig( - minimum_values_threshold=1, - ), - ) - ], - max_workers=1, - ).dict(), - } + pipeline_config_dict: Dict[str, Any] = recipe(mcp_output_path=mcp_output_path) - pipeline_config_dict: Dict[str, Any] = { - "source": { - "type": "bigquery", - "config": source_config_dict, - }, - "sink": {"type": "file", "config": {"filename": mcp_output_path}}, + run_and_get_pipeline(pipeline_config_dict) + + mce_helpers.check_golden_file( + pytestconfig, + output_path=mcp_output_path, + golden_path=mcp_golden_path, + ) + + +@freeze_time(FROZEN_TIME) +@patch.object(BigQuerySchemaApi, attribute="get_projects_with_labels") +@patch.object(BigQuerySchemaApi, "get_tables_for_dataset") +@patch.object(BigQuerySchemaGenerator, "get_core_table_details") +@patch.object(BigQuerySchemaApi, "get_datasets_for_project_id") +@patch.object(BigQuerySchemaApi, "get_columns_for_dataset") +@patch.object(BigQueryDataReader, "get_sample_data_for_table") +@patch("google.cloud.bigquery.Client") +@patch("google.cloud.datacatalog_v1.PolicyTagManagerClient") +@patch("google.cloud.resourcemanager_v3.ProjectsClient") +def test_bigquery_v2_project_labels_ingest( + client, + policy_tag_manager_client, + projects_client, + get_sample_data_for_table, + get_columns_for_dataset, + get_datasets_for_project_id, + get_core_table_details, + get_tables_for_dataset, + get_projects_with_labels, + pytestconfig, + tmp_path, +): + test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2" + mcp_golden_path = f"{test_resources_dir}/bigquery_project_label_mcp_golden.json" + mcp_output_path = "{}/{}".format(tmp_path, "bigquery_project_label_mcp_output.json") + + get_datasets_for_project_id.return_value = [ + BigqueryDataset(name="bigquery-dataset-1") + ] + + get_projects_with_labels.return_value = [ + BigqueryProject(id="dev", name="development") + ] + + table_list_item = TableListItem( + {"tableReference": {"projectId": "", "datasetId": "", "tableId": ""}} + ) + table_name = "table-1" + get_core_table_details.return_value = {table_name: table_list_item} + get_columns_for_dataset.return_value = { + table_name: [ + BigqueryColumn( + name="age", + ordinal_position=1, + is_nullable=False, + field_path="col_1", + data_type="INT", + comment="comment", + is_partition_column=False, + cluster_column_position=None, + policy_tags=["Test Policy Tag"], + ), + BigqueryColumn( + name="email", + ordinal_position=1, + is_nullable=False, + field_path="col_2", + data_type="STRING", + comment="comment", + is_partition_column=False, + cluster_column_position=None, + ), + ] + } + get_sample_data_for_table.return_value = { + "age": [random.randint(1, 80) for i in range(20)], + "email": [random_email() for i in range(20)], } + bigquery_table = BigqueryTable( + name=table_name, + comment=None, + created=None, + last_altered=None, + size_in_bytes=None, + rows_count=None, + ) + get_tables_for_dataset.return_value = iter([bigquery_table]) + + pipeline_config_dict: Dict[str, Any] = recipe(mcp_output_path=mcp_output_path) + + del pipeline_config_dict["source"]["config"]["project_ids"] + + pipeline_config_dict["source"]["config"]["project_labels"] = [ + "environment:development" + ] + run_and_get_pipeline(pipeline_config_dict) mce_helpers.check_golden_file( diff --git a/metadata-ingestion/tests/integration/lookml/drop_hive_dot/data.model.lkml b/metadata-ingestion/tests/integration/lookml/drop_hive_dot/data.model.lkml new file mode 100644 index 00000000000000..95391f6a73e635 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/drop_hive_dot/data.model.lkml @@ -0,0 +1,6 @@ +connection: "my_connection" + +include: "top_10_employee_income_source.view.lkml" + +explore: top_10_employee_income_source { +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/drop_hive_dot/top_10_employee_income_source.view.lkml b/metadata-ingestion/tests/integration/lookml/drop_hive_dot/top_10_employee_income_source.view.lkml new file mode 100644 index 00000000000000..149ce9219b54b8 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/drop_hive_dot/top_10_employee_income_source.view.lkml @@ -0,0 +1,26 @@ +view: top_10_employee_income_source { + derived_table: { + sql: SELECT id, + name, + source + FROM hive.employee_db.income_source + ORDER BY source desc + LIMIT 10 + ;; + } + + dimension: id { + type: number + sql: ${TABLE}.id ;; + } + + dimension: name { + type: string + sql: ${TABLE}.name ;; + } + + dimension: source { + type: string + sql: ${TABLE}.source ;; + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/drop_hive_dot_golden.json b/metadata-ingestion/tests/integration/lookml/drop_hive_dot_golden.json new file mode 100644 index 00000000000000..e1dad2e91b7353 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/drop_hive_dot_golden.json @@ -0,0 +1,357 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "looker", + "env": "PROD", + "project_name": "lkml_samples" + }, + "name": "lkml_samples" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "LookML Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Folders" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT id,\n name,\n source\n FROM hive.employee_db.income_source\n ORDER BY source desc\n LIMIT 10", + "viewLanguage": "sql" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,employee_db.income_source,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,employee_db.income_source,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,employee_db.income_source,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,employee_db.income_source,PROD),source)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD),source)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "top_10_employee_income_source", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "source", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "top_10_employee_income_source.view.lkml", + "looker.model": "data" + }, + "name": "top_10_employee_income_source", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Dimension", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Dimension" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json index 70f48953a06adb..c5b1d44772deab 100644 --- a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json @@ -485,9 +485,195 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_refinement_sample1.model_1.view.extend_book,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_refinement_sample1.model_1.view.extend_book,PROD),date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_refinement_sample1.model_1.view.extend_book,PROD),issue_date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_refinement_sample1.model_1.view.extend_book,PROD),issue_date_3)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD),count)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_refinement_sample1.model_1.view.extend_book,PROD),count)" + ], + "confidenceScore": 1.0 + } ] } }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "extend_book", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "name", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "date", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "issue_date", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "issue_date_3", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "count", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "count", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Measure" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 9e051995d0b940..a5d838cb16d73a 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -1032,3 +1032,30 @@ def test_field_tag_ingest(pytestconfig, tmp_path, mock_time): output_path=tmp_path / mce_out_file, golden_path=golden_path, ) + + +@freeze_time(FROZEN_TIME) +def test_drop_hive(pytestconfig, tmp_path, mock_time): + test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" + mce_out_file = "drop_hive_dot.json" + + new_recipe = get_default_recipe( + f"{tmp_path}/{mce_out_file}", + f"{test_resources_dir}/drop_hive_dot", + ) + + new_recipe["source"]["config"]["connection_to_platform_map"] = { + "my_connection": "hive" + } + + pipeline = Pipeline.create(new_recipe) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status(raise_warnings=True) + + golden_path = test_resources_dir / "drop_hive_dot_golden.json" + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / mce_out_file, + golden_path=golden_path, + ) diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/child_view.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/child_view.view.lkml new file mode 100644 index 00000000000000..5d8b51527b0fe7 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/child_view.view.lkml @@ -0,0 +1,16 @@ +include: "parent_view.view.lkml" + +view: child_view { + extends: [parent_view] + + dimension: id { + primary_key: yes + type: integer + sql: ${TABLE}.id ;; + } + + dimension: child_dimension_1 { + type: string + sql: ${TABLE}.child_dimension_1 ;; + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml index a87381dd0bf759..d570e0ecdb5b22 100644 --- a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml @@ -9,6 +9,7 @@ include: "employee_salary_rating.view.lkml" include: "environment_activity_logs.view.lkml" include: "employee_income_source_as_per_env.view.lkml" include: "rent_as_employee_income_source.view.lkml" +include: "child_view.view.lkml" explore: activity_logs { } @@ -35,4 +36,7 @@ explore: employee_income_source_as_per_env { } explore: rent_as_employee_income_source { +} + +explore: child_view { } \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/parent_view.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/parent_view.view.lkml new file mode 100644 index 00000000000000..c2f18924351c29 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/parent_view.view.lkml @@ -0,0 +1,18 @@ +view: parent_view { + sql_table_name: `dataset.table` ;; + + dimension: id { + primary_key: yes + type: string + sql: ${TABLE}.id ;; + } + + dimension: parent_dimension_1 { + type: string + sql: ${TABLE}.parent_dimension_1 ;; + } + + measure: parent_count { + type: count + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json b/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json index b723aff080bc44..dd6917c112579e 100644 --- a/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json +++ b/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json @@ -2294,6 +2294,538 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.parent_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.parent_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "view: parent_view {\n sql_table_name: `dataset.table` ;;\n\n dimension: id {\n primary_key: yes\n type: string\n sql: ${TABLE}.id ;;\n }\n\n dimension: parent_dimension_1 {\n type: string\n sql: ${TABLE}.parent_dimension_1 ;;\n }\n\n measure: parent_count {\n type: count\n }\n}", + "viewLanguage": "lookml" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.parent_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.parent_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.dataset.table,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.dataset.table,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.parent_view,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.dataset.table,PROD),parent_dimension_1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.parent_view,PROD),parent_dimension_1)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.dataset.table,PROD),parent_count)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.parent_view,PROD),parent_count)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "parent_view", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": true + }, + { + "fieldPath": "parent_dimension_1", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "parent_count", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "count", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Measure" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [ + "id" + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "parent_view.view.lkml", + "looker.model": "data" + }, + "name": "parent_view", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.parent_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.child_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.child_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "include: \"parent_view.view.lkml\"\n\nview: child_view {\n extends: [parent_view]\n\n dimension: id {\n primary_key: yes\n type: integer\n sql: ${TABLE}.id ;;\n }\n\n dimension: child_dimension_1 {\n type: string\n sql: ${TABLE}.child_dimension_1 ;;\n }\n}", + "viewLanguage": "lookml" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.child_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.child_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.dataset.table,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.dataset.table,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.child_view,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.dataset.table,PROD),child_dimension_1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.child_view,PROD),child_dimension_1)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.dataset.table,PROD),parent_dimension_1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.child_view,PROD),parent_dimension_1)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.dataset.table,PROD),parent_count)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.child_view,PROD),parent_count)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "child_view", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": true + }, + { + "fieldPath": "child_dimension_1", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "parent_dimension_1", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "parent_count", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "count", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Measure" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [ + "id" + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "child_view.view.lkml", + "looker.model": "data" + }, + "name": "child_view", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.child_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/mongodb/mongodb_mces_golden.json b/metadata-ingestion/tests/integration/mongodb/mongodb_mces_golden.json index ec3fd80e6a6ea4..bba160984eed87 100644 --- a/metadata-ingestion/tests/integration/mongodb/mongodb_mces_golden.json +++ b/metadata-ingestion/tests/integration/mongodb/mongodb_mces_golden.json @@ -1,4 +1,59 @@ [ +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mongodb", + "instance": "instance", + "env": "PROD", + "database": "mngdb" + }, + "name": "mngdb" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mongodb", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", @@ -47,6 +102,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", @@ -64,6 +135,70 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + }, + { + "id": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "urn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", @@ -377,6 +512,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", @@ -4026,6 +4177,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", @@ -4046,6 +4213,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + }, + { + "id": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "urn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", @@ -4063,6 +4255,47 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + }, + { + "id": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "urn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", @@ -4237,6 +4470,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + }, + { + "id": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "urn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", diff --git a/metadata-ingestion/tests/integration/mongodb/mongodb_mces_small_schema_size_golden.json b/metadata-ingestion/tests/integration/mongodb/mongodb_mces_small_schema_size_golden.json index 72b5fee49a0dbd..b2a1ba03dab768 100644 --- a/metadata-ingestion/tests/integration/mongodb/mongodb_mces_small_schema_size_golden.json +++ b/metadata-ingestion/tests/integration/mongodb/mongodb_mces_small_schema_size_golden.json @@ -1,4 +1,59 @@ [ +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mongodb", + "instance": "instance", + "env": "PROD", + "database": "mngdb" + }, + "name": "mngdb" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mongodb", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", @@ -47,6 +102,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", @@ -64,6 +135,70 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + }, + { + "id": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "urn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", @@ -236,6 +371,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", @@ -405,6 +556,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", @@ -425,6 +592,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + }, + { + "id": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "urn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", @@ -442,6 +634,47 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + }, + { + "id": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "urn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", @@ -616,6 +849,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + }, + { + "id": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed", + "urn": "urn:li:container:f5ff6ace1ed73cb3fd4c73dc718c39ed" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test-small-schema-size", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", diff --git a/metadata-ingestion/tests/test_helpers/docker_helpers.py b/metadata-ingestion/tests/test_helpers/docker_helpers.py index bacb8d80b9e721..20aec975787e4e 100644 --- a/metadata-ingestion/tests/test_helpers/docker_helpers.py +++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py @@ -1,52 +1,16 @@ -import contextlib import logging import os import subprocess -from typing import Callable, Iterator, List, Optional, Union import pytest -import pytest_docker.plugin -logger = logging.getLogger(__name__) - - -def is_responsive(container_name: str, port: int, hostname: Optional[str]) -> bool: - """A cheap way to figure out if a port is responsive on a container""" - if hostname: - cmd = f"docker exec {container_name} /bin/bash -c 'echo -n > /dev/tcp/{hostname}/{port}'" - else: - # use the hostname of the container - cmd = f"docker exec {container_name} /bin/bash -c 'c_host=`hostname`;echo -n > /dev/tcp/$c_host/{port}'" - ret = subprocess.run( - cmd, - shell=True, - ) - return ret.returncode == 0 +from datahub.testing.docker_utils import ( # noqa: F401 + docker_compose_runner, + is_responsive, + wait_for_port, +) - -def wait_for_port( - docker_services: pytest_docker.plugin.Services, - container_name: str, - container_port: int, - hostname: Optional[str] = None, - timeout: float = 30.0, - pause: float = 0.5, - checker: Optional[Callable[[], bool]] = None, -) -> None: - try: - docker_services.wait_until_responsive( - timeout=timeout, - pause=pause, - check=( - checker - if checker - else lambda: is_responsive(container_name, container_port, hostname) - ), - ) - logger.info(f"Container {container_name} is ready!") - finally: - # use check=True to raise an error if command gave bad exit code - subprocess.run(f"docker logs {container_name}", shell=True, check=True) +logger = logging.getLogger(__name__) @pytest.fixture(scope="session") @@ -58,28 +22,6 @@ def docker_compose_command(): return "docker compose" -@pytest.fixture(scope="module") -def docker_compose_runner( - docker_compose_command, docker_compose_project_name, docker_setup, docker_cleanup -): - @contextlib.contextmanager - def run( - compose_file_path: Union[str, List[str]], key: str, cleanup: bool = True - ) -> Iterator[pytest_docker.plugin.Services]: - with pytest_docker.plugin.get_docker_services( - docker_compose_command=docker_compose_command, - # We can remove the type ignore once this is merged: - # https://github.com/avast/pytest-docker/pull/108 - docker_compose_file=compose_file_path, # type: ignore - docker_compose_project_name=f"{docker_compose_project_name}-{key}", - docker_setup=docker_setup, - docker_cleanup=docker_cleanup if cleanup else [], - ) as docker_services: - yield docker_services - - return run - - def cleanup_image(image_name: str) -> None: assert ":" not in image_name, "image_name should not contain a tag" diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index 746cf9b0acfc3e..d12ffbcbbcf10b 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -170,7 +170,11 @@ def test_bigquery_uri_with_credential(): @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_get_projects_with_project_ids(get_bq_client_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_get_projects_with_project_ids( + get_projects_client, + get_bq_client_mock, +): client_mock = MagicMock() get_bq_client_mock.return_value = client_mock config = BigQueryV2Config.parse_obj( @@ -197,8 +201,10 @@ def test_get_projects_with_project_ids(get_bq_client_mock): @patch.object(BigQueryV2Config, "get_bigquery_client") +@patch.object(BigQueryV2Config, "get_projects_client") def test_get_projects_with_project_ids_overrides_project_id_pattern( - get_bq_client_mock, + get_projects_client, + get_bigquery_client, ): config = BigQueryV2Config.parse_obj( { @@ -226,7 +232,11 @@ def test_platform_instance_config_always_none(): @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_get_dataplatform_instance_aspect_returns_project_id(get_bq_client_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_get_dataplatform_instance_aspect_returns_project_id( + get_projects_client, + get_bq_client_mock, +): project_id = "project_id" expected_instance = ( f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})" @@ -247,7 +257,11 @@ def test_get_dataplatform_instance_aspect_returns_project_id(get_bq_client_mock) @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_get_dataplatform_instance_default_no_instance(get_bq_client_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_get_dataplatform_instance_default_no_instance( + get_projects_client, + get_bq_client_mock, +): config = BigQueryV2Config.parse_obj({}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) schema_gen = source.bq_schema_extractor @@ -263,7 +277,11 @@ def test_get_dataplatform_instance_default_no_instance(get_bq_client_mock): @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_get_projects_with_single_project_id(get_bq_client_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_get_projects_with_single_project_id( + get_projects_client, + get_bq_client_mock, +): client_mock = MagicMock() get_bq_client_mock.return_value = client_mock config = BigQueryV2Config.parse_obj({"project_id": "test-3"}) @@ -275,9 +293,10 @@ def test_get_projects_with_single_project_id(get_bq_client_mock): @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_get_projects_by_list(get_bq_client_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_get_projects_by_list(get_projects_client, get_bigquery_client): client_mock = MagicMock() - get_bq_client_mock.return_value = client_mock + get_bigquery_client.return_value = client_mock first_page = MagicMock() first_page.__iter__.return_value = iter( @@ -296,6 +315,7 @@ def test_get_projects_by_list(get_bq_client_mock): ] ) second_page.next_page_token = None + client_mock.list_projects.side_effect = [first_page, second_page] config = BigQueryV2Config.parse_obj({}) @@ -311,7 +331,10 @@ def test_get_projects_by_list(get_bq_client_mock): @patch.object(BigQuerySchemaApi, "get_projects") @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_get_projects_filter_by_pattern(get_bq_client_mock, get_projects_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_get_projects_filter_by_pattern( + get_projects_client, get_bq_client_mock, get_projects_mock +): get_projects_mock.return_value = [ BigqueryProject("test-project", "Test Project"), BigqueryProject("test-project-2", "Test Project 2"), @@ -329,7 +352,10 @@ def test_get_projects_filter_by_pattern(get_bq_client_mock, get_projects_mock): @patch.object(BigQuerySchemaApi, "get_projects") @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_get_projects_list_empty(get_bq_client_mock, get_projects_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_get_projects_list_empty( + get_projects_client, get_bq_client_mock, get_projects_mock +): get_projects_mock.return_value = [] config = BigQueryV2Config.parse_obj( @@ -342,7 +368,9 @@ def test_get_projects_list_empty(get_bq_client_mock, get_projects_mock): @patch.object(BigQueryV2Config, "get_bigquery_client") +@patch.object(BigQueryV2Config, "get_projects_client") def test_get_projects_list_failure( + get_projects_client: MagicMock, get_bq_client_mock: MagicMock, caplog: pytest.LogCaptureFixture, ) -> None: @@ -366,7 +394,10 @@ def test_get_projects_list_failure( @patch.object(BigQuerySchemaApi, "get_projects") @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_get_projects_list_fully_filtered(get_projects_mock, get_bq_client_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_get_projects_list_fully_filtered( + get_projects_mock, get_bq_client_mock, get_projects_client +): get_projects_mock.return_value = [BigqueryProject("test-project", "Test Project")] config = BigQueryV2Config.parse_obj( @@ -399,7 +430,10 @@ def bigquery_table() -> BigqueryTable: @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_gen_table_dataset_workunits(get_bq_client_mock, bigquery_table): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_gen_table_dataset_workunits( + get_projects_client, get_bq_client_mock, bigquery_table +): project_id = "test-project" dataset_name = "test-dataset" config = BigQueryV2Config.parse_obj( @@ -471,7 +505,8 @@ def test_gen_table_dataset_workunits(get_bq_client_mock, bigquery_table): @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_simple_upstream_table_generation(get_bq_client_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_simple_upstream_table_generation(get_bq_client_mock, get_projects_client): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( project_id="test-project", dataset="test-dataset", table="a" @@ -503,8 +538,10 @@ def test_simple_upstream_table_generation(get_bq_client_mock): @patch.object(BigQueryV2Config, "get_bigquery_client") +@patch.object(BigQueryV2Config, "get_projects_client") def test_upstream_table_generation_with_temporary_table_without_temp_upstream( get_bq_client_mock, + get_projects_client, ): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( @@ -536,7 +573,10 @@ def test_upstream_table_generation_with_temporary_table_without_temp_upstream( @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_upstream_table_column_lineage_with_temp_table(get_bq_client_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_upstream_table_column_lineage_with_temp_table( + get_bq_client_mock, get_projects_client +): from datahub.ingestion.api.common import PipelineContext a: BigQueryTableRef = BigQueryTableRef( @@ -611,8 +651,9 @@ def test_upstream_table_column_lineage_with_temp_table(get_bq_client_mock): @patch.object(BigQueryV2Config, "get_bigquery_client") +@patch.object(BigQueryV2Config, "get_projects_client") def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream( - get_bq_client_mock, + get_bq_client_mock, get_projects_client ): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( @@ -675,7 +716,10 @@ def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstr @patch.object(BigQuerySchemaApi, "get_tables_for_dataset") @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_table_processing_logic(get_bq_client_mock, data_dictionary_mock): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_table_processing_logic( + get_projects_client, get_bq_client_mock, data_dictionary_mock +): client_mock = MagicMock() get_bq_client_mock.return_value = client_mock config = BigQueryV2Config.parse_obj( @@ -747,8 +791,9 @@ def test_table_processing_logic(get_bq_client_mock, data_dictionary_mock): @patch.object(BigQuerySchemaApi, "get_tables_for_dataset") @patch.object(BigQueryV2Config, "get_bigquery_client") +@patch.object(BigQueryV2Config, "get_projects_client") def test_table_processing_logic_date_named_tables( - get_bq_client_mock, data_dictionary_mock + get_projects_client, get_bq_client_mock, data_dictionary_mock ): client_mock = MagicMock() get_bq_client_mock.return_value = client_mock @@ -859,8 +904,10 @@ def bigquery_view_2() -> BigqueryView: @patch.object(BigQuerySchemaApi, "get_query_result") @patch.object(BigQueryV2Config, "get_bigquery_client") +@patch.object(BigQueryV2Config, "get_projects_client") def test_get_views_for_dataset( get_bq_client_mock: Mock, + get_projects_client: MagicMock, query_mock: Mock, bigquery_view_1: BigqueryView, bigquery_view_2: BigqueryView, @@ -889,7 +936,9 @@ def test_get_views_for_dataset( ) query_mock.return_value = [row1, row2] bigquery_data_dictionary = BigQuerySchemaApi( - BigQueryV2Report().schema_api_perf, client_mock + report=BigQueryV2Report().schema_api_perf, + client=client_mock, + projects_client=MagicMock(), ) views = bigquery_data_dictionary.get_views_for_dataset( @@ -905,8 +954,9 @@ def test_get_views_for_dataset( BigQuerySchemaGenerator, "gen_dataset_workunits", lambda *args, **kwargs: [] ) @patch.object(BigQueryV2Config, "get_bigquery_client") +@patch.object(BigQueryV2Config, "get_projects_client") def test_gen_view_dataset_workunits( - get_bq_client_mock, bigquery_view_1, bigquery_view_2 + get_projects_client, get_bq_client_mock, bigquery_view_1, bigquery_view_2 ): project_id = "test-project" dataset_name = "test-dataset" @@ -963,7 +1013,9 @@ def bigquery_snapshot() -> BigqueryTableSnapshot: @patch.object(BigQuerySchemaApi, "get_query_result") @patch.object(BigQueryV2Config, "get_bigquery_client") +@patch.object(BigQueryV2Config, "get_projects_client") def test_get_snapshots_for_dataset( + get_projects_client: MagicMock, get_bq_client_mock: Mock, query_mock: Mock, bigquery_snapshot: BigqueryTableSnapshot, @@ -988,7 +1040,9 @@ def test_get_snapshots_for_dataset( ) query_mock.return_value = [row1] bigquery_data_dictionary = BigQuerySchemaApi( - BigQueryV2Report().schema_api_perf, client_mock + report=BigQueryV2Report().schema_api_perf, + client=client_mock, + projects_client=MagicMock(), ) snapshots = bigquery_data_dictionary.get_snapshots_for_dataset( @@ -1001,7 +1055,10 @@ def test_get_snapshots_for_dataset( @patch.object(BigQueryV2Config, "get_bigquery_client") -def test_gen_snapshot_dataset_workunits(get_bq_client_mock, bigquery_snapshot): +@patch.object(BigQueryV2Config, "get_projects_client") +def test_gen_snapshot_dataset_workunits( + get_bq_client_mock, get_projects_client, bigquery_snapshot +): project_id = "test-project" dataset_name = "test-dataset" config = BigQueryV2Config.parse_obj( @@ -1140,7 +1197,9 @@ def test_default_config_for_excluding_projects_and_datasets(): @patch.object(BigQueryConnectionConfig, "get_bigquery_client", new=lambda self: None) @patch.object(BigQuerySchemaApi, "get_datasets_for_project_id") +@patch.object(BigQueryV2Config, "get_projects_client") def test_excluding_empty_projects_from_ingestion( + get_projects_client, get_datasets_for_project_id_mock, ): project_id_with_datasets = "project-id-with-datasets" @@ -1173,3 +1232,32 @@ def get_datasets_for_project_id_side_effect( config = BigQueryV2Config.parse_obj({**base_config, "exclude_empty_projects": True}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test-2")) assert len({wu.metadata.entityUrn for wu in source.get_workunits()}) == 1 # type: ignore + + +@patch.object(BigQueryV2Config, "get_bigquery_client") +@patch.object(BigQueryV2Config, "get_projects_client") +def test_get_projects_with_project_labels( + get_projects_client, + get_bq_client_mock, +): + client_mock = MagicMock() + + get_projects_client.return_value = client_mock + + client_mock.search_projects.return_value = [ + SimpleNamespace(project_id="dev", display_name="dev_project"), + SimpleNamespace(project_id="qa", display_name="qa_project"), + ] + + config = BigQueryV2Config.parse_obj( + { + "project_labels": ["environment:dev", "environment:qa"], + } + ) + + source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) + + assert source._get_projects() == [ + BigqueryProject("dev", "dev_project"), + BigqueryProject("qa", "qa_project"), + ] diff --git a/settings.gradle b/settings.gradle index b850816ab5e6b7..899ca8f6f869b5 100644 --- a/settings.gradle +++ b/settings.gradle @@ -61,6 +61,7 @@ include 'metadata-integration:java:openlineage-converter' include 'metadata-integration:java:acryl-spark-lineage' include 'ingestion-scheduler' include 'metadata-ingestion-modules:airflow-plugin' +include 'metadata-ingestion-modules:gx-plugin' include 'metadata-ingestion-modules:dagster-plugin' include 'smoke-test' include 'metadata-auth:auth-api'