diff --git a/.github/.codecov.yml b/.github/.codecov.yml new file mode 100644 index 00000000000000..1faf5a6bab4644 --- /dev/null +++ b/.github/.codecov.yml @@ -0,0 +1,65 @@ +comment: + layout: "header, files, footer" # remove "new" from "header" and "footer" + hide_project_coverage: true # set to false + require_changes: false # if true: only post the comment if coverage changes + +codecov: + #due to ci-optimization, reports for modules that have not changed may be quite old + max_report_age: off + +flag_management: + default_rules: # the rules that will be followed for any flag added, generally + carryforward: true + statuses: + - type: project + target: auto + threshold: 0% #Not enforcing project coverage yet. + - type: patch + target: 90% + individual_flags: # exceptions to the default rules above, stated flag by flag + - name: frontend + paths: + - "datahub-frontend/**" + - "datahub-web-react/**" + - name: backend + paths: + - "metadata-models/**" + - "datahub-upgrade/**" + - "entity-registry/**" + - "li-utils/**" + - "metadata-auth/**" + - "metadata-dao-impl/**" + - "metadata-events/**" + - "metadata-jobs/**" + - "metadata-service/**" + - "metadata-utils/**" + - "metadata-operation-context/**" + - "datahub-graphql-core/**" + - name: metadata-io + paths: + - "metadata-io/**" + - name: ingestion + paths: + - "metadata-ingestion/**" + - name: ingestion-airflow + paths: + - "metadata-ingestion-modules/airflow-plugin/**" + - name: ingestion-dagster + paths: + - "metadata-ingestion-modules/dagster-plugin/**" + - name: ingestion-gx-plugin + paths: + - "metadata-ingestion-modules/gx-plugin/**" + - name: ingestion-prefect + paths: + - "metadata-ingestion-modules/prefect-plugin/**" +coverage: + status: + project: + default: + target: 0% # no threshold enforcement yet + only_pulls: true + patch: + default: + target: 90% # for new code added in the patch + only_pulls: true diff --git a/.github/actions/ci-optimization/action.yml b/.github/actions/ci-optimization/action.yml index 0d435963382675..8a81859ae903a8 100644 --- a/.github/actions/ci-optimization/action.yml +++ b/.github/actions/ci-optimization/action.yml @@ -13,16 +13,16 @@ outputs: value: ${{ steps.filter.outputs.frontend == 'false' && steps.filter.outputs.ingestion == 'false' && steps.filter.outputs.backend == 'true' }} backend-change: description: "Backend code has changed" - value: ${{ steps.filter.outputs.backend == 'true' }} + value: ${{ steps.filter.outputs.backend == 'true' || steps.trigger.outputs.trigger == 'manual' }} ingestion-change: description: "Ingestion code has changed" - value: ${{ steps.filter.outputs.ingestion == 'true' }} + value: ${{ steps.filter.outputs.ingestion == 'true' || steps.trigger.outputs.trigger == 'manual' }} ingestion-base-change: description: "Ingestion base image docker image has changed" value: ${{ steps.filter.outputs.ingestion-base == 'true' }} frontend-change: description: "Frontend code has changed" - value: ${{ steps.filter.outputs.frontend == 'true' }} + value: ${{ steps.filter.outputs.frontend == 'true' || steps.trigger.outputs.trigger == 'manual' }} docker-change: description: "Docker code has changed" value: ${{ steps.filter.outputs.docker == 'true' }} @@ -44,6 +44,15 @@ outputs: runs: using: "composite" steps: + - name: Check trigger type + id: trigger # Add an ID to reference this step + shell: bash + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "trigger=manual" >> $GITHUB_OUTPUT + else + echo "trigger=pr" >> $GITHUB_OUTPUT + fi - uses: dorny/paths-filter@v3 id: filter with: diff --git a/.github/scripts/generate_pre_commit.py b/.github/scripts/generate_pre_commit.py index 740d3c20d263b0..2db73fd357ff5f 100755 --- a/.github/scripts/generate_pre_commit.py +++ b/.github/scripts/generate_pre_commit.py @@ -9,6 +9,7 @@ from dataclasses import dataclass from enum import Enum, auto from pathlib import Path +import datetime import yaml @@ -188,6 +189,7 @@ def _generate_lint_fix_hook(self, project: Project) -> dict: "entry": f"./gradlew {project.gradle_path}:lintFix", "language": "system", "files": f"^{project.path}/.*\\.py$", + "pass_filenames": False, } def _generate_spotless_hook(self, project: Project) -> dict: @@ -198,6 +200,7 @@ def _generate_spotless_hook(self, project: Project) -> dict: "entry": f"./gradlew {project.gradle_path}:spotlessApply", "language": "system", "files": f"^{project.path}/.*\\.java$", + "pass_filenames": False, } @@ -209,8 +212,19 @@ def increase_indent(self, flow=False, *args, **kwargs): def write_yaml_with_spaces(file_path: str, data: dict): - """Write YAML file with extra spacing between hooks.""" + """Write YAML file with extra spacing between hooks and a timestamp header.""" with open(file_path, "w") as f: + # Add timestamp header + current_time = datetime.datetime.now(datetime.timezone.utc) + formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S %Z") + header = f"# Auto-generated by .github/scripts/generate_pre_commit.py at {formatted_time}\n" + f.write(header) + header = f"# Do not edit this file directly. Run the script to regenerate.\n" + f.write(header) + header = f"# Add additional hooks in .github/scripts/pre-commit-override.yaml\n" + f.write(header) + + # Write the YAML content yaml_str = yaml.dump( data, Dumper=PrecommitDumper, sort_keys=False, default_flow_style=False ) diff --git a/.github/scripts/pre-commit-override.yaml b/.github/scripts/pre-commit-override.yaml index a085d9ea3ee93b..961134bebe2c98 100644 --- a/.github/scripts/pre-commit-override.yaml +++ b/.github/scripts/pre-commit-override.yaml @@ -5,4 +5,5 @@ repos: name: smoke-test cypress Lint Fix entry: ./gradlew :smoke-test:cypressLintFix language: system - files: ^smoke-test/tests/cypress/.*$ \ No newline at end of file + files: ^smoke-test/tests/cypress/.*$ + pass_filenames: false \ No newline at end of file diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 89e0c9e2513d8b..e1e0fb0a85e977 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -18,6 +18,7 @@ on: - "metadata-models/**" release: types: [published] + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -77,15 +78,15 @@ jobs: **/build/test-results/test/** **/junit.*.xml !**/binary/** - - name: Upload coverage to Codecov + - name: Upload coverage to Codecov with ingestion flag if: always() uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - directory: ./build/coverage-reports/ + directory: ./build/coverage-reports/metadata-ingestion-modules/airflow-plugin/ fail_ci_if_error: false - flags: airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_extras }} - name: pytest-airflow + flags: ingestion-airflow + name: pytest-airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_requirements }} verbose: true - name: Upload test results to Codecov if: ${{ !cancelled() }} diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 058ac4a5c9b1e5..86545946d6afea 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -12,6 +12,7 @@ on: paths-ignore: - "docs/**" - "**.md" + workflow_dispatch: release: types: [published] @@ -113,10 +114,16 @@ jobs: if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} run: | ./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava + - name: Gather coverage files + run: | + echo "BACKEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(metadata-models|entity-registry|datahuyb-graphql-core|metadata-io|metadata-jobs|metadata-utils|metadata-service|medata-dao-impl|metadata-operation|li-utils|metadata-integration|metadata-events|metadata-auth|ingestion-scheduler|notifications|datahub-upgrade)' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV + echo "FRONTEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(datahub-frontend|datahub-web-react).*\.(xml|json)$' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV + - name: Generate tz artifact name + run: echo "NAME_TZ=$(echo ${{ matrix.timezone }} | tr '/' '-')" >> $GITHUB_ENV - uses: actions/upload-artifact@v4 if: always() with: - name: Test Results (build) + name: Test Results (build) - ${{ matrix.command}}-${{ env.NAME_TZ }} path: | **/build/reports/tests/test/** **/build/test-results/test/** @@ -124,14 +131,28 @@ jobs: !**/binary/** - name: Ensure codegen is updated uses: ./.github/actions/ensure-codegen-updated - - name: Upload coverage to Codecov - if: always() + - name: Upload backend coverage to Codecov + if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: ${{ env.BACKEND_FILES }} + disable_search: true + #handle_no_reports_found: true + fail_ci_if_error: false + flags: backend + name: ${{ matrix.command }} + verbose: true + - name: Upload frontend coverage to Codecov + if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }} uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - directory: ./build/coverage-reports/ + files: ${{ env.FRONTEND_FILES }} + disable_search: true + #handle_no_reports_found: true fail_ci_if_error: false - flags: ${{ matrix.timezone }} + flags: frontend name: ${{ matrix.command }} verbose: true - name: Upload test results to Codecov diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index c29e72367c53c5..a2ac59d6989a9f 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -18,6 +18,7 @@ on: - "metadata-models/**" release: types: [published] + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -64,14 +65,14 @@ jobs: **/build/reports/tests/test/** **/build/test-results/test/** **/junit.*.xml - - name: Upload coverage to Codecov + - name: Upload coverage to Codecov with ingestion flag if: always() uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - directory: ./build/coverage-reports/ + directory: ./build/coverage-reports/metadata-ingestion-modules/dagster-plugin/ fail_ci_if_error: false - flags: dagster-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} + flags: ingestion-dagster-plugin name: pytest-dagster verbose: true - name: Upload test results to Codecov diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml index 825f8beda2f561..c28bdbb30eb36d 100644 --- a/.github/workflows/gx-plugin.yml +++ b/.github/workflows/gx-plugin.yml @@ -18,6 +18,7 @@ on: - "metadata-models/**" release: types: [published] + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -68,14 +69,14 @@ jobs: **/build/reports/tests/test/** **/build/test-results/test/** **/junit.*.xml - - name: Upload coverage to Codecov + - name: Upload coverage to Codecov with ingestion flag if: always() uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - directory: ./build/coverage-reports/ + directory: ./build/coverage-reports/metadata-ingestion-modules/gx-plugin/ fail_ci_if_error: false - flags: gx-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} + flags: ingestion-gx-plugin name: pytest-gx verbose: true - name: Upload test results to Codecov diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index aa404c4c35c505..be6026098ce420 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -18,6 +18,7 @@ on: - "metadata-models/**" release: types: [published] + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -88,15 +89,15 @@ jobs: **/build/test-results/test/** **/junit.*.xml !**/binary/** - - name: Upload coverage to Codecov - if: ${{ always() }} + - name: Upload coverage to Codecov with ingestion flag + if: ${{ always() && matrix.python-version == '3.11' }} uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - directory: ./build/coverage-reports/ + directory: ./build/coverage-reports/metadata-ingestion/ fail_ci_if_error: false - flags: ingestion-${{ matrix.python-version }}-${{ matrix.command }} - name: pytest-ingestion + flags: ingestion + name: pytest-${{ matrix.python-version }}-${{ matrix.command }} verbose: true - name: Upload test results to Codecov if: ${{ !cancelled() }} diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index bcadc641ee2f7c..80af03e77eef82 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -20,6 +20,7 @@ on: - ".github/workflows/metadata-io.yml" release: types: [published] + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -86,8 +87,9 @@ jobs: uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - directory: ./build/coverage-reports/ + directory: ./build/coverage-reports/metadata-io/ fail_ci_if_error: false + flags: metadata-io name: metadata-io-test verbose: true - name: Upload test results to Codecov diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml index 0bce4d5ef19f31..401efa340ae8ca 100644 --- a/.github/workflows/prefect-plugin.yml +++ b/.github/workflows/prefect-plugin.yml @@ -18,6 +18,7 @@ on: - "metadata-models/**" release: types: [published] + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -60,15 +61,15 @@ jobs: **/build/test-results/test/** **/junit.*.xml !**/binary/** - - name: Upload coverage to Codecov + - name: Upload coverage to Codecov with ingestion flag if: always() uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - directory: ./build/coverage-reports/ + directory: ./build/coverage-reports/metadata-ingestion-modules/prefect-plugin/ fail_ci_if_error: false - flags: prefect-${{ matrix.python-version }} - name: pytest-prefect + flags: ingestion-prefect-plugin + name: pytest-prefect-${{ matrix.python-version }} verbose: true - name: Upload test results to Codecov if: ${{ !cancelled() }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c4edc2cc176355..3697efa37770e7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,6 @@ +# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-01-09 10:08:09 UTC +# Do not edit this file directly. Run the script to regenerate. +# Add additional hooks in .github/scripts/pre-commit-override.yaml repos: - repo: local hooks: @@ -6,372 +9,434 @@ repos: entry: ./gradlew :datahub-graphql-core:spotlessApply language: system files: ^datahub-graphql-core/.*\.java$ + pass_filenames: false - id: datahub-upgrade-spotless name: datahub-upgrade Spotless Apply entry: ./gradlew :datahub-upgrade:spotlessApply language: system files: ^datahub-upgrade/.*\.java$ + pass_filenames: false - id: entity-registry-spotless name: entity-registry Spotless Apply entry: ./gradlew :entity-registry:spotlessApply language: system files: ^entity-registry/.*\.java$ + pass_filenames: false - id: ingestion-scheduler-spotless name: ingestion-scheduler Spotless Apply entry: ./gradlew :ingestion-scheduler:spotlessApply language: system files: ^ingestion-scheduler/.*\.java$ + pass_filenames: false - id: li-utils-spotless name: li-utils Spotless Apply entry: ./gradlew :li-utils:spotlessApply language: system files: ^li-utils/.*\.java$ + pass_filenames: false - id: metadata-auth-auth-api-spotless name: metadata-auth/auth-api Spotless Apply entry: ./gradlew :metadata-auth:auth-api:spotlessApply language: system files: ^metadata-auth/auth-api/.*\.java$ + pass_filenames: false - id: metadata-dao-impl-kafka-producer-spotless name: metadata-dao-impl/kafka-producer Spotless Apply entry: ./gradlew :metadata-dao-impl:kafka-producer:spotlessApply language: system files: ^metadata-dao-impl/kafka-producer/.*\.java$ + pass_filenames: false - id: metadata-events-mxe-avro-spotless name: metadata-events/mxe-avro Spotless Apply entry: ./gradlew :metadata-events:mxe-avro:spotlessApply language: system files: ^metadata-events/mxe-avro/.*\.java$ + pass_filenames: false - id: metadata-events-mxe-registration-spotless name: metadata-events/mxe-registration Spotless Apply entry: ./gradlew :metadata-events:mxe-registration:spotlessApply language: system files: ^metadata-events/mxe-registration/.*\.java$ + pass_filenames: false - id: metadata-events-mxe-schemas-spotless name: metadata-events/mxe-schemas Spotless Apply entry: ./gradlew :metadata-events:mxe-schemas:spotlessApply language: system files: ^metadata-events/mxe-schemas/.*\.java$ + pass_filenames: false - id: metadata-events-mxe-utils-avro-spotless name: metadata-events/mxe-utils-avro Spotless Apply entry: ./gradlew :metadata-events:mxe-utils-avro:spotlessApply language: system files: ^metadata-events/mxe-utils-avro/.*\.java$ + pass_filenames: false - id: metadata-ingestion-lint-fix name: metadata-ingestion Lint Fix entry: ./gradlew :metadata-ingestion:lintFix language: system files: ^metadata-ingestion/.*\.py$ + pass_filenames: false - id: metadata-ingestion-modules-airflow-plugin-lint-fix name: metadata-ingestion-modules/airflow-plugin Lint Fix entry: ./gradlew :metadata-ingestion-modules:airflow-plugin:lintFix language: system files: ^metadata-ingestion-modules/airflow-plugin/.*\.py$ + pass_filenames: false - id: metadata-ingestion-modules-dagster-plugin-lint-fix name: metadata-ingestion-modules/dagster-plugin Lint Fix entry: ./gradlew :metadata-ingestion-modules:dagster-plugin:lintFix language: system files: ^metadata-ingestion-modules/dagster-plugin/.*\.py$ + pass_filenames: false - id: metadata-ingestion-modules-gx-plugin-lint-fix name: metadata-ingestion-modules/gx-plugin Lint Fix entry: ./gradlew :metadata-ingestion-modules:gx-plugin:lintFix language: system files: ^metadata-ingestion-modules/gx-plugin/.*\.py$ + pass_filenames: false - id: metadata-ingestion-modules-prefect-plugin-lint-fix name: metadata-ingestion-modules/prefect-plugin Lint Fix entry: ./gradlew :metadata-ingestion-modules:prefect-plugin:lintFix language: system files: ^metadata-ingestion-modules/prefect-plugin/.*\.py$ + pass_filenames: false - id: metadata-integration-java-acryl-spark-lineage-spotless name: metadata-integration/java/acryl-spark-lineage Spotless Apply entry: ./gradlew :metadata-integration:java:acryl-spark-lineage:spotlessApply language: system files: ^metadata-integration/java/acryl-spark-lineage/.*\.java$ + pass_filenames: false - id: metadata-integration-java-datahub-client-spotless name: metadata-integration/java/datahub-client Spotless Apply entry: ./gradlew :metadata-integration:java:datahub-client:spotlessApply language: system files: ^metadata-integration/java/datahub-client/.*\.java$ + pass_filenames: false - id: metadata-integration-java-datahub-event-spotless name: metadata-integration/java/datahub-event Spotless Apply entry: ./gradlew :metadata-integration:java:datahub-event:spotlessApply language: system files: ^metadata-integration/java/datahub-event/.*\.java$ + pass_filenames: false - id: metadata-integration-java-datahub-protobuf-spotless name: metadata-integration/java/datahub-protobuf Spotless Apply entry: ./gradlew :metadata-integration:java:datahub-protobuf:spotlessApply language: system files: ^metadata-integration/java/datahub-protobuf/.*\.java$ + pass_filenames: false - id: metadata-integration-java-datahub-schematron-cli-spotless name: metadata-integration/java/datahub-schematron/cli Spotless Apply entry: ./gradlew :metadata-integration:java:datahub-schematron:cli:spotlessApply language: system files: ^metadata-integration/java/datahub-schematron/cli/.*\.java$ + pass_filenames: false - id: metadata-integration-java-datahub-schematron-lib-spotless name: metadata-integration/java/datahub-schematron/lib Spotless Apply entry: ./gradlew :metadata-integration:java:datahub-schematron:lib:spotlessApply language: system files: ^metadata-integration/java/datahub-schematron/lib/.*\.java$ + pass_filenames: false - id: metadata-integration-java-examples-spotless name: metadata-integration/java/examples Spotless Apply entry: ./gradlew :metadata-integration:java:examples:spotlessApply language: system files: ^metadata-integration/java/examples/.*\.java$ + pass_filenames: false - id: metadata-integration-java-openlineage-converter-spotless name: metadata-integration/java/openlineage-converter Spotless Apply entry: ./gradlew :metadata-integration:java:openlineage-converter:spotlessApply language: system files: ^metadata-integration/java/openlineage-converter/.*\.java$ + pass_filenames: false - id: metadata-integration-java-spark-lineage-legacy-spotless name: metadata-integration/java/spark-lineage-legacy Spotless Apply entry: ./gradlew :metadata-integration:java:spark-lineage-legacy:spotlessApply language: system files: ^metadata-integration/java/spark-lineage-legacy/.*\.java$ + pass_filenames: false - id: metadata-io-spotless name: metadata-io Spotless Apply entry: ./gradlew :metadata-io:spotlessApply language: system files: ^metadata-io/.*\.java$ + pass_filenames: false - id: metadata-io-metadata-io-api-spotless name: metadata-io/metadata-io-api Spotless Apply entry: ./gradlew :metadata-io:metadata-io-api:spotlessApply language: system files: ^metadata-io/metadata-io-api/.*\.java$ + pass_filenames: false - id: metadata-jobs-common-spotless name: metadata-jobs/common Spotless Apply entry: ./gradlew :metadata-jobs:common:spotlessApply language: system files: ^metadata-jobs/common/.*\.java$ + pass_filenames: false - id: metadata-jobs-mae-consumer-spotless name: metadata-jobs/mae-consumer Spotless Apply entry: ./gradlew :metadata-jobs:mae-consumer:spotlessApply language: system files: ^metadata-jobs/mae-consumer/.*\.java$ + pass_filenames: false - id: metadata-jobs-mae-consumer-job-spotless name: metadata-jobs/mae-consumer-job Spotless Apply entry: ./gradlew :metadata-jobs:mae-consumer-job:spotlessApply language: system files: ^metadata-jobs/mae-consumer-job/.*\.java$ + pass_filenames: false - id: metadata-jobs-mce-consumer-spotless name: metadata-jobs/mce-consumer Spotless Apply entry: ./gradlew :metadata-jobs:mce-consumer:spotlessApply language: system files: ^metadata-jobs/mce-consumer/.*\.java$ + pass_filenames: false - id: metadata-jobs-mce-consumer-job-spotless name: metadata-jobs/mce-consumer-job Spotless Apply entry: ./gradlew :metadata-jobs:mce-consumer-job:spotlessApply language: system files: ^metadata-jobs/mce-consumer-job/.*\.java$ + pass_filenames: false - id: metadata-jobs-pe-consumer-spotless name: metadata-jobs/pe-consumer Spotless Apply entry: ./gradlew :metadata-jobs:pe-consumer:spotlessApply language: system files: ^metadata-jobs/pe-consumer/.*\.java$ + pass_filenames: false - id: metadata-models-spotless name: metadata-models Spotless Apply entry: ./gradlew :metadata-models:spotlessApply language: system files: ^metadata-models/.*\.java$ + pass_filenames: false - id: metadata-models-custom-spotless name: metadata-models-custom Spotless Apply entry: ./gradlew :metadata-models-custom:spotlessApply language: system files: ^metadata-models-custom/.*\.java$ + pass_filenames: false - id: metadata-models-validator-spotless name: metadata-models-validator Spotless Apply entry: ./gradlew :metadata-models-validator:spotlessApply language: system files: ^metadata-models-validator/.*\.java$ + pass_filenames: false - id: metadata-operation-context-spotless name: metadata-operation-context Spotless Apply entry: ./gradlew :metadata-operation-context:spotlessApply language: system files: ^metadata-operation-context/.*\.java$ + pass_filenames: false - id: metadata-service-auth-config-spotless name: metadata-service/auth-config Spotless Apply entry: ./gradlew :metadata-service:auth-config:spotlessApply language: system files: ^metadata-service/auth-config/.*\.java$ + pass_filenames: false - id: metadata-service-auth-filter-spotless name: metadata-service/auth-filter Spotless Apply entry: ./gradlew :metadata-service:auth-filter:spotlessApply language: system files: ^metadata-service/auth-filter/.*\.java$ + pass_filenames: false - id: metadata-service-auth-impl-spotless name: metadata-service/auth-impl Spotless Apply entry: ./gradlew :metadata-service:auth-impl:spotlessApply language: system files: ^metadata-service/auth-impl/.*\.java$ + pass_filenames: false - id: metadata-service-auth-servlet-impl-spotless name: metadata-service/auth-servlet-impl Spotless Apply entry: ./gradlew :metadata-service:auth-servlet-impl:spotlessApply language: system files: ^metadata-service/auth-servlet-impl/.*\.java$ + pass_filenames: false - id: metadata-service-configuration-spotless name: metadata-service/configuration Spotless Apply entry: ./gradlew :metadata-service:configuration:spotlessApply language: system files: ^metadata-service/configuration/.*\.java$ + pass_filenames: false - id: metadata-service-factories-spotless name: metadata-service/factories Spotless Apply entry: ./gradlew :metadata-service:factories:spotlessApply language: system files: ^metadata-service/factories/.*\.java$ + pass_filenames: false - id: metadata-service-graphql-servlet-impl-spotless name: metadata-service/graphql-servlet-impl Spotless Apply entry: ./gradlew :metadata-service:graphql-servlet-impl:spotlessApply language: system files: ^metadata-service/graphql-servlet-impl/.*\.java$ + pass_filenames: false - id: metadata-service-openapi-analytics-servlet-spotless name: metadata-service/openapi-analytics-servlet Spotless Apply entry: ./gradlew :metadata-service:openapi-analytics-servlet:spotlessApply language: system files: ^metadata-service/openapi-analytics-servlet/.*\.java$ + pass_filenames: false - id: metadata-service-openapi-entity-servlet-spotless name: metadata-service/openapi-entity-servlet Spotless Apply entry: ./gradlew :metadata-service:openapi-entity-servlet:spotlessApply language: system files: ^metadata-service/openapi-entity-servlet/.*\.java$ + pass_filenames: false - id: metadata-service-openapi-entity-servlet-generators-spotless name: metadata-service/openapi-entity-servlet/generators Spotless Apply entry: ./gradlew :metadata-service:openapi-entity-servlet:generators:spotlessApply language: system files: ^metadata-service/openapi-entity-servlet/generators/.*\.java$ + pass_filenames: false - id: metadata-service-openapi-servlet-spotless name: metadata-service/openapi-servlet Spotless Apply entry: ./gradlew :metadata-service:openapi-servlet:spotlessApply language: system files: ^metadata-service/openapi-servlet/.*\.java$ + pass_filenames: false - id: metadata-service-openapi-servlet-models-spotless name: metadata-service/openapi-servlet/models Spotless Apply entry: ./gradlew :metadata-service:openapi-servlet:models:spotlessApply language: system files: ^metadata-service/openapi-servlet/models/.*\.java$ + pass_filenames: false - id: metadata-service-plugin-spotless name: metadata-service/plugin Spotless Apply entry: ./gradlew :metadata-service:plugin:spotlessApply language: system files: ^metadata-service/plugin/.*\.java$ + pass_filenames: false - id: metadata-service-plugin-src-test-sample-test-plugins-spotless name: metadata-service/plugin/src/test/sample-test-plugins Spotless Apply entry: ./gradlew :metadata-service:plugin:src:test:sample-test-plugins:spotlessApply language: system files: ^metadata-service/plugin/src/test/sample-test-plugins/.*\.java$ + pass_filenames: false - id: metadata-service-restli-client-spotless name: metadata-service/restli-client Spotless Apply entry: ./gradlew :metadata-service:restli-client:spotlessApply language: system files: ^metadata-service/restli-client/.*\.java$ + pass_filenames: false - id: metadata-service-restli-client-api-spotless name: metadata-service/restli-client-api Spotless Apply entry: ./gradlew :metadata-service:restli-client-api:spotlessApply language: system files: ^metadata-service/restli-client-api/.*\.java$ + pass_filenames: false - id: metadata-service-restli-servlet-impl-spotless name: metadata-service/restli-servlet-impl Spotless Apply entry: ./gradlew :metadata-service:restli-servlet-impl:spotlessApply language: system files: ^metadata-service/restli-servlet-impl/.*\.java$ + pass_filenames: false - id: metadata-service-schema-registry-api-spotless name: metadata-service/schema-registry-api Spotless Apply entry: ./gradlew :metadata-service:schema-registry-api:spotlessApply language: system files: ^metadata-service/schema-registry-api/.*\.java$ + pass_filenames: false - id: metadata-service-schema-registry-servlet-spotless name: metadata-service/schema-registry-servlet Spotless Apply entry: ./gradlew :metadata-service:schema-registry-servlet:spotlessApply language: system files: ^metadata-service/schema-registry-servlet/.*\.java$ + pass_filenames: false - id: metadata-service-services-spotless name: metadata-service/services Spotless Apply entry: ./gradlew :metadata-service:services:spotlessApply language: system files: ^metadata-service/services/.*\.java$ + pass_filenames: false - id: metadata-service-servlet-spotless name: metadata-service/servlet Spotless Apply entry: ./gradlew :metadata-service:servlet:spotlessApply language: system files: ^metadata-service/servlet/.*\.java$ + pass_filenames: false - id: metadata-utils-spotless name: metadata-utils Spotless Apply entry: ./gradlew :metadata-utils:spotlessApply language: system files: ^metadata-utils/.*\.java$ + pass_filenames: false - id: mock-entity-registry-spotless name: mock-entity-registry Spotless Apply entry: ./gradlew :mock-entity-registry:spotlessApply language: system files: ^mock-entity-registry/.*\.java$ + pass_filenames: false - id: smoke-test-lint-fix name: smoke-test Lint Fix entry: ./gradlew :smoke-test:lintFix language: system files: ^smoke-test/.*\.py$ + pass_filenames: false - id: test-models-spotless name: test-models Spotless Apply entry: ./gradlew :test-models:spotlessApply language: system files: ^test-models/.*\.java$ + pass_filenames: false - id: smoke-test-cypress-lint-fix name: smoke-test cypress Lint Fix diff --git a/build.gradle b/build.gradle index 3c36feadc5f4bb..284092e2b14f49 100644 --- a/build.gradle +++ b/build.gradle @@ -211,7 +211,7 @@ project.ext.externalDependency = [ 'mockitoInline': 'org.mockito:mockito-inline:4.11.0', 'mockServer': 'org.mock-server:mockserver-netty:5.11.2', 'mockServerClient': 'org.mock-server:mockserver-client-java:5.11.2', - 'mysqlConnector': 'mysql:mysql-connector-java:8.0.28', + 'mysqlConnector': 'com.mysql:mysql-connector-j:8.4.0', 'neo4jHarness': 'org.neo4j.test:neo4j-harness:' + neo4jTestVersion, 'neo4jJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jVersion, 'neo4jTestJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jTestVersion, @@ -235,7 +235,7 @@ project.ext.externalDependency = [ 'playFilters': "com.typesafe.play:filters-helpers_$playScalaVersion:$playVersion", 'pac4j': 'org.pac4j:pac4j-oidc:6.0.6', 'playPac4j': "org.pac4j:play-pac4j_$playScalaVersion:12.0.0-PLAY2.8", - 'postgresql': 'org.postgresql:postgresql:42.3.9', + 'postgresql': 'org.postgresql:postgresql:42.7.4', 'protobuf': 'com.google.protobuf:protobuf-java:3.25.5', 'grpcProtobuf': 'io.grpc:grpc-protobuf:1.53.0', 'rangerCommons': 'org.apache.ranger:ranger-plugins-common:2.3.0', @@ -286,7 +286,8 @@ project.ext.externalDependency = [ 'annotationApi': 'javax.annotation:javax.annotation-api:1.3.2', 'jakartaAnnotationApi': 'jakarta.annotation:jakarta.annotation-api:3.0.0', 'classGraph': 'io.github.classgraph:classgraph:4.8.172', - 'mustache': 'com.github.spullara.mustache.java:compiler:0.9.14' + 'mustache': 'com.github.spullara.mustache.java:compiler:0.9.14', + 'javaxMail': 'com.sun.mail:jakarta.mail:1.6.7' ] allprojects { @@ -374,9 +375,11 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) { exclude group: "org.slf4j", module: "slf4j-nop" exclude group: "org.slf4j", module: "slf4j-ext" exclude group: "org.codehaus.jackson", module: "jackson-mapper-asl" + exclude group: "javax.mail", module: "mail" resolutionStrategy.force externalDependency.antlr4Runtime resolutionStrategy.force externalDependency.antlr4 + resolutionStrategy.force 'org.apache.mina:mina-core:2.2.4' } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 94f0e8a055b701..59335ba605a741 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -2377,6 +2377,17 @@ private void configureDataJobResolvers(final RuntimeWiring.Builder builder) { ? dataJob.getDataPlatformInstance().getUrn() : null; })) + .dataFetcher( + "container", + new LoadableTypeResolver<>( + containerType, + (env) -> { + final DataJob dataJob = env.getSource(); + return dataJob.getContainer() != null + ? dataJob.getContainer().getUrn() + : null; + })) + .dataFetcher("parentContainers", new ParentContainersResolver(entityClient)) .dataFetcher("runs", new DataJobRunsResolver(entityClient)) .dataFetcher("privileges", new EntityPrivilegesResolver(entityClient)) .dataFetcher("exists", new EntityExistsResolver(entityService)) @@ -2454,6 +2465,17 @@ private void configureDataFlowResolvers(final RuntimeWiring.Builder builder) { ? dataFlow.getDataPlatformInstance().getUrn() : null; })) + .dataFetcher( + "container", + new LoadableTypeResolver<>( + containerType, + (env) -> { + final DataFlow dataFlow = env.getSource(); + return dataFlow.getContainer() != null + ? dataFlow.getContainer().getUrn() + : null; + })) + .dataFetcher("parentContainers", new ParentContainersResolver(entityClient)) .dataFetcher( "health", new EntityHealthResolver( diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/DataFlowType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/DataFlowType.java index 3a697517bdecee..f2d38aadf49656 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/DataFlowType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/DataFlowType.java @@ -74,6 +74,7 @@ public class DataFlowType DOMAINS_ASPECT_NAME, DEPRECATION_ASPECT_NAME, DATA_PLATFORM_INSTANCE_ASPECT_NAME, + CONTAINER_ASPECT_NAME, DATA_PRODUCTS_ASPECT_NAME, BROWSE_PATHS_V2_ASPECT_NAME, STRUCTURED_PROPERTIES_ASPECT_NAME, diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapper.java index 44bc6a99eae4bb..0902d6f2080b8f 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapper.java @@ -16,6 +16,7 @@ import com.linkedin.data.DataMap; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; +import com.linkedin.datahub.graphql.generated.Container; import com.linkedin.datahub.graphql.generated.DataFlow; import com.linkedin.datahub.graphql.generated.DataFlowEditableProperties; import com.linkedin.datahub.graphql.generated.DataFlowInfo; @@ -106,6 +107,7 @@ public DataFlow apply( (dataset, dataMap) -> dataset.setDataPlatformInstance( DataPlatformInstanceAspectMapper.map(context, new DataPlatformInstance(dataMap)))); + mappingHelper.mapToResult(context, CONTAINER_ASPECT_NAME, DataFlowMapper::mapContainers); mappingHelper.mapToResult( BROWSE_PATHS_V2_ASPECT_NAME, (dataFlow, dataMap) -> @@ -206,6 +208,17 @@ private static void mapGlobalTags( dataFlow.setTags(globalTags); } + private static void mapContainers( + @Nullable final QueryContext context, @Nonnull DataFlow dataFlow, @Nonnull DataMap dataMap) { + final com.linkedin.container.Container gmsContainer = + new com.linkedin.container.Container(dataMap); + dataFlow.setContainer( + Container.builder() + .setType(EntityType.CONTAINER) + .setUrn(gmsContainer.getContainer().toString()) + .build()); + } + private static void mapDomains( @Nullable final QueryContext context, @Nonnull DataFlow dataFlow, @Nonnull DataMap dataMap) { final Domains domains = new Domains(dataMap); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java index 8d55ca6dbf7ac9..317ee39ea565e5 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java @@ -75,6 +75,7 @@ public class DataJobType DOMAINS_ASPECT_NAME, DEPRECATION_ASPECT_NAME, DATA_PLATFORM_INSTANCE_ASPECT_NAME, + CONTAINER_ASPECT_NAME, DATA_PRODUCTS_ASPECT_NAME, BROWSE_PATHS_V2_ASPECT_NAME, SUB_TYPES_ASPECT_NAME, diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java index ec57c95ce151e2..3403d1f8f7b7f2 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java @@ -9,6 +9,7 @@ import com.linkedin.data.DataMap; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; +import com.linkedin.datahub.graphql.generated.Container; import com.linkedin.datahub.graphql.generated.DataFlow; import com.linkedin.datahub.graphql.generated.DataJob; import com.linkedin.datahub.graphql.generated.DataJobEditableProperties; @@ -112,6 +113,14 @@ public DataJob apply( } else if (DATA_PLATFORM_INSTANCE_ASPECT_NAME.equals(name)) { result.setDataPlatformInstance( DataPlatformInstanceAspectMapper.map(context, new DataPlatformInstance(data))); + } else if (CONTAINER_ASPECT_NAME.equals(name)) { + final com.linkedin.container.Container gmsContainer = + new com.linkedin.container.Container(data); + result.setContainer( + Container.builder() + .setType(EntityType.CONTAINER) + .setUrn(gmsContainer.getContainer().toString()) + .build()); } else if (BROWSE_PATHS_V2_ASPECT_NAME.equals(name)) { result.setBrowsePathV2(BrowsePathsV2Mapper.map(context, new BrowsePathsV2(data))); } else if (SUB_TYPES_ASPECT_NAME.equals(name)) { diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index a5cb0893a64fae..adb24d92587b58 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -6275,6 +6275,16 @@ type DataFlow implements EntityWithRelationships & Entity & BrowsableEntity { """ dataPlatformInstance: DataPlatformInstance + """ + The parent container in which the entity resides + """ + container: Container + + """ + Recursively get the lineage of containers for this entity + """ + parentContainers: ParentContainersResult + """ Granular API for querying edges extending from this entity """ @@ -6457,6 +6467,16 @@ type DataJob implements EntityWithRelationships & Entity & BrowsableEntity { """ dataPlatformInstance: DataPlatformInstance + """ + The parent container in which the entity resides + """ + container: Container + + """ + Recursively get the lineage of containers for this entity + """ + parentContainers: ParentContainersResult + """ Additional read write properties associated with the Data Job """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapperTest.java new file mode 100644 index 00000000000000..a49f063f94d336 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapperTest.java @@ -0,0 +1,42 @@ +package com.linkedin.datahub.graphql.types.dataflow.mappers; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.generated.DataFlow; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.metadata.Constants; +import java.net.URISyntaxException; +import java.util.HashMap; +import java.util.Map; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class DataFlowMapperTest { + private static final Urn TEST_DATA_FLOW_URN = + Urn.createFromTuple(Constants.DATA_FLOW_ENTITY_NAME, "dataflow1"); + private static final Urn TEST_CONTAINER_URN = + Urn.createFromTuple(Constants.CONTAINER_ENTITY_NAME, "container1"); + + @Test + public void testMapDataFlowContainer() throws URISyntaxException { + com.linkedin.container.Container input = new com.linkedin.container.Container(); + input.setContainer(TEST_CONTAINER_URN); + + final Map containerAspect = new HashMap<>(); + containerAspect.put( + Constants.CONTAINER_ASPECT_NAME, + new com.linkedin.entity.EnvelopedAspect().setValue(new Aspect(input.data()))); + final EntityResponse response = + new EntityResponse() + .setEntityName(Constants.DATA_FLOW_ENTITY_NAME) + .setUrn(TEST_DATA_FLOW_URN) + .setAspects(new EnvelopedAspectMap(containerAspect)); + + final DataFlow actual = DataFlowMapper.map(null, response); + + Assert.assertEquals(actual.getUrn(), TEST_DATA_FLOW_URN.toString()); + Assert.assertEquals(actual.getContainer().getUrn(), TEST_CONTAINER_URN.toString()); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapperTest.java new file mode 100644 index 00000000000000..d7fc0f198977eb --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapperTest.java @@ -0,0 +1,42 @@ +package com.linkedin.datahub.graphql.types.datajob.mappers; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.generated.DataJob; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.metadata.Constants; +import java.net.URISyntaxException; +import java.util.HashMap; +import java.util.Map; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class DataJobMapperTest { + private static final Urn TEST_DATA_JOB_URN = + Urn.createFromTuple(Constants.DATA_JOB_ENTITY_NAME, "datajob1"); + private static final Urn TEST_CONTAINER_URN = + Urn.createFromTuple(Constants.CONTAINER_ENTITY_NAME, "container1"); + + @Test + public void testMapDataJobContainer() throws URISyntaxException { + com.linkedin.container.Container input = new com.linkedin.container.Container(); + input.setContainer(TEST_CONTAINER_URN); + + final Map containerAspect = new HashMap<>(); + containerAspect.put( + Constants.CONTAINER_ASPECT_NAME, + new com.linkedin.entity.EnvelopedAspect().setValue(new Aspect(input.data()))); + final EntityResponse response = + new EntityResponse() + .setEntityName(Constants.DATA_JOB_ENTITY_NAME) + .setUrn(TEST_DATA_JOB_URN) + .setAspects(new EnvelopedAspectMap(containerAspect)); + + final DataJob actual = DataJobMapper.map(null, response); + + Assert.assertEquals(actual.getUrn(), TEST_DATA_JOB_URN.toString()); + Assert.assertEquals(actual.getContainer().getUrn(), TEST_CONTAINER_URN.toString()); + } +} diff --git a/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx b/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx index 3c03dfb65ccbcd..9e26bbadaca070 100644 --- a/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx +++ b/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx @@ -184,6 +184,7 @@ export class DataFlowEntity implements Entity { degree={(result as any).degree} paths={(result as any).paths} health={data.health} + parentContainers={data.parentContainers} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx index f210f7c985ebf7..0c86e745eba29f 100644 --- a/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx @@ -10,6 +10,7 @@ import { GlobalTags, Health, Owner, + ParentContainersResult, SearchInsight, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; @@ -40,6 +41,7 @@ export const Preview = ({ degree, paths, health, + parentContainers, }: { urn: string; name: string; @@ -59,6 +61,7 @@ export const Preview = ({ degree?: number; paths?: EntityPath[]; health?: Health[] | null; + parentContainers?: ParentContainersResult | null; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -91,6 +94,7 @@ export const Preview = ({ degree={degree} paths={paths} health={health || undefined} + parentContainers={parentContainers} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx b/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx index 5b1aaeaef76d5b..ff6490ebc91b0c 100644 --- a/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx +++ b/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx @@ -205,6 +205,7 @@ export class DataJobEntity implements Entity { degree={(result as any).degree} paths={(result as any).paths} health={data.health} + parentContainers={data.parentContainers} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx index b163722b5151c7..07ff81effbbc65 100644 --- a/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx @@ -12,6 +12,7 @@ import { GlobalTags, Health, Owner, + ParentContainersResult, SearchInsight, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; @@ -44,6 +45,7 @@ export const Preview = ({ degree, paths, health, + parentContainers, }: { urn: string; name: string; @@ -64,6 +66,7 @@ export const Preview = ({ degree?: number; paths?: EntityPath[]; health?: Health[] | null; + parentContainers?: ParentContainersResult | null; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -98,6 +101,7 @@ export const Preview = ({ degree={degree} paths={paths} health={health || undefined} + parentContainers={parentContainers} /> ); }; diff --git a/datahub-web-react/src/graphql/dataFlow.graphql b/datahub-web-react/src/graphql/dataFlow.graphql index 2441ce600c3c55..199c47811ce08e 100644 --- a/datahub-web-react/src/graphql/dataFlow.graphql +++ b/datahub-web-react/src/graphql/dataFlow.graphql @@ -50,6 +50,9 @@ fragment dataFlowFields on DataFlow { dataPlatformInstance { ...dataPlatformInstanceFields } + parentContainers { + ...parentContainersFields + } browsePathV2 { ...browsePathV2Fields } diff --git a/datahub-web-react/src/graphql/fragments.graphql b/datahub-web-react/src/graphql/fragments.graphql index 788c68349b4268..68c57c5cb5db55 100644 --- a/datahub-web-react/src/graphql/fragments.graphql +++ b/datahub-web-react/src/graphql/fragments.graphql @@ -403,6 +403,9 @@ fragment dataJobFields on DataJob { dataPlatformInstance { ...dataPlatformInstanceFields } + parentContainers { + ...parentContainersFields + } privileges { canEditLineage } diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index 58c9a51f3d7e90..72e7d347187828 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -128,6 +128,9 @@ fragment autoCompleteFields on Entity { dataPlatformInstance { ...dataPlatformInstanceFields } + parentContainers { + ...parentContainersFields + } } ... on DataJob { dataFlow { @@ -146,6 +149,9 @@ fragment autoCompleteFields on Entity { dataPlatformInstance { ...dataPlatformInstanceFields } + parentContainers { + ...parentContainersFields + } } ... on GlossaryTerm { name @@ -626,6 +632,9 @@ fragment searchResultsWithoutSchemaField on Entity { dataPlatformInstance { ...dataPlatformInstanceFields } + parentContainers { + ...parentContainersFields + } domain { ...entityDomain } @@ -677,6 +686,9 @@ fragment searchResultsWithoutSchemaField on Entity { dataPlatformInstance { ...dataPlatformInstanceFields } + parentContainers { + ...parentContainersFields + } subTypes { typeNames } diff --git a/docker/datahub-frontend/Dockerfile b/docker/datahub-frontend/Dockerfile index 89974e56575b07..16e6477c37ce69 100644 --- a/docker/datahub-frontend/Dockerfile +++ b/docker/datahub-frontend/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=prod -FROM alpine:3.20 AS base +FROM alpine:3.21 AS base # Configurable repositories ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile index 47b10535f8deea..52cc507f9268d1 100644 --- a/docker/datahub-gms/Dockerfile +++ b/docker/datahub-gms/Dockerfile @@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION -FROM alpine:3.20 AS base +FROM alpine:3.21 AS base ENV JMX_VERSION=0.18.0 ENV JETTY_VERSION=11.0.21 diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile index 74375072761d89..4ddec56311fb96 100644 --- a/docker/datahub-mae-consumer/Dockerfile +++ b/docker/datahub-mae-consumer/Dockerfile @@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION -FROM alpine:3.20 AS base +FROM alpine:3.21 AS base # Re-declaring args from above to make them available in this stage (will inherit default values) ARG ALPINE_REPO_URL diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile index 3adef53cd06068..42e40cd5942144 100644 --- a/docker/datahub-mce-consumer/Dockerfile +++ b/docker/datahub-mce-consumer/Dockerfile @@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION -FROM alpine:3.20 AS base +FROM alpine:3.21 AS base # Re-declaring args from above to make them available in this stage (will inherit default values) ARG ALPINE_REPO_URL diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile index a8ef4e8034fdd5..d63ceb83dc5295 100644 --- a/docker/datahub-upgrade/Dockerfile +++ b/docker/datahub-upgrade/Dockerfile @@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION -FROM alpine:3.20 AS base +FROM alpine:3.21 AS base # Re-declaring args from above to make them available in this stage (will inherit default values) ARG ALPINE_REPO_URL @@ -34,16 +34,12 @@ ARG MAVEN_CENTRAL_REPO_URL RUN if [ "${ALPINE_REPO_URL}" != "http://dl-cdn.alpinelinux.org/alpine" ] ; then sed -i "s#http.*://dl-cdn.alpinelinux.org/alpine#${ALPINE_REPO_URL}#g" /etc/apk/repositories ; fi ENV JMX_VERSION=0.18.0 -ENV JETTY_VERSION=11.0.21 # Upgrade Alpine and base packages # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ && apk --no-cache add curl bash coreutils gcompat sqlite libc6-compat snappy \ && apk --no-cache add openjdk17-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \ - && curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-runner/${JETTY_VERSION}/jetty-runner-${JETTY_VERSION}.jar --output jetty-runner.jar \ - && curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-jmx/${JETTY_VERSION}/jetty-jmx-${JETTY_VERSION}.jar --output jetty-jmx.jar \ - && curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-util/${JETTY_VERSION}/jetty-util-${JETTY_VERSION}.jar --output jetty-util.jar \ && wget --no-verbose ${GITHUB_REPO_URL}/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \ && wget --no-verbose ${MAVEN_CENTRAL_REPO_URL}/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_VERSION}/jmx_prometheus_javaagent-${JMX_VERSION}.jar -O jmx_prometheus_javaagent.jar \ && cp /usr/lib/jvm/java-17-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks diff --git a/docker/elasticsearch-setup/Dockerfile b/docker/elasticsearch-setup/Dockerfile index 1a6fe5bee6c840..584007a5fb0a9c 100644 --- a/docker/elasticsearch-setup/Dockerfile +++ b/docker/elasticsearch-setup/Dockerfile @@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION -FROM alpine:3.20 AS base +FROM alpine:3.21 AS base ARG ALPINE_REPO_URL diff --git a/docker/mysql-setup/Dockerfile b/docker/mysql-setup/Dockerfile index 8a2d42bc233180..21b696a1b906fe 100644 --- a/docker/mysql-setup/Dockerfile +++ b/docker/mysql-setup/Dockerfile @@ -17,7 +17,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION -FROM alpine:3.20 +FROM alpine:3.21 COPY --from=binary /go/bin/dockerize /usr/local/bin ARG ALPINE_REPO_URL diff --git a/docker/mysql-setup/init.sh b/docker/mysql-setup/init.sh index b5ee294ddd6559..2760da86a9a33f 100755 --- a/docker/mysql-setup/init.sh +++ b/docker/mysql-setup/init.sh @@ -1,6 +1,7 @@ #!/bin/bash : ${MYSQL_PORT:=3306} +: ${MYSQL_ARGS:=--ssl=0} sed -e "s/DATAHUB_DB_NAME/${DATAHUB_DB_NAME}/g" /init.sql | tee -a /tmp/init-final.sql -mysql -u $MYSQL_USERNAME -p"$MYSQL_PASSWORD" -h $MYSQL_HOST -P $MYSQL_PORT < /tmp/init-final.sql \ No newline at end of file +mariadb -u $MYSQL_USERNAME -p"$MYSQL_PASSWORD" -h $MYSQL_HOST -P $MYSQL_PORT $MYSQL_ARGS < /tmp/init-final.sql \ No newline at end of file diff --git a/docker/postgres-setup/Dockerfile b/docker/postgres-setup/Dockerfile index 31e9687cea15e8..5362e0d787c15d 100644 --- a/docker/postgres-setup/Dockerfile +++ b/docker/postgres-setup/Dockerfile @@ -17,7 +17,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION -FROM alpine:3.20 +FROM alpine:3.21 COPY --from=binary /go/bin/dockerize /usr/local/bin ARG ALPINE_REPO_URL diff --git a/docs-website/README.md b/docs-website/README.md index 3b24cb869a444d..b40e4636422781 100644 --- a/docs-website/README.md +++ b/docs-website/README.md @@ -130,7 +130,6 @@ The purpose of this section is to provide developers & technical users with conc This section aims to provide plain-language feature overviews for both technical and non-technical readers alike. - ## Docs Generation Features **Includes all markdown files** @@ -145,16 +144,33 @@ You can suppress this check by adding the path to the file in a comment in `side Use an "inline" directive to include code snippets from other files. The `show_path_as_comment` option will include the path to the file as a comment at the top of the snippet. - ```python - {{ inline /metadata-ingestion/examples/library/data_quality_mcpw_rest.py show_path_as_comment }} - ``` + ```python + {{ inline /metadata-ingestion/examples/library/data_quality_mcpw_rest.py show_path_as_comment }} + ``` + +**Command Output** + +Use the `{{ command-output cmd }}` directive to run subprocesses and inject the outputs into the final markdown. + + {{ command-output python -c 'print("Hello world")' }} +This also works for multi-line scripts. + + {{ command-output + source metadata-ingestion/venv/bin/activate + python -m + }} + +Regardless of the location of the markdown file, the subcommands will be executed with working directory set to the repo root. + +Only the stdout of the subprocess will be outputted. The stderr, if any, will be included as a comment in the markdown. ## Docs site generation process This process is orchestrated by a combination of Gradle and Yarn tasks. The main entrypoint is via the `docs-website:yarnGenerate` task, which in turn eventually runs `yarn run generate`. Steps: + 1. Generate the GraphQL combined schema using the gradle's `docs-website:generateGraphQLSchema` task. This generates `./graphql/combined.graphql`. 2. Generate docs for ingestion sources using the `:metadata-ingestion:docGen` gradle task. 3. Generate docs for our metadata model using the `:metadata-ingestion:modelDocGen` gradle task. diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index ad82a85f9e5672..3a14baee073c2a 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -439,6 +439,42 @@ function markdown_process_inline_directives( contents.content = new_content; } +function markdown_process_command_output( + contents: matter.GrayMatterFile, + filepath: string +): void { + const new_content = contents.content.replace( + /^{{\s*command-output\s*([\s\S]*?)\s*}}$/gm, + (_, command: string) => { + try { + // Change to repo root directory before executing command + const repoRoot = path.resolve(__dirname, ".."); + + console.log(`Executing command: ${command}`); + + // Execute the command and capture output + const output = execSync(command, { + cwd: repoRoot, + encoding: "utf8", + stdio: ["pipe", "pipe", "pipe"], + }); + + // Return the command output + return output.trim(); + } catch (error: any) { + // If there's an error, include it as a comment + const errorMessage = error.stderr + ? error.stderr.toString() + : error.message; + return `${ + error.stdout ? error.stdout.toString().trim() : "" + }\n`; + } + } + ); + contents.content = new_content; +} + function markdown_sanitize_and_linkify(content: string): string { // MDX escaping content = content.replace(/ -Install the relevant CLI version. Forms are available as of CLI version `0.13.1`. The corresponding DataHub Cloud release version is `v0.2.16.5` +Install the relevant CLI version. +Structured Properties were introduced in version `0.13.1`, but we continuously improve and add new functionality, so you should always [upgrade](https://datahubproject.io/docs/cli/#installation) to the latest cli for best results. Connect to your instance via [init](https://datahubproject.io/docs/cli/#init): - Run `datahub init` to update the instance you want to load into. @@ -56,33 +58,8 @@ Requirements for OpenAPI are: The following code will create a structured property `io.acryl.privacy.retentionTime`. - -```graphql -mutation createStructuredProperty { - createStructuredProperty( - input: { - id: "retentionTime", - qualifiedName:"retentionTime", - displayName: "Retention Time", - description: "Retention Time is used to figure out how long to retain records in a dataset", - valueType: "urn:li:dataType:datahub.number", - allowedValues: [ - {numberValue: 30, description: "30 days, usually reserved for datasets that are ephemeral and contain pii"}, - {numberValue: 90, description:"description: Use this for datasets that drive monthly reporting but contain pii"}, - {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"} - ], - cardinality: SINGLE, - entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"], - } - ) { - urn - } -} -``` - - - + Create a yaml file representing the properties you’d like to load. For example, below file represents a property `io.acryl.privacy.retentionTime`. You can see the full example [here](https://github.com/datahub-project/datahub/blob/example-yaml-sp/metadata-ingestion/examples/structured_properties/struct_props.yaml). @@ -108,13 +85,41 @@ For example, below file represents a property `io.acryl.privacy.retentionTime`. ``` Use the CLI to create your properties: -```commandline +```shell datahub properties upsert -f {properties_yaml} ``` If successful, you should see `Created structured property urn:li:structuredProperty:...` + + + +```graphql +mutation createStructuredProperty { + createStructuredProperty( + input: { + id: "retentionTime", + qualifiedName:"retentionTime", + displayName: "Retention Time", + description: "Retention Time is used to figure out how long to retain records in a dataset", + valueType: "urn:li:dataType:datahub.number", + allowedValues: [ + {numberValue: 30, description: "30 days, usually reserved for datasets that are ephemeral and contain pii"}, + {numberValue: 90, description:"description: Use this for datasets that drive monthly reporting but contain pii"}, + {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"} + ], + cardinality: SINGLE, + entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"], + } + ) { + urn + } +} +``` + + + ```shell @@ -236,9 +241,182 @@ Example Response: -## Read Structured Properties +## List Structured Properties + +You can list all structured properties in your DataHub instance using the following methods: + + + + +```shell +datahub properties list +``` + +This will show all properties with their full details. + +Example Response: +```json +{ + "urn": "urn:li:structuredProperty:clusterName", + "qualified_name": "clusterName", + "type": "urn:li:dataType:datahub.string", + "description": "Test Cluster Name Property", + "display_name": "Cluster's name", + "entity_types": [ + "urn:li:entityType:datahub.dataset" + ], + "cardinality": "SINGLE" +} +{ + "urn": "urn:li:structuredProperty:projectNames", + "qualified_name": "projectNames", + "type": "urn:li:dataType:datahub.string", + "description": "Test property for project name", + "display_name": "Project Name", + "entity_types": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow" + ], + "cardinality": "MULTIPLE", + "allowed_values": [ + { + "value": "Tracking", + "description": "test value 1 for project" + }, + { + "value": "DataHub", + "description": "test value 2 for project" + } + ] +} +``` + + +If you only want to see the URNs, you can use: + +```shell +datahub properties list --no-details +``` + +Example Response: +``` +[2025-01-08 22:23:00,625] INFO {datahub.cli.specific.structuredproperties_cli:134} - Listing structured property urns only, use --details for more information +urn:li:structuredProperty:clusterName +urn:li:structuredProperty:clusterType +urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate +urn:li:structuredProperty:projectNames +``` + +To download all the structured property definitions into a single file that you can use with the `upsert` command as described in the [create section](#create-structured-properties), you can run the list command with the `--to-file` option. + +```shell +datahub properties list --to-file structured_properties.yaml +``` + +Example Response: +```yaml + - urn: urn:li:structuredProperty:clusterName + qualified_name: clusterName + type: urn:li:dataType:datahub.string + description: Test Cluster Name Property + display_name: Cluster's name + entity_types: + - urn:li:entityType:datahub.dataset + cardinality: SINGLE + - urn: urn:li:structuredProperty:clusterType + qualified_name: clusterType + type: urn:li:dataType:datahub.string + description: Test Cluster Type Property + display_name: Cluster's type + entity_types: + - urn:li:entityType:datahub.dataset + cardinality: SINGLE + - urn: urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate + qualified_name: io.acryl.dataManagement.deprecationDate + type: urn:li:dataType:datahub.date + display_name: Deprecation Date + entity_types: + - urn:li:entityType:datahub.dataset + - urn:li:entityType:datahub.dataFlow + - urn:li:entityType:datahub.dataJob + - urn:li:entityType:datahub.schemaField + cardinality: SINGLE + - urn: urn:li:structuredProperty:io.acryl.privacy.enumProperty5712 + qualified_name: io.acryl.privacy.enumProperty5712 + type: urn:li:dataType:datahub.string + description: The retention policy for the dataset + entity_types: + - urn:li:entityType:datahub.dataset + cardinality: MULTIPLE + allowed_values: + - value: foo + - value: bar +... etc. +``` + + + + + +Example Request: +```bash +curl -X 'GET' \ + 'http://localhost:9002/openapi/v3/entity/structuredproperty?systemMetadata=false&includeSoftDelete=false&skipCache=false&aspects=structuredPropertySettings&aspects=propertyDefinition&aspects=institutionalMemory&aspects=structuredPropertyKey&aspects=status&count=10&sortCriteria=urn&sortOrder=ASCENDING&query=*' \ + -H 'accept: application/json' +``` + +Example Response: +```json +{ + "scrollId": "...", + "entities": [ + { + "urn": "urn:li:structuredProperty:clusterName", + "propertyDefinition": { + "value": { + "immutable": false, + "qualifiedName": "clusterName", + "displayName": "Cluster's name", + "valueType": "urn:li:dataType:datahub.string", + "description": "Test Cluster Name Property", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "cardinality": "SINGLE" + } + }, + "structuredPropertyKey": { + "value": { + "id": "clusterName" + } + } + } + ] +} +``` + +Key Query Parameters: +- `count`: Number of results to return per page (default: 10) +- `sortCriteria`: Field to sort by (default: urn) +- `sortOrder`: Sort order (ASCENDING or DESCENDING) +- `query`: Search query to filter properties (* for all) + + + + +The list endpoint returns all structured properties in your DataHub instance. Each property includes: +- URN: Unique identifier for the property +- Qualified Name: The property's qualified name +- Type: The data type of the property (string, number, date, etc.) +- Description: A description of the property's purpose +- Display Name: Human-readable name for the property +- Entity Types: The types of entities this property can be applied to +- Cardinality: Whether the property accepts single (SINGLE) or multiple (MULTIPLE) values +- Allowed Values: If specified, the list of allowed values for this property -You can see the properties you created by running the following command: +## Read a single Structured Property + +You can read an individual property you created by running the following command: @@ -279,6 +457,91 @@ If successful, you should see metadata about your properties returned. } ``` + + + +Example Request: +```graphql +query { + structuredProperty(urn: "urn:li:structuredProperty:projectNames") { + urn + type + definition { + qualifiedName + displayName + description + cardinality + allowedValues { + value { + ... on StringValue { + stringValue + } + ... on NumberValue { + numberValue + } + } + description + } + entityTypes { + urn + info { + type + qualifiedName + } + } + } + } +} +``` + +Example Response: +```json +{ + "data": { + "structuredProperty": { + "urn": "urn:li:structuredProperty:projectNames", + "type": "STRUCTURED_PROPERTY", + "definition": { + "qualifiedName": "projectNames", + "displayName": "Project Name", + "description": "Test property for project name", + "cardinality": "MULTIPLE", + "allowedValues": [ + { + "value": { + "stringValue": "Tracking" + }, + "description": "test value 1 for project" + }, + { + "value": { + "stringValue": "DataHub" + }, + "description": "test value 2 for project" + } + ], + "entityTypes": [ + { + "urn": "urn:li:entityType:datahub.dataset", + "info": { + "type": "DATASET", + "qualifiedName": "datahub.dataset" + } + }, + { + "urn": "urn:li:entityType:datahub.dataFlow", + "info": { + "type": "DATA_FLOW", + "qualifiedName": "datahub.dataFlow" + } + } + ] + } + } + }, + "extensions": {} +} +``` @@ -389,7 +652,7 @@ Example Response: This action will set/replace all structured properties on the entity. See PATCH operations to add/remove a single property. - + ```graphql mutation upsertStructuredProperties { @@ -537,7 +800,7 @@ datahub dataset get --urn {urn} For reading all structured properties from a dataset: - + ```graphql query getDataset { diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 07577079d66d12..68b41c907c6ad6 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -44,6 +44,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - OpenAPI Update: PIT Keep Alive parameter added to scroll. NOTE: This parameter requires the `pointInTimeCreationEnabled` feature flag to be enabled and the `elasticSearch.implementation` configuration to be `elasticsearch`. This feature is not supported for OpenSearch at this time and the parameter will not be respected without both of these set. - OpenAPI Update 2: Previously there was an incorrectly marked parameter named `sort` on the generic list entities endpoint for v3. This parameter is deprecated and only supports a single string value while the documentation indicates it supports a list of strings. This documentation error has been fixed and the correct field, `sortCriteria`, is now documented which supports a list of strings. - #12223: For dbt Cloud ingestion, the "View in dbt" link will point at the "Explore" page in the dbt Cloud UI. You can revert to the old behavior of linking to the dbt Cloud IDE by setting `external_url_mode: ide". +- #12236: Data flow and data job entities may additionally produce container aspect that will require a corresponding upgrade of server. Otherwise server can reject the aspect. ### Breaking Changes diff --git a/docs/managed-datahub/subscription-and-notification.md b/docs/managed-datahub/subscription-and-notification.md index c3c31d5fed7e61..c27754a6371265 100644 --- a/docs/managed-datahub/subscription-and-notification.md +++ b/docs/managed-datahub/subscription-and-notification.md @@ -17,9 +17,30 @@ Email will work out of box. For installing the DataHub Slack App, see: This feature is especially useful in helping you stay on top of any upstream changes that could impact the assets you or your stakeholders rely on. It eliminates the need for you and your team to manually check for upstream changes, or for upstream stakeholders to identify and notify impacted users. As a user, you can subscribe to and receive notifications about changes such as deprecations, schema changes, changes in ownership, assertions, or incidents. You’ll always been in the know about potential data quality issues so you can proactively manage your data resources. + +## Platform Admin Notifications + +Datahub provides three levels of notifications: + +- **Platform-level** +- **Group-level** (described in other sections) +- **User-level** (described in other sections) + +**Setting Platform-Level Notifications:** +This requires appropriate permissions. Go to `Settings` > `Notifications` (under the `Platform` section, not `My Notifications`). + +**Platform-level Notifications:** +Platform-level notifications are applied to all assets within Datahub. +Example: If "An owner is added or removed from a data asset" is ticked, the designated Slack channel or email will receive notifications for any such changes across all assets. + +**Our Recommendations:** + +Notifying on tag changes for every asset in the platform would be noisy, and so we recommend to use these platform-level notifications only where appropriate. For example, we recommend notifications for ingestion failures routed to a central Slack channel or email. This will help you proactively ensure your Datahub metadata stays fresh. + ## Prerequisites Once you have [configured Slack within your DataHub instance](slack/saas-slack-setup.md), you will be able to subscribe to any Entity in DataHub and begin recieving notifications via DM. + To begin receiving personal notifications, go to Settings > "My Notifications". From here, toggle on Slack Notifications and input your Slack Member ID. If you want to create and manage group-level Subscriptions for your team, you will need [the following privileges](../../docs/authorization/roles.md#role-privileges): @@ -162,6 +183,21 @@ You can unsubscribe from any asset to stop receiving notifications about it. On What if I want to be notified about different changes? To modify your subscription, use the dropdown menu next to the Subscribe button to modify the changes you want to be notified about. + +
+ +I want to configure multiple channels. How many Slack channels or emails can I configure to get notified? + +At the platform-level, you can configure one email and one Slack channel. + +At the user and group -levels, you can configure one default email and Slack channel as well as overwrite that email/channel when you +go to a specific asset to subscribe to. + +To configure multiple channels, as a prereq, ensure you have the appropriate privileges. And then: +1. Create a datahub group for each channel you want notifications for. +2. Add yourself as a member to each of the groups. +3. Now, when you visit an asset and go to subscribe, you'll see the option "Manage Group Subscriptions". +
## Reference diff --git a/gradle/coverage/java-coverage.gradle b/gradle/coverage/java-coverage.gradle index 17260c1a309788..fe8bc65336a983 100644 --- a/gradle/coverage/java-coverage.gradle +++ b/gradle/coverage/java-coverage.gradle @@ -22,7 +22,7 @@ afterEvaluate { Tools that aggregate and analyse coverage tools search for the coverage result files. Keeping them under one folder will minimize the time spent searching through the full source tree. */ - outputLocation = rootProject.layout.buildDirectory.file("coverage-reports/jacoco-${project.name}.xml") + outputLocation = rootProject.layout.buildDirectory.file("coverage-reports/${rootProject.relativePath(project.projectDir)}/jacoco-${project.name}.xml") } csv.required = false html.required = false diff --git a/gradle/coverage/python-coverage.gradle b/gradle/coverage/python-coverage.gradle index 23d6e37387ed83..05eb79cf5659e2 100644 --- a/gradle/coverage/python-coverage.gradle +++ b/gradle/coverage/python-coverage.gradle @@ -7,7 +7,7 @@ ext.get_coverage_args = { test_name = "" -> Tools that aggregate and analyse coverage tools search for the coverage result files. Keeping them under one folder will minimize the time spent searching through the full source tree. */ - def base_path = "${rootProject.buildDir}/coverage-reports" + def base_path = "${rootProject.buildDir}/coverage-reports/${rootProject.relativePath(project.projectDir)}/" /* --cov=src was added via setup.cfg in many of the python projects but for some reason, was not getting picked up diff --git a/metadata-ingestion/docs/sources/metadata-file/metadata-file_recipe.yml b/metadata-ingestion/docs/sources/metadata-file/file_recipe.yml similarity index 100% rename from metadata-ingestion/docs/sources/metadata-file/metadata-file_recipe.yml rename to metadata-ingestion/docs/sources/metadata-file/file_recipe.yml diff --git a/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md b/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md new file mode 100644 index 00000000000000..ae9812b2a48ad3 --- /dev/null +++ b/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md @@ -0,0 +1,16 @@ +### Configuration Notes + +See the + +1. [Microsoft Grant user access to a Report Server doc](https://docs.microsoft.com/en-us/sql/reporting-services/security/grant-user-access-to-a-report-server?view=sql-server-ver16) +2. Use your user credentials from previous step in yaml file + +### Concept mapping + +| Power BI Report Server | Datahub | +| ---------------------- | ----------- | +| `Paginated Report` | `Dashboard` | +| `Power BI Report` | `Dashboard` | +| `Mobile Report` | `Dashboard` | +| `Linked Report` | `Dashboard` | +| `Dataset, Datasource` | `N/A` | diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_recipe.yml b/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_recipe.yml similarity index 100% rename from metadata-ingestion/docs/sources/powerbi/powerbi-report-server_recipe.yml rename to metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_recipe.yml diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md deleted file mode 100644 index ca600f10786758..00000000000000 --- a/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md +++ /dev/null @@ -1,13 +0,0 @@ -## Configuration Notes -See the -1. [Microsoft Grant user access to a Report Server doc](https://docs.microsoft.com/en-us/sql/reporting-services/security/grant-user-access-to-a-report-server?view=sql-server-ver16) -2. Use your user credentials from previous step in yaml file -## Concept mapping - -| Power BI Report Server | Datahub | -| ------------------------- | ------------------- | -| `Paginated Report` | `Dashboard` | -| `Power BI Report` | `Dashboard` | -| `Mobile Report` | `Dashboard` | -| `Linked Report` | `Dashboard` | -| `Dataset, Datasource` | `N/A` | diff --git a/metadata-ingestion/examples/structured_properties/list_structured_properties.py b/metadata-ingestion/examples/structured_properties/list_structured_properties.py new file mode 100644 index 00000000000000..66ac90c1228a37 --- /dev/null +++ b/metadata-ingestion/examples/structured_properties/list_structured_properties.py @@ -0,0 +1,12 @@ +# Usage: python3 list_structured_properties.py +# Expected Output: List of structured properties +# This script lists all structured properties in DataHub +from datahub.api.entities.structuredproperties.structuredproperties import ( + StructuredProperties, +) +from datahub.ingestion.graph.client import get_default_graph + +with get_default_graph() as graph: + structuredproperties = StructuredProperties.list(graph) + for structuredproperty in structuredproperties: + print(structuredproperty.dict()) diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index 2841985ad07808..0fe79a2c6a8e47 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -346,7 +346,7 @@ def write_urn_classes(key_aspects: List[dict], urn_dir: Path) -> None: code = """ # This file contains classes corresponding to entity URNs. -from typing import ClassVar, List, Optional, Type, TYPE_CHECKING +from typing import ClassVar, List, Optional, Type, TYPE_CHECKING, Union import functools from deprecated.sphinx import deprecated as _sphinx_deprecated @@ -547,10 +547,31 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str: assert fields[0]["type"] == ["null", "string"] fields[0]["type"] = "string" + field_urn_type_classes = {} + for field in fields: + # Figure out if urn types are valid for each field. + field_urn_type_class = None + if field_name(field) == "platform": + field_urn_type_class = "DataPlatformUrn" + elif field.get("Urn"): + if len(field.get("entityTypes", [])) == 1: + field_entity_type = field["entityTypes"][0] + field_urn_type_class = f"{capitalize_entity_name(field_entity_type)}Urn" + else: + field_urn_type_class = "Urn" + + field_urn_type_classes[field_name(field)] = field_urn_type_class + _init_arg_parts: List[str] = [] for field in fields: + field_urn_type_class = field_urn_type_classes[field_name(field)] + default = '"PROD"' if field_name(field) == "env" else None - _arg_part = f"{field_name(field)}: {field_type(field)}" + + type_hint = field_type(field) + if field_urn_type_class: + type_hint = f'Union["{field_urn_type_class}", str]' + _arg_part = f"{field_name(field)}: {type_hint}" if default: _arg_part += f" = {default}" _init_arg_parts.append(_arg_part) @@ -579,16 +600,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str: init_validation += f'if not {field_name(field)}:\n raise InvalidUrnError("{class_name} {field_name(field)} cannot be empty")\n' # Generalized mechanism for validating embedded urns. - field_urn_type_class = None - if field_name(field) == "platform": - field_urn_type_class = "DataPlatformUrn" - elif field.get("Urn"): - if len(field.get("entityTypes", [])) == 1: - field_entity_type = field["entityTypes"][0] - field_urn_type_class = f"{capitalize_entity_name(field_entity_type)}Urn" - else: - field_urn_type_class = "Urn" - + field_urn_type_class = field_urn_type_classes[field_name(field)] if field_urn_type_class: init_validation += f"{field_name(field)} = str({field_name(field)})\n" init_validation += ( @@ -608,7 +620,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str: init_coercion += " platform_name = DataPlatformUrn.from_string(platform_name).platform_name\n" if field_name(field) == "platform": - init_coercion += "platform = DataPlatformUrn(platform).urn()\n" + init_coercion += "platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()\n" elif field_urn_type_class is None: # For all non-urns, run the value through the UrnEncoder. init_coercion += ( diff --git a/metadata-ingestion/scripts/docgen.py b/metadata-ingestion/scripts/docgen.py index 402cd8a8141990..71eef96f5b9262 100644 --- a/metadata-ingestion/scripts/docgen.py +++ b/metadata-ingestion/scripts/docgen.py @@ -1,381 +1,25 @@ +import dataclasses import glob -import html import json import logging import os +import pathlib import re import sys import textwrap from importlib.metadata import metadata, requires -from typing import Any, Dict, Iterable, List, Optional +from typing import Dict, List, Optional import click -from pydantic import BaseModel, Field +from docgen_types import Platform, Plugin +from docs_config_table import gen_md_table_from_json_schema from datahub.configuration.common import ConfigModel -from datahub.ingestion.api.decorators import ( - CapabilitySetting, - SourceCapability, - SupportStatus, -) +from datahub.ingestion.api.decorators import SourceCapability, SupportStatus from datahub.ingestion.source.source_registry import source_registry -from datahub.metadata.schema_classes import SchemaFieldClass logger = logging.getLogger(__name__) -DEFAULT_VALUE_MAX_LENGTH = 50 -DEFAULT_VALUE_TRUNCATION_MESSAGE = "..." - - -def _truncate_default_value(value: str) -> str: - if len(value) > DEFAULT_VALUE_MAX_LENGTH: - return value[:DEFAULT_VALUE_MAX_LENGTH] + DEFAULT_VALUE_TRUNCATION_MESSAGE - return value - - -def _format_path_component(path: str) -> str: - """ - Given a path like 'a.b.c', adds css tags to the components. - """ - path_components = path.rsplit(".", maxsplit=1) - if len(path_components) == 1: - return f'{path_components[0]}' - - return ( - f'{path_components[0]}.' - f'{path_components[1]}' - ) - - -def _format_type_name(type_name: str) -> str: - return f'{type_name}' - - -def _format_default_line(default_value: str, has_desc_above: bool) -> str: - default_value = _truncate_default_value(default_value) - escaped_value = ( - html.escape(default_value) - # Replace curly braces to avoid JSX issues. - .replace("{", "{") - .replace("}", "}") - # We also need to replace markdown special characters. - .replace("*", "*") - .replace("_", "_") - .replace("[", "[") - .replace("]", "]") - .replace("|", "|") - .replace("`", "`") - ) - value_elem = f'{escaped_value}' - return f'
Default: {value_elem}
' - - -class FieldRow(BaseModel): - path: str - parent: Optional[str] - type_name: str - required: bool - has_default: bool - default: str - description: str - inner_fields: List["FieldRow"] = Field(default_factory=list) - discriminated_type: Optional[str] = None - - class Component(BaseModel): - type: str - field_name: Optional[str] - - # matches any [...] style section inside a field path - _V2_FIELD_PATH_TOKEN_MATCHER = r"\[[\w.]*[=]*[\w\(\-\ \_\).]*\][\.]*" - # matches a .?[...] style section inside a field path anchored to the beginning - _V2_FIELD_PATH_TOKEN_MATCHER_PREFIX = rf"^[\.]*{_V2_FIELD_PATH_TOKEN_MATCHER}" - _V2_FIELD_PATH_FIELD_NAME_MATCHER = r"^\w+" - - @staticmethod - def map_field_path_to_components(field_path: str) -> List[Component]: - m = re.match(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER_PREFIX, field_path) - v = re.match(FieldRow._V2_FIELD_PATH_FIELD_NAME_MATCHER, field_path) - components: List[FieldRow.Component] = [] - while m or v: - token = m.group() if m else v.group() # type: ignore - if v: - if components: - if components[-1].field_name is None: - components[-1].field_name = token - else: - components.append( - FieldRow.Component(type="non_map_type", field_name=token) - ) - else: - components.append( - FieldRow.Component(type="non_map_type", field_name=token) - ) - - if m: - if token.startswith("[version="): - pass - elif "[type=" in token: - type_match = re.match(r"[\.]*\[type=(.*)\]", token) - if type_match: - type_string = type_match.group(1) - if components and components[-1].type == "map": - if components[-1].field_name is None: - pass - else: - new_component = FieldRow.Component( - type="map_key", field_name="`key`" - ) - components.append(new_component) - new_component = FieldRow.Component( - type=type_string, field_name=None - ) - components.append(new_component) - if type_string == "map": - new_component = FieldRow.Component( - type=type_string, field_name=None - ) - components.append(new_component) - - field_path = field_path[m.span()[1] :] if m else field_path[v.span()[1] :] # type: ignore - m = re.match(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER_PREFIX, field_path) - v = re.match(FieldRow._V2_FIELD_PATH_FIELD_NAME_MATCHER, field_path) - - return components - - @staticmethod - def field_path_to_components(field_path: str) -> List[str]: - """ - Inverts the field_path v2 format to get the canonical field path - [version=2.0].[type=x].foo.[type=string(format=uri)].bar => ["foo","bar"] - """ - if "type=map" not in field_path: - return re.sub(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER, "", field_path).split( - "." - ) - else: - # fields with maps in them need special handling to insert the `key` fragment - return [ - c.field_name - for c in FieldRow.map_field_path_to_components(field_path) - if c.field_name - ] - - @classmethod - def from_schema_field(cls, schema_field: SchemaFieldClass) -> "FieldRow": - path_components = FieldRow.field_path_to_components(schema_field.fieldPath) - - parent = path_components[-2] if len(path_components) >= 2 else None - if parent == "`key`": - # the real parent node is one index above - parent = path_components[-3] - json_props = ( - json.loads(schema_field.jsonProps) if schema_field.jsonProps else {} - ) - - required = json_props.get("required", True) - has_default = "default" in json_props - default_value = str(json_props.get("default")) - - field_path = ".".join(path_components) - - return FieldRow( - path=field_path, - parent=parent, - type_name=str(schema_field.nativeDataType), - required=required, - has_default=has_default, - default=default_value, - description=schema_field.description, - inner_fields=[], - discriminated_type=schema_field.nativeDataType, - ) - - def get_checkbox(self) -> str: - if self.required and not self.has_default: - # Using a non-breaking space to prevent the checkbox from being - # broken into a new line. - if not self.parent: # None and empty string both count - return ' ' - else: - return f' ' - else: - return "" - - def to_md_line(self) -> str: - if self.inner_fields: - if len(self.inner_fields) == 1: - type_name = self.inner_fields[0].type_name or self.type_name - else: - # To deal with unions that have essentially the same simple field path, - # we combine the type names into a single string. - type_name = "One of " + ", ".join( - [x.type_name for x in self.inner_fields if x.discriminated_type] - ) - else: - type_name = self.type_name - - description = self.description.strip() - description = self.description.replace( - "\n", "
" - ) # descriptions with newlines in them break markdown rendering - - md_line = ( - f'|
{_format_path_component(self.path)}' - f"{self.get_checkbox()}
" - f'
{_format_type_name(type_name)}
' - f"| {description} " - f"{_format_default_line(self.default, bool(description)) if self.has_default else ''} |\n" - ) - return md_line - - -class FieldHeader(FieldRow): - def to_md_line(self) -> str: - return "\n".join( - [ - "| Field | Description |", - "|:--- |:--- |", - "", - ] - ) - - def __init__(self): - pass - - -def get_prefixed_name(field_prefix: Optional[str], field_name: Optional[str]) -> str: - assert ( - field_prefix or field_name - ), "One of field_prefix or field_name should be present" - return ( - f"{field_prefix}.{field_name}" # type: ignore - if field_prefix and field_name - else field_name - if not field_prefix - else field_prefix - ) - - -def custom_comparator(path: str) -> str: - """ - Projects a string onto a separate space - Low_prio string will start with Z else start with A - Number of field paths will add the second set of letters: 00 - 99 - - """ - opt1 = path - prio_value = priority_value(opt1) - projection = f"{prio_value}" - projection = f"{projection}{opt1}" - return projection - - -class FieldTree: - """ - A helper class that re-constructs the tree hierarchy of schema fields - to help sort fields by importance while keeping nesting intact - """ - - def __init__(self, field: Optional[FieldRow] = None): - self.field = field - self.fields: Dict[str, "FieldTree"] = {} - - def add_field(self, row: FieldRow, path: Optional[str] = None) -> "FieldTree": - # logger.warn(f"Add field: path:{path}, row:{row}") - if self.field and self.field.path == row.path: - # we have an incoming field with the same path as us, this is probably a union variant - # attach to existing field - self.field.inner_fields.append(row) - else: - path = path if path is not None else row.path - top_level_field = path.split(".")[0] - if top_level_field in self.fields: - self.fields[top_level_field].add_field( - row, ".".join(path.split(".")[1:]) - ) - else: - self.fields[top_level_field] = FieldTree(field=row) - # logger.warn(f"{self}") - return self - - def sort(self): - # Required fields before optionals - required_fields = { - k: v for k, v in self.fields.items() if v.field and v.field.required - } - optional_fields = { - k: v for k, v in self.fields.items() if v.field and not v.field.required - } - - self.sorted_fields = [] - for field_map in [required_fields, optional_fields]: - # Top-level fields before fields with nesting - self.sorted_fields.extend( - sorted( - [f for f, val in field_map.items() if val.fields == {}], - key=custom_comparator, - ) - ) - self.sorted_fields.extend( - sorted( - [f for f, val in field_map.items() if val.fields != {}], - key=custom_comparator, - ) - ) - - for field_tree in self.fields.values(): - field_tree.sort() - - def get_fields(self) -> Iterable[FieldRow]: - if self.field: - yield self.field - for key in self.sorted_fields: - yield from self.fields[key].get_fields() - - def __repr__(self) -> str: - result = {} - if self.field: - result["_self"] = json.loads(json.dumps(self.field.dict())) - for f in self.fields: - result[f] = json.loads(str(self.fields[f])) - return json.dumps(result, indent=2) - - -def priority_value(path: str) -> str: - # A map of low value tokens to their relative importance - low_value_token_map = {"env": "X", "profiling": "Y", "stateful_ingestion": "Z"} - tokens = path.split(".") - for low_value_token in low_value_token_map: - if low_value_token in tokens: - return low_value_token_map[low_value_token] - - # everything else high-prio - return "A" - - -def gen_md_table_from_struct(schema_dict: Dict[str, Any]) -> List[str]: - from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator - - # we don't want default field values to be injected into the description of the field - JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False - schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict)) - result: List[str] = [FieldHeader().to_md_line()] - - field_tree = FieldTree(field=None) - for field in schema_fields: - row: FieldRow = FieldRow.from_schema_field(field) - field_tree.add_field(row) - - field_tree.sort() - - for row in field_tree.get_fields(): - result.append(row.to_md_line()) - - # Wrap with a .config-table div. - result = ["\n
\n\n", *result, "\n
\n"] - - return result - def get_snippet(long_string: str, max_length: int = 100) -> str: snippet = "" @@ -424,19 +68,6 @@ def get_capability_text(src_capability: SourceCapability) -> str: ) -def create_or_update( - something: Dict[Any, Any], path: List[str], value: Any -) -> Dict[Any, Any]: - dict_under_operation = something - for p in path[:-1]: - if p not in dict_under_operation: - dict_under_operation[p] = {} - dict_under_operation = dict_under_operation[p] - - dict_under_operation[path[-1]] = value - return something - - def does_extra_exist(extra_name: str) -> bool: for key, value in metadata("acryl-datahub").items(): if key == "Provides-Extra" and value == extra_name: @@ -498,6 +129,102 @@ def new_url(original_url: str, file_path: str) -> str: return new_content +def load_plugin(plugin_name: str, out_dir: str) -> Plugin: + logger.debug(f"Loading {plugin_name}") + class_or_exception = source_registry._ensure_not_lazy(plugin_name) + if isinstance(class_or_exception, Exception): + raise class_or_exception + source_type = source_registry.get(plugin_name) + logger.debug(f"Source class is {source_type}") + + if hasattr(source_type, "get_platform_name"): + platform_name = source_type.get_platform_name() + else: + platform_name = ( + plugin_name.title() + ) # we like platform names to be human readable + + platform_id = None + if hasattr(source_type, "get_platform_id"): + platform_id = source_type.get_platform_id() + if platform_id is None: + raise ValueError(f"Platform ID not found for {plugin_name}") + + plugin = Plugin( + name=plugin_name, + platform_id=platform_id, + platform_name=platform_name, + classname=".".join([source_type.__module__, source_type.__name__]), + ) + + if hasattr(source_type, "get_platform_doc_order"): + platform_doc_order = source_type.get_platform_doc_order() + plugin.doc_order = platform_doc_order + + plugin_file_name = "src/" + "/".join(source_type.__module__.split(".")) + if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name): + plugin_file_name = plugin_file_name + "/__init__.py" + else: + plugin_file_name = plugin_file_name + ".py" + if os.path.exists(plugin_file_name): + plugin.filename = plugin_file_name + else: + logger.info( + f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}, but that doesn't exist" + ) + + if hasattr(source_type, "__doc__"): + plugin.source_docstring = textwrap.dedent(source_type.__doc__ or "") + + if hasattr(source_type, "get_support_status"): + plugin.support_status = source_type.get_support_status() + + if hasattr(source_type, "get_capabilities"): + capabilities = list(source_type.get_capabilities()) + capabilities.sort(key=lambda x: x.capability.value) + plugin.capabilities = capabilities + + try: + extra_plugin = plugin_name if does_extra_exist(plugin_name) else None + plugin.extra_deps = ( + get_additional_deps_for_extra(extra_plugin) if extra_plugin else [] + ) + except Exception as e: + logger.info( + f"Failed to load extras for {plugin_name} due to exception {e}", exc_info=e + ) + + if hasattr(source_type, "get_config_class"): + source_config_class: ConfigModel = source_type.get_config_class() + + plugin.config_json_schema = source_config_class.schema_json(indent=2) + plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema()) + + # Write the config json schema to the out_dir. + config_dir = pathlib.Path(out_dir) / "config_schemas" + config_dir.mkdir(parents=True, exist_ok=True) + (config_dir / f"{plugin_name}_config.json").write_text( + plugin.config_json_schema + ) + + return plugin + + +@dataclasses.dataclass +class PluginMetrics: + discovered: int = 0 + loaded: int = 0 + generated: int = 0 + failed: int = 0 + + +@dataclasses.dataclass +class PlatformMetrics: + discovered: int = 0 + generated: int = 0 + warnings: List[str] = dataclasses.field(default_factory=list) + + @click.command() @click.option("--out-dir", type=str, required=True) @click.option("--extra-docs", type=str, required=False) @@ -505,239 +232,111 @@ def new_url(original_url: str, file_path: str) -> str: def generate( out_dir: str, extra_docs: Optional[str] = None, source: Optional[str] = None ) -> None: # noqa: C901 - source_documentation: Dict[str, Any] = {} - metrics = {} - metrics["source_platforms"] = {"discovered": 0, "generated": 0, "warnings": []} - metrics["plugins"] = {"discovered": 0, "generated": 0, "failed": 0} - - if extra_docs: - for path in glob.glob(f"{extra_docs}/**/*[.md|.yaml|.yml]", recursive=True): - m = re.search("/docs/sources/(.*)/(.*).md", path) - if m: - platform_name = m.group(1).lower() - file_name = m.group(2) - destination_md: str = ( - f"../docs/generated/ingestion/sources/{platform_name}.md" - ) - - with open(path, "r") as doc_file: - file_contents = doc_file.read() - final_markdown = rewrite_markdown( - file_contents, path, destination_md - ) - - if file_name == "README": - # README goes as platform level docs - # all other docs are assumed to be plugin level - create_or_update( - source_documentation, - [platform_name, "custom_docs"], - final_markdown, - ) - else: - if "_" in file_name: - plugin_doc_parts = file_name.split("_") - if len(plugin_doc_parts) != 2 or plugin_doc_parts[ - 1 - ] not in ["pre", "post"]: - raise Exception( - f"{file_name} needs to be of the form _pre.md or _post.md" - ) - - docs_key_name = f"custom_docs_{plugin_doc_parts[1]}" - create_or_update( - source_documentation, - [ - platform_name, - "plugins", - plugin_doc_parts[0], - docs_key_name, - ], - final_markdown, - ) - else: - create_or_update( - source_documentation, - [ - platform_name, - "plugins", - file_name, - "custom_docs_post", - ], - final_markdown, - ) - else: - yml_match = re.search("/docs/sources/(.*)/(.*)_recipe.yml", path) - if yml_match: - platform_name = yml_match.group(1).lower() - plugin_name = yml_match.group(2) - with open(path, "r") as doc_file: - file_contents = doc_file.read() - create_or_update( - source_documentation, - [platform_name, "plugins", plugin_name, "recipe"], - file_contents, - ) + plugin_metrics = PluginMetrics() + platform_metrics = PlatformMetrics() + platforms: Dict[str, Platform] = {} for plugin_name in sorted(source_registry.mapping.keys()): if source and source != plugin_name: continue if plugin_name in { "snowflake-summary", + "snowflake-queries", + "bigquery-queries", }: logger.info(f"Skipping {plugin_name} as it is on the deny list") continue - metrics["plugins"]["discovered"] = metrics["plugins"]["discovered"] + 1 # type: ignore - # We want to attempt to load all plugins before printing a summary. - source_type = None + plugin_metrics.discovered += 1 try: - # output = subprocess.check_output( - # ["/bin/bash", "-c", f"pip install -e '.[{key}]'"] - # ) - class_or_exception = source_registry._ensure_not_lazy(plugin_name) - if isinstance(class_or_exception, Exception): - raise class_or_exception - logger.debug(f"Processing {plugin_name}") - source_type = source_registry.get(plugin_name) - logger.debug(f"Source class is {source_type}") - extra_plugin = plugin_name if does_extra_exist(plugin_name) else None - extra_deps = ( - get_additional_deps_for_extra(extra_plugin) if extra_plugin else [] - ) + plugin = load_plugin(plugin_name, out_dir=out_dir) except Exception as e: - logger.warning( - f"Failed to process {plugin_name} due to exception {e}", exc_info=e + logger.error( + f"Failed to load {plugin_name} due to exception {e}", exc_info=e ) - metrics["plugins"]["failed"] = metrics["plugins"].get("failed", 0) + 1 # type: ignore - - if source_type and hasattr(source_type, "get_config_class"): - try: - source_config_class: ConfigModel = source_type.get_config_class() - support_status = SupportStatus.UNKNOWN - capabilities = [] - if hasattr(source_type, "__doc__"): - source_doc = textwrap.dedent(source_type.__doc__ or "") - if hasattr(source_type, "get_platform_name"): - platform_name = source_type.get_platform_name() - else: - platform_name = ( - plugin_name.title() - ) # we like platform names to be human readable - - if hasattr(source_type, "get_platform_id"): - platform_id = source_type.get_platform_id() - - if hasattr(source_type, "get_platform_doc_order"): - platform_doc_order = source_type.get_platform_doc_order() - create_or_update( - source_documentation, - [platform_id, "plugins", plugin_name, "doc_order"], - platform_doc_order, - ) - - source_documentation[platform_id] = ( - source_documentation.get(platform_id) or {} - ) - - create_or_update( - source_documentation, - [platform_id, "plugins", plugin_name, "classname"], - ".".join([source_type.__module__, source_type.__name__]), - ) - plugin_file_name = "src/" + "/".join(source_type.__module__.split(".")) - if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name): - plugin_file_name = plugin_file_name + "/__init__.py" - else: - plugin_file_name = plugin_file_name + ".py" - if os.path.exists(plugin_file_name): - create_or_update( - source_documentation, - [platform_id, "plugins", plugin_name, "filename"], - plugin_file_name, - ) - else: - logger.info( - f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}" - ) - - if hasattr(source_type, "get_support_status"): - support_status = source_type.get_support_status() - - if hasattr(source_type, "get_capabilities"): - capabilities = list(source_type.get_capabilities()) - capabilities.sort(key=lambda x: x.capability.value) - - create_or_update( - source_documentation, - [platform_id, "plugins", plugin_name, "capabilities"], - capabilities, - ) - - create_or_update( - source_documentation, [platform_id, "name"], platform_name - ) - - create_or_update( - source_documentation, - [platform_id, "plugins", plugin_name, "extra_deps"], - extra_deps, - ) + plugin_metrics.failed += 1 + continue + else: + plugin_metrics.loaded += 1 - config_dir = f"{out_dir}/config_schemas" - os.makedirs(config_dir, exist_ok=True) - with open(f"{config_dir}/{plugin_name}_config.json", "w") as f: - f.write(source_config_class.schema_json(indent=2)) + # Add to the platform list if not already present. + platforms.setdefault( + plugin.platform_id, + Platform( + id=plugin.platform_id, + name=plugin.platform_name, + ), + ).add_plugin(plugin_name=plugin.name, plugin=plugin) - create_or_update( - source_documentation, - [platform_id, "plugins", plugin_name, "config_schema"], - source_config_class.schema_json(indent=2) or "", + if extra_docs: + for path in glob.glob(f"{extra_docs}/**/*[.md|.yaml|.yml]", recursive=True): + if m := re.search("/docs/sources/(.*)/(.*).md", path): + platform_name = m.group(1).lower() # TODO: rename this to platform_id + file_name = m.group(2) + destination_md: str = ( + f"../docs/generated/ingestion/sources/{platform_name}.md" ) - table_md = gen_md_table_from_struct(source_config_class.schema()) - create_or_update( - source_documentation, - [platform_id, "plugins", plugin_name, "source_doc"], - source_doc or "", - ) - create_or_update( - source_documentation, - [platform_id, "plugins", plugin_name, "config"], - table_md, - ) - create_or_update( - source_documentation, - [platform_id, "plugins", plugin_name, "support_status"], - support_status, - ) + with open(path, "r") as doc_file: + file_contents = doc_file.read() + final_markdown = rewrite_markdown(file_contents, path, destination_md) + + if file_name == "README": + # README goes as platform level docs + # all other docs are assumed to be plugin level + platforms[platform_name].custom_docs_pre = final_markdown + + elif "_" in file_name: + plugin_doc_parts = file_name.split("_") + if len(plugin_doc_parts) != 2: + raise ValueError( + f"{file_name} needs to be of the form _pre.md or _post.md" + ) + plugin_name, suffix = plugin_doc_parts + if suffix == "pre": + platforms[platform_name].plugins[ + plugin_name + ].custom_docs_pre = final_markdown + elif suffix == "post": + platforms[platform_name].plugins[ + plugin_name + ].custom_docs_post = final_markdown + else: + raise ValueError( + f"{file_name} needs to be of the form _pre.md or _post.md" + ) - except Exception as e: - raise e + else: # assume this is the platform post. + # TODO: Probably need better error checking here. + platforms[platform_name].plugins[ + file_name + ].custom_docs_post = final_markdown + elif yml_match := re.search("/docs/sources/(.*)/(.*)_recipe.yml", path): + platform_name = yml_match.group(1).lower() + plugin_name = yml_match.group(2) + platforms[platform_name].plugins[ + plugin_name + ].starter_recipe = pathlib.Path(path).read_text() sources_dir = f"{out_dir}/sources" os.makedirs(sources_dir, exist_ok=True) + # Sort platforms by platform name. + platforms = dict(sorted(platforms.items(), key=lambda x: x[1].name.casefold())) + i = 0 - for platform_id, platform_docs in sorted( - source_documentation.items(), - key=lambda x: (x[1]["name"].casefold(), x[1]["name"]) - if "name" in x[1] - else (x[0].casefold(), x[0]), - ): + for platform_id, platform in platforms.items(): if source and platform_id != source: continue - metrics["source_platforms"]["discovered"] = ( - metrics["source_platforms"]["discovered"] + 1 # type: ignore - ) + platform_metrics.discovered += 1 platform_doc_file = f"{sources_dir}/{platform_id}.md" - if "name" not in platform_docs: - # We seem to have discovered written docs that corresponds to a platform, but haven't found linkage to it from the source classes - warning_msg = f"Failed to find source classes for platform {platform_id}. Did you remember to annotate your source class with @platform_name({platform_id})?" - logger.error(warning_msg) - metrics["source_platforms"]["warnings"].append(warning_msg) # type: ignore - continue + # if "name" not in platform_docs: + # # We seem to have discovered written docs that corresponds to a platform, but haven't found linkage to it from the source classes + # warning_msg = f"Failed to find source classes for platform {platform_id}. Did you remember to annotate your source class with @platform_name({platform_id})?" + # logger.error(warning_msg) + # metrics["source_platforms"]["warnings"].append(warning_msg) # type: ignore + # continue with open(platform_doc_file, "w") as f: i += 1 @@ -745,12 +344,12 @@ def generate( f.write( "import Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\n\n" ) - f.write(f"# {platform_docs['name']}\n") + f.write(f"# {platform.name}\n") - if len(platform_docs["plugins"].keys()) > 1: + if len(platform.plugins) > 1: # More than one plugin used to provide integration with this platform f.write( - f"There are {len(platform_docs['plugins'].keys())} sources that provide integration with {platform_docs['name']}\n" + f"There are {len(platform.plugins)} sources that provide integration with {platform.name}\n" ) f.write("\n") f.write("\n") @@ -759,18 +358,22 @@ def generate( f.write(f"") f.write("") + # Sort plugins in the platform. + # It's a dict, so we need to recreate it. + platform.plugins = dict( + sorted( + platform.plugins.items(), + key=lambda x: str(x[1].doc_order) if x[1].doc_order else x[0], + ) + ) + # f.write("| Source Module | Documentation |\n") # f.write("| ------ | ---- |\n") - for plugin, plugin_docs in sorted( - platform_docs["plugins"].items(), - key=lambda x: str(x[1].get("doc_order")) - if x[1].get("doc_order") - else x[0], - ): + for plugin_name, plugin in platform.plugins.items(): f.write("\n") - f.write(f"\n") + f.write(f"\n") f.write( - f"\n" + f"\n" ) f.write("\n") # f.write( @@ -778,43 +381,33 @@ def generate( # ) f.write("
{col_header}
\n\n`{plugin}`\n\n\n\n`{plugin_name}`\n\n\n\n\n{platform_docs['plugins'][plugin].get('source_doc') or ''} [Read more...](#module-{plugin})\n\n\n\n\n\n{plugin.source_docstring or ''} [Read more...](#module-{plugin_name})\n\n\n
\n\n") # insert platform level custom docs before plugin section - f.write(platform_docs.get("custom_docs") or "") + f.write(platform.custom_docs_pre or "") # all_plugins = platform_docs["plugins"].keys() - for plugin, plugin_docs in sorted( - platform_docs["plugins"].items(), - key=lambda x: str(x[1].get("doc_order")) - if x[1].get("doc_order") - else x[0], - ): - if len(platform_docs["plugins"].keys()) > 1: + for plugin_name, plugin in platform.plugins.items(): + if len(platform.plugins) > 1: # We only need to show this if there are multiple modules. - f.write(f"\n\n## Module `{plugin}`\n") + f.write(f"\n\n## Module `{plugin_name}`\n") - if "support_status" in plugin_docs: - f.write( - get_support_status_badge(plugin_docs["support_status"]) + "\n\n" - ) - if "capabilities" in plugin_docs and len(plugin_docs["capabilities"]): + if plugin.support_status != SupportStatus.UNKNOWN: + f.write(get_support_status_badge(plugin.support_status) + "\n\n") + if plugin.capabilities and len(plugin.capabilities): f.write("\n### Important Capabilities\n") f.write("| Capability | Status | Notes |\n") f.write("| ---------- | ------ | ----- |\n") - plugin_capabilities: List[CapabilitySetting] = plugin_docs[ - "capabilities" - ] - for cap_setting in plugin_capabilities: + for cap_setting in plugin.capabilities: f.write( f"| {get_capability_text(cap_setting.capability)} | {get_capability_supported_badge(cap_setting.supported)} | {cap_setting.description} |\n" ) f.write("\n") - f.write(f"{plugin_docs.get('source_doc') or ''}\n") + f.write(f"{plugin.source_docstring or ''}\n") # Insert custom pre section - f.write(plugin_docs.get("custom_docs_pre", "")) + f.write(plugin.custom_docs_pre or "") f.write("\n### CLI based Ingestion\n") - if "extra_deps" in plugin_docs: + if plugin.extra_deps and len(plugin.extra_deps): f.write("\n#### Install the Plugin\n") - if plugin_docs["extra_deps"] != []: + if plugin.extra_deps != []: f.write("```shell\n") f.write(f"pip install 'acryl-datahub[{plugin}]'\n") f.write("```\n") @@ -822,7 +415,7 @@ def generate( f.write( f"The `{plugin}` source works out of the box with `acryl-datahub`.\n" ) - if "recipe" in plugin_docs: + if plugin.starter_recipe: f.write("\n### Starter Recipe\n") f.write( "Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.\n\n\n" @@ -831,9 +424,10 @@ def generate( "For general pointers on writing and running a recipe, see our [main recipe guide](../../../../metadata-ingestion/README.md#recipes).\n" ) f.write("```yaml\n") - f.write(plugin_docs["recipe"]) + f.write(plugin.starter_recipe) f.write("\n```\n") - if "config" in plugin_docs: + if plugin.config_json_schema: + assert plugin.config_md is not None f.write("\n### Config Details\n") f.write( """ @@ -845,8 +439,8 @@ def generate( # f.write( # "\n
\nView All Configuration Options\n\n" # ) - for doc in plugin_docs["config"]: - f.write(doc) + f.write(plugin.config_md) + f.write("\n\n") # f.write("\n
\n\n") f.write( f"""
@@ -854,39 +448,49 @@ def generate( The [JSONSchema](https://json-schema.org/) for this configuration is inlined below.\n\n ```javascript -{plugin_docs['config_schema']} +{plugin.config_json_schema} ```\n\n
\n\n""" ) + # insert custom plugin docs after config details - f.write(plugin_docs.get("custom_docs_post", "")) - if "classname" in plugin_docs: + f.write(plugin.custom_docs_post or "") + if plugin.classname: f.write("\n### Code Coordinates\n") - f.write(f"- Class Name: `{plugin_docs['classname']}`\n") - if "filename" in plugin_docs: + f.write(f"- Class Name: `{plugin.classname}`\n") + if plugin.filename: f.write( - f"- Browse on [GitHub](../../../../metadata-ingestion/{plugin_docs['filename']})\n\n" + f"- Browse on [GitHub](../../../../metadata-ingestion/{plugin.filename})\n\n" ) - metrics["plugins"]["generated"] = metrics["plugins"]["generated"] + 1 # type: ignore + plugin_metrics.generated += 1 # Using an h2 tag to prevent this from showing up in page's TOC sidebar. f.write("\n

Questions

\n\n") f.write( - f"If you've got any questions on configuring ingestion for {platform_docs.get('name',platform_id)}, feel free to ping us on [our Slack](https://slack.datahubproject.io).\n" - ) - metrics["source_platforms"]["generated"] = ( - metrics["source_platforms"]["generated"] + 1 # type: ignore + f"If you've got any questions on configuring ingestion for {platform.name}, feel free to ping us on [our Slack](https://slack.datahubproject.io).\n" ) + platform_metrics.generated += 1 print("Ingestion Documentation Generation Complete") print("############################################") - print(json.dumps(metrics, indent=2)) + print( + json.dumps( + { + "plugin_metrics": dataclasses.asdict(plugin_metrics), + "platform_metrics": dataclasses.asdict(platform_metrics), + }, + indent=2, + ) + ) print("############################################") - if metrics["plugins"].get("failed", 0) > 0: # type: ignore + if plugin_metrics.failed > 0: sys.exit(1) - ### Create Lineage doc + # Create Lineage doc + generate_lineage_doc(platforms) + +def generate_lineage_doc(platforms: Dict[str, Platform]) -> None: source_dir = "../docs/generated/lineage" os.makedirs(source_dir, exist_ok=True) doc_file = f"{source_dir}/lineage-feature-guide.md" @@ -894,7 +498,7 @@ def generate( f.write( "import FeatureAvailability from '@site/src/components/FeatureAvailability';\n\n" ) - f.write(f"# About DataHub Lineage\n\n") + f.write("# About DataHub Lineage\n\n") f.write("\n") f.write( @@ -996,30 +600,24 @@ def generate( ) f.write("| ---------- | ------ | ----- |----- |\n") - for platform_id, platform_docs in sorted( - source_documentation.items(), - key=lambda x: (x[1]["name"].casefold(), x[1]["name"]) - if "name" in x[1] - else (x[0].casefold(), x[0]), - ): - for plugin, plugin_docs in sorted( - platform_docs["plugins"].items(), - key=lambda x: str(x[1].get("doc_order")) - if x[1].get("doc_order") - else x[0], + for platform_id, platform in platforms.items(): + for plugin in sorted( + platform.plugins.values(), + key=lambda x: str(x.doc_order) if x.doc_order else x.name, ): - platform_name = platform_docs["name"] - if len(platform_docs["plugins"].keys()) > 1: + if len(platform.plugins) > 1: # We only need to show this if there are multiple modules. - platform_name = f"{platform_name} `{plugin}`" + platform_plugin_name = f"{platform.name} `{plugin.name}`" + else: + platform_plugin_name = platform.name # Initialize variables table_level_supported = "❌" column_level_supported = "❌" config_names = "" - if "capabilities" in plugin_docs: - plugin_capabilities = plugin_docs["capabilities"] + if plugin.capabilities and len(plugin.capabilities): + plugin_capabilities = plugin.capabilities for cap_setting in plugin_capabilities: capability_text = get_capability_text(cap_setting.capability) @@ -1040,10 +638,10 @@ def generate( column_level_supported = "✅" if not (table_level_supported == "❌" and column_level_supported == "❌"): - if "config_schema" in plugin_docs: - config_properties = json.loads( - plugin_docs["config_schema"] - ).get("properties", {}) + if plugin.config_json_schema: + config_properties = json.loads(plugin.config_json_schema).get( + "properties", {} + ) config_names = "
".join( [ f"- {property_name}" @@ -1065,7 +663,7 @@ def generate( ] if platform_id not in lineage_not_applicable_sources: f.write( - f"| [{platform_name}](../../generated/ingestion/sources/{platform_id}.md) | {table_level_supported} | {column_level_supported} | {config_names}|\n" + f"| [{platform_plugin_name}](../../generated/ingestion/sources/{platform_id}.md) | {table_level_supported} | {column_level_supported} | {config_names}|\n" ) f.write( diff --git a/metadata-ingestion/scripts/docgen_types.py b/metadata-ingestion/scripts/docgen_types.py new file mode 100644 index 00000000000000..c96ab955e8cce2 --- /dev/null +++ b/metadata-ingestion/scripts/docgen_types.py @@ -0,0 +1,45 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +from datahub.ingestion.api.decorators import CapabilitySetting, SupportStatus + + +@dataclass +class Plugin: + # Required fields + name: str + platform_id: str + platform_name: str + classname: str + + # Optional documentation fields + source_docstring: Optional[str] = None + config_json_schema: Optional[str] = None + config_md: Optional[str] = None + custom_docs_pre: Optional[str] = None + custom_docs_post: Optional[str] = None + starter_recipe: Optional[str] = None + + # Optional metadata fields + support_status: SupportStatus = SupportStatus.UNKNOWN + filename: Optional[str] = None + doc_order: Optional[int] = None + + # Lists with empty defaults + capabilities: List[CapabilitySetting] = field(default_factory=list) + extra_deps: List[str] = field(default_factory=list) + + +@dataclass +class Platform: + # Required fields + id: str + name: str + + # Optional fields + custom_docs_pre: Optional[str] = None + plugins: Dict[str, Plugin] = field(default_factory=dict) + + def add_plugin(self, plugin_name: str, plugin: Plugin) -> None: + """Helper method to add a plugin to the platform""" + self.plugins[plugin_name] = plugin diff --git a/metadata-ingestion/scripts/docs_config_table.py b/metadata-ingestion/scripts/docs_config_table.py new file mode 100644 index 00000000000000..3c5d9d0b0a2ba5 --- /dev/null +++ b/metadata-ingestion/scripts/docs_config_table.py @@ -0,0 +1,376 @@ +import html +import json +import re +from typing import Any, Dict, Iterable, List, Optional, Type + +from pydantic import BaseModel, Field + +from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator +from datahub.metadata.schema_classes import SchemaFieldClass + +DEFAULT_VALUE_MAX_LENGTH = 50 +DEFAULT_VALUE_TRUNCATION_MESSAGE = "..." + + +def _truncate_default_value(value: str) -> str: + if len(value) > DEFAULT_VALUE_MAX_LENGTH: + return value[:DEFAULT_VALUE_MAX_LENGTH] + DEFAULT_VALUE_TRUNCATION_MESSAGE + return value + + +def _format_path_component(path: str) -> str: + """ + Given a path like 'a.b.c', adds css tags to the components. + """ + path_components = path.rsplit(".", maxsplit=1) + if len(path_components) == 1: + return f'{path_components[0]}' + + return ( + f'{path_components[0]}.' + f'{path_components[1]}' + ) + + +def _format_type_name(type_name: str) -> str: + return f'{type_name}' + + +def _format_default_line(default_value: str, has_desc_above: bool) -> str: + default_value = _truncate_default_value(default_value) + escaped_value = ( + html.escape(default_value) + # Replace curly braces to avoid JSX issues. + .replace("{", "{") + .replace("}", "}") + # We also need to replace markdown special characters. + .replace("*", "*") + .replace("_", "_") + .replace("[", "[") + .replace("]", "]") + .replace("|", "|") + .replace("`", "`") + ) + value_elem = f'{escaped_value}' + return f'
Default: {value_elem}
' + + +class FieldRow(BaseModel): + path: str + parent: Optional[str] + type_name: str + required: bool + has_default: bool + default: str + description: str + inner_fields: List["FieldRow"] = Field(default_factory=list) + discriminated_type: Optional[str] = None + + class Component(BaseModel): + type: str + field_name: Optional[str] + + # matches any [...] style section inside a field path + _V2_FIELD_PATH_TOKEN_MATCHER = r"\[[\w.]*[=]*[\w\(\-\ \_\).]*\][\.]*" + # matches a .?[...] style section inside a field path anchored to the beginning + _V2_FIELD_PATH_TOKEN_MATCHER_PREFIX = rf"^[\.]*{_V2_FIELD_PATH_TOKEN_MATCHER}" + _V2_FIELD_PATH_FIELD_NAME_MATCHER = r"^\w+" + + @staticmethod + def map_field_path_to_components(field_path: str) -> List[Component]: + m = re.match(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER_PREFIX, field_path) + v = re.match(FieldRow._V2_FIELD_PATH_FIELD_NAME_MATCHER, field_path) + components: List[FieldRow.Component] = [] + while m or v: + token = m.group() if m else v.group() # type: ignore + if v: + if components: + if components[-1].field_name is None: + components[-1].field_name = token + else: + components.append( + FieldRow.Component(type="non_map_type", field_name=token) + ) + else: + components.append( + FieldRow.Component(type="non_map_type", field_name=token) + ) + + if m: + if token.startswith("[version="): + pass + elif "[type=" in token: + type_match = re.match(r"[\.]*\[type=(.*)\]", token) + if type_match: + type_string = type_match.group(1) + if components and components[-1].type == "map": + if components[-1].field_name is None: + pass + else: + new_component = FieldRow.Component( + type="map_key", field_name="`key`" + ) + components.append(new_component) + new_component = FieldRow.Component( + type=type_string, field_name=None + ) + components.append(new_component) + if type_string == "map": + new_component = FieldRow.Component( + type=type_string, field_name=None + ) + components.append(new_component) + + field_path = field_path[m.span()[1] :] if m else field_path[v.span()[1] :] # type: ignore + m = re.match(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER_PREFIX, field_path) + v = re.match(FieldRow._V2_FIELD_PATH_FIELD_NAME_MATCHER, field_path) + + return components + + @staticmethod + def field_path_to_components(field_path: str) -> List[str]: + """ + Inverts the field_path v2 format to get the canonical field path + [version=2.0].[type=x].foo.[type=string(format=uri)].bar => ["foo","bar"] + """ + if "type=map" not in field_path: + return re.sub(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER, "", field_path).split( + "." + ) + else: + # fields with maps in them need special handling to insert the `key` fragment + return [ + c.field_name + for c in FieldRow.map_field_path_to_components(field_path) + if c.field_name + ] + + @classmethod + def from_schema_field(cls, schema_field: SchemaFieldClass) -> "FieldRow": + path_components = FieldRow.field_path_to_components(schema_field.fieldPath) + + parent = path_components[-2] if len(path_components) >= 2 else None + if parent == "`key`": + # the real parent node is one index above + parent = path_components[-3] + json_props = ( + json.loads(schema_field.jsonProps) if schema_field.jsonProps else {} + ) + + required = json_props.get("required", True) + has_default = "default" in json_props + default_value = str(json_props.get("default")) + + field_path = ".".join(path_components) + + return FieldRow( + path=field_path, + parent=parent, + type_name=str(schema_field.nativeDataType), + required=required, + has_default=has_default, + default=default_value, + description=schema_field.description, + inner_fields=[], + discriminated_type=schema_field.nativeDataType, + ) + + def get_checkbox(self) -> str: + if self.required and not self.has_default: + # Using a non-breaking space to prevent the checkbox from being + # broken into a new line. + if not self.parent: # None and empty string both count + return ' ' + else: + return f' ' + else: + return "" + + def to_md_line(self) -> str: + if self.inner_fields: + if len(self.inner_fields) == 1: + type_name = self.inner_fields[0].type_name or self.type_name + else: + # To deal with unions that have essentially the same simple field path, + # we combine the type names into a single string. + type_name = "One of " + ", ".join( + [x.type_name for x in self.inner_fields if x.discriminated_type] + ) + else: + type_name = self.type_name + + description = self.description.strip() + description = self.description.replace( + "\n", "
" + ) # descriptions with newlines in them break markdown rendering + + md_line = ( + f'|
{_format_path_component(self.path)}' + f"{self.get_checkbox()}
" + f'
{_format_type_name(type_name)}
' + f"| {description} " + f"{_format_default_line(self.default, bool(description)) if self.has_default else ''} |\n" + ) + return md_line + + +class FieldHeader(FieldRow): + def to_md_line(self) -> str: + return "\n".join( + [ + "| Field | Description |", + "|:--- |:--- |", + "", + ] + ) + + def __init__(self): + pass + + +def get_prefixed_name(field_prefix: Optional[str], field_name: Optional[str]) -> str: + assert ( + field_prefix or field_name + ), "One of field_prefix or field_name should be present" + return ( + f"{field_prefix}.{field_name}" # type: ignore + if field_prefix and field_name + else field_name + if not field_prefix + else field_prefix + ) + + +def custom_comparator(path: str) -> str: + """ + Projects a string onto a separate space + Low_prio string will start with Z else start with A + Number of field paths will add the second set of letters: 00 - 99 + + """ + opt1 = path + prio_value = priority_value(opt1) + projection = f"{prio_value}" + projection = f"{projection}{opt1}" + return projection + + +class FieldTree: + """ + A helper class that re-constructs the tree hierarchy of schema fields + to help sort fields by importance while keeping nesting intact + """ + + def __init__(self, field: Optional[FieldRow] = None): + self.field = field + self.fields: Dict[str, "FieldTree"] = {} + + def add_field(self, row: FieldRow, path: Optional[str] = None) -> "FieldTree": + # logger.warn(f"Add field: path:{path}, row:{row}") + if self.field and self.field.path == row.path: + # we have an incoming field with the same path as us, this is probably a union variant + # attach to existing field + self.field.inner_fields.append(row) + else: + path = path if path is not None else row.path + top_level_field = path.split(".")[0] + if top_level_field in self.fields: + self.fields[top_level_field].add_field( + row, ".".join(path.split(".")[1:]) + ) + else: + self.fields[top_level_field] = FieldTree(field=row) + # logger.warn(f"{self}") + return self + + def sort(self): + # Required fields before optionals + required_fields = { + k: v for k, v in self.fields.items() if v.field and v.field.required + } + optional_fields = { + k: v for k, v in self.fields.items() if v.field and not v.field.required + } + + self.sorted_fields = [] + for field_map in [required_fields, optional_fields]: + # Top-level fields before fields with nesting + self.sorted_fields.extend( + sorted( + [f for f, val in field_map.items() if val.fields == {}], + key=custom_comparator, + ) + ) + self.sorted_fields.extend( + sorted( + [f for f, val in field_map.items() if val.fields != {}], + key=custom_comparator, + ) + ) + + for field_tree in self.fields.values(): + field_tree.sort() + + def get_fields(self) -> Iterable[FieldRow]: + if self.field: + yield self.field + for key in self.sorted_fields: + yield from self.fields[key].get_fields() + + def __repr__(self) -> str: + result = {} + if self.field: + result["_self"] = json.loads(json.dumps(self.field.dict())) + for f in self.fields: + result[f] = json.loads(str(self.fields[f])) + return json.dumps(result, indent=2) + + +def priority_value(path: str) -> str: + # A map of low value tokens to their relative importance + low_value_token_map = { + "env": "X", + "classification": "Y", + "profiling": "Y", + "stateful_ingestion": "Z", + } + tokens = path.split(".") + for low_value_token in low_value_token_map: + if low_value_token in tokens: + return low_value_token_map[low_value_token] + + # everything else high-prio + return "A" + + +def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str: + # we don't want default field values to be injected into the description of the field + JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False + schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict)) + result: List[str] = [FieldHeader().to_md_line()] + + field_tree = FieldTree(field=None) + for field in schema_fields: + row: FieldRow = FieldRow.from_schema_field(field) + field_tree.add_field(row) + + field_tree.sort() + + for row in field_tree.get_fields(): + result.append(row.to_md_line()) + + # Wrap with a .config-table div. + result = ["\n
\n\n", *result, "\n
\n"] + + return "".join(result) + + +def gen_md_table_from_pydantic(model: Type[BaseModel]) -> str: + return gen_md_table_from_json_schema(model.schema()) + + +if __name__ == "__main__": + # Simple test code. + from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config + + print("".join(gen_md_table_from_pydantic(SnowflakeV2Config))) diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index 619f69b016262d..179dbdb231c912 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -1,7 +1,7 @@ import logging from enum import Enum from pathlib import Path -from typing import List, Optional +from typing import Iterable, List, Optional import yaml from pydantic import validator @@ -226,3 +226,14 @@ def to_yaml( yaml.indent(mapping=2, sequence=4, offset=2) yaml.default_flow_style = False yaml.dump(self.dict(), fp) + + @staticmethod + def list_urns(graph: DataHubGraph) -> Iterable[str]: + return graph.get_urns_by_filter( + entity_types=["structuredProperty"], + ) + + @staticmethod + def list(graph: DataHubGraph) -> Iterable["StructuredProperties"]: + for urn in StructuredProperties.list_urns(graph): + yield StructuredProperties.from_datahub(graph, urn) diff --git a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py index 42285cf13a5ddc..5cd28516a076d9 100644 --- a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py +++ b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py @@ -1,9 +1,11 @@ import json import logging from pathlib import Path +from typing import Iterable import click from click_default_group import DefaultGroup +from ruamel.yaml import YAML from datahub.api.entities.structuredproperties.structuredproperties import ( StructuredProperties, @@ -61,3 +63,85 @@ def get(urn: str, to_file: str) -> None: ) else: click.secho(f"Structured property {urn} does not exist") + + +@properties.command( + name="list", +) +@click.option("--details/--no-details", is_flag=True, default=True) +@click.option("--to-file", required=False, type=str) +@telemetry.with_telemetry() +def list(details: bool, to_file: str) -> None: + """List structured properties in DataHub""" + + def to_yaml_list( + objects: Iterable[StructuredProperties], # iterable of objects to dump + file: Path, + ) -> None: + # if file exists, first we read it + yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip) + yaml.indent(mapping=2, sequence=4, offset=2) + yaml.default_flow_style = False + serialized_objects = [] + if file.exists(): + with open(file, "r") as fp: + existing_objects = yaml.load(fp) # this is a list of dicts + existing_objects = [ + StructuredProperties.parse_obj(obj) for obj in existing_objects + ] + objects = [obj for obj in objects] + # do a positional update of the existing objects + existing_urns = {obj.urn for obj in existing_objects} + # existing_urns = {obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" for obj in existing_objects} + for i, obj in enumerate(existing_objects): + # existing_urn = obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" + existing_urn = obj.urn + # breakpoint() + if existing_urn in {obj.urn for obj in objects}: + existing_objects[i] = next( + obj.dict(exclude_unset=True, exclude_none=True) + for obj in objects + if obj.urn == existing_urn + ) + new_objects = [ + obj.dict(exclude_unset=True, exclude_none=True) + for obj in objects + if obj.urn not in existing_urns + ] + serialized_objects = existing_objects + new_objects + else: + serialized_objects = [ + obj.dict(exclude_unset=True, exclude_none=True) for obj in objects + ] + + with open(file, "w") as fp: + yaml.dump(serialized_objects, fp) + + with get_default_graph() as graph: + if details: + logger.info( + "Listing structured properties with details. Use --no-details for urns only" + ) + structuredproperties = StructuredProperties.list(graph) + if to_file: + to_yaml_list(structuredproperties, Path(to_file)) + else: + for structuredproperty in structuredproperties: + click.secho( + f"{json.dumps(structuredproperty.dict(exclude_unset=True, exclude_none=True), indent=2)}" + ) + else: + logger.info( + "Listing structured property urns only, use --details for more information" + ) + structured_property_urns = StructuredProperties.list_urns(graph) + if to_file: + with open(to_file, "w") as f: + for urn in structured_property_urns: + f.write(f"{urn}\n") + click.secho( + f"Structured property urns written to {to_file}", fg="green" + ) + else: + for urn in structured_property_urns: + click.secho(f"{urn}") diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 75dc980e234ac8..53cb1b0ecad4ee 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -334,6 +334,8 @@ def as_obj(self) -> dict: } def compute_stats(self) -> None: + super().compute_stats() + duration = datetime.datetime.now() - self.start_time workunits_produced = self.events_produced if duration.total_seconds() > 0: diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 7de6e8130a7ab6..8c5f894a072d93 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -248,9 +248,11 @@ def make_rest_sink( with DatahubRestSink(PipelineContext(run_id=run_id), sink_config) as sink: yield sink if sink.report.failures: + logger.error( + f"Failed to emit {len(sink.report.failures)} records\n{sink.report.as_string()}" + ) raise OperationalError( - f"Failed to emit {len(sink.report.failures)} records", - info=sink.report.as_obj(), + f"Failed to emit {len(sink.report.failures)} records" ) def emit_all( diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index a0bed4ae9a7581..30e81643837375 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -248,6 +248,9 @@ def report_table_dropped(self, table: str) -> None: "Enabled by default when stateful ingestion is turned on.", ) @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +@capability( + SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field" +) class GlueSource(StatefulIngestionSourceBase): """ Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. @@ -284,12 +287,22 @@ class GlueSource(StatefulIngestionSourceBase): "Action": [ "glue:GetDataflowGraph", "glue:GetJobs", + "s3:GetObject", ], "Resource": "*" } ``` - plus `s3:GetObject` for the job script locations. + For profiling datasets, the following additional permissions are required: + ```json + { + "Effect": "Allow", + "Action": [ + "glue:GetPartitions", + ], + "Resource": "*" + } + ``` """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py index 878b8dd1bb9a51..360f18aa448f27 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py @@ -1,6 +1,11 @@ import logging import os -from typing import Optional +from collections import defaultdict +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional + +if TYPE_CHECKING: + from mypy_boto3_s3.service_resource import ObjectSummary + S3_PREFIXES = ["s3://", "s3n://", "s3a://"] @@ -68,3 +73,21 @@ def get_key_prefix(s3_uri: str) -> str: f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}" ) return strip_s3_prefix(s3_uri).split("/", maxsplit=1)[1] + + +def group_s3_objects_by_dirname( + s3_objects: Iterable["ObjectSummary"], +) -> Dict[str, List["ObjectSummary"]]: + """ + Groups S3 objects by their directory name. + + If a s3_object in the root directory (i.e., s3://bucket/file.txt), it is grouped under '/'. + """ + grouped_s3_objs = defaultdict(list) + for obj in s3_objects: + if "/" in obj.key: + dirname = obj.key.rsplit("/", 1)[0] + else: + dirname = "/" + grouped_s3_objs[dirname].append(obj) + return grouped_s3_objs diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index db7b0540e49e71..508b4bbaa277dc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -253,14 +253,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: for project in projects: yield from self.bq_schema_extractor.get_project_workunits(project) - self.report.set_ingestion_stage("*", "View and Snapshot Lineage") - yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots( - [p.id for p in projects], - self.bq_schema_extractor.view_refs_by_project, - self.bq_schema_extractor.view_definitions, - self.bq_schema_extractor.snapshot_refs_by_project, - self.bq_schema_extractor.snapshots_by_ref, - ) + with self.report.new_stage("*: View and Snapshot Lineage"): + yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots( + [p.id for p in projects], + self.bq_schema_extractor.view_refs_by_project, + self.bq_schema_extractor.view_definitions, + self.bq_schema_extractor.snapshot_refs_by_project, + self.bq_schema_extractor.snapshots_by_ref, + ) if self.config.use_queries_v2: # if both usage and lineage are disabled then skip queries extractor piece @@ -270,31 +270,29 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ): return - self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) - - with BigQueryQueriesExtractor( - connection=self.config.get_bigquery_client(), - schema_api=self.bq_schema_extractor.schema_api, - config=BigQueryQueriesExtractorConfig( - window=self.config, - user_email_pattern=self.config.usage.user_email_pattern, - include_lineage=self.config.include_table_lineage, - include_usage_statistics=self.config.include_usage_statistics, - include_operations=self.config.usage.include_operational_stats, - include_queries=self.config.include_queries, - include_query_usage_statistics=self.config.include_query_usage_statistics, - top_n_queries=self.config.usage.top_n_queries, - region_qualifiers=self.config.region_qualifiers, - ), - structured_report=self.report, - filters=self.filters, - identifiers=self.identifiers, - schema_resolver=self.sql_parser_schema_resolver, - discovered_tables=self.bq_schema_extractor.table_refs, - ) as queries_extractor: - self.report.queries_extractor = queries_extractor.report - yield from queries_extractor.get_workunits_internal() - + with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"): + with BigQueryQueriesExtractor( + connection=self.config.get_bigquery_client(), + schema_api=self.bq_schema_extractor.schema_api, + config=BigQueryQueriesExtractorConfig( + window=self.config, + user_email_pattern=self.config.usage.user_email_pattern, + include_lineage=self.config.include_table_lineage, + include_usage_statistics=self.config.include_usage_statistics, + include_operations=self.config.usage.include_operational_stats, + include_queries=self.config.include_queries, + include_query_usage_statistics=self.config.include_query_usage_statistics, + top_n_queries=self.config.usage.top_n_queries, + region_qualifiers=self.config.region_qualifiers, + ), + structured_report=self.report, + filters=self.filters, + identifiers=self.identifiers, + schema_resolver=self.sql_parser_schema_resolver, + discovered_tables=self.bq_schema_extractor.table_refs, + ) as queries_extractor: + self.report.queries_extractor = queries_extractor.report + yield from queries_extractor.get_workunits_internal() else: if self.config.include_usage_statistics: yield from self.usage_extractor.get_usage_workunits( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 06842da67f76ca..8e55d81aac5fe3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -190,6 +190,3 @@ class BigQueryV2Report( num_skipped_external_table_lineage: int = 0 queries_extractor: Optional[BigQueryQueriesExtractorReport] = None - - def set_ingestion_stage(self, project_id: str, stage: str) -> None: - self.report_ingestion_stage_start(f"{project_id}: {stage}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index bc2688e6b481ab..56e930dfb811f1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -248,9 +248,9 @@ def modified_base32decode(self, text_to_decode: str) -> str: def get_project_workunits( self, project: BigqueryProject ) -> Iterable[MetadataWorkUnit]: - self.report.set_ingestion_stage(project.id, METADATA_EXTRACTION) - logger.info(f"Processing project: {project.id}") - yield from self._process_project(project) + with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"): + logger.info(f"Processing project: {project.id}") + yield from self._process_project(project) def get_dataplatform_instance_aspect( self, dataset_urn: str, project_id: str @@ -405,11 +405,11 @@ def _process_project( if self.config.is_profiling_enabled(): logger.info(f"Starting profiling project {project_id}") - self.report.set_ingestion_stage(project_id, PROFILING) - yield from self.profiler.get_workunits( - project_id=project_id, - tables=db_tables, - ) + with self.report.new_stage(f"{project_id}: {PROFILING}"): + yield from self.profiler.get_workunits( + project_id=project_id, + tables=db_tables, + ) def _process_project_datasets( self, @@ -1203,9 +1203,9 @@ def get_tables_for_dataset( report=self.report, ) - self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = round( - timer.elapsed_seconds(), 2 - ) + self.report.metadata_extraction_sec[ + f"{project_id}.{dataset.name}" + ] = timer.elapsed_seconds(digits=2) def get_core_table_details( self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index ba3357aa8ca20c..433282a21fdb66 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -330,11 +330,11 @@ def get_lineage_workunits( projects = ["*"] # project_id not used when using exported metadata for project in projects: - self.report.set_ingestion_stage(project, LINEAGE_EXTRACTION) - yield from self.generate_lineage( - project, - table_refs, - ) + with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"): + yield from self.generate_lineage( + project, + table_refs, + ) if self.redundant_run_skip_handler: # Update the checkpoint state for this run. @@ -368,8 +368,8 @@ def generate_lineage( self.report.lineage_metadata_entries[project_id] = len(lineage) logger.info(f"Built lineage map containing {len(lineage)} entries.") logger.debug(f"lineage metadata is {lineage}") - self.report.lineage_extraction_sec[project_id] = round( - timer.elapsed_seconds(), 2 + self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds( + digits=2 ) self.report.lineage_mem_size[project_id] = humanfriendly.format_size( memory_footprint.total_size(lineage) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 876ffab85ba311..f2f6cc731858d1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -495,62 +495,62 @@ def _ingest_events( def _generate_operational_workunits( self, usage_state: BigQueryUsageState, table_refs: Collection[str] ) -> Iterable[MetadataWorkUnit]: - self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS) - for audit_event in usage_state.standalone_events(): - try: - operational_wu = self._create_operation_workunit( - audit_event, table_refs - ) - if operational_wu: - yield operational_wu - self.report.num_operational_stats_workunits_emitted += 1 - except Exception as e: - self.report.warning( - message="Unable to generate operation workunit", - context=f"{audit_event}", - exc=e, - ) + with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"): + for audit_event in usage_state.standalone_events(): + try: + operational_wu = self._create_operation_workunit( + audit_event, table_refs + ) + if operational_wu: + yield operational_wu + self.report.num_operational_stats_workunits_emitted += 1 + except Exception as e: + self.report.warning( + message="Unable to generate operation workunit", + context=f"{audit_event}", + exc=e, + ) def _generate_usage_workunits( self, usage_state: BigQueryUsageState ) -> Iterable[MetadataWorkUnit]: - self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION) - top_n = ( - self.config.usage.top_n_queries - if self.config.usage.include_top_n_queries - else 0 - ) - for entry in usage_state.usage_statistics(top_n=top_n): - try: - query_freq = [ - ( - self.uuid_to_query.get( - query_hash, usage_state.queries[query_hash] - ), - count, + with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"): + top_n = ( + self.config.usage.top_n_queries + if self.config.usage.include_top_n_queries + else 0 + ) + for entry in usage_state.usage_statistics(top_n=top_n): + try: + query_freq = [ + ( + self.uuid_to_query.get( + query_hash, usage_state.queries[query_hash] + ), + count, + ) + for query_hash, count in entry.query_freq + ] + yield make_usage_workunit( + bucket_start_time=datetime.fromisoformat(entry.timestamp), + resource=BigQueryTableRef.from_string_name(entry.resource), + query_count=entry.query_count, + query_freq=query_freq, + user_freq=entry.user_freq, + column_freq=entry.column_freq, + bucket_duration=self.config.bucket_duration, + resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref, + top_n_queries=self.config.usage.top_n_queries, + format_sql_queries=self.config.usage.format_sql_queries, + queries_character_limit=self.config.usage.queries_character_limit, + ) + self.report.num_usage_workunits_emitted += 1 + except Exception as e: + self.report.warning( + message="Unable to generate usage statistics workunit", + context=f"{entry.timestamp}, {entry.resource}", + exc=e, ) - for query_hash, count in entry.query_freq - ] - yield make_usage_workunit( - bucket_start_time=datetime.fromisoformat(entry.timestamp), - resource=BigQueryTableRef.from_string_name(entry.resource), - query_count=entry.query_count, - query_freq=query_freq, - user_freq=entry.user_freq, - column_freq=entry.column_freq, - bucket_duration=self.config.bucket_duration, - resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref, - top_n_queries=self.config.usage.top_n_queries, - format_sql_queries=self.config.usage.format_sql_queries, - queries_character_limit=self.config.usage.queries_character_limit, - ) - self.report.num_usage_workunits_emitted += 1 - except Exception as e: - self.report.warning( - message="Unable to generate usage statistics workunit", - context=f"{entry.timestamp}, {entry.resource}", - exc=e, - ) def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: if self.config.use_exported_bigquery_audit_metadata: @@ -559,10 +559,10 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: for project_id in projects: with PerfTimer() as timer: try: - self.report.set_ingestion_stage( - project_id, USAGE_EXTRACTION_INGESTION - ) - yield from self._get_parsed_bigquery_log_events(project_id) + with self.report.new_stage( + f"{project_id}: {USAGE_EXTRACTION_INGESTION}" + ): + yield from self._get_parsed_bigquery_log_events(project_id) except Exception as e: self.report.usage_failed_extraction.append(project_id) self.report.warning( @@ -572,8 +572,8 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: ) self.report_status(f"usage-extraction-{project_id}", False) - self.report.usage_extraction_sec[project_id] = round( - timer.elapsed_seconds(), 2 + self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds( + digits=2 ) def _store_usage_event( diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py index d8ab62f1d6d91f..7bf1d66f618a4b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py @@ -70,30 +70,30 @@ def get_workunits( ) -> Iterable[MetadataWorkUnit]: for keyspace_name in cassandra_data.keyspaces: tables = cassandra_data.tables.get(keyspace_name, []) - self.report.set_ingestion_stage(keyspace_name, PROFILING) - with ThreadPoolExecutor( - max_workers=self.config.profiling.max_workers - ) as executor: - future_to_dataset = { - executor.submit( - self.generate_profile, - keyspace_name, - table_name, - cassandra_data.columns.get(table_name, []), - ): table_name - for table_name in tables - } - for future in as_completed(future_to_dataset): - table_name = future_to_dataset[future] - try: - yield from future.result() - except Exception as exc: - self.report.profiling_skipped_other[table_name] += 1 - self.report.failure( - message="Failed to profile for table", - context=f"{keyspace_name}.{table_name}", - exc=exc, - ) + with self.report.new_stage(f"{keyspace_name}: {PROFILING}"): + with ThreadPoolExecutor( + max_workers=self.config.profiling.max_workers + ) as executor: + future_to_dataset = { + executor.submit( + self.generate_profile, + keyspace_name, + table_name, + cassandra_data.columns.get(table_name, []), + ): table_name + for table_name in tables + } + for future in as_completed(future_to_dataset): + table_name = future_to_dataset[future] + try: + yield from future.result() + except Exception as exc: + self.report.profiling_skipped_other[table_name] += 1 + self.report.failure( + message="Failed to profile for table", + context=f"{keyspace_name}.{table_name}", + exc=exc, + ) def generate_profile( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py index 41d4ac7ced6035..75a0ba0c617734 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py @@ -54,9 +54,6 @@ def report_entity_scanned(self, name: str, ent_type: str = "View") -> None: else: raise KeyError(f"Unknown entity {ent_type}.") - def set_ingestion_stage(self, keyspace: str, stage: str) -> None: - self.report_ingestion_stage_start(f"{keyspace}: {stage}") - # TODO Need to create seperate common config for profiling report profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict) profiling_skipped_table_profile_pattern: TopKDict[str, int] = field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py index 09f38913f11b19..8622e221940317 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -108,6 +108,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern()) + drop_duplicate_schema_fields: bool = Field( + default=False, + description="Whether to drop duplicate schema fields in the schemaMetadata aspect. " + "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.", + ) + @root_validator(skip_on_failure=True) def check_ingesting_data(cls, values): if ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py index 12daba298a2014..472abd0a97ec70 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -12,7 +12,10 @@ support_status, ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport -from datahub.ingestion.api.source_helpers import auto_workunit_reporter +from datahub.ingestion.api.source_helpers import ( + auto_fix_duplicate_schema_field_paths, + auto_workunit_reporter, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.datahub.config import DataHubSourceConfig from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader @@ -57,7 +60,14 @@ def get_report(self) -> SourceReport: def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: # Exactly replicate data from DataHub source - return [partial(auto_workunit_reporter, self.get_report())] + return [ + ( + auto_fix_duplicate_schema_field_paths + if self.config.drop_duplicate_schema_fields + else None + ), + partial(auto_workunit_reporter, self.get_report()), + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.stop_time = datetime.now(tz=timezone.utc) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py index c8eb035461ca16..9712d4ddc67998 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py @@ -45,6 +45,3 @@ def report_entity_scanned(self, name: str, ent_type: str = "View") -> None: self.views_scanned += 1 else: raise KeyError(f"Unknown entity {ent_type}.") - - def set_ingestion_stage(self, dataset: str, stage: str) -> None: - self.report_ingestion_stage_start(f"{dataset}: {stage}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py index 319290d25169af..6d34e86be6282e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py @@ -472,8 +472,8 @@ def generate_profiles( env=self.config.env, platform_instance=self.config.platform_instance, ) - self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING) - yield from self.profiler.get_workunits(dataset_info, dataset_urn) + with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"): + yield from self.profiler.get_workunits(dataset_info, dataset_urn) def generate_view_lineage( self, dataset_urn: str, parents: List[str] diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 443368e6d8b4fb..b4cc5423277c5a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -141,40 +141,36 @@ def get_workunits_internal( ) -> Iterable[MetadataWorkUnit]: if self.config.cleanup_expired_tokens: try: - self.report.report_ingestion_stage_start("Expired Token Cleanup") - self.revoke_expired_tokens() + with self.report.new_stage("Expired Token Cleanup"): + self.revoke_expired_tokens() except Exception as e: self.report.failure("While trying to cleanup expired token ", exc=e) if self.config.truncate_indices: try: - self.report.report_ingestion_stage_start("Truncate Indices") - self.truncate_indices() + with self.report.new_stage("Truncate Indices"): + self.truncate_indices() except Exception as e: self.report.failure("While trying to truncate indices ", exc=e) if self.config.soft_deleted_entities_cleanup.enabled: try: - self.report.report_ingestion_stage_start( - "Soft Deleted Entities Cleanup" - ) - self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() + with self.report.new_stage("Soft Deleted Entities Cleanup"): + self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() except Exception as e: self.report.failure( "While trying to cleanup soft deleted entities ", exc=e ) if self.config.dataprocess_cleanup.enabled: try: - self.report.report_ingestion_stage_start("Data Process Cleanup") - yield from self.dataprocess_cleanup.get_workunits_internal() + with self.report.new_stage("Data Process Cleanup"): + yield from self.dataprocess_cleanup.get_workunits_internal() except Exception as e: self.report.failure("While trying to cleanup data process ", exc=e) if self.config.execution_request_cleanup.enabled: try: - self.report.report_ingestion_stage_start("Execution request Cleanup") - self.execution_request_cleanup.run() + with self.report.new_stage("Execution request Cleanup"): + self.execution_request_cleanup.run() except Exception as e: self.report.failure("While trying to cleanup execution request ", exc=e) - # Otherwise last stage's duration does not get calculated. - self.report.report_ingestion_stage_start("End") yield from [] def truncate_indices(self) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index f9a00d7f009058..c1763b16f3670f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel): ) keep_history_max_days: int = Field( - 30, + 90, description="Maximum number of days to keep execution requests for, per ingestion source", ) @@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel): description="Maximum runtime in seconds for the cleanup task", ) + limit_entities_delete: Optional[int] = Field( + 10000, description="Max number of execution requests to hard delete." + ) + max_read_errors: int = Field( default=10, description="Maximum number of read errors before aborting", @@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport): ergc_delete_errors: int = 0 ergc_start_time: Optional[datetime.datetime] = None ergc_end_time: Optional[datetime.datetime] = None + ergc_delete_limit_reached: bool = False + ergc_runtime_limit_reached: bool = False class CleanupRecord(BaseModel): @@ -85,12 +91,20 @@ def __init__( self.graph = graph self.report = report self.instance_id = int(time.time()) + self.last_print_time = 0.0 if config is not None: self.config = config else: self.config = DatahubExecutionRequestCleanupConfig() + def _print_report(self) -> None: + time_taken = round(time.time() - self.last_print_time, 1) + # Print report every 2 minutes + if time_taken > 120: + self.last_print_time = time.time() + logger.info(f"\n{self.report.as_string()}") + def _to_cleanup_record(self, entry: Dict) -> CleanupRecord: input_aspect = ( entry.get("aspects", {}) @@ -175,6 +189,7 @@ def _scroll_garbage_records(self): running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000 for entry in self._scroll_execution_requests(): + self._print_report() self.report.ergc_records_read += 1 key = entry.ingestion_source @@ -225,15 +240,12 @@ def _scroll_garbage_records(self): f"record timestamp: {entry.requested_at}." ) ) - self.report.ergc_records_deleted += 1 yield entry def _delete_entry(self, entry: CleanupRecord) -> None: try: - logger.info( - f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}" - ) self.graph.delete_entity(entry.urn, True) + self.report.ergc_records_deleted += 1 except Exception as e: self.report.ergc_delete_errors += 1 self.report.failure( @@ -252,10 +264,23 @@ def _reached_runtime_limit(self) -> bool: >= datetime.timedelta(seconds=self.config.runtime_limit_seconds) ) ): + self.report.ergc_runtime_limit_reached = True logger.info(f"ergc({self.instance_id}): max runtime reached.") return True return False + def _reached_delete_limit(self) -> bool: + if ( + self.config.limit_entities_delete + and self.report.ergc_records_deleted >= self.config.limit_entities_delete + ): + logger.info( + f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}." + ) + self.report.ergc_delete_limit_reached = True + return True + return False + def run(self) -> None: if not self.config.enabled: logger.info( @@ -274,7 +299,7 @@ def run(self) -> None: ) for entry in self._scroll_garbage_records(): - if self._reached_runtime_limit(): + if self._reached_runtime_limit() or self._reached_delete_limit(): break self._delete_entry(entry) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py index 0a52b7e17bf714..471eeff0224ed1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py @@ -231,6 +231,15 @@ def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]: def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]: assert self.ctx.graph scroll_id: Optional[str] = None + + batch_size = self.config.batch_size + if entity_type == "DATA_PROCESS_INSTANCE": + # Due to a bug in Data process instance querying this is a temp workaround + # to avoid a giant stacktrace by having a smaller batch size in first call + # This will be remove in future version after server with fix has been + # around for a while + batch_size = 10 + while True: try: result = self.ctx.graph.execute_graphql( @@ -240,7 +249,7 @@ def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[st "types": [entity_type], "query": "*", "scrollId": scroll_id if scroll_id else None, - "count": self.config.batch_size, + "count": batch_size, "orFilters": [ { "and": [ @@ -263,6 +272,10 @@ def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[st scroll_across_entities = result.get("scrollAcrossEntities") if not scroll_across_entities or not scroll_across_entities.get("count"): break + if entity_type == "DATA_PROCESS_INSTANCE": + # Temp workaround. See note in beginning of the function + # We make the batch size = config after call has succeeded once + batch_size = self.config.batch_size scroll_id = scroll_across_entities.get("nextScrollId") self.report.num_queries_found += scroll_across_entities.get("count") for query in scroll_across_entities.get("searchResults"): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py index 2a247d0c63957a..4764400215e12a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py @@ -485,7 +485,7 @@ def report_dropped(self, view: str) -> None: self.filtered_reports.append(view) -@platform_name("PowerBI") +@platform_name("PowerBI Report Server") @config_class(PowerBiReportServerDashboardSourceConfig) @support_status(SupportStatus.INCUBATING) @capability(SourceCapability.OWNERSHIP, "Enabled by default") diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 49f7941563c1a7..5371017a2a3212 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -423,10 +423,10 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit database = self.config.database logger.info(f"Processing db {database}") - self.report.report_ingestion_stage_start(METADATA_EXTRACTION) - self.db_tables[database] = defaultdict() - self.db_views[database] = defaultdict() - self.db_schemas.setdefault(database, {}) + with self.report.new_stage(METADATA_EXTRACTION): + self.db_tables[database] = defaultdict() + self.db_views[database] = defaultdict() + self.db_schemas.setdefault(database, {}) # TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping # this fallback. For now, this gets us broad coverage quickly. @@ -462,12 +462,12 @@ def _extract_metadata( self.process_schemas(connection, database) ) - self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION) - yield from self.extract_lineage_v2( - connection=connection, - database=database, - lineage_extractor=lineage_extractor, - ) + with self.report.new_stage(LINEAGE_EXTRACTION): + yield from self.extract_lineage_v2( + connection=connection, + database=database, + lineage_extractor=lineage_extractor, + ) all_tables = self.get_all_tables() else: @@ -480,25 +480,25 @@ def _extract_metadata( or self.config.include_view_lineage or self.config.include_copy_lineage ): - self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION) - yield from self.extract_lineage( - connection=connection, all_tables=all_tables, database=database - ) + with self.report.new_stage(LINEAGE_EXTRACTION): + yield from self.extract_lineage( + connection=connection, all_tables=all_tables, database=database + ) - self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION) if self.config.include_usage_statistics: - yield from self.extract_usage( - connection=connection, all_tables=all_tables, database=database - ) + with self.report.new_stage(USAGE_EXTRACTION_INGESTION): + yield from self.extract_usage( + connection=connection, all_tables=all_tables, database=database + ) if self.config.is_profiling_enabled(): - self.report.report_ingestion_stage_start(PROFILING) - profiler = RedshiftProfiler( - config=self.config, - report=self.report, - state_handler=self.profiling_state_handler, - ) - yield from profiler.get_workunits(self.db_tables) + with self.report.new_stage(PROFILING): + profiler = RedshiftProfiler( + config=self.config, + report=self.report, + state_handler=self.profiling_state_handler, + ) + yield from profiler.get_workunits(self.db_tables) def process_schemas(self, connection, database): for schema in self.data_dictionary.get_schemas( @@ -633,8 +633,8 @@ def process_schema( else: logger.info("View processing disabled, skipping") - self.report.metadata_extraction_sec[report_key] = round( - timer.elapsed_seconds(), 2 + self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds( + digits=2 ) def _process_table( @@ -986,9 +986,7 @@ def extract_usage( yield from usage_extractor.get_usage_workunits(all_tables=all_tables) - self.report.usage_extraction_sec[database] = round( - timer.elapsed_seconds(), 2 - ) + self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2) def extract_lineage( self, @@ -1011,8 +1009,8 @@ def extract_lineage( database=database, connection=connection, all_tables=all_tables ) - self.report.lineage_extraction_sec[f"{database}"] = round( - timer.elapsed_seconds(), 2 + self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds( + digits=2 ) yield from self.generate_lineage( database, lineage_extractor=lineage_extractor @@ -1042,8 +1040,8 @@ def extract_lineage_v2( yield from lineage_extractor.generate() - self.report.lineage_extraction_sec[f"{database}"] = round( - timer.elapsed_seconds(), 2 + self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds( + digits=2 ) if self.redundant_lineage_run_skip_handler: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py index e0bf8b23dd0f7d..d66a1ee18be40f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py @@ -182,38 +182,38 @@ def _get_workunits_internal( self.report.num_operational_stats_filtered = 0 if self.config.include_operational_stats: - self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS) - with PerfTimer() as timer: - # Generate operation aspect workunits - yield from self._gen_operation_aspect_workunits( - self.connection, all_tables - ) - self.report.operational_metadata_extraction_sec[ - self.config.database - ] = round(timer.elapsed_seconds(), 2) + with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS): + with PerfTimer() as timer: + # Generate operation aspect workunits + yield from self._gen_operation_aspect_workunits( + self.connection, all_tables + ) + self.report.operational_metadata_extraction_sec[ + self.config.database + ] = timer.elapsed_seconds(digits=2) # Generate aggregate events - self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION) - query: str = self.queries.usage_query( - start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT), - end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT), - database=self.config.database, - ) - access_events_iterable: Iterable[ - RedshiftAccessEvent - ] = self._gen_access_events_from_history_query( - query, connection=self.connection, all_tables=all_tables - ) + with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION): + query: str = self.queries.usage_query( + start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT), + end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT), + database=self.config.database, + ) + access_events_iterable: Iterable[ + RedshiftAccessEvent + ] = self._gen_access_events_from_history_query( + query, connection=self.connection, all_tables=all_tables + ) - aggregated_events: AggregatedAccessEvents = self._aggregate_access_events( - access_events_iterable - ) - # Generate usage workunits from aggregated events. - for time_bucket in aggregated_events.values(): - for aggregate in time_bucket.values(): - wu: MetadataWorkUnit = self._make_usage_stat(aggregate) - self.report.num_usage_workunits_emitted += 1 - yield wu + aggregated_events: AggregatedAccessEvents = self._aggregate_access_events( + access_events_iterable + ) + # Generate usage workunits from aggregated events. + for time_bucket in aggregated_events.values(): + for aggregate in time_bucket.values(): + wu: MetadataWorkUnit = self._make_usage_stat(aggregate) + self.report.num_usage_workunits_emitted += 1 + yield wu def _gen_operation_aspect_workunits( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index ceac9e96d1ddd0..989d0d734352a2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -6,9 +6,8 @@ import re import time from datetime import datetime -from itertools import groupby from pathlib import PurePath -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple from urllib.parse import urlparse import smart_open.compression as so_compression @@ -41,6 +40,7 @@ get_bucket_name, get_bucket_relative_path, get_key_prefix, + group_s3_objects_by_dirname, strip_s3_prefix, ) from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator @@ -75,6 +75,9 @@ from datahub.telemetry import stats, telemetry from datahub.utilities.perf_timer import PerfTimer +if TYPE_CHECKING: + from mypy_boto3_s3.service_resource import Bucket + # hide annoying debug errors from py4j logging.getLogger("py4j").setLevel(logging.ERROR) logger: logging.Logger = logging.getLogger(__name__) @@ -842,7 +845,7 @@ def get_dir_to_process( def get_folder_info( self, path_spec: PathSpec, - bucket: Any, # Todo: proper type + bucket: "Bucket", prefix: str, ) -> List[Folder]: """ @@ -857,22 +860,15 @@ def get_folder_info( Parameters: path_spec (PathSpec): The path specification used to determine partitioning. - bucket (Any): The S3 bucket object. + bucket (Bucket): The S3 bucket object. prefix (str): The prefix path in the S3 bucket to list objects from. Returns: List[Folder]: A list of Folder objects representing the partitions found. """ - - prefix_to_list = prefix - files = list( - bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE) - ) - files = sorted(files, key=lambda a: a.last_modified) - grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0]) - partitions: List[Folder] = [] - for key, group in grouped_files: + s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE) + for key, group in group_s3_objects_by_dirname(s3_objects).items(): file_size = 0 creation_time = None modification_time = None @@ -904,7 +900,7 @@ def get_folder_info( Folder( partition_id=id, is_partition=bool(id), - creation_time=creation_time if creation_time else None, + creation_time=creation_time if creation_time else None, # type: ignore[arg-type] modification_time=modification_time, sample_file=self.create_s3_path(max_file.bucket_name, max_file.key), size=file_size, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py index 030b2d43be81f9..b24471f8666afa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py @@ -166,6 +166,3 @@ def _is_tag_scanned(self, tag_name: str) -> bool: def report_tag_processed(self, tag_name: str) -> None: self._processed_tags.add(tag_name) - - def set_ingestion_stage(self, database: str, stage: str) -> None: - self.report_ingestion_stage_start(f"{database}: {stage}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 8a1bf15b7a7bc4..6f09c26b08da2d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -216,21 +216,23 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: try: for snowflake_db in self.databases: - self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION) - yield from self._process_database(snowflake_db) + with self.report.new_stage( + f"{snowflake_db.name}: {METADATA_EXTRACTION}" + ): + yield from self._process_database(snowflake_db) - self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE) - discovered_tables: List[str] = [ - self.identifiers.get_dataset_identifier( - table_name, schema.name, db.name - ) - for db in self.databases - for schema in db.schemas - for table_name in schema.tables - ] - if self.aggregator: - for entry in self._external_tables_ddl_lineage(discovered_tables): - self.aggregator.add(entry) + with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"): + discovered_tables: List[str] = [ + self.identifiers.get_dataset_identifier( + table_name, schema.name, db.name + ) + for db in self.databases + for schema in db.schemas + for table_name in schema.tables + ] + if self.aggregator: + for entry in self._external_tables_ddl_lineage(discovered_tables): + self.aggregator.add(entry) except SnowflakePermissionError as e: self.structured_reporter.failure( @@ -332,8 +334,8 @@ def _process_database( yield from self._process_db_schemas(snowflake_db, db_tables) if self.profiler and db_tables: - self.report.set_ingestion_stage(snowflake_db.name, PROFILING) - yield from self.profiler.get_workunits(snowflake_db, db_tables) + with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"): + yield from self.profiler.get_workunits(snowflake_db, db_tables) def _process_db_schemas( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index 4bdf559f293b51..85e4071aec07df 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -146,59 +146,58 @@ def get_usage_workunits( if not self._should_ingest_usage(): return - self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION) - if self.report.edition == SnowflakeEdition.STANDARD.value: - logger.info( - "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported." - ) - return + with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"): + if self.report.edition == SnowflakeEdition.STANDARD.value: + logger.info( + "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported." + ) + return - logger.info("Checking usage date ranges") + logger.info("Checking usage date ranges") - self._check_usage_date_ranges() + self._check_usage_date_ranges() - # If permission error, execution returns from here - if ( - self.report.min_access_history_time is None - or self.report.max_access_history_time is None - ): - return + # If permission error, execution returns from here + if ( + self.report.min_access_history_time is None + or self.report.max_access_history_time is None + ): + return - # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation - # Now, we report the usage as well as operation metadata even if user email is absent + # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation + # Now, we report the usage as well as operation metadata even if user email is absent - if self.config.include_usage_stats: - yield from auto_empty_dataset_usage_statistics( - self._get_workunits_internal(discovered_datasets), - config=BaseTimeWindowConfig( - start_time=self.start_time, - end_time=self.end_time, - bucket_duration=self.config.bucket_duration, - ), - dataset_urns={ - self.identifiers.gen_dataset_urn(dataset_identifier) - for dataset_identifier in discovered_datasets - }, - ) + if self.config.include_usage_stats: + yield from auto_empty_dataset_usage_statistics( + self._get_workunits_internal(discovered_datasets), + config=BaseTimeWindowConfig( + start_time=self.start_time, + end_time=self.end_time, + bucket_duration=self.config.bucket_duration, + ), + dataset_urns={ + self.identifiers.gen_dataset_urn(dataset_identifier) + for dataset_identifier in discovered_datasets + }, + ) - self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS) + with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"): + if self.config.include_operational_stats: + # Generate the operation workunits. + access_events = self._get_snowflake_history() + for event in access_events: + yield from self._get_operation_aspect_work_unit( + event, discovered_datasets + ) - if self.config.include_operational_stats: - # Generate the operation workunits. - access_events = self._get_snowflake_history() - for event in access_events: - yield from self._get_operation_aspect_work_unit( - event, discovered_datasets + if self.redundant_run_skip_handler: + # Update the checkpoint state for this run. + self.redundant_run_skip_handler.update_state( + self.config.start_time, + self.config.end_time, + self.config.bucket_duration, ) - if self.redundant_run_skip_handler: - # Update the checkpoint state for this run. - self.redundant_run_skip_handler.update_state( - self.config.start_time, - self.config.end_time, - self.config.bucket_duration, - ) - def _get_workunits_internal( self, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: @@ -386,7 +385,7 @@ def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]: ) self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False) return - self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2) + self.report.access_history_query_secs = timer.elapsed_seconds(digits=2) for row in results: yield from self._process_snowflake_history_row(row) @@ -434,8 +433,8 @@ def _check_usage_date_ranges(self) -> None: self.report.max_access_history_time = db_row["MAX_TIME"].astimezone( tz=timezone.utc ) - self.report.access_history_range_query_secs = round( - timer.elapsed_seconds(), 2 + self.report.access_history_range_query_secs = timer.elapsed_seconds( + digits=2 ) def _get_operation_aspect_work_unit( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index aede3d056709a2..c0385a8d5af30a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -480,8 +480,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: identifiers=self.identifiers, ) - self.report.set_ingestion_stage("*", METADATA_EXTRACTION) - yield from schema_extractor.get_workunits_internal() + with self.report.new_stage(f"*: {METADATA_EXTRACTION}"): + yield from schema_extractor.get_workunits_internal() databases = schema_extractor.databases @@ -513,47 +513,46 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: discovered_datasets = discovered_tables + discovered_views if self.config.use_queries_v2: - self.report.set_ingestion_stage("*", VIEW_PARSING) - yield from auto_workunit(self.aggregator.gen_metadata()) - - self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) - - schema_resolver = self.aggregator._schema_resolver - - queries_extractor = SnowflakeQueriesExtractor( - connection=self.connection, - config=SnowflakeQueriesExtractorConfig( - window=self.config, - temporary_tables_pattern=self.config.temporary_tables_pattern, - include_lineage=self.config.include_table_lineage, - include_usage_statistics=self.config.include_usage_stats, - include_operations=self.config.include_operational_stats, - include_queries=self.config.include_queries, - include_query_usage_statistics=self.config.include_query_usage_statistics, - user_email_pattern=self.config.user_email_pattern, - ), - structured_report=self.report, - filters=self.filters, - identifiers=self.identifiers, - schema_resolver=schema_resolver, - discovered_tables=discovered_datasets, - graph=self.ctx.graph, - ) + with self.report.new_stage(f"*: {VIEW_PARSING}"): + yield from auto_workunit(self.aggregator.gen_metadata()) - # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs - # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors, - # it should be pretty straightforward to refactor this and only initialize the aggregator once. - self.report.queries_extractor = queries_extractor.report - yield from queries_extractor.get_workunits_internal() - queries_extractor.close() + with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"): + schema_resolver = self.aggregator._schema_resolver + + queries_extractor = SnowflakeQueriesExtractor( + connection=self.connection, + config=SnowflakeQueriesExtractorConfig( + window=self.config, + temporary_tables_pattern=self.config.temporary_tables_pattern, + include_lineage=self.config.include_table_lineage, + include_usage_statistics=self.config.include_usage_stats, + include_operations=self.config.include_operational_stats, + include_queries=self.config.include_queries, + include_query_usage_statistics=self.config.include_query_usage_statistics, + user_email_pattern=self.config.user_email_pattern, + ), + structured_report=self.report, + filters=self.filters, + identifiers=self.identifiers, + schema_resolver=schema_resolver, + discovered_tables=discovered_datasets, + graph=self.ctx.graph, + ) + + # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs + # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors, + # it should be pretty straightforward to refactor this and only initialize the aggregator once. + self.report.queries_extractor = queries_extractor.report + yield from queries_extractor.get_workunits_internal() + queries_extractor.close() else: if self.lineage_extractor: - self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION) - self.lineage_extractor.add_time_based_lineage_to_aggregator( - discovered_tables=discovered_tables, - discovered_views=discovered_views, - ) + with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"): + self.lineage_extractor.add_time_based_lineage_to_aggregator( + discovered_tables=discovered_tables, + discovered_views=discovered_views, + ) # This would emit view and external table ddl lineage # as well as query lineage via lineage_extractor diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py index e42564975c3d19..5b76fe41d92e97 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -878,7 +878,7 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit urns = self.schema_resolver.get_urns() if self.config.include_table_lineage or self.config.include_usage_statistics: - self.report.report_ingestion_stage_start("audit log extraction") - yield from self.get_audit_log_mcps(urns=urns) + with self.report.new_stage("Audit log extraction"): + yield from self.get_audit_log_mcps(urns=urns) yield from self.builder.gen_workunits() diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index d149402741e82f..2543cbe653ba72 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -118,6 +118,7 @@ ) from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo from datahub.ingestion.source.tableau.tableau_validation import check_user_role +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.metadata.com.linkedin.pegasus2avro.common import ( AuditStamp, ChangeAuditStamps, @@ -170,6 +171,8 @@ create_lineage_sql_parsed_result, ) from datahub.utilities import config_clean +from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.stats_collections import TopKDict from datahub.utilities.urns.dataset_urn import DatasetUrn try: @@ -643,12 +646,41 @@ class SiteIdContentUrl: @dataclass -class TableauSourceReport(StaleEntityRemovalSourceReport): +class TableauSourceReport( + StaleEntityRemovalSourceReport, + IngestionStageReport, +): get_all_datasources_query_failed: bool = False num_get_datasource_query_failures: int = 0 num_datasource_field_skipped_no_name: int = 0 num_csql_field_skipped_no_name: int = 0 num_table_field_skipped_no_name: int = 0 + # timers + extract_usage_stats_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + populate_projects_registry_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + emit_embedded_datasources_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_published_datasources_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_upstream_tables_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) # lineage num_tables_with_upstream_lineage: int = 0 num_upstream_table_lineage: int = 0 @@ -660,6 +692,7 @@ class TableauSourceReport(StaleEntityRemovalSourceReport): num_upstream_fine_grained_lineage_failed_parse_sql: int = 0 num_hidden_assets_skipped: int = 0 logged_in_user: List[UserInfo] = dataclass_field(default_factory=list) + last_authenticated_at: Optional[datetime] = None num_expected_tableau_metadata_queries: int = 0 @@ -834,6 +867,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: platform=self.platform, ) yield from site_source.ingest_tableau_site() + except MetadataQueryException as md_exception: self.report.failure( title="Failed to Retrieve Tableau Metadata", @@ -3489,33 +3523,87 @@ def _create_workbook_properties( return {"permissions": json.dumps(groups)} if len(groups) > 0 else None def ingest_tableau_site(self): - # Initialise the dictionary to later look-up for chart and dashboard stat - if self.config.extract_usage_stats: - self._populate_usage_stat_registry() - - if self.config.permission_ingestion: - self._fetch_groups() - - # Populate the map of database names and database hostnames to be used later to map - # databases to platform instances. - if self.config.database_hostname_to_platform_instance_map: - self._populate_database_server_hostname_map() - - self._populate_projects_registry() - - if self.config.add_site_container: - yield from self.emit_site_container() - yield from self.emit_project_containers() - yield from self.emit_workbooks() - if self.sheet_ids: - yield from self.emit_sheets() - if self.dashboard_ids: - yield from self.emit_dashboards() - if self.embedded_datasource_ids_being_used: - yield from self.emit_embedded_datasources() - if self.datasource_ids_being_used: - yield from self.emit_published_datasources() - if self.custom_sql_ids_being_used: - yield from self.emit_custom_sql_datasources() - if self.database_tables: - yield from self.emit_upstream_tables() + with self.report.new_stage( + f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}" + ): + # Initialise the dictionary to later look-up for chart and dashboard stat + if self.config.extract_usage_stats: + with PerfTimer() as timer: + self._populate_usage_stat_registry() + self.report.extract_usage_stats_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.config.permission_ingestion: + with PerfTimer() as timer: + self._fetch_groups() + self.report.fetch_groups_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + # Populate the map of database names and database hostnames to be used later to map + # databases to platform instances. + if self.config.database_hostname_to_platform_instance_map: + with PerfTimer() as timer: + self._populate_database_server_hostname_map() + self.report.populate_database_server_hostname_map_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + with PerfTimer() as timer: + self._populate_projects_registry() + self.report.populate_projects_registry_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.config.add_site_container: + yield from self.emit_site_container() + yield from self.emit_project_containers() + + with PerfTimer() as timer: + yield from self.emit_workbooks() + self.report.emit_workbooks_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.sheet_ids: + with PerfTimer() as timer: + yield from self.emit_sheets() + self.report.emit_sheets_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.dashboard_ids: + with PerfTimer() as timer: + yield from self.emit_dashboards() + self.report.emit_dashboards_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.embedded_datasource_ids_being_used: + with PerfTimer() as timer: + yield from self.emit_embedded_datasources() + self.report.emit_embedded_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.datasource_ids_being_used: + with PerfTimer() as timer: + yield from self.emit_published_datasources() + self.report.emit_published_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.custom_sql_ids_being_used: + with PerfTimer() as timer: + yield from self.emit_custom_sql_datasources() + self.report.emit_custom_sql_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.database_tables: + with PerfTimer() as timer: + yield from self.emit_upstream_tables() + self.report.emit_upstream_tables_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 9d9a746580f939..43bd788f809c3e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -263,86 +263,86 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - self.report.report_ingestion_stage_start("Ingestion Setup") - wait_on_warehouse = None - if self.config.include_hive_metastore: - self.report.report_ingestion_stage_start("Start warehouse") - # Can take several minutes, so start now and wait later - wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() - if wait_on_warehouse is None: - self.report.report_failure( - "initialization", - f"SQL warehouse {self.config.profiling.warehouse_id} not found", - ) - return - else: - # wait until warehouse is started - wait_on_warehouse.result() + with self.report.new_stage("Ingestion Setup"): + wait_on_warehouse = None + if self.config.include_hive_metastore: + with self.report.new_stage("Start warehouse"): + # Can take several minutes, so start now and wait later + wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() + if wait_on_warehouse is None: + self.report.report_failure( + "initialization", + f"SQL warehouse {self.config.profiling.warehouse_id} not found", + ) + return + else: + # wait until warehouse is started + wait_on_warehouse.result() if self.config.include_ownership: - self.report.report_ingestion_stage_start("Ingest service principals") - self.build_service_principal_map() - self.build_groups_map() + with self.report.new_stage("Ingest service principals"): + self.build_service_principal_map() + self.build_groups_map() if self.config.include_notebooks: - self.report.report_ingestion_stage_start("Ingest notebooks") - yield from self.process_notebooks() + with self.report.new_stage("Ingest notebooks"): + yield from self.process_notebooks() yield from self.process_metastores() yield from self.get_view_lineage() if self.config.include_notebooks: - self.report.report_ingestion_stage_start("Notebook lineage") - for notebook in self.notebooks.values(): - wu = self._gen_notebook_lineage(notebook) - if wu: - yield wu + with self.report.new_stage("Notebook lineage"): + for notebook in self.notebooks.values(): + wu = self._gen_notebook_lineage(notebook) + if wu: + yield wu if self.config.include_usage_statistics: - self.report.report_ingestion_stage_start("Ingest usage") - usage_extractor = UnityCatalogUsageExtractor( - config=self.config, - report=self.report, - proxy=self.unity_catalog_api_proxy, - table_urn_builder=self.gen_dataset_urn, - user_urn_builder=self.gen_user_urn, - ) - yield from usage_extractor.get_usage_workunits( - self.table_refs | self.view_refs - ) - - if self.config.is_profiling_enabled(): - self.report.report_ingestion_stage_start("Start warehouse") - # Need to start the warehouse again for profiling, - # as it may have been stopped after ingestion might take - # longer time to complete - wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() - if wait_on_warehouse is None: - self.report.report_failure( - "initialization", - f"SQL warehouse {self.config.profiling.warehouse_id} not found", + with self.report.new_stage("Ingest usage"): + usage_extractor = UnityCatalogUsageExtractor( + config=self.config, + report=self.report, + proxy=self.unity_catalog_api_proxy, + table_urn_builder=self.gen_dataset_urn, + user_urn_builder=self.gen_user_urn, + ) + yield from usage_extractor.get_usage_workunits( + self.table_refs | self.view_refs ) - return - else: - # wait until warehouse is started - wait_on_warehouse.result() - self.report.report_ingestion_stage_start("Profiling") - if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig): - yield from UnityCatalogAnalyzeProfiler( - self.config.profiling, - self.report, - self.unity_catalog_api_proxy, - self.gen_dataset_urn, - ).get_workunits(self.table_refs) - elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig): - yield from UnityCatalogGEProfiler( - sql_common_config=self.config, - profiling_config=self.config.profiling, - report=self.report, - ).get_workunits(list(self.tables.values())) - else: - raise ValueError("Unknown profiling config method") + if self.config.is_profiling_enabled(): + with self.report.new_stage("Start warehouse"): + # Need to start the warehouse again for profiling, + # as it may have been stopped after ingestion might take + # longer time to complete + wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() + if wait_on_warehouse is None: + self.report.report_failure( + "initialization", + f"SQL warehouse {self.config.profiling.warehouse_id} not found", + ) + return + else: + # wait until warehouse is started + wait_on_warehouse.result() + + with self.report.new_stage("Profiling"): + if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig): + yield from UnityCatalogAnalyzeProfiler( + self.config.profiling, + self.report, + self.unity_catalog_api_proxy, + self.gen_dataset_urn, + ).get_workunits(self.table_refs) + elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig): + yield from UnityCatalogGEProfiler( + sql_common_config=self.config, + profiling_config=self.config.profiling, + report=self.report, + ).get_workunits(list(self.tables.values())) + else: + raise ValueError("Unknown profiling config method") def build_service_principal_map(self) -> None: try: @@ -462,11 +462,11 @@ def process_schemas(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]: self.report.schemas.dropped(schema.id) continue - self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}") - yield from self.gen_schema_containers(schema) - yield from self.process_tables(schema) + with self.report.new_stage(f"Ingest schema {schema.id}"): + yield from self.gen_schema_containers(schema) + yield from self.process_tables(schema) - self.report.schemas.processed(schema.id) + self.report.schemas.processed(schema.id) def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]: for table in self.unity_catalog_api_proxy.tables(schema=schema): diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index ce683e64b3f468..130a36e254fefd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -1,7 +1,7 @@ import logging +from contextlib import AbstractContextManager from dataclasses import dataclass, field from datetime import datetime, timezone -from typing import Optional from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.stats_collections import TopKDict @@ -22,25 +22,29 @@ @dataclass class IngestionStageReport: - ingestion_stage: Optional[str] = None ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict) - _timer: Optional[PerfTimer] = field( - default=None, init=False, repr=False, compare=False - ) - - def report_ingestion_stage_start(self, stage: str) -> None: - if self._timer: - elapsed = round(self._timer.elapsed_seconds(), 2) - logger.info( - f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds", - stacklevel=2, - ) - if self.ingestion_stage: - self.ingestion_stage_durations[self.ingestion_stage] = elapsed - else: - self._timer = PerfTimer() - - self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" - logger.info(f"Stage started: {self.ingestion_stage}") + def new_stage(self, stage: str) -> "IngestionStageContext": + return IngestionStageContext(stage, self) + + +@dataclass +class IngestionStageContext(AbstractContextManager): + def __init__(self, stage: str, report: IngestionStageReport): + self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" + self._timer: PerfTimer = PerfTimer() + self._report = report + + def __enter__(self) -> "IngestionStageContext": + logger.info(f"Stage started: {self._ingestion_stage}") self._timer.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + elapsed = self._timer.elapsed_seconds(digits=2) + logger.info( + f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds", + stacklevel=2, + ) + self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed + return None diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py index 9488683d6d8cac..fc1b1ed58244c3 100644 --- a/metadata-ingestion/src/datahub/utilities/perf_timer.py +++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py @@ -57,7 +57,7 @@ def __exit__( self.finish() return None - def elapsed_seconds(self) -> float: + def elapsed_seconds(self, digits: int = 4) -> float: """ Returns the elapsed time in seconds. """ @@ -65,11 +65,18 @@ def elapsed_seconds(self) -> float: return self._past_active_time if self.end_time is None: - return (time.perf_counter() - self.start_time) + (self._past_active_time) + elapsed = (time.perf_counter() - self.start_time) + (self._past_active_time) else: - return (self.end_time - self.start_time) + self._past_active_time + elapsed = (self.end_time - self.start_time) + self._past_active_time + + return round(elapsed, digits) def assert_timer_is_running(self) -> None: + if not self.is_running(): + self._error_state = True + logger.warning("Did you forget to start the timer ?") + + def is_running(self) -> bool: """ Returns true if timer is in running state. Timer is in NOT in running state if @@ -77,9 +84,7 @@ def assert_timer_is_running(self) -> None: 2. it is in paused state. 3. it had been started and finished in the past but not started again. """ - if self.start_time is None or self.paused or self.end_time: - self._error_state = True - logger.warning("Did you forget to start the timer ?") + return self.start_time is not None and not self.paused and self.end_time is None def __repr__(self) -> str: return repr(self.as_obj()) diff --git a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py index 9cb80ff02657bb..24460f38298069 100644 --- a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py +++ b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py @@ -26,14 +26,14 @@ def run_test(): report = BigQueryV2Report() - report.set_ingestion_stage("All", "Seed Data Generation") - seed_metadata = generate_data( - num_containers=2000, - num_tables=20000, - num_views=2000, - time_range=timedelta(days=7), - ) - all_tables = seed_metadata.all_tables + with report.new_stage("All: Seed Data Generation"): + seed_metadata = generate_data( + num_containers=2000, + num_tables=20000, + num_views=2000, + time_range=timedelta(days=7), + ) + all_tables = seed_metadata.all_tables config = BigQueryV2Config( start_time=seed_metadata.start_time, @@ -51,42 +51,45 @@ def run_test(): schema_resolver=SchemaResolver(platform="bigquery"), identifiers=BigQueryIdentifierBuilder(config, report), ) - report.set_ingestion_stage("All", "Event Generation") - - num_projects = 100 - projects = [f"project-{i}" for i in range(num_projects)] - table_to_project = {table.name: random.choice(projects) for table in all_tables} - table_refs = {str(ref_from_table(table, table_to_project)) for table in all_tables} + with report.new_stage("All: Event Generation"): + num_projects = 100 + projects = [f"project-{i}" for i in range(num_projects)] + table_to_project = {table.name: random.choice(projects) for table in all_tables} + table_refs = { + str(ref_from_table(table, table_to_project)) for table in all_tables + } - queries = list( - generate_queries( - seed_metadata, - num_selects=240_000, - num_operations=800_000, - num_unique_queries=50_000, - num_users=2000, - query_length=NormalDistribution(2000, 500), + queries = list( + generate_queries( + seed_metadata, + num_selects=240_000, + num_operations=800_000, + num_unique_queries=50_000, + num_users=2000, + query_length=NormalDistribution(2000, 500), + ) ) - ) - queries.sort(key=lambda q: q.timestamp) - events = list(generate_events(queries, projects, table_to_project, config=config)) - print(f"Events generated: {len(events)}") - pre_mem_usage = psutil.Process(os.getpid()).memory_info().rss - print(f"Test data size: {humanfriendly.format_size(pre_mem_usage)}") + queries.sort(key=lambda q: q.timestamp) + events = list( + generate_events(queries, projects, table_to_project, config=config) + ) + print(f"Events generated: {len(events)}") + pre_mem_usage = psutil.Process(os.getpid()).memory_info().rss + print(f"Test data size: {humanfriendly.format_size(pre_mem_usage)}") - report.set_ingestion_stage("All", "Event Ingestion") - with PerfTimer() as timer: - workunits = usage_extractor._get_workunits_internal(events, table_refs) - num_workunits, peak_memory_usage = workunit_sink(workunits) - report.set_ingestion_stage("All", "Done") - print(f"Workunits Generated: {num_workunits}") - print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + with report.new_stage("All: Event Ingestion"): + with PerfTimer() as timer: + workunits = usage_extractor._get_workunits_internal(events, table_refs) + num_workunits, peak_memory_usage = workunit_sink(workunits) + with report.new_stage("All: Done"): + print(f"Workunits Generated: {num_workunits}") + print(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds") - print( - f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" - ) - print(f"Disk Used: {report.processing_perf.usage_state_size}") - print(f"Hash collisions: {report.num_usage_query_hash_collisions}") + print( + f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" + ) + print(f"Disk Used: {report.processing_perf.usage_state_size}") + print(f"Hash collisions: {report.num_usage_query_hash_collisions}") if __name__ == "__main__": diff --git a/metadata-ingestion/tests/performance/databricks/test_unity.py b/metadata-ingestion/tests/performance/databricks/test_unity.py index ddd19804ba1841..71192dc5b509bc 100644 --- a/metadata-ingestion/tests/performance/databricks/test_unity.py +++ b/metadata-ingestion/tests/performance/databricks/test_unity.py @@ -59,7 +59,7 @@ def run_test(): workunits = source.get_workunits() num_workunits, peak_memory_usage = workunit_sink(workunits) print(f"Workunits Generated: {num_workunits}") - print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + print(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds") print( f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py index 984d9e42957452..a940cce46a8f74 100644 --- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py @@ -53,7 +53,7 @@ def run_test(): workunits = source.get_workunits() num_workunits, peak_memory_usage = workunit_sink(workunits) logging.info(f"Workunits Generated: {num_workunits}") - logging.info(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + logging.info(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds") logging.info(source.get_report().as_string()) logging.info( diff --git a/metadata-ingestion/tests/performance/sql/test_sql_formatter.py b/metadata-ingestion/tests/performance/sql/test_sql_formatter.py index 5f783efc559bc9..f09047c0ec4a4f 100644 --- a/metadata-ingestion/tests/performance/sql/test_sql_formatter.py +++ b/metadata-ingestion/tests/performance/sql/test_sql_formatter.py @@ -12,12 +12,14 @@ def run_test() -> None: for i in range(N): if i % 50 == 0: print( - f"Running iteration {i}, elapsed time: {timer.elapsed_seconds():.2f} seconds" + f"Running iteration {i}, elapsed time: {timer.elapsed_seconds(digits=2)} seconds" ) try_format_query.__wrapped__(large_sql_query, platform="snowflake") - print(f"Total time taken for {N} iterations: {timer.elapsed_seconds():.2f} seconds") + print( + f"Total time taken for {N} iterations: {timer.elapsed_seconds(digits=2)} seconds" + ) if __name__ == "__main__": diff --git a/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py new file mode 100644 index 00000000000000..8bae38eaa74446 --- /dev/null +++ b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py @@ -0,0 +1,42 @@ +import time + +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport + + +def test_ingestion_stage_context_records_duration(): + report = IngestionStageReport() + with report.new_stage(stage="Test Stage"): + pass + assert len(report.ingestion_stage_durations) == 1 + assert "Test Stage" in next(iter(report.ingestion_stage_durations.keys())) + + +def test_ingestion_stage_context_handles_exceptions(): + report = IngestionStageReport() + try: + with report.new_stage(stage="Test Stage"): + raise ValueError("Test Exception") + except ValueError: + pass + assert len(report.ingestion_stage_durations) == 1 + assert "Test Stage" in next(iter(report.ingestion_stage_durations)) + + +def test_ingestion_stage_context_report_handles_multiple_stages(): + report = IngestionStageReport() + with report.new_stage(stage="Test Stage 1"): + time.sleep(0.1) + with report.new_stage(stage="Test Stage 2"): + time.sleep(0.1) + with report.new_stage(stage="Test Stage 3"): + time.sleep(0.1) + assert len(report.ingestion_stage_durations) == 3 + assert all( + isinstance(duration, float) and duration > 0.0 + for duration in report.ingestion_stage_durations.values() + ) + + sorted_stages = list(sorted(report.ingestion_stage_durations.keys())) + assert "Test Stage 1" in sorted_stages[0] + assert "Test Stage 2" in sorted_stages[1] + assert "Test Stage 3" in sorted_stages[2] diff --git a/metadata-ingestion/tests/unit/s3/test_s3_source.py b/metadata-ingestion/tests/unit/s3/test_s3_source.py index f826cf0179e221..902987213e122f 100644 --- a/metadata-ingestion/tests/unit/s3/test_s3_source.py +++ b/metadata-ingestion/tests/unit/s3/test_s3_source.py @@ -1,12 +1,15 @@ +from datetime import datetime from typing import List, Tuple +from unittest.mock import Mock import pytest from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator from datahub.ingestion.source.data_lake_common.path_spec import PathSpec -from datahub.ingestion.source.s3.source import partitioned_folder_comparator +from datahub.ingestion.source.s3.source import S3Source, partitioned_folder_comparator def test_partition_comparator_numeric_folder_name(): @@ -240,3 +243,63 @@ def container_properties_filter(x: MetadataWorkUnit) -> bool: "folder_abs_path": "my-bucket/my-dir/my-dir2", "platform": "s3", } + + +def test_get_folder_info(): + """ + Test S3Source.get_folder_info returns the latest file in each folder + """ + + def _get_s3_source(path_spec_: PathSpec) -> S3Source: + return S3Source.create( + config_dict={ + "path_spec": { + "include": path_spec_.include, + "table_name": path_spec_.table_name, + }, + }, + ctx=PipelineContext(run_id="test-s3"), + ) + + # arrange + path_spec = PathSpec( + include="s3://my-bucket/{table}/{partition0}/*.csv", + table_name="{table}", + ) + + bucket = Mock() + bucket.objects.filter().page_size = Mock( + return_value=[ + Mock( + bucket_name="my-bucket", + key="my-folder/dir1/0001.csv", + creation_time=datetime(2025, 1, 1, 1), + last_modified=datetime(2025, 1, 1, 1), + size=100, + ), + Mock( + bucket_name="my-bucket", + key="my-folder/dir2/0001.csv", + creation_time=datetime(2025, 1, 1, 2), + last_modified=datetime(2025, 1, 1, 2), + size=100, + ), + Mock( + bucket_name="my-bucket", + key="my-folder/dir1/0002.csv", + creation_time=datetime(2025, 1, 1, 2), + last_modified=datetime(2025, 1, 1, 2), + size=100, + ), + ] + ) + + # act + res = _get_s3_source(path_spec).get_folder_info( + path_spec, bucket, prefix="/my-folder" + ) + + # assert + assert len(res) == 2 + assert res[0].sample_file == "s3://my-bucket/my-folder/dir1/0002.csv" + assert res[1].sample_file == "s3://my-bucket/my-folder/dir2/0001.csv" diff --git a/metadata-ingestion/tests/unit/s3/test_s3_util.py b/metadata-ingestion/tests/unit/s3/test_s3_util.py new file mode 100644 index 00000000000000..7850d65ca8b01f --- /dev/null +++ b/metadata-ingestion/tests/unit/s3/test_s3_util.py @@ -0,0 +1,29 @@ +from unittest.mock import Mock + +from datahub.ingestion.source.aws.s3_util import group_s3_objects_by_dirname + + +def test_group_s3_objects_by_dirname(): + s3_objects = [ + Mock(key="/dir1/file1.txt"), + Mock(key="/dir2/file2.txt"), + Mock(key="/dir1/file3.txt"), + ] + + grouped_objects = group_s3_objects_by_dirname(s3_objects) + + assert len(grouped_objects) == 2 + assert grouped_objects["/dir1"] == [s3_objects[0], s3_objects[2]] + assert grouped_objects["/dir2"] == [s3_objects[1]] + + +def test_group_s3_objects_by_dirname_files_in_root_directory(): + s3_objects = [ + Mock(key="file1.txt"), + Mock(key="file2.txt"), + ] + + grouped_objects = group_s3_objects_by_dirname(s3_objects) + + assert len(grouped_objects) == 1 + assert grouped_objects["/"] == s3_objects diff --git a/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py b/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py new file mode 100644 index 00000000000000..d03b08b77d5a96 --- /dev/null +++ b/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py @@ -0,0 +1,213 @@ +from unittest.mock import Mock + +import pytest +import yaml + +from datahub.api.entities.structuredproperties.structuredproperties import ( + AllowedValue, + StructuredProperties, + TypeQualifierAllowedTypes, +) +from datahub.ingestion.graph.client import DataHubGraph +from datahub.metadata.schema_classes import ( + PropertyValueClass, + StructuredPropertyDefinitionClass, +) + + +@pytest.fixture +def sample_yaml_content(): + return """ +- id: test_property + type: string + description: Test description + display_name: Test Property + entity_types: + - dataset + cardinality: SINGLE + allowed_values: + - value: test_value + description: Test value description +""" + + +@pytest.fixture +def sample_yaml_file(tmp_path, sample_yaml_content): + yaml_file = tmp_path / "test_properties.yaml" + yaml_file.write_text(sample_yaml_content) + return str(yaml_file) + + +@pytest.fixture +def mock_graph(): + return Mock(spec=DataHubGraph) + + +def test_structured_properties_basic_creation(): + props = StructuredProperties( + id="test_prop", type="string", description="Test description" + ) + assert props.id == "test_prop" + assert props.type == "urn:li:dataType:datahub.string" + assert props.description == "Test description" + assert props.urn == "urn:li:structuredProperty:test_prop" + + +def test_structured_properties_validate_type(): + # Test valid types + props = StructuredProperties(id="test", type="string") + assert props.type == "urn:li:dataType:datahub.string" + + # Test invalid type + with pytest.raises(ValueError, match="Type .* is not allowed"): + StructuredProperties(id="test", type="invalid_type") + + +def test_structured_properties_validate_entity_types(): + # Test valid entity type + props = StructuredProperties(id="test", type="string", entity_types=["dataset"]) + assert props.entity_types + assert "urn:li:entityType:datahub.dataset" in props.entity_types + + # Test invalid entity type + with pytest.raises(ValueError, match="not a valid entity type"): + StructuredProperties(id="test", type="string", entity_types=["invalid_entity"]) + + +def test_structured_properties_from_yaml(sample_yaml_file): + props = StructuredProperties.from_yaml(sample_yaml_file) + assert len(props) == 1 + assert props[0].id == "test_property" + assert props[0].type == "urn:li:dataType:datahub.string" + assert props[0].description == "Test description" + assert props[0].display_name + assert props[0].display_name == "Test Property" + assert props[0].allowed_values + assert len(props[0].allowed_values) == 1 + assert props[0].allowed_values[0].value == "test_value" + + +def test_structured_properties_generate_mcps(): + props = StructuredProperties( + id="test_prop", + type="string", + description="Test description", + display_name="Test Property", + entity_types=["dataset"], + allowed_values=[ + AllowedValue(value="test_value", description="Test value description") + ], + ) + + mcps = props.generate_mcps() + assert len(mcps) == 1 + mcp = mcps[0] + + assert mcp.entityUrn == "urn:li:structuredProperty:test_prop" + assert isinstance(mcp.aspect, StructuredPropertyDefinitionClass) + assert mcp.aspect.valueType == "urn:li:dataType:datahub.string" + assert mcp.aspect.description == "Test description" + assert mcp.aspect.allowedValues + assert len(mcp.aspect.allowedValues) == 1 + assert mcp.aspect.allowedValues[0].value == "test_value" + + +def test_structured_properties_from_datahub(mock_graph): + mock_aspect = StructuredPropertyDefinitionClass( + qualifiedName="test_prop", + valueType="urn:li:dataType:datahub.string", + displayName="Test Property", + description="Test description", + entityTypes=["urn:li:entityType:datahub.dataset"], + cardinality="SINGLE", + allowedValues=[ + PropertyValueClass(value="test_value", description="Test description") + ], + ) + + mock_graph.get_aspect.return_value = mock_aspect + + props = StructuredProperties.from_datahub( + mock_graph, "urn:li:structuredProperty:test_prop" + ) + + assert props.qualified_name == "test_prop" + assert props.type == "urn:li:dataType:datahub.string" + assert props.display_name == "Test Property" + assert props.allowed_values + assert len(props.allowed_values) == 1 + assert props.allowed_values[0].value == "test_value" + + +def test_structured_properties_to_yaml(tmp_path): + props = StructuredProperties( + id="test_prop", + type="string", + description="Test description", + allowed_values=[ + AllowedValue(value="test_value", description="Test value description") + ], + ) + + yaml_file = tmp_path / "output.yaml" + props.to_yaml(yaml_file) + + # Verify the yaml file was created and contains expected content + assert yaml_file.exists() + with open(yaml_file) as f: + content = yaml.safe_load(f) + assert content["id"] == "test_prop" + assert content["type"] == "urn:li:dataType:datahub.string" + assert content["description"] == "Test description" + + +@pytest.mark.parametrize( + "input_type,expected_type", + [ + ("string", "urn:li:dataType:datahub.string"), + ("STRING", "urn:li:dataType:datahub.string"), + ("number", "urn:li:dataType:datahub.number"), + ("date", "urn:li:dataType:datahub.date"), + ], +) +def test_structured_properties_type_normalization(input_type, expected_type): + props = StructuredProperties(id="test_prop", type=input_type) + assert props.type == expected_type + + +def test_structured_properties_type_qualifier(): + props = StructuredProperties( + id="test_prop", + type="urn", + type_qualifier=TypeQualifierAllowedTypes(allowed_types=["dataset"]), + ) + + mcps = props.generate_mcps() + assert mcps[0].aspect + assert mcps[0].aspect.typeQualifier["allowedTypes"] == [ # type: ignore + "urn:li:entityType:datahub.dataset" + ] + + +def test_structured_properties_list(mock_graph): + mock_graph.get_urns_by_filter.return_value = [ + "urn:li:structuredProperty:prop1", + "urn:li:structuredProperty:prop2", + ] + + mock_aspect = StructuredPropertyDefinitionClass( + qualifiedName="test_prop", + valueType="urn:li:dataType:string", + entityTypes=["urn:li:entityType:datahub.dataset"], + ) + mock_graph.get_aspect.return_value = mock_aspect + + props = list(StructuredProperties.list(mock_graph)) + + # Verify get_urns_by_filter was called with correct arguments + mock_graph.get_urns_by_filter.assert_called_once_with( + entity_types=["structuredProperty"] + ) + + assert len(props) == 2 + assert all(isinstance(prop, StructuredProperties) for prop in props) diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py index 0c362473c0cf18..bee80ec33148e9 100644 --- a/metadata-ingestion/tests/unit/urns/test_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_urn.py @@ -4,7 +4,13 @@ import pytest -from datahub.metadata.urns import CorpUserUrn, DatasetUrn, Urn +from datahub.metadata.urns import ( + CorpUserUrn, + DataPlatformUrn, + DatasetUrn, + SchemaFieldUrn, + Urn, +) from datahub.utilities.urns.error import InvalidUrnError pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -60,6 +66,20 @@ def test_urn_coercion() -> None: assert urn == Urn.from_string(urn.urn()) +def test_urns_in_init() -> None: + platform = DataPlatformUrn("abc") + assert platform.urn() == "urn:li:dataPlatform:abc" + + dataset_urn = DatasetUrn(platform, "def", "PROD") + assert dataset_urn.urn() == "urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)" + + schema_field = SchemaFieldUrn(dataset_urn, "foo") + assert ( + schema_field.urn() + == "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD),foo)" + ) + + def test_urn_type_dispatch_1() -> None: urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)") assert isinstance(urn, DatasetUrn) diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 0193e5e2c5c6c3..1556b72e4aefb1 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -70,6 +70,7 @@ entities: - glossaryTerms - institutionalMemory - dataPlatformInstance + - container - browsePathsV2 - structuredProperties - forms @@ -93,6 +94,7 @@ entities: - glossaryTerms - institutionalMemory - dataPlatformInstance + - container - browsePathsV2 - structuredProperties - incidentsSummary diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml index c0c5be85b16b1d..8879a2f6549945 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml @@ -21,19 +21,30 @@ truncate_indices: {{truncate_indices}}{{^truncate_indices}}true{{/truncate_indices}} truncate_index_older_than_days: {{truncate_indices_retention_days}}{{^truncate_indices_retention_days}}30{{/truncate_indices_retention_days}} dataprocess_cleanup: + enabled: {{dataprocess_cleanup.enabled}}{{^dataprocess_cleanup.enabled}}false{{/dataprocess_cleanup.enabled}} retention_days: {{dataprocess_cleanup.retention_days}}{{^dataprocess_cleanup.retention_days}}10{{/dataprocess_cleanup.retention_days}} - delete_empty_data_jobs: {{dataprocess_cleanup.delete_empty_data_jobs}}{{^dataprocess_cleanup.delete_empty_data_jobs}}true{{/dataprocess_cleanup.delete_empty_data_jobs}} - delete_empty_data_flows: {{dataprocess_cleanup.delete_empty_data_flows}}{{^dataprocess_cleanup.delete_empty_data_flows}}true{{/dataprocess_cleanup.delete_empty_data_flows}} + delete_empty_data_jobs: {{dataprocess_cleanup.delete_empty_data_jobs}}{{^dataprocess_cleanup.delete_empty_data_jobs}}false{{/dataprocess_cleanup.delete_empty_data_jobs}} + delete_empty_data_flows: {{dataprocess_cleanup.delete_empty_data_flows}}{{^dataprocess_cleanup.delete_empty_data_flows}}false{{/dataprocess_cleanup.delete_empty_data_flows}} hard_delete_entities: {{dataprocess_cleanup.hard_delete_entities}}{{^dataprocess_cleanup.hard_delete_entities}}false{{/dataprocess_cleanup.hard_delete_entities}} keep_last_n: {{dataprocess_cleanup.keep_last_n}}{{^dataprocess_cleanup.keep_last_n}}5{{/dataprocess_cleanup.keep_last_n}} + batch_size: {{dataprocess_cleanup.batch_size}}{{^dataprocess_cleanup.batch_size}}500{{/dataprocess_cleanup.batch_size}} + max_workers: {{dataprocess_cleanup.max_workers}}{{^dataprocess_cleanup.max_workers}}10{{/dataprocess_cleanup.max_workers}} soft_deleted_entities_cleanup: retention_days: {{soft_deleted_entities_cleanup.retention_days}}{{^soft_deleted_entities_cleanup.retention_days}}10{{/soft_deleted_entities_cleanup.retention_days}} + enabled: {{soft_deleted_entities_cleanup.enabled}}{{^soft_deleted_entities_cleanup.enabled}}true{{/soft_deleted_entities_cleanup.enabled}} + batch_size: {{soft_deleted_entities_cleanup.batch_size}}{{^soft_deleted_entities_cleanup.batch_size}}500{{/soft_deleted_entities_cleanup.batch_size}} + max_workers: {{soft_deleted_entities_cleanup.max_workers}}{{^soft_deleted_entities_cleanup.max_workers}}10{{/soft_deleted_entities_cleanup.max_workers}} + limit_entities_delete: {{soft_deleted_entities_cleanup.limit_entities_delete}}{{^soft_deleted_entities_cleanup.limit_entities_delete}}25000{{/soft_deleted_entities_cleanup.limit_entities_delete}} + runtime_limit_seconds: {{soft_deleted_entities_cleanup.runtime_limit_seconds}}{{^soft_deleted_entities_cleanup.runtime_limit_seconds}}7200{{/soft_deleted_entities_cleanup.runtime_limit_seconds}} execution_request_cleanup: keep_history_min_count: {{execution_request_cleanup.keep_history_min_count}}{{^execution_request_cleanup.keep_history_min_count}}10{{/execution_request_cleanup.keep_history_min_count}} keep_history_max_count: {{execution_request_cleanup.keep_history_max_count}}{{^execution_request_cleanup.keep_history_max_count}}1000{{/execution_request_cleanup.keep_history_max_count}} - keep_history_max_days: {{execution_request_cleanup.keep_history_max_days}}{{^execution_request_cleanup.keep_history_max_days}}30{{/execution_request_cleanup.keep_history_max_days}} + keep_history_max_days: {{execution_request_cleanup.keep_history_max_days}}{{^execution_request_cleanup.keep_history_max_days}}90{{/execution_request_cleanup.keep_history_max_days}} batch_read_size: {{execution_request_cleanup.batch_read_size}}{{^execution_request_cleanup.batch_read_size}}100{{/execution_request_cleanup.batch_read_size}} - enabled: {{execution_request_cleanup.enabled}}{{^execution_request_cleanup.enabled}}false{{/execution_request_cleanup.enabled}} + enabled: {{execution_request_cleanup.enabled}}{{^execution_request_cleanup.enabled}}true{{/execution_request_cleanup.enabled}} + runtime_limit_seconds: {{execution_request_cleanup.runtime_limit_seconds}}{{^execution_request_cleanup.runtime_limit_seconds}}3600{{/execution_request_cleanup.runtime_limit_seconds}} + limit_entities_delete: {{execution_request_cleanup.limit_entities_delete}}{{^execution_request_cleanup.limit_entities_delete}}10000{{/execution_request_cleanup.limit_entities_delete}} + max_read_errors: {{execution_request_cleanup.max_read_errors}}{{^execution_request_cleanup.max_read_errors}}10{{/execution_request_cleanup.max_read_errors}} extraArgs: {} debugMode: false executorId: default diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index 8d4c5e9228a71c..ca775619220831 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -103,10 +103,10 @@ import java.util.stream.StreamSupport; import javax.annotation.Nonnull; import javax.annotation.Nullable; -import javax.mail.MethodNotSupportedException; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.NotImplementedException; +import org.apache.http.MethodNotSupportedException; import org.opensearch.core.common.util.CollectionUtils; @Slf4j @@ -1195,7 +1195,7 @@ public DataMap getRawAspect( @Nonnull String aspect, @Nonnull Long version) throws RemoteInvocationException { - throw new MethodNotSupportedException(); + throw new MethodNotSupportedException("Method not supported"); } @Override diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle index 07ce50993655d2..4b24eeac50b0b7 100644 --- a/metadata-utils/build.gradle +++ b/metadata-utils/build.gradle @@ -26,6 +26,7 @@ dependencies { implementation externalDependency.slf4jApi compileOnly externalDependency.lombok + runtimeOnly externalDependency.javaxMail annotationProcessor externalDependency.lombok @@ -40,6 +41,9 @@ dependencies { implementation(externalDependency.log4jApi) { because("previous versions are vulnerable to CVE-2021-45105") } + implementation(externalDependency.javaxMail) { + because("previous versions are vulnerable") + } } implementation externalDependency.logbackClassic diff --git a/smoke-test/tests/structured_properties/test_structured_properties.py b/smoke-test/tests/structured_properties/test_structured_properties.py index 533a03a55735a1..e3c33aa406efc4 100644 --- a/smoke-test/tests/structured_properties/test_structured_properties.py +++ b/smoke-test/tests/structured_properties/test_structured_properties.py @@ -839,3 +839,49 @@ def validate_search(qualified_name, expected): # Validate search works for property #1 & #2 validate_search(property1.qualified_name, expected=[]) validate_search(property2.qualified_name, expected=[dataset_urns[0]]) + + +def test_structured_properties_list(ingest_cleanup_data, graph_client, caplog): + # Create property, assign value to target dataset urn + def create_property(): + property_name = f"listTest{randint(10, 10000)}Property" + value_type = "string" + property_urn = f"urn:li:structuredProperty:{default_namespace}.{property_name}" + + create_property_definition( + property_name=property_name, + graph=graph_client, + value_type=value_type, + cardinality="SINGLE", + ) + + test_property = StructuredProperties.from_datahub( + graph=graph_client, urn=property_urn + ) + assert test_property is not None + + return test_property + + # create 2 structured properties + property1 = create_property() + property2 = create_property() + wait_for_writes_to_sync() + + # validate that urns are in the list + structured_properties_urns = [ + u for u in StructuredProperties.list_urns(graph_client) + ] + assert property1.urn in structured_properties_urns + assert property2.urn in structured_properties_urns + + # list structured properties (full) + structured_properties = StructuredProperties.list(graph_client) + matched_properties = [ + p for p in structured_properties if p.urn in [property1.urn, property2.urn] + ] + assert len(matched_properties) == 2 + retrieved_property1 = next(p for p in matched_properties if p.urn == property1.urn) + retrieved_property2 = next(p for p in matched_properties if p.urn == property2.urn) + + assert property1.dict() == retrieved_property1.dict() + assert property2.dict() == retrieved_property2.dict()