diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index eefa02be4f1af..26fcceb8aeab7 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -27,7 +27,6 @@ jobs: airflow-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -69,7 +68,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10' && matrix.extra_pip_requirements == 'apache-airflow>=2.7.0' }} with: name: Test Results (Airflow Plugin ${{ matrix.python-version}}) @@ -93,7 +92,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index bee1ec95e7774..d8a9cd7bfd6a3 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -27,7 +27,6 @@ jobs: dagster-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -44,7 +43,8 @@ jobs: with: distribution: "zulu" java-version: 17 - - uses: actions/checkout@v4 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -56,7 +56,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/dagster-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} with: name: Test Results (dagster Plugin ${{ matrix.python-version}}) @@ -79,7 +79,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml index 595438bd6e4a9..2fd814a076485 100644 --- a/.github/workflows/gx-plugin.yml +++ b/.github/workflows/gx-plugin.yml @@ -27,7 +27,6 @@ jobs: gx-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -48,7 +47,8 @@ jobs: with: distribution: "zulu" java-version: 17 - - uses: actions/checkout@v4 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -60,7 +60,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/gx-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.11' && matrix.extraPythonRequirement == 'great-expectations~=0.17.0' }} with: name: Test Results (GX Plugin ${{ matrix.python-version}}) @@ -83,7 +83,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 49def2a863c56..ad00c6d1551d1 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -28,7 +28,6 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 40 env: - SPARK_VERSION: 3.3.2 DATAHUB_TELEMETRY_ENABLED: false # TODO: Enable this once the test is fixed. # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} @@ -84,9 +83,9 @@ jobs: df -hl docker image ls docker system df - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: - name: Test Results (metadata ingestion ${{ matrix.python-version }}) + name: Test Results (metadata ingestion ${{ matrix.python-version }} ${{ matrix.command }}) path: | **/build/reports/tests/test/** **/build/test-results/test/** @@ -100,14 +99,14 @@ jobs: directory: ./build/coverage-reports/ fail_ci_if_error: false flags: pytest-${{ matrix.command }} - name: pytest-${{ matrix.command }} + name: pytest-${{ matrix.python-version }}-${{ matrix.command }} verbose: true event-file: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml index 3c75e8fe9a62f..e4a70426f3a61 100644 --- a/.github/workflows/prefect-plugin.yml +++ b/.github/workflows/prefect-plugin.yml @@ -27,25 +27,20 @@ jobs: prefect-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: python-version: ["3.8", "3.9", "3.10"] - include: - - python-version: "3.8" - - python-version: "3.9" - - python-version: "3.10" fail-fast: false steps: - name: Set up JDK 17 - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: "zulu" java-version: 17 - uses: gradle/actions/setup-gradle@v3 - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: "pip" @@ -56,7 +51,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/prefect-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10'}} with: name: Test Results (Prefect Plugin ${{ matrix.python-version}}) @@ -72,7 +67,7 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} directory: ./build/coverage-reports/ fail_ci_if_error: false - flags: prefect,prefect-${{ matrix.extra_pip_extras }} + flags: prefect,prefect-${{ matrix.python-version }} name: pytest-prefect-${{ matrix.python-version }} verbose: true @@ -80,7 +75,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/python-build-pages.yml b/.github/workflows/python-build-pages.yml new file mode 100644 index 0000000000000..8971722c374fb --- /dev/null +++ b/.github/workflows/python-build-pages.yml @@ -0,0 +1,64 @@ +name: Python Build +on: + push: + branches: + - master + paths: + - ".github/workflows/python-build-pages.yml" + - "metadata-ingestion/**" + - "metadata-ingestion-modules/**" + - "metadata-models/**" + pull_request: + branches: + - "**" + paths: + - ".github/workflows/python-build-pages.yml" + - "metadata-ingestion/**" + - "metadata-ingestion-modules/**" + - "metadata-models/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + deploy-pages: + runs-on: ubuntu-latest + if: ${{ vars.CLOUDFLARE_WHEELS_PROJECT_NAME != '' }} + + name: Python Wheels + permissions: + contents: read + pull-requests: read + deployments: write + steps: + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + distribution: "zulu" + java-version: 17 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: "pip" + - uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} + - name: Build Python wheel site + run: | + ./gradlew :python-build:buildSite + env: + GITHUB_TOKEN: ${{ github.token }} + - name: Publish + uses: cloudflare/pages-action@v1 + with: + apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} + accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + projectName: ${{ vars.CLOUDFLARE_WHEELS_PROJECT_NAME }} + workingDirectory: python-build + directory: site + gitHubToken: ${{ github.token }} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/TimeStampMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/TimeStampMapper.java new file mode 100644 index 0000000000000..9792079e66f64 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/TimeStampMapper.java @@ -0,0 +1,28 @@ +package com.linkedin.datahub.graphql.types.common.mappers; + +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.TimeStamp; +import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +public class TimeStampMapper implements ModelMapper { + + public static final TimeStampMapper INSTANCE = new TimeStampMapper(); + + public static TimeStamp map( + @Nullable QueryContext context, @Nonnull final com.linkedin.common.TimeStamp timestamp) { + return INSTANCE.apply(context, timestamp); + } + + @Override + public TimeStamp apply( + @Nullable QueryContext context, @Nonnull final com.linkedin.common.TimeStamp timestamp) { + final TimeStamp result = new TimeStamp(); + result.setTime(timestamp.getTime()); + if (timestamp.hasActor()) { + result.setActor(timestamp.getActor().toString()); + } + return result; + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java new file mode 100644 index 0000000000000..62e7c90ab9b0e --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java @@ -0,0 +1,12 @@ +package com.linkedin.datahub.graphql.types.mappers; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.QueryContext; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +/** Made for models that are embedded in other models and thus do not encode their own URN. */ +public interface EmbeddedModelMapper { + O apply( + @Nullable final QueryContext context, @Nonnull final I input, @Nonnull final Urn entityUrn); +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java index d5eb1a15624dc..74076fd2f4ee9 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java @@ -75,7 +75,8 @@ public MLFeature apply( mlFeature.setOwnership( OwnershipMapper.map(context, new Ownership(dataMap), entityUrn))); mappingHelper.mapToResult( - context, ML_FEATURE_PROPERTIES_ASPECT_NAME, MLFeatureMapper::mapMLFeatureProperties); + ML_FEATURE_PROPERTIES_ASPECT_NAME, + (entity, dataMap) -> mapMLFeatureProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( INSTITUTIONAL_MEMORY_ASPECT_NAME, (mlFeature, dataMap) -> @@ -138,10 +139,13 @@ private static void mapMLFeatureKey(@Nonnull MLFeature mlFeature, @Nonnull DataM private static void mapMLFeatureProperties( @Nullable final QueryContext context, @Nonnull MLFeature mlFeature, - @Nonnull DataMap dataMap) { + @Nonnull DataMap dataMap, + @Nonnull Urn entityUrn) { MLFeatureProperties featureProperties = new MLFeatureProperties(dataMap); - mlFeature.setFeatureProperties(MLFeaturePropertiesMapper.map(context, featureProperties)); - mlFeature.setProperties(MLFeaturePropertiesMapper.map(context, featureProperties)); + com.linkedin.datahub.graphql.generated.MLFeatureProperties graphqlProperties = + MLFeaturePropertiesMapper.map(context, featureProperties, entityUrn); + mlFeature.setFeatureProperties(graphqlProperties); + mlFeature.setProperties(graphqlProperties); mlFeature.setDescription(featureProperties.getDescription()); if (featureProperties.getDataType() != null) { mlFeature.setDataType(MLFeatureDataType.valueOf(featureProperties.getDataType().toString())); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java index 92d090275867d..08ac3a1b5f138 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java @@ -1,29 +1,34 @@ package com.linkedin.datahub.graphql.types.mlmodel.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.MLFeatureDataType; import com.linkedin.datahub.graphql.generated.MLFeatureProperties; -import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; public class MLFeaturePropertiesMapper - implements ModelMapper { + implements EmbeddedModelMapper< + com.linkedin.ml.metadata.MLFeatureProperties, MLFeatureProperties> { public static final MLFeaturePropertiesMapper INSTANCE = new MLFeaturePropertiesMapper(); public static MLFeatureProperties map( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties) { - return INSTANCE.apply(context, mlFeatureProperties); + @Nonnull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties, + @Nonnull Urn entityUrn) { + return INSTANCE.apply(context, mlFeatureProperties, entityUrn); } @Override public MLFeatureProperties apply( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties) { + @Nonnull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties, + @Nonnull Urn entityUrn) { final MLFeatureProperties result = new MLFeatureProperties(); result.setDescription(mlFeatureProperties.getDescription()); @@ -45,6 +50,9 @@ public MLFeatureProperties apply( .collect(Collectors.toList())); } + result.setCustomProperties( + CustomPropertiesMapper.map(mlFeatureProperties.getCustomProperties(), entityUrn)); + return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java index 51d3004d97a61..65bc8e84f7bbb 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java @@ -76,7 +76,7 @@ public MLFeatureTable apply( mappingHelper.mapToResult(ML_FEATURE_TABLE_KEY_ASPECT_NAME, this::mapMLFeatureTableKey); mappingHelper.mapToResult( ML_FEATURE_TABLE_PROPERTIES_ASPECT_NAME, - (entity, dataMap) -> this.mapMLFeatureTableProperties(context, entity, dataMap, entityUrn)); + (entity, dataMap) -> mapMLFeatureTableProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( INSTITUTIONAL_MEMORY_ASPECT_NAME, (mlFeatureTable, dataMap) -> @@ -146,10 +146,10 @@ private static void mapMLFeatureTableProperties( @Nonnull DataMap dataMap, Urn entityUrn) { MLFeatureTableProperties featureTableProperties = new MLFeatureTableProperties(dataMap); - mlFeatureTable.setFeatureTableProperties( - MLFeatureTablePropertiesMapper.map(context, featureTableProperties, entityUrn)); - mlFeatureTable.setProperties( - MLFeatureTablePropertiesMapper.map(context, featureTableProperties, entityUrn)); + com.linkedin.datahub.graphql.generated.MLFeatureTableProperties graphqlProperties = + MLFeatureTablePropertiesMapper.map(context, featureTableProperties, entityUrn); + mlFeatureTable.setFeatureTableProperties(graphqlProperties); + mlFeatureTable.setProperties(graphqlProperties); mlFeatureTable.setDescription(featureTableProperties.getDescription()); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java index d9fed13ed0d0b..3c054cb6a9a5b 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java @@ -8,26 +8,30 @@ import com.linkedin.datahub.graphql.generated.MLFeatureTableProperties; import com.linkedin.datahub.graphql.generated.MLPrimaryKey; import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; -public class MLFeatureTablePropertiesMapper { +public class MLFeatureTablePropertiesMapper + implements EmbeddedModelMapper< + com.linkedin.ml.metadata.MLFeatureTableProperties, MLFeatureTableProperties> { public static final MLFeatureTablePropertiesMapper INSTANCE = new MLFeatureTablePropertiesMapper(); public static MLFeatureTableProperties map( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, - Urn entityUrn) { + @Nonnull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, + @Nonnull Urn entityUrn) { return INSTANCE.apply(context, mlFeatureTableProperties, entityUrn); } - public static MLFeatureTableProperties apply( + @Override + public MLFeatureTableProperties apply( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, - Urn entityUrn) { + @Nonnull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, + @Nonnull Urn entityUrn) { final MLFeatureTableProperties result = new MLFeatureTableProperties(); result.setDescription(mlFeatureTableProperties.getDescription()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java index 6e3da1c153392..9009972a47616 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java @@ -75,9 +75,8 @@ public MLModelGroup apply( mappingHelper.mapToResult( ML_MODEL_GROUP_KEY_ASPECT_NAME, MLModelGroupMapper::mapToMLModelGroupKey); mappingHelper.mapToResult( - context, ML_MODEL_GROUP_PROPERTIES_ASPECT_NAME, - MLModelGroupMapper::mapToMLModelGroupProperties); + (entity, dataMap) -> mapToMLModelGroupProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( STATUS_ASPECT_NAME, (mlModelGroup, dataMap) -> @@ -136,9 +135,13 @@ private static void mapToMLModelGroupKey(MLModelGroup mlModelGroup, DataMap data } private static void mapToMLModelGroupProperties( - @Nullable final QueryContext context, MLModelGroup mlModelGroup, DataMap dataMap) { + @Nullable final QueryContext context, + MLModelGroup mlModelGroup, + DataMap dataMap, + @Nonnull Urn entityUrn) { MLModelGroupProperties modelGroupProperties = new MLModelGroupProperties(dataMap); - mlModelGroup.setProperties(MLModelGroupPropertiesMapper.map(context, modelGroupProperties)); + mlModelGroup.setProperties( + MLModelGroupPropertiesMapper.map(context, modelGroupProperties, entityUrn)); if (modelGroupProperties.getDescription() != null) { mlModelGroup.setDescription(modelGroupProperties.getDescription()); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java index 9f1918f9ec489..a6cfded9865d9 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java @@ -1,27 +1,31 @@ package com.linkedin.datahub.graphql.types.mlmodel.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.MLModelGroupProperties; -import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; public class MLModelGroupPropertiesMapper - implements ModelMapper< + implements EmbeddedModelMapper< com.linkedin.ml.metadata.MLModelGroupProperties, MLModelGroupProperties> { public static final MLModelGroupPropertiesMapper INSTANCE = new MLModelGroupPropertiesMapper(); public static MLModelGroupProperties map( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties) { - return INSTANCE.apply(context, mlModelGroupProperties); + @Nonnull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties, + @Nonnull Urn entityUrn) { + return INSTANCE.apply(context, mlModelGroupProperties, entityUrn); } @Override public MLModelGroupProperties apply( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties) { + @Nonnull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties, + @Nonnull Urn entityUrn) { final MLModelGroupProperties result = new MLModelGroupProperties(); result.setDescription(mlModelGroupProperties.getDescription()); @@ -30,6 +34,9 @@ public MLModelGroupProperties apply( } result.setCreatedAt(mlModelGroupProperties.getCreatedAt()); + result.setCustomProperties( + CustomPropertiesMapper.map(mlModelGroupProperties.getCustomProperties(), entityUrn)); + return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java index a89904b3ab915..265005c2caa9e 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java @@ -7,25 +7,27 @@ import com.linkedin.datahub.graphql.generated.MLModelGroup; import com.linkedin.datahub.graphql.generated.MLModelProperties; import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; -public class MLModelPropertiesMapper { +public class MLModelPropertiesMapper + implements EmbeddedModelMapper { public static final MLModelPropertiesMapper INSTANCE = new MLModelPropertiesMapper(); public static MLModelProperties map( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, + @Nonnull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, Urn entityUrn) { return INSTANCE.apply(context, mlModelProperties, entityUrn); } public MLModelProperties apply( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, - Urn entityUrn) { + @Nonnull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, + @Nonnull Urn entityUrn) { final MLModelProperties result = new MLModelProperties(); result.setDate(mlModelProperties.getDate()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java index c446c892cb223..d48d93ede9c1a 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java @@ -74,9 +74,8 @@ public MLPrimaryKey apply( mappingHelper.mapToResult( ML_PRIMARY_KEY_KEY_ASPECT_NAME, MLPrimaryKeyMapper::mapMLPrimaryKeyKey); mappingHelper.mapToResult( - context, ML_PRIMARY_KEY_PROPERTIES_ASPECT_NAME, - MLPrimaryKeyMapper::mapMLPrimaryKeyProperties); + (entity, dataMap) -> mapMLPrimaryKeyProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( INSTITUTIONAL_MEMORY_ASPECT_NAME, (mlPrimaryKey, dataMap) -> @@ -132,11 +131,15 @@ private static void mapMLPrimaryKeyKey(MLPrimaryKey mlPrimaryKey, DataMap dataMa } private static void mapMLPrimaryKeyProperties( - @Nullable final QueryContext context, MLPrimaryKey mlPrimaryKey, DataMap dataMap) { + @Nullable final QueryContext context, + MLPrimaryKey mlPrimaryKey, + DataMap dataMap, + @Nonnull Urn entityUrn) { MLPrimaryKeyProperties primaryKeyProperties = new MLPrimaryKeyProperties(dataMap); - mlPrimaryKey.setPrimaryKeyProperties( - MLPrimaryKeyPropertiesMapper.map(context, primaryKeyProperties)); - mlPrimaryKey.setProperties(MLPrimaryKeyPropertiesMapper.map(context, primaryKeyProperties)); + com.linkedin.datahub.graphql.generated.MLPrimaryKeyProperties graphqlProperties = + MLPrimaryKeyPropertiesMapper.map(context, primaryKeyProperties, entityUrn); + mlPrimaryKey.setPrimaryKeyProperties(graphqlProperties); + mlPrimaryKey.setProperties(graphqlProperties); mlPrimaryKey.setDescription(primaryKeyProperties.getDescription()); if (primaryKeyProperties.getDataType() != null) { mlPrimaryKey.setDataType( diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java index 09e41fe7ee4e8..0bbe8f53f3271 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java @@ -1,30 +1,34 @@ package com.linkedin.datahub.graphql.types.mlmodel.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.MLFeatureDataType; import com.linkedin.datahub.graphql.generated.MLPrimaryKeyProperties; -import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; public class MLPrimaryKeyPropertiesMapper - implements ModelMapper< + implements EmbeddedModelMapper< com.linkedin.ml.metadata.MLPrimaryKeyProperties, MLPrimaryKeyProperties> { public static final MLPrimaryKeyPropertiesMapper INSTANCE = new MLPrimaryKeyPropertiesMapper(); public static MLPrimaryKeyProperties map( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties) { - return INSTANCE.apply(context, mlPrimaryKeyProperties); + @Nonnull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties, + @Nonnull Urn entityUrn) { + return INSTANCE.apply(context, mlPrimaryKeyProperties, entityUrn); } @Override public MLPrimaryKeyProperties apply( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties) { + @Nonnull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties, + @Nonnull Urn entityUrn) { final MLPrimaryKeyProperties result = new MLPrimaryKeyProperties(); result.setDescription(mlPrimaryKeyProperties.getDescription()); @@ -45,6 +49,9 @@ public MLPrimaryKeyProperties apply( }) .collect(Collectors.toList())); + result.setCustomProperties( + CustomPropertiesMapper.map(mlPrimaryKeyProperties.getCustomProperties(), entityUrn)); + return result; } } diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 049527e5d77e3..b003cbd9b23dc 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -14,6 +14,11 @@ Root type used for fetching DataHub Metadata Coming soon listEntity queries for listing all entities of a given type """ type Query { + _empty: String +} + + +extend type Query { """ Fetch a Data Platform by primary key (urn) """ @@ -262,8 +267,16 @@ type Query { Fetch all Business Attributes """ listBusinessAttributes(input: ListBusinessAttributesInput!): ListBusinessAttributesResult + + """ + Fetch a Data Process Instance by primary key (urn) + """ + dataProcessInstance(urn: String!): DataProcessInstance + + } + """ An ERModelRelationship is a high-level abstraction that dictates what datasets fields are erModelRelationshiped. """ @@ -9483,6 +9496,18 @@ type AuditStamp { actor: String } +type TimeStamp { + """ + When did the event occur + """ + time: Long! + + """ + The actor involved in the event + """ + actor: String +} + """ Input for creating a new group """ @@ -9827,13 +9852,47 @@ type MLModelGroup implements EntityWithRelationships & Entity & BrowsableEntity privileges: EntityPrivileges } +""" +Properties describing a group of related ML models +""" type MLModelGroupProperties { + """ + Display name of the model group + """ + name: String -description: String +<<<<<<< HEAD + """ + Detailed description of the model group's purpose and contents + """ +======= +>>>>>>> b7bb5ca7e (feat(graphql/ml): Add custom properties to ml entities (#12152)) + description: String - createdAt: Long + """ + When this model group was created + """ + created: TimeStamp + """ + When this model group was last modified + """ + lastModified: TimeStamp + + """ + Version identifier for this model group + """ version: VersionTag + +<<<<<<< HEAD + """ + Deprecated creation timestamp + @deprecated Use the 'created' field instead + """ + createdAt: Long @deprecated(reason: "Use `created` instead") +======= + customProperties: [CustomPropertiesEntry!] +>>>>>>> b7bb5ca7e (feat(graphql/ml): Add custom properties to ml entities (#12152)) } """ @@ -9983,40 +10042,103 @@ description: String } type MLMetric { + """ + Name of the metric (e.g. accuracy, precision, recall) + """ name: String + """ + Description of what this metric measures + """ description: String + """ + The computed value of the metric + """ value: String + """ + Timestamp when this metric was recorded + """ createdAt: Long } type MLModelProperties { + """ + The display name of the model used in the UI + """ + name: String! + """ + Detailed description of the model's purpose and characteristics + """ description: String - date: Long + """ + When the model was last modified + """ + lastModified: TimeStamp + """ + Version identifier for this model + """ version: String + """ + The type/category of ML model (e.g. classification, regression) + """ type: String + """ + Mapping of hyperparameter configurations + """ hyperParameters: HyperParameterMap - hyperParams: [MLHyperParam] + """ + List of hyperparameter settings used to train this model + """ + hyperParams: [MLHyperParam] + """ + Performance metrics from model training + """ trainingMetrics: [MLMetric] + """ + Names of ML features used by this model + """ mlFeatures: [String!] + """ + Tags for categorizing and searching models + """ tags: [String!] + """ + Model groups this model belongs to + """ groups: [MLModelGroup] + """ + Additional custom properties specific to this model + """ customProperties: [CustomPropertiesEntry!] + """ + URL to view this model in external system + """ externalUrl: String + + """ + When this model was created + """ + created: TimeStamp + + """ + Deprecated timestamp for model creation + @deprecated Use 'created' field instead + """ + date: Long @deprecated(reason: "Use `created` instead") } type MLFeatureProperties { @@ -10028,6 +10150,8 @@ type MLFeatureProperties { version: VersionTag sources: [Dataset] + + customProperties: [CustomPropertiesEntry!] } """ @@ -10164,13 +10288,15 @@ type MLPrimaryKey implements EntityWithRelationships & Entity { type MLPrimaryKeyProperties { -description: String + description: String dataType: MLFeatureDataType version: VersionTag sources: [Dataset] + + customProperties: [CustomPropertiesEntry!] } """ @@ -10347,7 +10473,7 @@ type MLModelGroupEditableProperties { type MLFeatureTableProperties { -description: String + description: String mlFeatures: [MLFeature] @@ -12793,3 +12919,92 @@ type CronSchedule { """ timezone: String! } + + +""" +Properties describing a data process instance's execution metadata +""" +type DataProcessInstanceProperties { + """ + The display name of this process instance + """ + name: String! + + """ + URL to view this process instance in the external system + """ + externalUrl: String + + """ + When this process instance was created + """ + created: TimeStamp + + """ + Additional custom properties specific to this process instance + """ + customProperties: [CustomPropertiesEntry!] +} + +""" +Properties specific to an ML model training run instance +""" +type MLTrainingRunProperties { + """ + Unique identifier for this training run + """ + id: String + + """ + List of URLs to access training run outputs (e.g. model artifacts, logs) + """ + outputUrls: [String] + + """ + Hyperparameters used in this training run + """ + hyperParams: [MLHyperParam] + + """ + Performance metrics recorded during this training run + """ + trainingMetrics: [MLMetric] +} + +extend type DataProcessInstance { + + """ + Additional read only properties associated with the Data Job + """ + properties: DataProcessInstanceProperties + + """ + The specific instance of the data platform that this entity belongs to + """ + dataPlatformInstance: DataPlatformInstance + + """ + Sub Types that this entity implements + """ + subTypes: SubTypes + + """ + The parent container in which the entity resides + """ + container: Container + + """ + Standardized platform urn where the data process instance is defined + """ + platform: DataPlatform! + + """ + Recursively get the lineage of containers for this entity + """ + parentContainers: ParentContainersResult + + """ + Additional properties when subtype is Training Run + """ + mlTrainingRunProperties: MLTrainingRunProperties +} \ No newline at end of file diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index ef482de9256a3..f19faa227ca61 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 7 // increment to trigger rebuild + revision = 8 // increment to trigger rebuild } docker { diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 113a6dcf0a1bd..b236a53c288f7 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 8 // increment to trigger rebuild + revision = 9 // increment to trigger rebuild } dependencies { diff --git a/docker/profiles/docker-compose.actions.yml b/docker/profiles/docker-compose.actions.yml index c2985f4299326..459fffdd8acf3 100644 --- a/docker/profiles/docker-compose.actions.yml +++ b/docker/profiles/docker-compose.actions.yml @@ -6,7 +6,7 @@ x-search-datastore-elasticsearch-env: &search-datastore-env x-datahub-actions-service: &datahub-actions-service hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:-${DATAHUB_ACTIONS_REPO:-acryldata}/datahub-actions}:${ACTIONS_VERSION:-v0.1.1} + image: ${DATAHUB_ACTIONS_IMAGE:-${DATAHUB_ACTIONS_REPO:-acryldata}/datahub-actions}:${ACTIONS_VERSION:-v0.1.6} env_file: - datahub-actions/env/docker.env - ${DATAHUB_LOCAL_COMMON_ENV:-empty.env} diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 1860b4a49ae23..797863d2019fb 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -83,11 +83,7 @@ task yarnInstall(type: YarnTask) { task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLSchema, generateJsonSchema, ':metadata-ingestion:modelDocGen', ':metadata-ingestion:docGen', - ':metadata-ingestion:buildWheel', - ':metadata-ingestion-modules:airflow-plugin:buildWheel', - ':metadata-ingestion-modules:dagster-plugin:buildWheel', - ':metadata-ingestion-modules:prefect-plugin:buildWheel', - ':metadata-ingestion-modules:gx-plugin:buildWheel', + ':python-build:buildWheels', ]) { inputs.files(projectMdFiles) outputs.cacheIf { true } diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index 0f7e347da64eb..ad82a85f9e567 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -573,26 +573,20 @@ function write_markdown_file( function copy_python_wheels(): void { // Copy the built wheel files to the static directory. - const wheel_dirs = [ - "../metadata-ingestion/dist", - "../metadata-ingestion-modules/airflow-plugin/dist", - "../metadata-ingestion-modules/dagster-plugin/dist", - "../metadata-ingestion-modules/prefect-plugin/dist", - "../metadata-ingestion-modules/gx-plugin/dist", - ]; + // Everything is copied to the python-build directory first, so + // we just need to copy from there. + const wheel_dir = "../python-build/wheels"; const wheel_output_directory = path.join(STATIC_DIRECTORY, "wheels"); fs.mkdirSync(wheel_output_directory, { recursive: true }); - for (const wheel_dir of wheel_dirs) { - const wheel_files = fs.readdirSync(wheel_dir); - for (const wheel_file of wheel_files) { - const src = path.join(wheel_dir, wheel_file); - const dest = path.join(wheel_output_directory, wheel_file); + const wheel_files = fs.readdirSync(wheel_dir); + for (const wheel_file of wheel_files) { + const src = path.join(wheel_dir, wheel_file); + const dest = path.join(wheel_output_directory, wheel_file); - // console.log(`Copying artifact ${src} to ${dest}...`); - fs.copyFileSync(src, dest); - } + // console.log(`Copying artifact ${src} to ${dest}...`); + fs.copyFileSync(src, dest); } } diff --git a/docs/advanced/patch.md b/docs/advanced/patch.md index 601d055659313..24e8c68a9168d 100644 --- a/docs/advanced/patch.md +++ b/docs/advanced/patch.md @@ -1,69 +1,120 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# But First, Semantics: Upsert versus Patch +# Emitting Patch Updates to DataHub ## Why Would You Use Patch -By default, most of the SDK tutorials and API-s involve applying full upserts at the aspect level. This means that typically, when you want to change one field within an aspect without modifying others, you need to do a read-modify-write to not overwrite existing fields. -To support these scenarios, DataHub supports PATCH based operations so that targeted changes to single fields or values within arrays of fields are possible without impacting other existing metadata. +By default, most of the SDK tutorials and APIs involve applying full upserts at the aspect level, e.g. replacing the aspect entirely. +This means that when you want to change even a single field within an aspect without modifying others, you need to do a read-modify-write to avoid overwriting existing fields. +To support these scenarios, DataHub supports `PATCH` operations to perform targeted changes for individual fields or values within arrays of fields are possible without impacting other existing metadata. :::note -Currently, PATCH support is only available for a selected set of aspects, so before pinning your hopes on using PATCH as a way to make modifications to aspect values, confirm whether your aspect supports PATCH semantics. The complete list of Aspects that are supported are maintained [here](https://github.com/datahub-project/datahub/blob/9588440549f3d99965085e97b214a7dabc181ed2/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java#L24). In the near future, we do have plans to automatically support PATCH semantics for aspects by default. +Currently, PATCH support is only available for a selected set of aspects, so before pinning your hopes on using PATCH as a way to make modifications to aspect values, confirm whether your aspect supports PATCH semantics. The complete list of Aspects that are supported are maintained [here](https://github.com/datahub-project/datahub/blob/9588440549f3d99965085e97b214a7dabc181ed2/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java#L24). ::: -## How To Use Patch +## How To Use Patches -Examples for using Patch are sprinkled throughout the API guides. Here's how to find the appropriate classes for the language for your choice. - - + -The Java Patch builders are aspect-oriented and located in the [datahub-client](https://github.com/datahub-project/datahub/tree/master/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch) module under the `datahub.client.patch` namespace. +The Python Patch builders are entity-oriented and located in the [metadata-ingestion](https://github.com/datahub-project/datahub/tree/9588440549f3d99965085e97b214a7dabc181ed2/metadata-ingestion/src/datahub/specific) module and located in the `datahub.specific` module. +Patch builder helper classes exist for -Here are a few illustrative examples using the Java Patch builders: +- [Datasets](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dataset.py) +- [Charts](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/chart.py) +- [Dashboards](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dashboard.py) +- [Data Jobs (Tasks)](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/datajob.py) +- [Data Products](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dataproduct.py) +And we are gladly accepting contributions for Containers, Data Flows (Pipelines), Tags, Glossary Terms, Domains, and ML Models. -### Add Custom Properties +### Add & Remove Owners for Dataset -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAdd.java show_path_as_comment }} +To add & remove specific owners for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_owner_patch.py show_path_as_comment }} ``` -### Add and Remove Custom Properties +### Add & Remove Tags for Dataset -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAddRemove.java show_path_as_comment }} +To add & remove specific tags for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_tag_patch.py show_path_as_comment }} ``` -### Add Data Job Lineage +And for a specific schema field within the Dataset: -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DataJobLineageAdd.java show_path_as_comment }} +```python +{{ inline /metadata-ingestion/examples/library/dataset_field_add_tag_patch.py show_path_as_comment }} ``` - - +### Add & Remove Glossary Terms for Dataset + +To add & remove specific glossary terms for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py show_path_as_comment }} +``` + +And for a specific schema field within the Dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py show_path_as_comment }} +``` + +### Add & Remove Structured Properties for Dataset -The Python Patch builders are entity-oriented and located in the [metadata-ingestion](https://github.com/datahub-project/datahub/tree/9588440549f3d99965085e97b214a7dabc181ed2/metadata-ingestion/src/datahub/specific) module and located in the `datahub.specific` module. +To add & remove structured properties for a dataset: -Here are a few illustrative examples using the Python Patch builders: +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py show_path_as_comment }} +``` -### Add Properties to Dataset +### Add & Remove Upstream Lineage for Dataset + +To add & remove a lineage edge connecting a dataset to it's upstream or input at both the dataset and schema field level: ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py show_path_as_comment }} +``` + +### Add & Remove Read-Only Custom Properties for Dataset + +To add & remove specific custom properties for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py show_path_as_comment }} +``` + + + + +The Java Patch builders are aspect-oriented and located in the [datahub-client](https://github.com/datahub-project/datahub/tree/master/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch) module under the `datahub.client.patch` namespace. + +### Add & Remove Read-Only Custom Properties + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAddRemove.java show_path_as_comment }} +``` + +### Add Data Job Lineage + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DataJobLineageAdd.java show_path_as_comment }} ``` -## How Patch works +## Advanced: How Patch works To understand how patching works, it's important to understand a bit about our [models](../what/aspect.md). Entities are comprised of Aspects which can be reasoned about as JSON representations of the object models. To be able to patch these we utilize [JsonPatch](https://jsonpatch.com/). The components of a JSON Patch are the path, operation, and value. @@ -73,9 +124,6 @@ which can be reasoned about as JSON representations of the object models. To be The JSON path refers to a value within the schema. This can be a single field or can be an entire object reference depending on what the path is. For our patches we are primarily targeting single fields or even single array elements within a field. To be able to target array elements by id, we go through a translation process of the schema to transform arrays into maps. This allows a path to reference a particular array element by key rather than by index, for example a specific tag urn being added to a dataset. -This is important to note that for some fields in our schema that are arrays which do not necessarily restrict uniqueness, this puts a uniqueness constraint on the key. -The key for objects stored in arrays is determined manually by examining the schema and a long term goal is to make these keys annotation driven to reduce the amount of code needed to support -additional aspects to be patched. There is a generic patch endpoint, but it requires any array field keys to be specified at request time, putting a lot of burden on the API user. #### Examples @@ -87,8 +135,7 @@ Breakdown: * `/upstreams` -> References the upstreams field of the UpstreamLineage aspect, this is an array of Upstream objects where the key is the Urn * `/urn:...` -> The dataset to be targeted by the operation - -A patch path for targeting a fine grained lineage upstream: +A patch path for targeting a fine-grained lineage upstream: `/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD),foo)/urn:li:query:queryId/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created_upstream,PROD),bar)` @@ -118,7 +165,6 @@ using adds, but generally the most useful use case for patch is to add elements Remove operations require the path specified to be present, or an error will be thrown, otherwise they operate as one would expect. The specified path will be removed from the aspect. - ### Value Value is the actual information that will be stored at a path. If the path references an object then this will include the JSON key value pairs for that object. diff --git a/docs/api/tutorials/custom-properties.md b/docs/api/tutorials/custom-properties.md index fe0d7e62dcde8..86b1b2c0c54da 100644 --- a/docs/api/tutorials/custom-properties.md +++ b/docs/api/tutorials/custom-properties.md @@ -74,7 +74,7 @@ The following code adds custom properties `cluster_name` and `retention_time` to ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py show_path_as_comment }} ``` @@ -128,7 +128,7 @@ The following code shows you how can add and remove custom properties in the sam ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_remove_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py show_path_as_comment }} ``` diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md index be3a2d97514ef..75f5ac21224c2 100644 --- a/docs/managed-datahub/release-notes/v_0_3_7.md +++ b/docs/managed-datahub/release-notes/v_0_3_7.md @@ -13,12 +13,43 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ## Known Issues +### v0.3.7.7 + * Postgres regression, non-functional when using postgres + ### v0.3.7.3 * Search page fails to render when filters are applied with a query which returns zero results. ## Release Changelog --- +### v0.3.7.8 + +- [Postgres] Fix regression from MySQL fix in v0.3.7.7 + +### v0.3.7.7 + +- [UI] Fix bug showing upstream lineage dbt source leaves +- [UI] Show column-level lineage through transformational home node +- [UI] Browse nodes titles expand to full width of panel +- [UI] Data product preview cards display correctly +- [UI] Fix elasticsearch usage sort field names +- [UI] Add structured property display settings feature +- [Executor] Fix false errors on cli ingestions +- [Search] Schema field boost reduced +- [Search] Search usage ranking null_fill fix +- [Search] Single term with underscores by default no longer considered quoted +- [Metadata Tests] Metadata Test shutdown actions flush +- [Metadata Tests] Add deduplicate logic for MCP batches +- [Metadata Tests] Prevent mutation of systemMetadata in patch batches +- [MAE Consumer] Fix graph edge on container delete exception +- [Notifications] Filter out system ingestion source notifications +- [MySQL] Fix index gap lock deadlock +- [API] DataJobInputOutput finegrained lineage fix + +### v0.3.7.6 + +- [UI] fix(automations): white screen automations with dbt sync + ### v0.3.7.5 - [GMS] Fix upstream lineage patching when path contained encoded slash diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 2c5d8e6c9646a..fc1409fbed74e 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -23,8 +23,8 @@ task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { inputs.file file('setup.py') outputs.file(sentinel_file) commandLine 'bash', '-c', - "${python_executable} -m venv ${venv_name} && " + - "${venv_name}/bin/python -m pip install --upgrade pip uv wheel 'setuptools>=63.0.0' && " + + "${python_executable} -m venv ${venv_name} && set -x && " + + "${venv_name}/bin/python -m pip install --upgrade uv && " + "touch ${sentinel_file}" } diff --git a/metadata-ingestion/examples/ai/data_job_instance.py b/metadata-ingestion/examples/ai/data_job_instance.py new file mode 100644 index 0000000000000..8dcc6150fdd9a --- /dev/null +++ b/metadata-ingestion/examples/ai/data_job_instance.py @@ -0,0 +1,155 @@ +from dataclasses import dataclass +from typing import Iterable, Optional, Union + +import datahub.metadata.schema_classes as models +from datahub.api.entities.datajob import DataFlow, DataJob +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, +) +from datahub.api.entities.dataset.dataset import Dataset +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.graph.client import get_default_graph +from datahub.metadata.urns import DatasetUrn, DataPlatformUrn +from datahub.emitter.mcp_builder import ContainerKey + +ORCHESTRATOR_MLFLOW = "mlflow" +ORCHESTRATOR_AIRFLOW = "airflow" + + +class ContainerKeyWithId(ContainerKey): + id: str + + +@dataclass +class Container: + key: ContainerKeyWithId + subtype: str + name: Optional[str] = None + description: Optional[str] = None + + def generate_mcp( + self, + ) -> Iterable[ + Union[models.MetadataChangeProposalClass, MetadataChangeProposalWrapper] + ]: + container_urn = self.key.as_urn() + + container_subtype = models.SubTypesClass(typeNames=[self.subtype]) + + container_info = models.ContainerPropertiesClass( + name=self.name or self.key.id, + description=self.description, + customProperties={}, + ) + + browse_path = models.BrowsePathsV2Class(path=[]) + + dpi = models.DataPlatformInstanceClass( + platform=self.key.platform, + instance=self.key.instance, + ) + + yield from MetadataChangeProposalWrapper.construct_many( + entityUrn=container_urn, + aspects=[container_subtype, container_info, browse_path, dpi], + ) + + +def generate_pipeline( + pipeline_name: str, + orchestrator: str, +) -> Iterable[Union[models.MetadataChangeProposalClass, MetadataChangeProposalWrapper]]: + data_flow = DataFlow( + id=pipeline_name, + orchestrator=orchestrator, + cluster="default", + name=pipeline_name, + ) + + data_job = DataJob(id="training", flow_urn=data_flow.urn, name="Training") + + dataset_1 = Dataset( + id="input_data", + name="input_data", + description="Input data", + properties={}, + platform="s3", + schema=None, + ) + + dataset_2 = Dataset( + id="output_data", + name="output_data", + description="Output data", + properties={}, + platform="s3", + schema=None, + ) + + if orchestrator == ORCHESTRATOR_MLFLOW: + # For Mlflow we create an experiment and a run + + experiment = Container( + key=ContainerKeyWithId( + platform=str(DataPlatformUrn.create_from_id("mlflow")), + id="experiment_1", + ), + subtype="Experiment", + name="Experiment 1", + description="Experiment 1 description", + ) + + yield from experiment.generate_mcp() + + data_process_instance = DataProcessInstance.from_container( + container_key=experiment.key, id="training_2024_01_01" + ) + + if orchestrator == ORCHESTRATOR_AIRFLOW: + # For Airflow we create a DAG and a task + data_process_instance = DataProcessInstance.from_datajob( + datajob=data_job, id="training_2024_01_01" + ) + yield from data_flow.generate_mcp() + yield from data_job.generate_mcp() + + # data_process_instance = DataProcessInstance.from_datajob( + # datajob=data_job, id="training_2024_01_01" + # ) + data_process_instance.subtype = "ML Training Run" + data_process_instance.inlets = [DatasetUrn.from_string(dataset_1.urn)] + data_process_instance.outlets = [DatasetUrn.from_string(dataset_2.urn)] + + yield from dataset_1.generate_mcp() + yield from dataset_2.generate_mcp() + print(f"Generating for {data_process_instance.urn}") + yield from data_process_instance.generate_mcp( + created_ts_millis=None, materialize_iolets=False + ) + # Finally generate the start and end events + # start date is Dec 3rd 2024 at 10am UTC + start_time_millis = 1735689600000 + # the job ran for 1 hour + end_time_millis = start_time_millis + 60 * 60 * 1000 + yield from data_process_instance.start_event_mcp( + # 5 days ago + start_timestamp_millis=start_time_millis + ) + yield from data_process_instance.end_event_mcp( + end_timestamp_millis=end_time_millis, + result=InstanceRunResult.SUCCESS, + start_timestamp_millis=start_time_millis, + ) + + +if __name__ == "__main__": + with get_default_graph() as graph: + for mcp in generate_pipeline( + "training_pipeline_mlflow", orchestrator=ORCHESTRATOR_MLFLOW + ): + graph.emit(mcp) + for mcp in generate_pipeline( + "training_pipeline_airflow", orchestrator=ORCHESTRATOR_AIRFLOW + ): + graph.emit(mcp) diff --git a/metadata-ingestion/examples/ai/demo_script.py b/metadata-ingestion/examples/ai/demo_script.py new file mode 100644 index 0000000000000..1503f00b0e14b --- /dev/null +++ b/metadata-ingestion/examples/ai/demo_script.py @@ -0,0 +1,373 @@ +import time +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Iterable, List, Optional, Union +import random + +import datahub.metadata.schema_classes as models +from datahub.api.entities.datajob import DataFlow, DataJob +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, +) +from datahub.api.entities.dataset.dataset import Dataset +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.graph.client import get_default_graph +from datahub.metadata.urns import ( + DatasetUrn, + DataPlatformUrn, + MlModelGroupUrn, + MlModelUrn, +) +from datahub.emitter.mcp_builder import ContainerKey + +ORCHESTRATOR_MLFLOW = "mlflow" +ORCHESTRATOR_AIRFLOW = "airflow" + + +class ContainerKeyWithId(ContainerKey): + id: str + + +@dataclass +class Container: + key: ContainerKeyWithId + subtype: str + name: Optional[str] = None + description: Optional[str] = None + + def generate_mcp( + self, + ) -> Iterable[ + Union[models.MetadataChangeProposalClass, MetadataChangeProposalWrapper] + ]: + container_urn = self.key.as_urn() + + container_subtype = models.SubTypesClass(typeNames=[self.subtype]) + + container_info = models.ContainerPropertiesClass( + name=self.name or self.key.id, + description=self.description, + customProperties={}, + ) + + browse_path = models.BrowsePathsV2Class(path=[]) + + dpi = models.DataPlatformInstanceClass( + platform=self.key.platform, + instance=self.key.instance, + ) + + yield from MetadataChangeProposalWrapper.construct_many( + entityUrn=container_urn, + aspects=[container_subtype, container_info, browse_path, dpi], + ) + + +def create_model( + model_name: str, + model_group_urn: str, + tags: List[str], + version_aliases: List[str], + index: int, + training_metrics: List[models.MLMetricClass], + hyper_params: List[models.MLHyperParamClass], + model_description: str, + created_at: int, +) -> Iterable[MetadataChangeProposalWrapper]: + model_urn = MlModelUrn(platform="mlflow", name=model_name) + model_info = models.MLModelPropertiesClass( + displayName=f"{model_name}", + description=model_description, + version=models.VersionTagClass(versionTag=f"{index}"), + groups=[str(model_group_urn)], + date=created_at, + lastModified=created_at, + createdBy=f"user_{index}", + versionAliases=[ + models.VersionAssociationClass( + version=models.VersionTagClass(versionTag=alias), + ) + for alias in version_aliases + ], + tags=tags, + # trainingMetrics=training_metrics, + hyperParams=hyper_params, + ) + + yield MetadataChangeProposalWrapper( + entityUrn=model_urn, + aspect=model_info, + ) + + +def generate_pipeline( + pipeline_name: str, + orchestrator: str, +) -> Iterable[Union[models.MetadataChangeProposalClass, MetadataChangeProposalWrapper]]: + data_flow = DataFlow( + id=pipeline_name, + orchestrator=orchestrator, + cluster="default", + name=pipeline_name, + ) + + data_job = DataJob(id="training", flow_urn=data_flow.urn, name="Training") + + input_dataset = Dataset( + id="airline_passengers", + name="Airline Passengers", + description="Monthly airline passenger data", + properties={}, + platform="s3", + schema=None, + ) + + if orchestrator == ORCHESTRATOR_MLFLOW: + experiment = Container( + key=ContainerKeyWithId( + platform=str(DataPlatformUrn.create_from_id("mlflow")), + id="airline_forecast_experiment", + ), + subtype="ML Experiment", + name="Airline Forecast Experiment", + description="Experiment for forecasting airline passengers", + ) + + yield from experiment.generate_mcp() + + model_group_urn = MlModelGroupUrn( + platform="mlflow", name="airline_forecast_models" + ) + current_time = int(time.time() * 1000) + model_group_info = models.MLModelGroupPropertiesClass( + description="ML models for airline passenger forecasting", + customProperties={ + "stage": "production", + "team": "data_science", + }, + createdAt=current_time, + lastModified=current_time, + createdBy="john_doe", + ) + + yield MetadataChangeProposalWrapper( + entityUrn=model_group_urn, + aspect=model_group_info, + ) + + print("model_group_urn: ", model_group_urn) + + model_aliases = [ + "challenger", + "champion", + "production", + "experimental", + "deprecated", + ] + model_tags = [ + "stage:production", + "stage:development", + "team:data_science", + "team:ml_engineering", + "team:analytics", + ] + + model_dict = { + "arima_model_1": "ARIMA model for airline passenger forecasting", + "arima_model_2": "Enhanced ARIMA model with seasonal components", + "arima_model_3": "ARIMA model optimized for long-term forecasting", + "arima_model_4": "ARIMA model with hyperparameter tuning", + "arima_model_5": "ARIMA model trained on extended dataset", + } + + # Generate run timestamps within the last month + end_time = int(time.time() * 1000) # Current timestamp in milliseconds + start_time = end_time - ( + 30 * 24 * 60 * 60 * 1000 + ) # 30 days ago in milliseconds + run_timestamps = [ + start_time + (i * 5 * 24 * 60 * 60 * 1000) # 5-day intervals + for i in range(5) + ] + + run_dict = { + "run_1": { + "start_time": run_timestamps[0], + "duration": 45, + "result": InstanceRunResult.SUCCESS, + }, + "run_2": { + "start_time": run_timestamps[1], + "duration": 60, + "result": InstanceRunResult.FAILURE, + }, + "run_3": { + "start_time": run_timestamps[2], + "duration": 55, + "result": InstanceRunResult.SUCCESS, + }, + "run_4": { + "start_time": run_timestamps[3], + "duration": 70, + "result": InstanceRunResult.SUCCESS, + }, + "run_5": { + "start_time": run_timestamps[4], + "duration": 50, + "result": InstanceRunResult.FAILURE, + }, + } + + for i, (model_name, model_description) in enumerate( + model_dict.items(), start=1 + ): + run_id = f"run_{i}" + data_process_instance = DataProcessInstance.from_container( + container_key=experiment.key, id=run_id + ) + + data_process_instance.subtype = "ML Training Run" + data_process_instance.inlets = [DatasetUrn.from_string(input_dataset.urn)] + + output_dataset = Dataset( + id=f"passenger_forecast_24_12_0{i}", + name=f"Passenger Forecast 24_12_0{i}", + description=f"Forecasted airline passenger numbers for run {i}", + properties={}, + platform="s3", + schema=None, + ) + yield from output_dataset.generate_mcp() + + data_process_instance.outlets = [DatasetUrn.from_string(output_dataset.urn)] + + # Training metrics and hyperparameters + training_metrics = [ + models.MLMetricClass( + name="accuracy", + value=str(random.uniform(0.7, 0.99)), + description="Test accuracy", + ), + models.MLMetricClass( + name="f1_score", + value=str(random.uniform(0.7, 0.99)), + description="Test F1 score", + ), + ] + hyper_params = [ + models.MLHyperParamClass( + name="n_estimators", + value=str(random.randint(50, 200)), + description="Number of trees", + ), + models.MLHyperParamClass( + name="max_depth", + value=str(random.randint(5, 15)), + description="Maximum tree depth", + ), + ] + + # DPI properties + created_at = int(time.time() * 1000) + print(start_time) + dpi_props = models.DataProcessInstancePropertiesClass( + name=f"Training {run_id}", + created=models.AuditStampClass( + time=created_at, actor="urn:li:corpuser:datahub" + ), + createdAt=int(created_at / 1000), + createdBy="jane_doe", + loggedModels=["sklearn"], + artifactsLocation="s3://mlflow/artifacts", + externalUrl="http://mlflow:5000", + customProperties={ + "framework": "statsmodels", + "python_version": "3.8", + }, + id=run_id, + trainingMetrics=training_metrics, + hyperParams=hyper_params, + ) + + yield from data_process_instance.generate_mcp( + created_ts_millis=created_at, materialize_iolets=True + ) + + yield MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=dpi_props, + ) + + dpi_output_model = models.DataProcessInstanceOutputClass( + name="model", + description="Trained model", + + ) + + yield MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=dpi_output_model, + ) + + # Generate start and end events + start_time_millis = run_dict[run_id]["start_time"] + duration_minutes = run_dict[run_id]["duration"] + end_time_millis = start_time_millis + duration_minutes * 60000 + result = run_dict[run_id]["result"] + result_type = ( + "SUCCESS" if result == InstanceRunResult.SUCCESS else "FAILURE" + ) + + yield from data_process_instance.start_event_mcp( + start_timestamp_millis=start_time_millis + ) + yield from data_process_instance.end_event_mcp( + end_timestamp_millis=end_time_millis, + result=result, + result_type=result_type, + start_timestamp_millis=start_time_millis, + ) + + print("data_process_instance.urn: ", data_process_instance.urn) + print("start Time:", start_time_millis) + print( + "start Time:", + time.strftime( + "%Y-%m-%d %H:%M:%S", time.localtime(start_time_millis / 1000) + ), + ) + + # Model + selected_aliases = random.sample(model_aliases, k=random.randint(1, 2)) + selected_tags = random.sample(model_tags, 2) + yield from create_model( + model_name=model_name, + model_group_urn=str(model_group_urn), + data_process_instance_urn=str(data_process_instance.urn), + tags=selected_tags, + version_aliases=selected_aliases, + index=i, + training_metrics=training_metrics, + hyper_params=hyper_params, + model_description=model_description, + created_at=created_at, + ) + + if orchestrator == ORCHESTRATOR_AIRFLOW: + yield from data_flow.generate_mcp() + yield from data_job.generate_mcp() + + yield from input_dataset.generate_mcp() + + +if __name__ == "__main__": + with get_default_graph() as graph: + for mcp in generate_pipeline( + "airline_forecast_pipeline_mlflow", orchestrator=ORCHESTRATOR_MLFLOW + ): + graph.emit(mcp) + for mcp in generate_pipeline( + "airline_forecast_pipeline_airflow", orchestrator=ORCHESTRATOR_AIRFLOW + ): + graph.emit(mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py new file mode 100644 index 0000000000000..7231461fea322 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py @@ -0,0 +1,19 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add Custom Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_custom_property("cluster_name", "datahubproject.acryl.io") +patch_builder.add_custom_property("retention_time", "2 years") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py b/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py new file mode 100644 index 0000000000000..d0b9a866fde61 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py @@ -0,0 +1,22 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_term_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import GlossaryTermAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Term for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_term(GlossaryTermAssociationClass(make_term_urn("term-to-add-id"))) +patch_builder.remove_term(make_term_urn("term-to-remove-id")) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_owner_patch.py b/metadata-ingestion/examples/library/dataset_add_owner_patch.py new file mode 100644 index 0000000000000..8d3130c09c4bb --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_owner_patch.py @@ -0,0 +1,24 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_group_urn, make_user_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import OwnerClass, OwnershipTypeClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Owners +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_owner( + OwnerClass(make_user_urn("user-to-add-id"), OwnershipTypeClass.TECHNICAL_OWNER) +) +patch_builder.remove_owner(make_group_urn("group-to-remove-id")) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_properties.py b/metadata-ingestion/examples/library/dataset_add_properties.py deleted file mode 100644 index b72aac5b82800..0000000000000 --- a/metadata-ingestion/examples/library/dataset_add_properties.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging -from typing import Union - -from datahub.configuration.kafka import KafkaProducerConnectionConfig -from datahub.emitter.kafka_emitter import DatahubKafkaEmitter, KafkaEmitterConfig -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -# Get an emitter, either REST or Kafka, this example shows you both -def get_emitter() -> Union[DataHubRestEmitter, DatahubKafkaEmitter]: - USE_REST_EMITTER = True - if USE_REST_EMITTER: - gms_endpoint = "http://localhost:8080" - return DataHubRestEmitter(gms_server=gms_endpoint) - else: - kafka_server = "localhost:9092" - schema_registry_url = "http://localhost:8081" - return DatahubKafkaEmitter( - config=KafkaEmitterConfig( - connection=KafkaProducerConnectionConfig( - bootstrap=kafka_server, schema_registry_url=schema_registry_url - ) - ) - ) - - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - -with get_emitter() as emitter: - for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_custom_property("cluster_name", "datahubproject.acryl.io") - .add_custom_property("retention_time", "2 years") - .build() - ): - emitter.emit(patch_mcp) - - -log.info(f"Added cluster_name, retention_time properties to dataset {dataset_urn}") diff --git a/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py new file mode 100644 index 0000000000000..c1db9c91d13ec --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py @@ -0,0 +1,19 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add + Remove Custom Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_custom_property("cluster_name", "datahubproject.acryl.io") +patch_builder.remove_custom_property("retention_time") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_remove_properties.py b/metadata-ingestion/examples/library/dataset_add_remove_properties.py deleted file mode 100644 index 7109c0264f971..0000000000000 --- a/metadata-ingestion/examples/library/dataset_add_remove_properties.py +++ /dev/null @@ -1,46 +0,0 @@ -import logging -from typing import Union - -from datahub.configuration.kafka import KafkaProducerConnectionConfig -from datahub.emitter.kafka_emitter import DatahubKafkaEmitter, KafkaEmitterConfig -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -# Get an emitter, either REST or Kafka, this example shows you both -def get_emitter() -> Union[DataHubRestEmitter, DatahubKafkaEmitter]: - USE_REST_EMITTER = True - if USE_REST_EMITTER: - gms_endpoint = "http://localhost:8080" - return DataHubRestEmitter(gms_server=gms_endpoint) - else: - kafka_server = "localhost:9092" - schema_registry_url = "http://localhost:8081" - return DatahubKafkaEmitter( - config=KafkaEmitterConfig( - connection=KafkaProducerConnectionConfig( - bootstrap=kafka_server, schema_registry_url=schema_registry_url - ) - ) - ) - - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - -with get_emitter() as emitter: - for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_custom_property("cluster_name", "datahubproject.acryl.io") - .remove_custom_property("retention_time") - .build() - ): - emitter.emit(patch_mcp) - - -log.info( - f"Added cluster_name property, removed retention_time property from dataset {dataset_urn}" -) diff --git a/metadata-ingestion/examples/library/dataset_add_structured_properties.py b/metadata-ingestion/examples/library/dataset_add_structured_properties.py deleted file mode 100644 index fc2c379340592..0000000000000 --- a/metadata-ingestion/examples/library/dataset_add_structured_properties.py +++ /dev/null @@ -1,24 +0,0 @@ -import logging - -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - -# Create rest emitter -rest_emitter = DataHubRestEmitter(gms_server="http://localhost:8080") - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - - -for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_structured_property("io.acryl.dataManagement.replicationSLA", 12) - .build() -): - rest_emitter.emit(patch_mcp) - - -log.info(f"Added cluster_name, retention_time properties to dataset {dataset_urn}") diff --git a/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py new file mode 100644 index 0000000000000..ef72ed58a4b82 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py @@ -0,0 +1,23 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add and Remove Structured Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_structured_property( + "urn:li:structuredProperty:retentionTimeInDays", 12 +) +patch_builder.remove_structured_property( + "urn:li:structuredProperty:customClassification" +) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_tag_patch.py b/metadata-ingestion/examples/library/dataset_add_tag_patch.py new file mode 100644 index 0000000000000..0bc644d6865f6 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_tag_patch.py @@ -0,0 +1,22 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import TagAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_tag(TagAssociationClass(make_tag_urn("tag-to-add-id"))) +patch_builder.remove_tag("urn:li:tag:tag-to-remove-id") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py b/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py new file mode 100644 index 0000000000000..0b4e5e39bf627 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py @@ -0,0 +1,62 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import ( + DatasetLineageTypeClass, + FineGrainedLineageClass, + FineGrainedLineageUpstreamTypeClass, + UpstreamClass, +) +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) +upstream_to_remove_urn = make_dataset_urn( + platform="s3", name="fct_users_old", env="PROD" +) +upstream_to_add_urn = make_dataset_urn(platform="s3", name="fct_users_new", env="PROD") + +# Create Dataset Patch to Add & Remove Upstream Lineage Edges +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.remove_upstream_lineage(upstream_to_remove_urn) +patch_builder.add_upstream_lineage( + UpstreamClass(upstream_to_add_urn, DatasetLineageTypeClass.TRANSFORMED) +) + +# ...And also include schema field lineage +upstream_field_to_add_urn = make_schema_field_urn(upstream_to_add_urn, "profile_id") +downstream_field_to_add_urn = make_schema_field_urn(dataset_urn, "profile_id") + +patch_builder.add_fine_grained_upstream_lineage( + FineGrainedLineageClass( + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + [upstream_field_to_add_urn], + [downstream_field_to_add_urn], + ) +) + +upstream_field_to_remove_urn = make_schema_field_urn( + upstream_to_remove_urn, "profile_id" +) +downstream_field_to_remove_urn = make_schema_field_urn(dataset_urn, "profile_id") + +patch_builder.remove_fine_grained_upstream_lineage( + FineGrainedLineageClass( + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + [upstream_field_to_remove_urn], + [downstream_field_to_remove_urn], + ) +) + +patch_mcps = patch_builder.build() + + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py b/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py new file mode 100644 index 0000000000000..3f8da2c143c92 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py @@ -0,0 +1,26 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_term_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import GlossaryTermAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Term for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.for_field("profile_id").add_term( + GlossaryTermAssociationClass(make_term_urn("term-to-add-id")) +) +patch_builder.for_field("profile_id").remove_term( + "urn:li:glossaryTerm:term-to-remove-id" +) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py b/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py new file mode 100644 index 0000000000000..3075cac5320ae --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py @@ -0,0 +1,24 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import TagAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Tag for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.for_field("profile_id").add_tag( + TagAssociationClass(make_tag_urn("tag-to-add-id")) +) +patch_builder.for_field("profile_id").remove_tag("urn:li:tag:tag-to-remove-id") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/recipes/mlflow_to_datahub.dhub.yaml b/metadata-ingestion/examples/recipes/mlflow_to_datahub.dhub.yaml new file mode 100644 index 0000000000000..07e9ed5d786cd --- /dev/null +++ b/metadata-ingestion/examples/recipes/mlflow_to_datahub.dhub.yaml @@ -0,0 +1,9 @@ +source: + type: mlflow + config: + tracking_uri: "http://127.0.0.1:5000" + +sink: + type: datahub-rest + config: + server: "http://localhost:8080" \ No newline at end of file diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 6334b3abbb8a0..c6994dd6d5aa6 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -741,7 +741,7 @@ "hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource", "json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource", "kafka = datahub.ingestion.source.kafka.kafka:KafkaSource", - "kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource", + "kafka-connect = datahub.ingestion.source.kafka_connect.kafka_connect:KafkaConnectSource", "ldap = datahub.ingestion.source.ldap:LDAPSource", "looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource", "lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource", diff --git a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py index d406fa36e00db..2b847cb6fe4f9 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py +++ b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py @@ -15,16 +15,21 @@ ) from datahub.metadata.schema_classes import ( AuditStampClass, + DataPlatformInstanceClass, DataProcessInstanceRunEventClass, DataProcessInstanceRunResultClass, DataProcessRunStatusClass, DataProcessTypeClass, + SubTypesClass, + ContainerClass, ) +from datahub.metadata.urns import DataPlatformInstanceUrn, DataPlatformUrn, ContainerUrn from datahub.utilities.str_enum import StrEnum from datahub.utilities.urns.data_flow_urn import DataFlowUrn from datahub.utilities.urns.data_job_urn import DataJobUrn from datahub.utilities.urns.data_process_instance_urn import DataProcessInstanceUrn from datahub.utilities.urns.dataset_urn import DatasetUrn +from datahub.emitter.mcp_builder import ContainerKey class DataProcessInstanceKey(DatahubKey): @@ -61,7 +66,9 @@ class DataProcessInstance: orchestrator: str cluster: Optional[str] = None type: str = DataProcessTypeClass.BATCH_SCHEDULED - template_urn: Optional[Union[DataJobUrn, DataFlowUrn, DatasetUrn]] = None + template_urn: Optional[ + Union[DataJobUrn, DataFlowUrn, DatasetUrn, ContainerUrn] + ] = None parent_instance: Optional[DataProcessInstanceUrn] = None properties: Dict[str, str] = field(default_factory=dict) url: Optional[str] = None @@ -71,6 +78,10 @@ class DataProcessInstance: _template_object: Optional[Union[DataJob, DataFlow]] = field( init=False, default=None, repr=False ) + data_platform: Optional[str] = None + data_plaform_instance: Optional[str] = None + subtype: Optional[str] = None + container_urn: Optional[str] = None def __post_init__(self): self.urn = DataProcessInstanceUrn( @@ -80,6 +91,36 @@ def __post_init__(self): id=self.id, ).guid() ) + if self.data_platform is None: + self.data_platform = self.orchestrator + + try: + # We first try to create from string assuming its an urn + self.data_platform = str( + DataPlatformUrn.create_from_string(self.data_platform) + ) + except Exception: + # If it fails, we assume its an id + self.data_platform = str(DataPlatformUrn.create_from_id(self.data_platform)) + + if self.data_plaform_instance is None and self.cluster is not None: + self.data_plaform_instance = self.cluster + + if self.data_plaform_instance is not None: + try: + # We first try to create from string assuming its an urn + self.data_plaform_instance = str( + DataPlatformInstanceUrn.create_from_string( + self.data_plaform_instance + ) + ) + except Exception: + # If it fails, we assume its an id + self.data_plaform_instance = str( + DataPlatformInstanceUrn( + platform=self.data_platform, instance=self.data_plaform_instance + ) + ) def start_event_mcp( self, start_timestamp_millis: int, attempt: Optional[int] = None @@ -269,6 +310,29 @@ def generate_mcp( ) yield mcp + assert self.data_platform + + mcp = MetadataChangeProposalWrapper( + entityUrn=str(self.urn), + aspect=DataPlatformInstanceClass( + platform=self.data_platform, instance=self.data_plaform_instance + ), + ) + yield mcp + + if self.subtype: + mcp = MetadataChangeProposalWrapper( + entityUrn=str(self.urn), aspect=SubTypesClass(typeNames=[self.subtype]) + ) + yield mcp + + if self.container_urn: + mcp = MetadataChangeProposalWrapper( + entityUrn=str(self.urn), + aspect=ContainerClass(container=self.container_urn), + ) + yield mcp + yield from self.generate_inlet_outlet_mcp(materialize_iolets=materialize_iolets) @staticmethod @@ -331,6 +395,31 @@ def from_datajob( dpi.outlets = datajob.outlets return dpi + @staticmethod + def from_container( + container_key: ContainerKey, + id: str, + ) -> "DataProcessInstance": + """ + Generates DataProcessInstance from a Container + + :param datajob: (DataJob) the datajob from generate the DataProcessInstance + :param id: (str) the id for the DataProcessInstance + :param clone_inlets: (bool) whether to clone datajob's inlets + :param clone_outlets: (bool) whether to clone datajob's outlets + :return: DataProcessInstance + """ + dpi: DataProcessInstance = DataProcessInstance( + id=id, + orchestrator=DataPlatformUrn.from_string( + container_key.platform + ).platform_name, + template_urn=None, + container_urn=container_key.as_urn(), + ) + + return dpi + @staticmethod def from_dataflow(dataflow: DataFlow, id: str) -> "DataProcessInstance": """ diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index 44c737f1bd13d..8e41e9fb91787 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin): default=None, description="A holder for platform -> platform_instance mappings to generate correct dataset urns", ) + + +class PlatformDetail(ConfigModel): + platform_instance: Optional[str] = Field( + default=None, + description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " + "with platform instance name used in ingestion " + "recipe of other datahub sources.", + ) + env: str = Field( + default=DEFAULT_ENV, + description="The environment that all assets produced by DataHub platform ingestion source belong to", + ) diff --git a/metadata-ingestion/src/datahub/emitter/mcp.py b/metadata-ingestion/src/datahub/emitter/mcp.py index c6fcfad2e0aba..645c6899765ed 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp.py +++ b/metadata-ingestion/src/datahub/emitter/mcp.py @@ -14,7 +14,7 @@ SystemMetadataClass, _Aspect, ) -from datahub.utilities.urns.urn import guess_entity_type +from datahub.utilities.urns.urn import Urn, guess_entity_type if TYPE_CHECKING: from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -63,7 +63,10 @@ class MetadataChangeProposalWrapper: entityType: str = _ENTITY_TYPE_UNSET changeType: Union[str, ChangeTypeClass] = ChangeTypeClass.UPSERT - entityUrn: Union[None, str] = None + entityUrn: Union[ + None, + str, + ] = None entityKeyAspect: Union[None, _Aspect] = None auditHeader: Union[None, KafkaAuditHeaderClass] = None aspectName: Union[None, str] = None @@ -71,7 +74,11 @@ class MetadataChangeProposalWrapper: systemMetadata: Union[None, SystemMetadataClass] = None def __post_init__(self) -> None: - if self.entityUrn and self.entityType == _ENTITY_TYPE_UNSET: + if isinstance(self.entityUrn, Urn): + if self.entityType == _ENTITY_TYPE_UNSET: + self.entityType = self.entityUrn.entity_type + self.entityUrn = str(self.entityUrn) + elif self.entityUrn and self.entityType == _ENTITY_TYPE_UNSET: self.entityType = guess_entity_type(self.entityUrn) elif self.entityUrn and self.entityType: guessed_entity_type = guess_entity_type(self.entityUrn).lower() @@ -105,6 +112,7 @@ def construct_many( return [cls(entityUrn=entityUrn, aspect=aspect) for aspect in aspects if aspect] def _make_mcp_without_aspects(self) -> MetadataChangeProposalClass: + assert self.entityUrn is None or isinstance(self.entityUrn, str) return MetadataChangeProposalClass( entityType=self.entityType, entityUrn=self.entityUrn, diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index 5931873f54236..76f24bfd63d47 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -10,6 +10,7 @@ NoSuchNamespaceError, NoSuchPropertyException, NoSuchTableError, + ServerError, ) from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit from pyiceberg.table import Table @@ -145,6 +146,13 @@ def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]: self.report.report_no_listed_namespaces(len(namespaces)) tables_count = 0 for namespace in namespaces: + namespace_repr = ".".join(namespace) + if not self.config.namespace_pattern.allowed(namespace_repr): + LOGGER.info( + f"Namespace {namespace_repr} is not allowed by config pattern, skipping" + ) + self.report.report_dropped(f"{namespace_repr}.*") + continue try: tables = catalog.list_tables(namespace) tables_count += len(tables) @@ -181,6 +189,9 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: if not self.config.table_pattern.allowed(dataset_name): # Dataset name is rejected by pattern, report as dropped. self.report.report_dropped(dataset_name) + LOGGER.debug( + f"Skipping table {dataset_name} due to not being allowed by the config pattern" + ) return try: if not hasattr(thread_local, "local_catalog"): @@ -219,6 +230,22 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: LOGGER.warning( f"NoSuchTableError while processing table {dataset_path}, skipping it.", ) + except FileNotFoundError as e: + self.report.report_warning( + "file-not-found", + f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}", + ) + LOGGER.warning( + f"FileNotFoundError while processing table {dataset_path}, skipping it." + ) + except ServerError as e: + self.report.report_warning( + "iceberg-rest-server-error", + f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}", + ) + LOGGER.warning( + f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it." + ) except Exception as e: self.report.report_failure("general", f"Failed to create workunit: {e}") LOGGER.exception( @@ -269,7 +296,6 @@ def _create_iceberg_workunit( ] = table.current_snapshot().manifest_list dataset_properties = DatasetPropertiesClass( name=table.name()[-1], - tags=[], description=table.metadata.properties.get("comment", None), customProperties=custom_properties, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py index 98ad9e552d35c..4a7f6bf4d60c1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py @@ -68,6 +68,10 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin) default=AllowDenyPattern.allow_all(), description="Regex patterns for tables to filter in ingestion.", ) + namespace_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for namespaces to filter in ingestion.", + ) user_ownership_property: Optional[str] = Field( default="owner", description="Iceberg table property to look for a `CorpUser` owner. Can only hold a single user value. If property has no value, no owner information will be emitted.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py deleted file mode 100644 index 23a99ccb310e1..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py +++ /dev/null @@ -1,1468 +0,0 @@ -import logging -import re -from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Tuple - -import jpype -import jpype.imports -import requests -from pydantic.fields import Field -from sqlalchemy.engine.url import make_url - -import datahub.emitter.mce_builder as builder -import datahub.metadata.schema_classes as models -from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import ( - DatasetLineageProviderConfigBase, - PlatformInstanceConfigMixin, -) -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( - get_platform_from_sqlalchemy_uri, -) -from datahub.ingestion.source.state.stale_entity_removal_handler import ( - StaleEntityRemovalHandler, - StaleEntityRemovalSourceReport, - StatefulStaleMetadataRemovalConfig, -) -from datahub.ingestion.source.state.stateful_ingestion_base import ( - StatefulIngestionConfigBase, - StatefulIngestionSourceBase, -) - -logger = logging.getLogger(__name__) - -KAFKA = "kafka" -SOURCE = "source" -SINK = "sink" -CONNECTOR_CLASS = "connector.class" - - -class ProvidedConfig(ConfigModel): - provider: str - path_key: str - value: str - - -class GenericConnectorConfig(ConfigModel): - connector_name: str - source_dataset: str - source_platform: str - - -class KafkaConnectSourceConfig( - PlatformInstanceConfigMixin, - DatasetLineageProviderConfigBase, - StatefulIngestionConfigBase, -): - # See the Connect REST Interface for details - # https://docs.confluent.io/platform/current/connect/references/restapi.html# - connect_uri: str = Field( - default="http://localhost:8083/", description="URI to connect to." - ) - username: Optional[str] = Field(default=None, description="Kafka Connect username.") - password: Optional[str] = Field(default=None, description="Kafka Connect password.") - cluster_name: Optional[str] = Field( - default="connect-cluster", description="Cluster to ingest from." - ) - # convert lineage dataset's urns to lowercase - convert_lineage_urns_to_lowercase: bool = Field( - default=False, - description="Whether to convert the urns of ingested lineage dataset to lowercase", - ) - connector_patterns: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), - description="regex patterns for connectors to filter for ingestion.", - ) - provided_configs: Optional[List[ProvidedConfig]] = Field( - default=None, description="Provided Configurations" - ) - connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( - default=None, - description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', - ) - platform_instance_map: Optional[Dict[str, str]] = Field( - default=None, - description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', - ) - generic_connectors: List[GenericConnectorConfig] = Field( - default=[], - description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", - ) - - stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None - - -@dataclass -class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): - connectors_scanned: int = 0 - filtered: List[str] = field(default_factory=list) - - def report_connector_scanned(self, connector: str) -> None: - self.connectors_scanned += 1 - - def report_dropped(self, connector: str) -> None: - self.filtered.append(connector) - - -@dataclass -class KafkaConnectLineage: - """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" - - source_platform: str - target_dataset: str - target_platform: str - job_property_bag: Optional[Dict[str, str]] = None - source_dataset: Optional[str] = None - - -@dataclass -class ConnectorManifest: - """Each instance is potential DataFlow""" - - name: str - type: str - config: Dict - tasks: Dict - url: Optional[str] = None - flow_property_bag: Optional[Dict[str, str]] = None - lineages: List[KafkaConnectLineage] = field(default_factory=list) - topic_names: Iterable[str] = field(default_factory=list) - - -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - index = len(prefix) - return text[index:] - return text - - -def unquote( - string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None -) -> str: - """ - If string starts and ends with a quote, unquote it - """ - trailing_quote = trailing_quote if trailing_quote else leading_quote - if string.startswith(leading_quote) and string.endswith(trailing_quote): - string = string[1:-1] - return string - - -def get_dataset_name( - database_name: Optional[str], - source_table: str, -) -> str: - if database_name: - dataset_name = database_name + "." + source_table - else: - dataset_name = source_table - - return dataset_name - - -def get_platform_instance( - config: KafkaConnectSourceConfig, connector_name: str, platform: str -) -> Optional[str]: - instance_name = None - if ( - config.connect_to_platform_map - and config.connect_to_platform_map.get(connector_name) - and config.connect_to_platform_map[connector_name].get(platform) - ): - instance_name = config.connect_to_platform_map[connector_name][platform] - if config.platform_instance_map and config.platform_instance_map.get(platform): - logger.warning( - f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." - "Will prefer connector specific platform instance from connect_to_platform_map." - ) - elif config.platform_instance_map and config.platform_instance_map.get(platform): - instance_name = config.platform_instance_map[platform] - logger.info( - f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" - ) - return instance_name - - -@dataclass -class ConfluentJDBCSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" - KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] - # https://kafka.apache.org/documentation/#connect_included_transformation - KAFKA_NONTOPICROUTING_TRANSFORMS = [ - "InsertField", - "InsertField$Key", - "InsertField$Value", - "ReplaceField", - "ReplaceField$Key", - "ReplaceField$Value", - "MaskField", - "MaskField$Key", - "MaskField$Value", - "ValueToKey", - "ValueToKey$Key", - "ValueToKey$Value", - "HoistField", - "HoistField$Key", - "HoistField$Value", - "ExtractField", - "ExtractField$Key", - "ExtractField$Value", - "SetSchemaMetadata", - "SetSchemaMetadata$Key", - "SetSchemaMetadata$Value", - "Flatten", - "Flatten$Key", - "Flatten$Value", - "Cast", - "Cast$Key", - "Cast$Value", - "HeadersFrom", - "HeadersFrom$Key", - "HeadersFrom$Value", - "TimestampConverter", - "Filter", - "InsertHeader", - "DropHeaders", - ] - # https://docs.confluent.io/platform/current/connect/transforms/overview.html - CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ - "Drop", - "Drop$Key", - "Drop$Value", - "Filter", - "Filter$Key", - "Filter$Value", - "TombstoneHandler", - ] - KNOWN_NONTOPICROUTING_TRANSFORMS = ( - KAFKA_NONTOPICROUTING_TRANSFORMS - + [ - f"org.apache.kafka.connect.transforms.{t}" - for t in KAFKA_NONTOPICROUTING_TRANSFORMS - ] - + CONFLUENT_NONTOPICROUTING_TRANSFORMS - + [ - f"io.confluent.connect.transforms.{t}" - for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS - ] - ) - - @dataclass - class JdbcParser: - db_connection_url: str - source_platform: str - database_name: str - topic_prefix: str - query: str - transforms: list - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> JdbcParser: - url = remove_prefix( - str(connector_manifest.config.get("connection.url")), "jdbc:" - ) - url_instance = make_url(url) - source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) - database_name = url_instance.database - assert database_name - db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" - - topic_prefix = self.connector_manifest.config.get("topic.prefix", None) - - query = self.connector_manifest.config.get("query", None) - - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - return self.JdbcParser( - db_connection_url, - source_platform, - database_name, - topic_prefix, - query, - transforms, - ) - - def default_get_lineages( - self, - topic_prefix: str, - database_name: str, - source_platform: str, - topic_names: Optional[Iterable[str]] = None, - include_source_dataset: bool = True, - ) -> List[KafkaConnectLineage]: - lineages: List[KafkaConnectLineage] = [] - if not topic_names: - topic_names = self.connector_manifest.topic_names - table_name_tuples: List[Tuple] = self.get_table_names() - for topic in topic_names: - # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) - source_table: str = ( - remove_prefix(topic, topic_prefix) if topic_prefix else topic - ) - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform): - table_name_tuple: Tuple = next( - iter([t for t in table_name_tuples if t and t[-1] == source_table]), - (), - ) - if len(table_name_tuple) > 1: - source_table = f"{table_name_tuple[-2]}.{source_table}" - else: - include_source_dataset = False - self.report.warning( - "Could not find schema for table" - f"{self.connector_manifest.name} : {source_table}", - ) - dataset_name: str = get_dataset_name(database_name, source_table) - lineage = KafkaConnectLineage( - source_dataset=dataset_name if include_source_dataset else None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - return lineages - - def get_table_names(self) -> List[Tuple]: - sep: str = "." - leading_quote_char: str = '"' - trailing_quote_char: str = leading_quote_char - - table_ids: List[str] = [] - if self.connector_manifest.tasks: - table_ids = ( - ",".join( - [ - task["config"].get("tables") - for task in self.connector_manifest.tasks - ] - ) - ).split(",") - quote_method = self.connector_manifest.config.get( - "quote.sql.identifiers", "always" - ) - if ( - quote_method == "always" - and table_ids - and table_ids[0] - and table_ids[-1] - ): - leading_quote_char = table_ids[0][0] - trailing_quote_char = table_ids[-1][-1] - # This will only work for single character quotes - elif self.connector_manifest.config.get("table.whitelist"): - table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore - - # List of Tuple containing (schema, table) - tables: List[Tuple] = [ - ( - ( - unquote( - table_id.split(sep)[-2], leading_quote_char, trailing_quote_char - ) - if len(table_id.split(sep)) > 1 - else "" - ), - unquote( - table_id.split(sep)[-1], leading_quote_char, trailing_quote_char - ), - ) - for table_id in table_ids - ] - return tables - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - database_name = parser.database_name - query = parser.query - topic_prefix = parser.topic_prefix - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # Mask/Remove properties that may reveal credentials - self.connector_manifest.flow_property_bag[ - "connection.url" - ] = parser.db_connection_url - if "connection.password" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.password"] - if "connection.user" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.user"] - - logging.debug( - f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " - ) - - if not self.connector_manifest.topic_names: - self.connector_manifest.lineages = lineages - return - - if query: - # Lineage source_table can be extracted by parsing query - for topic in self.connector_manifest.topic_names: - # default method - as per earlier implementation - dataset_name: str = get_dataset_name(database_name, topic) - - lineage = KafkaConnectLineage( - source_dataset=None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.report.warning( - "Could not find input dataset, the connector has query configuration set", - self.connector_manifest.name, - ) - self.connector_manifest.lineages = lineages - return - - SINGLE_TRANSFORM = len(transforms) == 1 - NO_TRANSFORM = len(transforms) == 0 - UNKNOWN_TRANSFORM = any( - [ - transform["type"] - not in self.KNOWN_TOPICROUTING_TRANSFORMS - + self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - ALL_TRANSFORMS_NON_TOPICROUTING = all( - [ - transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - - if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: - self.connector_manifest.lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - ) - return - - if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: - tables = self.get_table_names() - topic_names = list(self.connector_manifest.topic_names) - - from java.util.regex import Pattern - - for table in tables: - source_table: str = table[-1] - topic = topic_prefix + source_table if topic_prefix else source_table - - transform_regex = Pattern.compile(transforms[0]["regex"]) - transform_replacement = transforms[0]["replacement"] - - matcher = transform_regex.matcher(topic) - if matcher.matches(): - topic = str(matcher.replaceFirst(transform_replacement)) - - # Additional check to confirm that the topic present - # in connector topics - - if topic in self.connector_manifest.topic_names: - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform) and len(table) > 1: - source_table = f"{table[-2]}.{table[-1]}" - - dataset_name = get_dataset_name(database_name, source_table) - - lineage = KafkaConnectLineage( - source_dataset=dataset_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - topic_names.remove(topic) - lineages.append(lineage) - - if topic_names: - lineages.extend( - self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - topic_names=topic_names, - include_source_dataset=False, - ) - ) - self.report.warning( - "Could not find input dataset for connector topics", - f"{self.connector_manifest.name} : {topic_names}", - ) - self.connector_manifest.lineages = lineages - return - else: - include_source_dataset = True - if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has unknown transform", - f"{self.connector_manifest.name} : {transforms[0]['type']}", - ) - include_source_dataset = False - if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has one or more unknown transforms", - self.connector_manifest.name, - ) - include_source_dataset = False - lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - include_source_dataset=include_source_dataset, - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class MongoSourceConnector: - # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ - - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self._extract_lineages() - - @dataclass - class MongoSourceParser: - db_connection_url: Optional[str] - source_platform: str - database_name: Optional[str] - topic_prefix: Optional[str] - transforms: List[str] - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> MongoSourceParser: - parser = self.MongoSourceParser( - db_connection_url=connector_manifest.config.get("connection.uri"), - source_platform="mongodb", - database_name=connector_manifest.config.get("database"), - topic_prefix=connector_manifest.config.get("topic_prefix"), - transforms=( - connector_manifest.config["transforms"].split(",") - if "transforms" in connector_manifest.config - else [] - ), - ) - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(found.group(1), found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - - -@dataclass -class DebeziumSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - @dataclass - class DebeziumParser: - source_platform: str - server_name: Optional[str] - database_name: Optional[str] - - def get_server_name(self, connector_manifest: ConnectorManifest) -> str: - if "topic.prefix" in connector_manifest.config: - return connector_manifest.config["topic.prefix"] - else: - return connector_manifest.config.get("database.server.name", "") - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> DebeziumParser: - connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") - - if connector_class == "io.debezium.connector.mysql.MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": - parser = self.DebeziumParser( - source_platform="mongodb", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": - parser = self.DebeziumParser( - source_platform="postgres", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.oracle.OracleConnector": - parser = self.DebeziumParser( - source_platform="oracle", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": - database_name = connector_manifest.config.get( - "database.names" - ) or connector_manifest.config.get("database.dbname") - - if "," in str(database_name): - raise Exception( - f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" - ) - - parser = self.DebeziumParser( - source_platform="mssql", - server_name=self.get_server_name(connector_manifest), - database_name=database_name, - ) - elif connector_class == "io.debezium.connector.db2.Db2Connector": - parser = self.DebeziumParser( - source_platform="db2", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.vitess.VitessConnector": - parser = self.DebeziumParser( - source_platform="vitess", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("vitess.keyspace"), - ) - else: - raise ValueError(f"Connector class '{connector_class}' is unknown.") - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - - try: - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - server_name = parser.server_name - database_name = parser.database_name - topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(database_name, found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -@dataclass -class BigQuerySinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class BQParser: - project: str - target_platform: str - sanitizeTopics: str - transforms: list - topicsToTables: Optional[str] = None - datasets: Optional[str] = None - defaultDataset: Optional[str] = None - version: str = "v1" - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> BQParser: - project = connector_manifest.config["project"] - sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - if "defaultDataset" in connector_manifest.config: - defaultDataset = connector_manifest.config["defaultDataset"] - return self.BQParser( - project=project, - defaultDataset=defaultDataset, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - version="v2", - transforms=transforms, - ) - else: - # version 1.6.x and similar configs supported - datasets = connector_manifest.config["datasets"] - topicsToTables = connector_manifest.config.get("topicsToTables") - - return self.BQParser( - project=project, - topicsToTables=topicsToTables, - datasets=datasets, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - transforms=transforms, - ) - - def get_list(self, property: str) -> Iterable[Tuple[str, str]]: - entries = property.split(",") - for entry in entries: - key, val = entry.rsplit("=") - yield (key.strip(), val.strip()) - - def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: - topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore - from java.util.regex import Pattern - - for pattern, dataset in topicregex_dataset_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - return dataset - return None - - def sanitize_table_name(self, table_name): - table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - - return table_name - - def get_dataset_table_for_topic( - self, topic: str, parser: BQParser - ) -> Optional[str]: - if parser.version == "v2": - dataset = parser.defaultDataset - parts = topic.split(":") - if len(parts) == 2: - dataset = parts[0] - table = parts[1] - else: - table = parts[0] - else: - dataset = self.get_dataset_for_topic_v1(topic, parser) - if dataset is None: - return None - - table = topic - if parser.topicsToTables: - topicregex_table_map: Dict[str, str] = dict( - self.get_list(parser.topicsToTables) # type: ignore - ) - from java.util.regex import Pattern - - for pattern, tbl in topicregex_table_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - table = tbl - break - - if parser.sanitizeTopics: - table = self.sanitize_table_name(table) - return f"{dataset}.{table}" - - def apply_transformations( - self, topic: str, transforms: List[Dict[str, str]] - ) -> str: - for transform in transforms: - if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": - regex = transform["regex"] - replacement = transform["replacement"] - pattern = re.compile(regex) - if pattern.match(topic): - topic = pattern.sub(replacement, topic, count=1) - return topic - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - if not parser: - return lineages - target_platform = parser.target_platform - project = parser.project - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - # Mask/Remove properties that may reveal credentials - if "keyfile" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["keyfile"] - - for topic in self.connector_manifest.topic_names: - transformed_topic = self.apply_transformations(topic, transforms) - dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) - if dataset_table is None: - self.report.warning( - "Could not find target dataset for topic, please check your connector configuration" - f"{self.connector_manifest.name} : {transformed_topic} ", - ) - continue - target_dataset = f"{project}.{dataset_table}" - - lineages.append( - KafkaConnectLineage( - source_dataset=transformed_topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform=target_platform, - ) - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class SnowflakeSinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class SnowflakeParser: - database_name: str - schema_name: str - topics_to_tables: Dict[str, str] - - def get_table_name_from_topic_name(self, topic_name: str) -> str: - """ - This function converts the topic name to a valid Snowflake table name using some rules. - Refer below link for more info - https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics - """ - table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - # Connector may append original topic's hash code as suffix for conflict resolution - # if generated table names for 2 topics are similar. This corner case is not handled here. - # Note that Snowflake recommends to choose topic names that follow the rules for - # Snowflake identifier names so this case is not recommended by snowflake. - return table_name - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> SnowflakeParser: - database_name = connector_manifest.config["snowflake.database.name"] - schema_name = connector_manifest.config["snowflake.schema.name"] - - # Fetch user provided topic to table map - provided_topics_to_tables: Dict[str, str] = {} - if connector_manifest.config.get("snowflake.topic2table.map"): - for each in connector_manifest.config["snowflake.topic2table.map"].split( - "," - ): - topic, table = each.split(":") - provided_topics_to_tables[topic.strip()] = table.strip() - - topics_to_tables: Dict[str, str] = {} - # Extract lineage for only those topics whose data ingestion started - for topic in connector_manifest.topic_names: - if topic in provided_topics_to_tables: - # If user provided which table to get mapped with this topic - topics_to_tables[topic] = provided_topics_to_tables[topic] - else: - # Else connector converts topic name to a valid Snowflake table name. - topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) - - return self.SnowflakeParser( - database_name=database_name, - schema_name=schema_name, - topics_to_tables=topics_to_tables, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # For all snowflake sink connector properties, refer below link - # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector - # remove private keys, secrets from properties - secret_properties = [ - "snowflake.private.key", - "snowflake.private.key.passphrase", - "value.converter.basic.auth.user.info", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - - for topic, table in parser.topics_to_tables.items(): - target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform="snowflake", - ) - ) - - self.connector_manifest.lineages = lineages - return - - -@dataclass -class ConfluentS3SinkConnector: - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class S3SinkParser: - target_platform: str - bucket: str - topics_dir: str - topics: Iterable[str] - - def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 - bucket = connector_manifest.config.get("s3.bucket.name") - if not bucket: - raise ValueError( - "Could not find 's3.bucket.name' in connector configuration" - ) - - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage - topics_dir = connector_manifest.config.get("topics.dir", "topics") - - return self.S3SinkParser( - target_platform="s3", - bucket=bucket, - topics_dir=topics_dir, - topics=connector_manifest.topic_names, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # remove keys, secrets from properties - secret_properties = [ - "aws.access.key.id", - "aws.secret.access.key", - "s3.sse.customer.key", - "s3.proxy.password", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - try: - parser = self._get_parser(self.connector_manifest) - - lineages: List[KafkaConnectLineage] = list() - for topic in parser.topics: - target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" - - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform="kafka", - target_dataset=target_dataset, - target_platform=parser.target_platform, - ) - ) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -def transform_connector_config( - connector_config: Dict, provided_configs: List[ProvidedConfig] -) -> None: - """This method will update provided configs in connector config values, if any""" - lookupsByProvider = {} - for pconfig in provided_configs: - lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value - for k, v in connector_config.items(): - for key, value in lookupsByProvider.items(): - if key in v: - connector_config[k] = connector_config[k].replace(key, value) - - -@platform_name("Kafka Connect") -@config_class(KafkaConnectSourceConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") -@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") -class KafkaConnectSource(StatefulIngestionSourceBase): - config: KafkaConnectSourceConfig - report: KafkaConnectSourceReport - platform: str = "kafka-connect" - - def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): - super().__init__(config, ctx) - self.config = config - self.report = KafkaConnectSourceReport() - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "Content-Type": "application/json", - } - ) - - # Test the connection - if self.config.username is not None and self.config.password is not None: - logger.info( - f"Connecting to {self.config.connect_uri} with Authentication..." - ) - self.session.auth = (self.config.username, self.config.password) - - test_response = self.session.get(f"{self.config.connect_uri}/connectors") - test_response.raise_for_status() - logger.info(f"Connection to {self.config.connect_uri} is ok") - if not jpype.isJVMStarted(): - jpype.startJVM() - - @classmethod - def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: - config = KafkaConnectSourceConfig.parse_obj(config_dict) - return cls(config, ctx) - - def get_connectors_manifest(self) -> List[ConnectorManifest]: - """Get Kafka Connect connectors manifest using REST API. - Enrich with lineages metadata. - """ - connectors_manifest = list() - - connector_response = self.session.get( - f"{self.config.connect_uri}/connectors", - ) - - payload = connector_response.json() - - for connector_name in payload: - connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" - connector_manifest = self._get_connector_manifest( - connector_name, connector_url - ) - if ( - connector_manifest is None - or not self.config.connector_patterns.allowed(connector_manifest.name) - ): - self.report.report_dropped(connector_name) - continue - - if self.config.provided_configs: - transform_connector_config( - connector_manifest.config, self.config.provided_configs - ) - # Initialize connector lineages - connector_manifest.lineages = list() - connector_manifest.url = connector_url - - connector_manifest.topic_names = self._get_connector_topics(connector_name) - - # Populate Source Connector metadata - if connector_manifest.type == SOURCE: - connector_manifest.tasks = self._get_connector_tasks(connector_name) - - # JDBC source connector lineages - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "io.confluent.connect.jdbc.JdbcSourceConnector" - ): - connector_manifest = ConfluentJDBCSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif connector_manifest.config.get(CONNECTOR_CLASS, "").startswith( - "io.debezium.connector" - ): - connector_manifest = DebeziumSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif ( - connector_manifest.config.get(CONNECTOR_CLASS, "") - == "com.mongodb.kafka.connect.MongoSourceConnector" - ): - connector_manifest = MongoSourceConnector( - connector_manifest=connector_manifest, config=self.config - ).connector_manifest - else: - # Find the target connector object in the list, or log an error if unknown. - target_connector = None - for connector in self.config.generic_connectors: - if connector.connector_name == connector_manifest.name: - target_connector = connector - break - if not target_connector: - logger.warning( - f"Detected undefined connector {connector_manifest.name}, which is not in the customized connector list. Please refer to Kafka Connect ingestion recipe to define this customized connector." - ) - continue - - for topic in connector_manifest.topic_names: - lineage = KafkaConnectLineage( - source_dataset=target_connector.source_dataset, - source_platform=target_connector.source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - - connector_manifest.lineages.append(lineage) - - if connector_manifest.type == SINK: - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" - ): - connector_manifest = BigQuerySinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "io.confluent.connect.s3.S3SinkConnector" - ): - connector_manifest = ConfluentS3SinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "com.snowflake.kafka.connector.SnowflakeSinkConnector" - ): - connector_manifest = SnowflakeSinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - else: - self.report.report_dropped(connector_manifest.name) - logger.warning( - f"Skipping connector {connector_manifest.name}. Lineage for Connector not yet implemented" - ) - pass - - connectors_manifest.append(connector_manifest) - - return connectors_manifest - - def _get_connector_manifest( - self, connector_name: str, connector_url: str - ) -> Optional[ConnectorManifest]: - try: - connector_response = self.session.get(connector_url) - connector_response.raise_for_status() - except Exception as e: - self.report.warning( - "Failed to get connector details", connector_name, exc=e - ) - return None - manifest = connector_response.json() - connector_manifest = ConnectorManifest(**manifest) - return connector_manifest - - def _get_connector_tasks(self, connector_name: str) -> dict: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/tasks", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector tasks", context=connector_name, exc=e - ) - return {} - - return response.json() - - def _get_connector_topics(self, connector_name: str) -> List[str]: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/topics", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector topics", context=connector_name, exc=e - ) - return [] - - return response.json()[connector_name]["topics"] - - def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: - connector_name = connector.name - connector_type = connector.type - connector_class = connector.config.get(CONNECTOR_CLASS) - flow_property_bag = connector.flow_property_bag - # connector_url = connector.url # NOTE: this will expose connector credential when used - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - return MetadataChangeProposalWrapper( - entityUrn=flow_urn, - aspect=models.DataFlowInfoClass( - name=connector_name, - description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", - customProperties=flow_property_bag, - # externalUrl=connector_url, # NOTE: this will expose connector credential when used - ), - ).as_workunit() - - def construct_job_workunits( - self, connector: ConnectorManifest - ) -> Iterable[MetadataWorkUnit]: - connector_name = connector.name - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - lineages = connector.lineages - if lineages: - for lineage in lineages: - source_dataset = lineage.source_dataset - source_platform = lineage.source_platform - target_dataset = lineage.target_dataset - target_platform = lineage.target_platform - job_property_bag = lineage.job_property_bag - - source_platform_instance = get_platform_instance( - self.config, connector_name, source_platform - ) - target_platform_instance = get_platform_instance( - self.config, connector_name, target_platform - ) - - job_id = self.get_job_id(lineage, connector, self.config) - job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) - - inlets = ( - [ - self.make_lineage_dataset_urn( - source_platform, source_dataset, source_platform_instance - ) - ] - if source_dataset - else [] - ) - outlets = [ - self.make_lineage_dataset_urn( - target_platform, target_dataset, target_platform_instance - ) - ] - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInfoClass( - name=f"{connector_name}:{job_id}", - type="COMMAND", - customProperties=job_property_bag, - ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInputOutputClass( - inputDatasets=inlets, - outputDatasets=outlets, - ), - ).as_workunit() - - def get_job_id( - self, - lineage: KafkaConnectLineage, - connector: ConnectorManifest, - config: KafkaConnectSourceConfig, - ) -> str: - connector_class = connector.config.get(CONNECTOR_CLASS) - - # Note - This block is only to maintain backward compatibility of Job URN - if ( - connector_class - and connector.type == SOURCE - and ( - "JdbcSourceConnector" in connector_class - or connector_class.startswith("io.debezium.connector") - ) - and lineage.source_dataset - and config.connect_to_platform_map - and config.connect_to_platform_map.get(connector.name) - and config.connect_to_platform_map[connector.name].get( - lineage.source_platform - ) - ): - return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" - - return ( - lineage.source_dataset - if lineage.source_dataset - else f"unknown_source.{lineage.target_dataset}" - ) - - def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: - return [ - *super().get_workunit_processors(), - StaleEntityRemovalHandler.create( - self, self.config, self.ctx - ).workunit_processor, - ] - - def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - connectors_manifest = self.get_connectors_manifest() - for connector in connectors_manifest: - name = connector.name - - yield self.construct_flow_workunit(connector) - yield from self.construct_job_workunits(connector) - self.report.report_connector_scanned(name) - - def get_report(self) -> KafkaConnectSourceReport: - return self.report - - def make_lineage_dataset_urn( - self, platform: str, name: str, platform_instance: Optional[str] - ) -> str: - if self.config.convert_lineage_urns_to_lowercase: - name = name.lower() - - return builder.make_dataset_urn_with_platform_instance( - platform, name, platform_instance, self.config.env - ) - - -# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. -def has_three_level_hierarchy(platform: str) -> bool: - return platform in ["postgres", "trino", "redshift", "snowflake"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py new file mode 100644 index 0000000000000..36f6a96c0d408 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py @@ -0,0 +1,202 @@ +import logging +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Optional + +from pydantic.fields import Field + +from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.source_common import ( + DatasetLineageProviderConfigBase, + PlatformInstanceConfigMixin, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) + +logger = logging.getLogger(__name__) + +KAFKA = "kafka" +SOURCE = "source" +SINK = "sink" +CONNECTOR_CLASS = "connector.class" + + +class ProvidedConfig(ConfigModel): + provider: str + path_key: str + value: str + + +class GenericConnectorConfig(ConfigModel): + connector_name: str + source_dataset: str + source_platform: str + + +class KafkaConnectSourceConfig( + PlatformInstanceConfigMixin, + DatasetLineageProviderConfigBase, + StatefulIngestionConfigBase, +): + # See the Connect REST Interface for details + # https://docs.confluent.io/platform/current/connect/references/restapi.html# + connect_uri: str = Field( + default="http://localhost:8083/", description="URI to connect to." + ) + username: Optional[str] = Field(default=None, description="Kafka Connect username.") + password: Optional[str] = Field(default=None, description="Kafka Connect password.") + cluster_name: Optional[str] = Field( + default="connect-cluster", description="Cluster to ingest from." + ) + # convert lineage dataset's urns to lowercase + convert_lineage_urns_to_lowercase: bool = Field( + default=False, + description="Whether to convert the urns of ingested lineage dataset to lowercase", + ) + connector_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for connectors to filter for ingestion.", + ) + provided_configs: Optional[List[ProvidedConfig]] = Field( + default=None, description="Provided Configurations" + ) + connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( + default=None, + description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', + ) + platform_instance_map: Optional[Dict[str, str]] = Field( + default=None, + description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', + ) + generic_connectors: List[GenericConnectorConfig] = Field( + default=[], + description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", + ) + + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None + + +@dataclass +class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): + connectors_scanned: int = 0 + filtered: List[str] = field(default_factory=list) + + def report_connector_scanned(self, connector: str) -> None: + self.connectors_scanned += 1 + + def report_dropped(self, connector: str) -> None: + self.filtered.append(connector) + + +@dataclass +class KafkaConnectLineage: + """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" + + source_platform: str + target_dataset: str + target_platform: str + job_property_bag: Optional[Dict[str, str]] = None + source_dataset: Optional[str] = None + + +@dataclass +class ConnectorManifest: + """Each instance is potential DataFlow""" + + name: str + type: str + config: Dict + tasks: Dict + url: Optional[str] = None + flow_property_bag: Optional[Dict[str, str]] = None + lineages: List[KafkaConnectLineage] = field(default_factory=list) + topic_names: Iterable[str] = field(default_factory=list) + + +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + index = len(prefix) + return text[index:] + return text + + +def unquote( + string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None +) -> str: + """ + If string starts and ends with a quote, unquote it + """ + trailing_quote = trailing_quote if trailing_quote else leading_quote + if string.startswith(leading_quote) and string.endswith(trailing_quote): + string = string[1:-1] + return string + + +def get_dataset_name( + database_name: Optional[str], + source_table: str, +) -> str: + if database_name: + dataset_name = database_name + "." + source_table + else: + dataset_name = source_table + + return dataset_name + + +def get_platform_instance( + config: KafkaConnectSourceConfig, connector_name: str, platform: str +) -> Optional[str]: + instance_name = None + if ( + config.connect_to_platform_map + and config.connect_to_platform_map.get(connector_name) + and config.connect_to_platform_map[connector_name].get(platform) + ): + instance_name = config.connect_to_platform_map[connector_name][platform] + if config.platform_instance_map and config.platform_instance_map.get(platform): + logger.warning( + f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." + "Will prefer connector specific platform instance from connect_to_platform_map." + ) + elif config.platform_instance_map and config.platform_instance_map.get(platform): + instance_name = config.platform_instance_map[platform] + logger.info( + f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" + ) + return instance_name + + +def transform_connector_config( + connector_config: Dict, provided_configs: List[ProvidedConfig] +) -> None: + """This method will update provided configs in connector config values, if any""" + lookupsByProvider = {} + for pconfig in provided_configs: + lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value + for k, v in connector_config.items(): + for key, value in lookupsByProvider.items(): + if key in v: + connector_config[k] = connector_config[k].replace(key, value) + + +# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. +def has_three_level_hierarchy(platform: str) -> bool: + return platform in ["postgres", "trino", "redshift", "snowflake"] + + +@dataclass +class BaseConnector: + connector_manifest: ConnectorManifest + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + + def extract_lineages(self) -> List[KafkaConnectLineage]: + return [] + + def extract_flow_property_bag(self) -> Optional[Dict[str, str]]: + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py new file mode 100644 index 0000000000000..fa6b614c4b52a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py @@ -0,0 +1,367 @@ +import logging +from typing import Iterable, List, Optional, Type + +import jpype +import jpype.imports +import requests + +import datahub.emitter.mce_builder as builder +import datahub.metadata.schema_classes as models +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + SINK, + SOURCE, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + KafkaConnectSourceConfig, + KafkaConnectSourceReport, + get_platform_instance, + transform_connector_config, +) +from datahub.ingestion.source.kafka_connect.sink_connectors import ( + BIGQUERY_SINK_CONNECTOR_CLASS, + S3_SINK_CONNECTOR_CLASS, + SNOWFLAKE_SINK_CONNECTOR_CLASS, + BigQuerySinkConnector, + ConfluentS3SinkConnector, + SnowflakeSinkConnector, +) +from datahub.ingestion.source.kafka_connect.source_connectors import ( + DEBEZIUM_SOURCE_CONNECTOR_PREFIX, + JDBC_SOURCE_CONNECTOR_CLASS, + MONGO_SOURCE_CONNECTOR_CLASS, + ConfigDrivenSourceConnector, + ConfluentJDBCSourceConnector, + DebeziumSourceConnector, + MongoSourceConnector, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) + +logger = logging.getLogger(__name__) + + +@platform_name("Kafka Connect") +@config_class(KafkaConnectSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +class KafkaConnectSource(StatefulIngestionSourceBase): + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + platform: str = "kafka-connect" + + def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): + super().__init__(config, ctx) + self.config = config + self.report = KafkaConnectSourceReport() + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "Content-Type": "application/json", + } + ) + + # Test the connection + if self.config.username is not None and self.config.password is not None: + logger.info( + f"Connecting to {self.config.connect_uri} with Authentication..." + ) + self.session.auth = (self.config.username, self.config.password) + + test_response = self.session.get(f"{self.config.connect_uri}/connectors") + test_response.raise_for_status() + logger.info(f"Connection to {self.config.connect_uri} is ok") + if not jpype.isJVMStarted(): + jpype.startJVM() + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: + config = KafkaConnectSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_connectors_manifest(self) -> Iterable[ConnectorManifest]: + """Get Kafka Connect connectors manifest using REST API. + Enrich with lineages metadata. + """ + + connector_response = self.session.get( + f"{self.config.connect_uri}/connectors", + ) + + payload = connector_response.json() + + for connector_name in payload: + connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" + connector_manifest = self._get_connector_manifest( + connector_name, connector_url + ) + if ( + connector_manifest is None + or not self.config.connector_patterns.allowed(connector_manifest.name) + ): + self.report.report_dropped(connector_name) + continue + + if self.config.provided_configs: + transform_connector_config( + connector_manifest.config, self.config.provided_configs + ) + connector_manifest.url = connector_url + connector_manifest.topic_names = self._get_connector_topics(connector_name) + connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or "" + + class_type: Type[BaseConnector] = BaseConnector + + # Populate Source Connector metadata + if connector_manifest.type == SOURCE: + connector_manifest.tasks = self._get_connector_tasks(connector_name) + + # JDBC source connector lineages + if connector_class_value == JDBC_SOURCE_CONNECTOR_CLASS: + class_type = ConfluentJDBCSourceConnector + elif connector_class_value.startswith(DEBEZIUM_SOURCE_CONNECTOR_PREFIX): + class_type = DebeziumSourceConnector + elif connector_class_value == MONGO_SOURCE_CONNECTOR_CLASS: + class_type = MongoSourceConnector + elif any( + [ + connector.connector_name == connector_manifest.name + for connector in self.config.generic_connectors + ] + ): + class_type = ConfigDrivenSourceConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Source Connector not supported. " + "Please refer to Kafka Connect docs to use `generic_connectors` config.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + continue + elif connector_manifest.type == SINK: + if connector_class_value == BIGQUERY_SINK_CONNECTOR_CLASS: + class_type = BigQuerySinkConnector + elif connector_class_value == S3_SINK_CONNECTOR_CLASS: + class_type = ConfluentS3SinkConnector + elif connector_class_value == SNOWFLAKE_SINK_CONNECTOR_CLASS: + class_type = SnowflakeSinkConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Sink Connector not supported.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + + connector_class = class_type(connector_manifest, self.config, self.report) + connector_manifest.lineages = connector_class.extract_lineages() + connector_manifest.flow_property_bag = ( + connector_class.extract_flow_property_bag() + ) + + yield connector_manifest + + def _get_connector_manifest( + self, connector_name: str, connector_url: str + ) -> Optional[ConnectorManifest]: + try: + connector_response = self.session.get(connector_url) + connector_response.raise_for_status() + except Exception as e: + self.report.warning( + "Failed to get connector details", connector_name, exc=e + ) + return None + manifest = connector_response.json() + connector_manifest = ConnectorManifest(**manifest) + return connector_manifest + + def _get_connector_tasks(self, connector_name: str) -> dict: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/tasks", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector tasks", context=connector_name, exc=e + ) + return {} + + return response.json() + + def _get_connector_topics(self, connector_name: str) -> List[str]: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/topics", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector topics", context=connector_name, exc=e + ) + return [] + + return response.json()[connector_name]["topics"] + + def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: + connector_name = connector.name + connector_type = connector.type + connector_class = connector.config.get(CONNECTOR_CLASS) + flow_property_bag = connector.flow_property_bag + # connector_url = connector.url # NOTE: this will expose connector credential when used + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + return MetadataChangeProposalWrapper( + entityUrn=flow_urn, + aspect=models.DataFlowInfoClass( + name=connector_name, + description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", + customProperties=flow_property_bag, + # externalUrl=connector_url, # NOTE: this will expose connector credential when used + ), + ).as_workunit() + + def construct_job_workunits( + self, connector: ConnectorManifest + ) -> Iterable[MetadataWorkUnit]: + connector_name = connector.name + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + lineages = connector.lineages + if lineages: + for lineage in lineages: + source_dataset = lineage.source_dataset + source_platform = lineage.source_platform + target_dataset = lineage.target_dataset + target_platform = lineage.target_platform + job_property_bag = lineage.job_property_bag + + source_platform_instance = get_platform_instance( + self.config, connector_name, source_platform + ) + target_platform_instance = get_platform_instance( + self.config, connector_name, target_platform + ) + + job_id = self.get_job_id(lineage, connector, self.config) + job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) + + inlets = ( + [ + self.make_lineage_dataset_urn( + source_platform, source_dataset, source_platform_instance + ) + ] + if source_dataset + else [] + ) + outlets = [ + self.make_lineage_dataset_urn( + target_platform, target_dataset, target_platform_instance + ) + ] + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInfoClass( + name=f"{connector_name}:{job_id}", + type="COMMAND", + customProperties=job_property_bag, + ), + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInputOutputClass( + inputDatasets=inlets, + outputDatasets=outlets, + ), + ).as_workunit() + + def get_job_id( + self, + lineage: KafkaConnectLineage, + connector: ConnectorManifest, + config: KafkaConnectSourceConfig, + ) -> str: + connector_class = connector.config.get(CONNECTOR_CLASS) + + # Note - This block is only to maintain backward compatibility of Job URN + if ( + connector_class + and connector.type == SOURCE + and ( + "JdbcSourceConnector" in connector_class + or connector_class.startswith("io.debezium.connector") + ) + and lineage.source_dataset + and config.connect_to_platform_map + and config.connect_to_platform_map.get(connector.name) + and config.connect_to_platform_map[connector.name].get( + lineage.source_platform + ) + ): + return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" + + return ( + lineage.source_dataset + if lineage.source_dataset + else f"unknown_source.{lineage.target_dataset}" + ) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + for connector in self.get_connectors_manifest(): + yield self.construct_flow_workunit(connector) + yield from self.construct_job_workunits(connector) + self.report.report_connector_scanned(connector.name) + + def get_report(self) -> KafkaConnectSourceReport: + return self.report + + def make_lineage_dataset_urn( + self, platform: str, name: str, platform_instance: Optional[str] + ) -> str: + if self.config.convert_lineage_urns_to_lowercase: + name = name.lower() + + return builder.make_dataset_urn_with_platform_instance( + platform, name, platform_instance, self.config.env + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py new file mode 100644 index 0000000000000..2790460c8e601 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py @@ -0,0 +1,341 @@ +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from datahub.ingestion.source.kafka_connect.common import ( + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, +) + + +@dataclass +class ConfluentS3SinkConnector(BaseConnector): + @dataclass + class S3SinkParser: + target_platform: str + bucket: str + topics_dir: str + topics: Iterable[str] + + def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 + bucket = connector_manifest.config.get("s3.bucket.name") + if not bucket: + raise ValueError( + "Could not find 's3.bucket.name' in connector configuration" + ) + + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage + topics_dir = connector_manifest.config.get("topics.dir", "topics") + + return self.S3SinkParser( + target_platform="s3", + bucket=bucket, + topics_dir=topics_dir, + topics=connector_manifest.topic_names, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "aws.access.key.id", + "aws.secret.access.key", + "s3.sse.customer.key", + "s3.proxy.password", + ] + } + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + try: + parser = self._get_parser(self.connector_manifest) + + lineages: List[KafkaConnectLineage] = list() + for topic in parser.topics: + target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" + + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform="kafka", + target_dataset=target_dataset, + target_platform=parser.target_platform, + ) + ) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class SnowflakeSinkConnector(BaseConnector): + @dataclass + class SnowflakeParser: + database_name: str + schema_name: str + topics_to_tables: Dict[str, str] + + def get_table_name_from_topic_name(self, topic_name: str) -> str: + """ + This function converts the topic name to a valid Snowflake table name using some rules. + Refer below link for more info + https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics + """ + table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + # Connector may append original topic's hash code as suffix for conflict resolution + # if generated table names for 2 topics are similar. This corner case is not handled here. + # Note that Snowflake recommends to choose topic names that follow the rules for + # Snowflake identifier names so this case is not recommended by snowflake. + return table_name + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> SnowflakeParser: + database_name = connector_manifest.config["snowflake.database.name"] + schema_name = connector_manifest.config["snowflake.schema.name"] + + # Fetch user provided topic to table map + provided_topics_to_tables: Dict[str, str] = {} + if connector_manifest.config.get("snowflake.topic2table.map"): + for each in connector_manifest.config["snowflake.topic2table.map"].split( + "," + ): + topic, table = each.split(":") + provided_topics_to_tables[topic.strip()] = table.strip() + + topics_to_tables: Dict[str, str] = {} + # Extract lineage for only those topics whose data ingestion started + for topic in connector_manifest.topic_names: + if topic in provided_topics_to_tables: + # If user provided which table to get mapped with this topic + topics_to_tables[topic] = provided_topics_to_tables[topic] + else: + # Else connector converts topic name to a valid Snowflake table name. + topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) + + return self.SnowflakeParser( + database_name=database_name, + schema_name=schema_name, + topics_to_tables=topics_to_tables, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # For all snowflake sink connector properties, refer below link + # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector + # remove private keys, secrets from properties + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "snowflake.private.key", + "snowflake.private.key.passphrase", + "value.converter.basic.auth.user.info", + ] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + + for topic, table in parser.topics_to_tables.items(): + target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform="snowflake", + ) + ) + + return lineages + + +@dataclass +class BigQuerySinkConnector(BaseConnector): + @dataclass + class BQParser: + project: str + target_platform: str + sanitizeTopics: str + transforms: list + topicsToTables: Optional[str] = None + datasets: Optional[str] = None + defaultDataset: Optional[str] = None + version: str = "v1" + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> BQParser: + project = connector_manifest.config["project"] + sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + if "defaultDataset" in connector_manifest.config: + defaultDataset = connector_manifest.config["defaultDataset"] + return self.BQParser( + project=project, + defaultDataset=defaultDataset, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + version="v2", + transforms=transforms, + ) + else: + # version 1.6.x and similar configs supported + datasets = connector_manifest.config["datasets"] + topicsToTables = connector_manifest.config.get("topicsToTables") + + return self.BQParser( + project=project, + topicsToTables=topicsToTables, + datasets=datasets, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + transforms=transforms, + ) + + def get_list(self, property: str) -> Iterable[Tuple[str, str]]: + entries = property.split(",") + for entry in entries: + key, val = entry.rsplit("=") + yield (key.strip(), val.strip()) + + def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: + topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore + from java.util.regex import Pattern + + for pattern, dataset in topicregex_dataset_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + return dataset + return None + + def sanitize_table_name(self, table_name): + table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + + return table_name + + def get_dataset_table_for_topic( + self, topic: str, parser: BQParser + ) -> Optional[str]: + if parser.version == "v2": + dataset = parser.defaultDataset + parts = topic.split(":") + if len(parts) == 2: + dataset = parts[0] + table = parts[1] + else: + table = parts[0] + else: + dataset = self.get_dataset_for_topic_v1(topic, parser) + if dataset is None: + return None + + table = topic + if parser.topicsToTables: + topicregex_table_map: Dict[str, str] = dict( + self.get_list(parser.topicsToTables) # type: ignore + ) + from java.util.regex import Pattern + + for pattern, tbl in topicregex_table_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + table = tbl + break + + if parser.sanitizeTopics: + table = self.sanitize_table_name(table) + return f"{dataset}.{table}" + + def apply_transformations( + self, topic: str, transforms: List[Dict[str, str]] + ) -> str: + for transform in transforms: + if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": + regex = transform["regex"] + replacement = transform["replacement"] + pattern = re.compile(regex) + if pattern.match(topic): + topic = pattern.sub(replacement, topic, count=1) + return topic + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["keyfile"] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + if not parser: + return lineages + target_platform = parser.target_platform + project = parser.project + transforms = parser.transforms + + for topic in self.connector_manifest.topic_names: + transformed_topic = self.apply_transformations(topic, transforms) + dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) + if dataset_table is None: + self.report.warning( + "Could not find target dataset for topic, please check your connector configuration" + f"{self.connector_manifest.name} : {transformed_topic} ", + ) + continue + target_dataset = f"{project}.{dataset_table}" + + lineages.append( + KafkaConnectLineage( + source_dataset=transformed_topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform=target_platform, + ) + ) + return lineages + + +BIGQUERY_SINK_CONNECTOR_CLASS = "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" +S3_SINK_CONNECTOR_CLASS = "io.confluent.connect.s3.S3SinkConnector" +SNOWFLAKE_SINK_CONNECTOR_CLASS = "com.snowflake.kafka.connector.SnowflakeSinkConnector" diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py new file mode 100644 index 0000000000000..7b3b6e551a0a1 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py @@ -0,0 +1,570 @@ +import logging +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from sqlalchemy.engine.url import make_url + +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + get_dataset_name, + has_three_level_hierarchy, + remove_prefix, + unquote, +) +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) + + +@dataclass +class ConfluentJDBCSourceConnector(BaseConnector): + REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" + KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] + # https://kafka.apache.org/documentation/#connect_included_transformation + KAFKA_NONTOPICROUTING_TRANSFORMS = [ + "InsertField", + "InsertField$Key", + "InsertField$Value", + "ReplaceField", + "ReplaceField$Key", + "ReplaceField$Value", + "MaskField", + "MaskField$Key", + "MaskField$Value", + "ValueToKey", + "ValueToKey$Key", + "ValueToKey$Value", + "HoistField", + "HoistField$Key", + "HoistField$Value", + "ExtractField", + "ExtractField$Key", + "ExtractField$Value", + "SetSchemaMetadata", + "SetSchemaMetadata$Key", + "SetSchemaMetadata$Value", + "Flatten", + "Flatten$Key", + "Flatten$Value", + "Cast", + "Cast$Key", + "Cast$Value", + "HeadersFrom", + "HeadersFrom$Key", + "HeadersFrom$Value", + "TimestampConverter", + "Filter", + "InsertHeader", + "DropHeaders", + ] + # https://docs.confluent.io/platform/current/connect/transforms/overview.html + CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ + "Drop", + "Drop$Key", + "Drop$Value", + "Filter", + "Filter$Key", + "Filter$Value", + "TombstoneHandler", + ] + KNOWN_NONTOPICROUTING_TRANSFORMS = ( + KAFKA_NONTOPICROUTING_TRANSFORMS + + [ + f"org.apache.kafka.connect.transforms.{t}" + for t in KAFKA_NONTOPICROUTING_TRANSFORMS + ] + + CONFLUENT_NONTOPICROUTING_TRANSFORMS + + [ + f"io.confluent.connect.transforms.{t}" + for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS + ] + ) + + @dataclass + class JdbcParser: + db_connection_url: str + source_platform: str + database_name: str + topic_prefix: str + query: str + transforms: list + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> JdbcParser: + url = remove_prefix( + str(connector_manifest.config.get("connection.url")), "jdbc:" + ) + url_instance = make_url(url) + source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) + database_name = url_instance.database + assert database_name + db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" + + topic_prefix = self.connector_manifest.config.get("topic.prefix", None) + + query = self.connector_manifest.config.get("query", None) + + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + return self.JdbcParser( + db_connection_url, + source_platform, + database_name, + topic_prefix, + query, + transforms, + ) + + def default_get_lineages( + self, + topic_prefix: str, + database_name: str, + source_platform: str, + topic_names: Optional[Iterable[str]] = None, + include_source_dataset: bool = True, + ) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = [] + if not topic_names: + topic_names = self.connector_manifest.topic_names + table_name_tuples: List[Tuple] = self.get_table_names() + for topic in topic_names: + # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) + source_table: str = ( + remove_prefix(topic, topic_prefix) if topic_prefix else topic + ) + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform): + table_name_tuple: Tuple = next( + iter([t for t in table_name_tuples if t and t[-1] == source_table]), + (), + ) + if len(table_name_tuple) > 1: + source_table = f"{table_name_tuple[-2]}.{source_table}" + else: + include_source_dataset = False + self.report.warning( + "Could not find schema for table" + f"{self.connector_manifest.name} : {source_table}", + ) + dataset_name: str = get_dataset_name(database_name, source_table) + lineage = KafkaConnectLineage( + source_dataset=dataset_name if include_source_dataset else None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + def get_table_names(self) -> List[Tuple]: + sep: str = "." + leading_quote_char: str = '"' + trailing_quote_char: str = leading_quote_char + + table_ids: List[str] = [] + if self.connector_manifest.tasks: + table_ids = ( + ",".join( + [ + task["config"].get("tables") + for task in self.connector_manifest.tasks + ] + ) + ).split(",") + quote_method = self.connector_manifest.config.get( + "quote.sql.identifiers", "always" + ) + if ( + quote_method == "always" + and table_ids + and table_ids[0] + and table_ids[-1] + ): + leading_quote_char = table_ids[0][0] + trailing_quote_char = table_ids[-1][-1] + # This will only work for single character quotes + elif self.connector_manifest.config.get("table.whitelist"): + table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore + + # List of Tuple containing (schema, table) + tables: List[Tuple] = [ + ( + ( + unquote( + table_id.split(sep)[-2], leading_quote_char, trailing_quote_char + ) + if len(table_id.split(sep)) > 1 + else "" + ), + unquote( + table_id.split(sep)[-1], leading_quote_char, trailing_quote_char + ), + ) + for table_id in table_ids + ] + return tables + + def extract_flow_property_bag(self) -> Dict[str, str]: + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["connection.password", "connection.user"] + } + + # Mask/Remove properties that may reveal credentials + flow_property_bag["connection.url"] = self.get_parser( + self.connector_manifest + ).db_connection_url + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + database_name = parser.database_name + query = parser.query + topic_prefix = parser.topic_prefix + transforms = parser.transforms + + logging.debug( + f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " + ) + + if not self.connector_manifest.topic_names: + return lineages + + if query: + # Lineage source_table can be extracted by parsing query + for topic in self.connector_manifest.topic_names: + # default method - as per earlier implementation + dataset_name: str = get_dataset_name(database_name, topic) + + lineage = KafkaConnectLineage( + source_dataset=None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + self.report.warning( + "Could not find input dataset, the connector has query configuration set", + self.connector_manifest.name, + ) + return lineages + + SINGLE_TRANSFORM = len(transforms) == 1 + NO_TRANSFORM = len(transforms) == 0 + UNKNOWN_TRANSFORM = any( + [ + transform["type"] + not in self.KNOWN_TOPICROUTING_TRANSFORMS + + self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + ALL_TRANSFORMS_NON_TOPICROUTING = all( + [ + transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + + if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: + return self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + ) + + if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: + tables = self.get_table_names() + topic_names = list(self.connector_manifest.topic_names) + + from java.util.regex import Pattern + + for table in tables: + source_table: str = table[-1] + topic = topic_prefix + source_table if topic_prefix else source_table + + transform_regex = Pattern.compile(transforms[0]["regex"]) + transform_replacement = transforms[0]["replacement"] + + matcher = transform_regex.matcher(topic) + if matcher.matches(): + topic = str(matcher.replaceFirst(transform_replacement)) + + # Additional check to confirm that the topic present + # in connector topics + + if topic in self.connector_manifest.topic_names: + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform) and len(table) > 1: + source_table = f"{table[-2]}.{table[-1]}" + + dataset_name = get_dataset_name(database_name, source_table) + + lineage = KafkaConnectLineage( + source_dataset=dataset_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + topic_names.remove(topic) + lineages.append(lineage) + + if topic_names: + lineages.extend( + self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + topic_names=topic_names, + include_source_dataset=False, + ) + ) + self.report.warning( + "Could not find input dataset for connector topics", + f"{self.connector_manifest.name} : {topic_names}", + ) + return lineages + else: + include_source_dataset = True + if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has unknown transform", + f"{self.connector_manifest.name} : {transforms[0]['type']}", + ) + include_source_dataset = False + if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has one or more unknown transforms", + self.connector_manifest.name, + ) + include_source_dataset = False + lineages = self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + include_source_dataset=include_source_dataset, + ) + return lineages + + +@dataclass +class MongoSourceConnector(BaseConnector): + # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ + + @dataclass + class MongoSourceParser: + db_connection_url: Optional[str] + source_platform: str + database_name: Optional[str] + topic_prefix: Optional[str] + transforms: List[str] + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> MongoSourceParser: + parser = self.MongoSourceParser( + db_connection_url=connector_manifest.config.get("connection.uri"), + source_platform="mongodb", + database_name=connector_manifest.config.get("database"), + topic_prefix=connector_manifest.config.get("topic_prefix"), + transforms=( + connector_manifest.config["transforms"].split(",") + if "transforms" in connector_manifest.config + else [] + ), + ) + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(found.group(1), found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +@dataclass +class DebeziumSourceConnector(BaseConnector): + @dataclass + class DebeziumParser: + source_platform: str + server_name: Optional[str] + database_name: Optional[str] + + def get_server_name(self, connector_manifest: ConnectorManifest) -> str: + if "topic.prefix" in connector_manifest.config: + return connector_manifest.config["topic.prefix"] + else: + return connector_manifest.config.get("database.server.name", "") + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> DebeziumParser: + connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") + + if connector_class == "io.debezium.connector.mysql.MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": + parser = self.DebeziumParser( + source_platform="mongodb", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": + parser = self.DebeziumParser( + source_platform="postgres", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.oracle.OracleConnector": + parser = self.DebeziumParser( + source_platform="oracle", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": + database_name = connector_manifest.config.get( + "database.names" + ) or connector_manifest.config.get("database.dbname") + + if "," in str(database_name): + raise Exception( + f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" + ) + + parser = self.DebeziumParser( + source_platform="mssql", + server_name=self.get_server_name(connector_manifest), + database_name=database_name, + ) + elif connector_class == "io.debezium.connector.db2.Db2Connector": + parser = self.DebeziumParser( + source_platform="db2", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.vitess.VitessConnector": + parser = self.DebeziumParser( + source_platform="vitess", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("vitess.keyspace"), + ) + else: + raise ValueError(f"Connector class '{connector_class}' is unknown.") + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + + try: + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + server_name = parser.server_name + database_name = parser.database_name + topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(database_name, found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class ConfigDrivenSourceConnector(BaseConnector): + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages = [] + for connector in self.config.generic_connectors: + if connector.connector_name == self.connector_manifest.name: + target_connector = connector + break + for topic in self.connector_manifest.topic_names: + lineage = KafkaConnectLineage( + source_dataset=target_connector.source_dataset, + source_platform=target_connector.source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +JDBC_SOURCE_CONNECTOR_CLASS = "io.confluent.connect.jdbc.JdbcSourceConnector" +DEBEZIUM_SOURCE_CONNECTOR_PREFIX = "io.debezium.connector" +MONGO_SOURCE_CONNECTOR_CLASS = "com.mongodb.kafka.connect.MongoSourceConnector" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 57a251ef2ed14..a66962f962255 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -31,6 +31,10 @@ from pydantic.class_validators import validator import datahub.emitter.mce_builder as builder +from datahub.api.entities.platformresource.platform_resource import ( + PlatformResource, + PlatformResourceKey, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp from datahub.ingestion.api.report import Report @@ -106,7 +110,7 @@ from datahub.utilities.url_util import remove_port_from_url CORPUSER_DATAHUB = "urn:li:corpuser:datahub" - +LOOKER = "looker" logger = logging.getLogger(__name__) @@ -1411,6 +1415,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): resolved_user_ids: int = 0 email_ids_missing: int = 0 # resolved users with missing email addresses + looker_user_count: int = 0 _looker_api: Optional[LookerAPI] = None query_latency: Dict[str, datetime.timedelta] = dataclasses_field( @@ -1614,9 +1619,21 @@ def get_urn_dashboard_id(self): class LookerUserRegistry: looker_api_wrapper: LookerAPI fields: str = ",".join(["id", "email", "display_name", "first_name", "last_name"]) + _user_cache: Dict[str, LookerUser] = {} - def __init__(self, looker_api: LookerAPI): + def __init__(self, looker_api: LookerAPI, report: LookerDashboardSourceReport): self.looker_api_wrapper = looker_api + self.report = report + self._initialize_user_cache() + + def _initialize_user_cache(self) -> None: + raw_users: Sequence[User] = self.looker_api_wrapper.all_users( + user_fields=self.fields + ) + + for raw_user in raw_users: + looker_user = LookerUser.create_looker_user(raw_user) + self._user_cache[str(looker_user.id)] = looker_user def get_by_id(self, id_: str) -> Optional[LookerUser]: if not id_: @@ -1624,6 +1641,9 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: logger.debug(f"Will get user {id_}") + if str(id_) in self._user_cache: + return self._user_cache.get(str(id_)) + raw_user: Optional[User] = self.looker_api_wrapper.get_user( str(id_), user_fields=self.fields ) @@ -1632,3 +1652,35 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: looker_user = LookerUser.create_looker_user(raw_user) return looker_user + + def to_platform_resource( + self, platform_instance: Optional[str] + ) -> Iterable[MetadataChangeProposalWrapper]: + try: + platform_resource_key = PlatformResourceKey( + platform=LOOKER, + resource_type="USER_ID_MAPPING", + platform_instance=platform_instance, + primary_key="", + ) + + # Extract user email mappings + user_email_cache = { + user_id: user.email + for user_id, user in self._user_cache.items() + if user.email + } + + platform_resource = PlatformResource.create( + key=platform_resource_key, + value=user_email_cache, + ) + + self.report.looker_user_count = len(user_email_cache) + yield from platform_resource.to_mcps() + + except Exception as exc: + self.report.warning( + message="Failed to generate platform resource for looker id mappings", + exc=exc, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py index ab55d4e15e5de..c3f2a110136c4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py @@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel): get_look_calls: int = 0 search_looks_calls: int = 0 search_dashboards_calls: int = 0 + all_user_calls: int = 0 class LookerAPI: @@ -135,7 +136,7 @@ def get_available_permissions(self) -> Set[str]: return permissions - @lru_cache(maxsize=1000) + @lru_cache(maxsize=5000) def get_user(self, id_: str, user_fields: str) -> Optional[User]: self.client_stats.user_calls += 1 try: @@ -154,6 +155,17 @@ def get_user(self, id_: str, user_fields: str) -> Optional[User]: # User not found return None + def all_users(self, user_fields: str) -> Sequence[User]: + self.client_stats.all_user_calls += 1 + try: + return self.client.all_users( + fields=cast(str, user_fields), + transport_options=self.transport_options, + ) + except SDKError as e: + logger.warning(f"Failure was {e}") + return [] + def execute_query(self, write_query: WriteQuery) -> List[Dict]: logger.debug(f"Executing query {write_query}") self.client_stats.query_calls += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index cd8ccb8217257..815c5dfb1c014 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -145,7 +145,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): self.source_config: LookerDashboardSourceConfig = config self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport() self.looker_api: LookerAPI = LookerAPI(self.source_config) - self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api) + self.user_registry: LookerUserRegistry = LookerUserRegistry( + self.looker_api, self.reporter + ) self.explore_registry: LookerExploreRegistry = LookerExploreRegistry( self.looker_api, self.reporter, self.source_config ) @@ -1673,5 +1675,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield usage_mcp.as_workunit() self.reporter.report_stage_end("usage_extraction") + # Dump looker user resource mappings. + logger.info("Ingesting looker user resource mapping workunits") + self.reporter.report_stage_start("user_resource_extraction") + yield from auto_workunit( + self.user_registry.to_platform_resource( + self.source_config.platform_instance + ) + ) + def get_report(self) -> SourceReport: return self.reporter diff --git a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py index 26d160acf330c..1a934f6aab173 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py @@ -1,13 +1,15 @@ from dataclasses import dataclass -from typing import Any, Callable, Iterable, Optional, TypeVar, Union +from typing import Any, Callable, Iterable, Optional, TypeVar, Union, List +import time from mlflow import MlflowClient from mlflow.entities import Run -from mlflow.entities.model_registry import ModelVersion, RegisteredModel +from mlflow.entities.model_registry import ModelVersion, RegisteredModel, Experiment from mlflow.store.entities import PagedList from pydantic.fields import Field import datahub.emitter.mce_builder as builder +from datahub.emitter.mcp_builder import ContainerKey from datahub.configuration.source_common import EnvConfigMixin from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext @@ -29,39 +31,79 @@ TagAssociationClass, TagPropertiesClass, VersionTagClass, - _Aspect, + DataProcessInstanceRunEventClass, + DataProcessInstancePropertiesClass, + ContainerPropertiesClass, + AuditStampClass, + TimeStampClass, + DataProcessRunStatusClass, + SubTypesClass, + DataPlatformInstanceClass, + BrowsePathsV2Class, + MetadataChangeProposalClass, + MLTrainingRunPropertiesClass, + DataProcessInstanceRunResultClass, + DataProcessInstanceOutputClass, +) +from datahub.metadata.urns import DatasetUrn, DataPlatformUrn, MlModelUrn, MlModelGroupUrn, DataProcessInstanceUrn, DataPlatformInstanceUrn +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, ) T = TypeVar("T") +class ContainerKeyWithId(ContainerKey): + id: str + + +@dataclass +class Container: + key: ContainerKeyWithId + subtype: str + name: Optional[str] = None + description: Optional[str] = None + + def generate_mcp( + self, + ) -> Iterable[Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]]: + container_urn = self.key.as_urn() + + container_subtype = SubTypesClass(typeNames=[self.subtype]) + + container_info = ContainerPropertiesClass( + name=self.name or self.key.id, + description=self.description, + customProperties={}, + ) + + browse_path = BrowsePathsV2Class(path=[]) + + dpi = DataPlatformInstanceClass( + platform=self.key.platform, + instance=self.key.instance, + ) + + return MetadataChangeProposalWrapper.construct_many( + entityUrn=container_urn, + aspects=[container_subtype, container_info, browse_path, dpi], + ) + + class MLflowConfig(EnvConfigMixin): tracking_uri: Optional[str] = Field( default=None, - description=( - "Tracking server URI. If not set, an MLflow default tracking_uri is used" - " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)" - ), + description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)", ) registry_uri: Optional[str] = Field( default=None, - description=( - "Registry server URI. If not set, an MLflow default registry_uri is used" - " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)" - ), + description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)", ) model_name_separator: str = Field( default="_", description="A string which separates model name from its version (e.g. model_1 or model-1)", ) - base_external_url: Optional[str] = Field( - default=None, - description=( - "Base URL to use when constructing external URLs to MLflow." - " If not set, tracking_uri is used if it's an HTTP URL." - " If neither is set, external URLs are not generated." - ), - ) @dataclass @@ -118,12 +160,10 @@ def get_report(self) -> SourceReport: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self._get_tags_workunits() + yield from self._get_experiment_workunits() yield from self._get_ml_model_workunits() def _get_tags_workunits(self) -> Iterable[MetadataWorkUnit]: - """ - Create tags for each Stage in MLflow Model Registry. - """ for stage_info in self.registered_model_stages_info: tag_urn = self._make_stage_tag_urn(stage_info.name) tag_properties = TagPropertiesClass( @@ -142,36 +182,205 @@ def _make_stage_tag_urn(self, stage_name: str) -> str: def _make_stage_tag_name(self, stage_name: str) -> str: return f"{self.platform}_{stage_name.lower()}" - def _create_workunit(self, urn: str, aspect: _Aspect) -> MetadataWorkUnit: - """ - Utility to create an MCP workunit. - """ + def _create_workunit(self, urn: str, aspect: Any) -> MetadataWorkUnit: return MetadataChangeProposalWrapper( entityUrn=urn, aspect=aspect, ).as_workunit() - def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: - """ - Traverse each Registered Model in Model Registry and generate a corresponding workunit. - """ - registered_models = self._get_mlflow_registered_models() - for registered_model in registered_models: - yield self._get_ml_group_workunit(registered_model) - model_versions = self._get_mlflow_model_versions(registered_model) - for model_version in model_versions: - run = self._get_mlflow_run(model_version) - yield self._get_ml_model_properties_workunit( - registered_model=registered_model, - model_version=model_version, - run=run, - ) - yield self._get_global_tags_workunit(model_version=model_version) + def _get_experiment_workunits(self) -> Iterable[MetadataWorkUnit]: + experiments = self._get_mlflow_experiments() + for experiment in experiments: + # Yield each workunit from the container workunits + for wu in self._get_experiment_container_workunit(experiment): + yield wu + + runs = self._get_mlflow_runs_from_experiment(experiment) + if runs: + for run in runs: + for wu in self._get_run_workunits(experiment, run): + yield wu + + def _get_experiment_custom_properties(self, experiment): + experiment_custom_props = getattr(experiment, "tags", {}) or {} + experiment_custom_props.pop("mlflow.note.content", None) + experiment_custom_props["artifacts_location"] = experiment.artifact_location + return experiment_custom_props + + def _get_experiment_container_workunit( + self, experiment: Experiment + ) -> List[MetadataWorkUnit]: + experiment_container = Container( + key=ContainerKeyWithId( + platform=str(DataPlatformUrn.create_from_id("mlflow")), + id=experiment.name + ), + subtype="ML Experiment", + name=experiment.name, + description=experiment.tags.get("mlflow.note.content"), + ) # TODO: this generates a urn as guid, should we change this to use experiment.id? + + print("experiment.key.id:", experiment.key.id) # this should be same as container key as urn + print("experiment.key.as_urn(): ", experiment.key.as_urn()) + + workunits = [mcp.as_workunit() for mcp in experiment.generate_mcp()] + return workunits + + def _get_run_custom_properties(self, run: Run): + custom_props = {} + custom_props.update(getattr(run, "tags", {}) or {}) + return custom_props + + def _get_run_metrics(self, run: Run): + return [ + MLMetricClass(name=k, value=str(v)) for k, v in run.data.metrics.items() + ] + + def _get_run_params(self, run: Run): + return [ + MLHyperParamClass(name=k, value=str(v)) for k, v in run.data.params.items() + ] + + def _convert_run_result_type( + self, status: str + ) -> DataProcessInstanceRunResultClass: + if status == "FINISHED": + return DataProcessInstanceRunResultClass( + type="SUCCESS", nativeResultType="mlflow" + ) + elif status == "FAILED": + return DataProcessInstanceRunResultClass( + type="FAILURE", nativeResultType="mlflow" + ) + else: + return DataProcessInstanceRunResultClass( + type="SKIPPED", nativeResultType="mlflow" + ) + + def _get_run_workunits( + self, experiment: Experiment, run: Run + ) -> List[MetadataWorkUnit]: + experiment_key = ContainerKeyWithId( + platform=str(DataPlatformUrn.create_from_id("mlflow")), id=experiment.name + ) + + data_process_instance = DataProcessInstance.from_container( + container_key=experiment_key, id=run.info.run_name + ) # TODO: this generates a urn as guid, should we change this to use run.info.run_id? + workunits = [] + + run_custom_props = self._get_run_custom_properties(run) + created_time = run.info.start_time or int(time.time() * 1000) + created_actor = ( + f"urn:li:platformResource:{run.info.user_id}" if run.info.user_id else None + ) + + workunits.append( + MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=DataProcessInstancePropertiesClass( + name=run.info.run_name or run.info.run_id, + created=AuditStampClass( + time=created_time, + actor=created_actor, + ), + externalUrl=self._make_external_url_from_run(experiment, run), + customProperties=run_custom_props, + ), + ).as_workunit() + ) + + # get model from run + model_versions = self.get_mlflow_model_versions_from_run(run.info.run_id) + model_version_urn = self._make_ml_model_urn(model_versions[0]) + model_version_urn = "urn:li:dataset:(urn:li:dataPlatform:mlflow,sk-learn-random-forest-reg_1,PROD)" + if model_versions: + workunits.append( + MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=DataProcessInstanceOutputClass( + outputs=[model_version_urn] + ), + ).as_workunit() + ) + + metrics = self._get_run_metrics(run) + hyperparams = self._get_run_params(run) + workunits.append( + MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=MLTrainingRunPropertiesClass( + hyperParams=hyperparams, + trainingMetrics=metrics, + outputUrls=[run.info.artifact_uri], + id=run.info.run_id, + ), + ).as_workunit() + ) + + result = ( + run.info.status + ) # TODO: this should be SUCCESS, SKIPPED, FAILURE, UP_FOR_RETRY + duration_millis = run.info.end_time - run.info.start_time + + + if run.info.end_time: + workunits.append( + MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=DataProcessInstanceRunEventClass( + status=DataProcessRunStatusClass.COMPLETE, + timestampMillis=run.info.end_time, + result=DataProcessInstanceRunResultClass( + type=self._convert_run_result_type(result).type, + nativeResultType="mlflow", + ), + durationMillis=duration_millis, + ), + ).as_workunit() + ) + + workunits.append( + MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=DataPlatformInstanceClass( + platform=str(DataPlatformUrn.create_from_id("mlflow")) + ), + ).as_workunit() + ) + + workunits.append( + MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=DataProcessInstancePropertiesClass( # Changed from RunEventClass + name=run.info.run_name or run.info.run_id, + created=AuditStampClass( + time=created_time, + actor=created_actor, + ) + ), + ).as_workunit() + ) + + workunits.append( + MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=DataPlatformInstanceClass( + platform=str(DataPlatformUrn.create_from_id("mlflow")) + ), + ).as_workunit() + ) + + workunits.append( + MetadataChangeProposalWrapper( + entityUrn=str(data_process_instance.urn), + aspect=SubTypesClass(typeNames=["ML Training Run"]), + ).as_workunit() + ) + + return workunits def _get_mlflow_registered_models(self) -> Iterable[RegisteredModel]: - """ - Get all Registered Models in MLflow Model Registry. - """ registered_models: Iterable[ RegisteredModel ] = self._traverse_mlflow_search_func( @@ -179,34 +388,57 @@ def _get_mlflow_registered_models(self) -> Iterable[RegisteredModel]: ) return registered_models + def _get_mlflow_experiments(self) -> Iterable[Experiment]: + experiments: Iterable[Experiment] = self._traverse_mlflow_search_func( + search_func=self.client.search_experiments, + ) + return experiments + + def _get_mlflow_runs_from_experiment(self, experiment: Experiment) -> List[Run]: + runs: List[Run] = self._traverse_mlflow_search_func( + search_func=self.client.search_runs, + experiment_ids=[experiment.experiment_id], + ) + return runs + @staticmethod def _traverse_mlflow_search_func( search_func: Callable[..., PagedList[T]], **kwargs: Any, ) -> Iterable[T]: - """ - Utility to traverse an MLflow search_* functions which return PagedList. - """ next_page_token = None while True: paged_list = search_func(page_token=next_page_token, **kwargs) - yield from paged_list.to_list() + yield from paged_list next_page_token = paged_list.token if not next_page_token: return + def _get_latest_version(self, registered_model: RegisteredModel) -> Optional[str]: + return ( + str(registered_model.latest_versions[0].version) + if registered_model.latest_versions + else None + ) + def _get_ml_group_workunit( self, registered_model: RegisteredModel, ) -> MetadataWorkUnit: - """ - Generate an MLModelGroup workunit for an MLflow Registered Model. - """ ml_model_group_urn = self._make_ml_model_group_urn(registered_model) ml_model_group_properties = MLModelGroupPropertiesClass( customProperties=registered_model.tags, description=registered_model.description, - createdAt=registered_model.creation_timestamp, + created=TimeStampClass( + time=registered_model.creation_timestamp, actor=None + ), + lastModified=TimeStampClass( + time=registered_model.last_updated_timestamp, + actor=None, + ), + version=VersionTagClass( + versionTag=self._get_latest_version(registered_model) + ), ) wu = self._create_workunit( urn=ml_model_group_urn, @@ -226,9 +458,6 @@ def _get_mlflow_model_versions( self, registered_model: RegisteredModel, ) -> Iterable[ModelVersion]: - """ - Get all Model Versions for each Registered Model. - """ filter_string = f"name = '{registered_model.name}'" model_versions: Iterable[ModelVersion] = self._traverse_mlflow_search_func( search_func=self.client.search_model_versions, @@ -236,51 +465,78 @@ def _get_mlflow_model_versions( ) return model_versions + def get_mlflow_model_versions_from_run(self, run_id): + filter_string = f"run_id = '{run_id}'" + + model_versions: Iterable[ModelVersion] = self._traverse_mlflow_search_func( + search_func=self.client.search_model_versions, + filter_string=filter_string, + ) + + return list(model_versions) + def _get_mlflow_run(self, model_version: ModelVersion) -> Union[None, Run]: - """ - Get a Run associated with a Model Version. Some MVs may exist without Run. - """ if model_version.run_id: run = self.client.get_run(model_version.run_id) return run else: return None + def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: + registered_models = self._get_mlflow_registered_models() + for registered_model in registered_models: + yield self._get_ml_group_workunit(registered_model) + model_versions = self._get_mlflow_model_versions(registered_model) + for model_version in model_versions: + run = self._get_mlflow_run(model_version) + yield self._get_ml_model_properties_workunit( + registered_model=registered_model, + model_version=model_version, + run=run, + ) + yield self._get_global_tags_workunit(model_version=model_version) + def _get_ml_model_properties_workunit( self, registered_model: RegisteredModel, model_version: ModelVersion, run: Union[None, Run], ) -> MetadataWorkUnit: - """ - Generate an MLModel workunit for an MLflow Model Version. - Every Model Version is a DataHub MLModel entity associated with an MLModelGroup corresponding to a Registered Model. - If a model was registered without an associated Run then hyperparams and metrics are not available. - """ ml_model_group_urn = self._make_ml_model_group_urn(registered_model) ml_model_urn = self._make_ml_model_urn(model_version) + if run: - hyperparams = [ - MLHyperParamClass(name=k, value=str(v)) - for k, v in run.data.params.items() - ] - training_metrics = [ - MLMetricClass(name=k, value=str(v)) for k, v in run.data.metrics.items() - ] + # Use the same metrics and hyperparams from the run + hyperparams = self._get_run_params(run) + training_metrics = self._get_run_metrics(run) else: hyperparams = None training_metrics = None + + created_time = model_version.creation_timestamp + created_actor = ( + f"urn:li:platformResource:{model_version.user_id}" + if model_version.user_id + else None + ) + ml_model_properties = MLModelPropertiesClass( customProperties=model_version.tags, - externalUrl=self._make_external_url(model_version), + lastModified=TimeStampClass( + time=model_version.last_updated_timestamp, + actor=None, + ), + externalUrl=self._make_external_url_from_model_version(model_version), description=model_version.description, - date=model_version.creation_timestamp, + created=TimeStampClass( + time=created_time, + actor=created_actor, + ), version=VersionTagClass(versionTag=str(model_version.version)), hyperParams=hyperparams, trainingMetrics=training_metrics, - # mlflow tags are dicts, but datahub tags are lists. currently use only keys from mlflow tags tags=list(model_version.tags.keys()), - groups=[ml_model_group_urn], + groups=[str(ml_model_group_urn)], ) wu = self._create_workunit(urn=ml_model_urn, aspect=ml_model_properties) return wu @@ -293,24 +549,21 @@ def _make_ml_model_urn(self, model_version: ModelVersion) -> str: ) return urn - def _get_base_external_url_from_tracking_uri(self) -> Optional[str]: - if isinstance( - self.client.tracking_uri, str - ) and self.client.tracking_uri.startswith("http"): - return self.client.tracking_uri + def _make_external_url_from_model_version( + self, model_version: ModelVersion + ) -> Union[None, str]: + base_uri = self.client.tracking_uri + if base_uri.startswith("http"): + return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}" else: return None - def _make_external_url(self, model_version: ModelVersion) -> Optional[str]: - """ - Generate URL for a Model Version to MLflow UI. - """ - base_uri = ( - self.config.base_external_url - or self._get_base_external_url_from_tracking_uri() - ) - if base_uri: - return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}" + def _make_external_url_from_run( + self, experiment: Experiment, run: Run + ) -> Union[None, str]: + base_uri = self.client.tracking_uri + if base_uri.startswith("http"): + return f"{base_uri.rstrip('/')}/#/experiments/{experiment.experiment_id}/runs/{run.info.run_id}" else: return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index f7458c4eb4d5b..b49d40a0c7eb6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,7 +9,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]: return dict_ -class PlatformDetail(ConfigModel): - platform_instance: Optional[str] = pydantic.Field( - default=None, - description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " - "with platform instance name used in ingestion " - "recipe of other datahub sources.", - ) - env: str = pydantic.Field( - default=builder.DEFAULT_ENV, - description="The environment that all assets produced by DataHub platform ingestion source belong to", - ) - - class DataBricksPlatformDetail(PlatformDetail): """ metastore is an additional field used in Databricks connector to generate the dataset urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py index baaa8d5b85ae1..6d51e853a2fb0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py @@ -2,8 +2,8 @@ from abc import ABC, abstractmethod from typing import Union +from datahub.configuration.source_common import PlatformDetail from datahub.ingestion.source.powerbi.config import ( - PlatformDetail, PowerBiDashboardSourceConfig, PowerBIPlatformDetail, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py index ffaed79f4e42a..63520bd731de8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py @@ -5,13 +5,13 @@ from lark import Tree +from datahub.configuration.source_common import PlatformDetail from datahub.emitter import mce_builder as builder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( Constant, DataBricksPlatformDetail, DataPlatformPair, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, PowerBIPlatformDetail, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index c3a7912c40e8e..e5883dd0349a3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -540,6 +540,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: identifiers=self.identifiers, schema_resolver=schema_resolver, discovered_tables=discovered_datasets, + graph=self.ctx.graph, ) # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index 5107a4e38f64d..d3941e7add0fd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -1,11 +1,17 @@ from dataclasses import dataclass, field from typing import Dict, List, Optional, Union -from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn +from datahub.emitter.mce_builder import ( + make_data_flow_urn, + make_data_job_urn, + make_data_platform_urn, + make_dataplatform_instance_urn, +) from datahub.metadata.schema_classes import ( DataFlowInfoClass, DataJobInfoClass, DataJobInputOutputClass, + DataPlatformInstanceClass, ) @@ -204,6 +210,18 @@ def as_datajob_info_aspect(self) -> DataJobInfoClass: status=self.status, ) + @property + def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: + if self.entity.flow.platform_instance: + return DataPlatformInstanceClass( + platform=make_data_platform_urn(self.entity.flow.orchestrator), + instance=make_dataplatform_instance_urn( + platform=self.entity.flow.orchestrator, + instance=self.entity.flow.platform_instance, + ), + ) + return None + @dataclass class MSSQLDataFlow: @@ -238,3 +256,14 @@ def as_dataflow_info_aspect(self) -> DataFlowInfoClass: customProperties=self.flow_properties, externalUrl=self.external_url, ) + + @property + def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: + if self.entity.platform_instance: + return DataPlatformInstanceClass( + platform=make_data_platform_urn(self.entity.orchestrator), + instance=make_dataplatform_instance_urn( + self.entity.orchestrator, self.entity.platform_instance + ), + ) + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 414c1faaa1661..9d8b67041998c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -639,6 +639,13 @@ def construct_job_workunits( aspect=data_job.as_datajob_info_aspect, ).as_workunit() + data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect + if data_platform_instance_aspect: + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_platform_instance_aspect, + ).as_workunit() + if include_lineage: yield MetadataChangeProposalWrapper( entityUrn=data_job.urn, @@ -654,6 +661,13 @@ def construct_flow_workunits( entityUrn=data_flow.urn, aspect=data_flow.as_dataflow_info_aspect, ).as_workunit() + + data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect + if data_platform_instance_aspect: + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_platform_instance_aspect, + ).as_workunit() # TODO: Add SubType when it appear def get_inspectors(self) -> Iterable[Inspector]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 6cc2220d90fd9..fadcb8ff8f396 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -645,7 +645,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None: # the site-role might be different on another site logged_in_user: UserInfo = UserInfo.from_server(server=server) - if not logged_in_user.is_site_administrator_explorer(): + if not logged_in_user.has_site_administrator_explorer_privileges(): report.warning( title=title, message=message, @@ -896,10 +896,9 @@ def dataset_browse_prefix(self) -> str: return f"/{self.config.env.lower()}{self.no_env_browse_prefix}" def _re_authenticate(self): - tableau_auth: Union[ - TableauAuth, PersonalAccessTokenAuth - ] = self.config.get_tableau_auth(self.site_id) - self.server.auth.sign_in(tableau_auth) + # Sign-in again may not be enough because Tableau sometimes caches invalid sessions + # so we need to recreate the Tableau Server object + self.server = self.config.make_tableau_client(self.site_id) @property def site_content_url(self) -> Optional[str]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py index ea0878143ef35..d69312f803021 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py @@ -82,4 +82,6 @@ SITE = "Site" IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql" SITE_PERMISSION = "sitePermission" -SITE_ROLE = "SiteAdministratorExplorer" +ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer" +ROLE_SITE_ADMIN_CREATOR = "SiteAdministratorCreator" +ROLE_SERVER_ADMIN = "ServerAdministrator" diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py index f309622d12b91..482140a227511 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py @@ -11,8 +11,12 @@ class UserInfo: site_role: str site_id: str - def is_site_administrator_explorer(self): - return self.site_role == c.SITE_ROLE + def has_site_administrator_explorer_privileges(self): + return self.site_role in [ + c.ROLE_SITE_ADMIN_EXPLORER, + c.ROLE_SITE_ADMIN_CREATOR, + c.ROLE_SERVER_ADMIN, + ] @staticmethod def from_server(server: Server) -> "UserInfo": diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py index 4a703faf6091b..4ec0e5ef01d3c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py @@ -28,7 +28,7 @@ def check_user_role( try: # TODO: Add check for `Enable Derived Permissions` - if not logged_in_user.is_site_administrator_explorer(): + if not logged_in_user.has_site_administrator_explorer_privileges(): capability_dict[c.SITE_PERMISSION] = CapabilityReport( capable=False, failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.", diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 79ea98d1c7f54..f81eb291e89e1 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -490,7 +490,7 @@ def __init__( self._exit_stack.push(self._query_usage_counts) # Tool Extractor - self._tool_meta_extractor = ToolMetaExtractor() + self._tool_meta_extractor = ToolMetaExtractor.create(graph) self.report.tool_meta_report = self._tool_meta_extractor.report def close(self) -> None: diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index 0d85002776e5e..5af9d9d4f0fff 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -1,3 +1,4 @@ +import contextlib import json import logging from dataclasses import dataclass, field @@ -5,8 +6,15 @@ from typing_extensions import Protocol +from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, + PlatformResource, + PlatformResourceSearchFields, +) from datahub.ingestion.api.report import Report +from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn +from datahub.utilities.search_utils import LogicalOperator from datahub.utilities.stats_collections import int_top_k_dict UrnStr = str @@ -31,6 +39,7 @@ def _get_last_line(query: str) -> str: @dataclass class ToolMetaExtractorReport(Report): num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict) + failures: List[str] = field(default_factory=list) class ToolMetaExtractor: @@ -42,14 +51,81 @@ class ToolMetaExtractor: by warehouse query logs. """ - def __init__(self) -> None: - self.report = ToolMetaExtractorReport() + def __init__( + self, + report: ToolMetaExtractorReport, + looker_user_mapping: Optional[Dict[str, str]] = None, + ) -> None: + self.report = report self.known_tool_extractors: List[Tuple[str, Callable[[QueryLog], bool]]] = [ ( "mode", self._extract_mode_query, - ) + ), + ( + "looker", + self._extract_looker_query, + ), ] + # maps user id (as string) to email address + self.looker_user_mapping = looker_user_mapping + + @classmethod + def create( + cls, + graph: Optional[DataHubGraph] = None, + ) -> "ToolMetaExtractor": + report = ToolMetaExtractorReport() + looker_user_mapping = None + if graph: + try: + looker_user_mapping = cls.extract_looker_user_mapping_from_graph( + graph, report + ) + except Exception as e: + report.failures.append( + f"Unexpected error during Looker user metadata extraction: {str(e)}" + ) + + return cls(report, looker_user_mapping) + + @classmethod + def extract_looker_user_mapping_from_graph( + cls, graph: DataHubGraph, report: ToolMetaExtractorReport + ) -> Optional[Dict[str, str]]: + looker_user_mapping = None + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.PLATFORM, "looker") + .add_field_match( + PlatformResourceSearchFields.RESOURCE_TYPE, + "USER_ID_MAPPING", + ) + .end() + ) + platform_resources = list( + PlatformResource.search_by_filters(query=query, graph_client=graph) + ) + + if len(platform_resources) > 1: + report.failures.append( + "Looker user metadata extraction failed. Found more than one looker user id mappings." + ) + else: + platform_resource = platform_resources[0] + + if ( + platform_resource + and platform_resource.resource_info + and platform_resource.resource_info.value + ): + with contextlib.suppress(ValueError, AssertionError): + value = platform_resource.resource_info.value.as_raw_json() + if value: + looker_user_mapping = value + + return looker_user_mapping def _extract_mode_query(self, entry: QueryLog) -> bool: """ @@ -78,14 +154,49 @@ def _extract_mode_query(self, entry: QueryLog) -> bool: return True + def _extract_looker_query(self, entry: QueryLog) -> bool: + """ + Returns: + bool: whether QueryLog entry is that of looker and looker user info + is extracted into entry. + """ + if not self.looker_user_mapping: + return False + + last_line = _get_last_line(entry.query_text) + + if not (last_line.startswith("--") and "Looker Query Context" in last_line): + return False + + start_quote_idx = last_line.index("'") + end_quote_idx = last_line.rindex("'") + if start_quote_idx == -1 or end_quote_idx == -1: + return False + + looker_json_raw = last_line[start_quote_idx + 1 : end_quote_idx] + looker_json = json.loads(looker_json_raw) + + user_id = str(looker_json["user_id"]) + email = self.looker_user_mapping.get(user_id) + if not email: + return False + + original_user = entry.user + + entry.user = email_to_user_urn(email) + entry.extra_info = entry.extra_info or {} + entry.extra_info["user_via"] = original_user + + return True + def extract_bi_metadata(self, entry: QueryLog) -> bool: for tool, meta_extractor in self.known_tool_extractors: try: if meta_extractor(entry): self.report.num_queries_meta_extracted[tool] += 1 return True - except Exception: - logger.debug("Tool metadata extraction failed with error : {e}") + except Exception as e: + logger.debug(f"Tool metadata extraction failed with error : {e}") return False diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index a9c445b5986ef..6ae772c134cb3 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -842,6 +842,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index af9c62a2a4180..d7620980a9ced 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -497,6 +497,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index b89bc356b48fd..13963af55bfe5 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index 810fefd8f6cb8..f11d060102851 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json index 3d78397f54a23..f6e39dd5286cd 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json @@ -828,6 +828,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index 5a540e61e768d..203bed843155c 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -708,6 +723,21 @@ "/Folders/Personal" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-2@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1108,12 +1138,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/sales_model" + "/Explore/data" ] } }, @@ -1126,12 +1156,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "sales_model", + "model": "data", "looker.explore.label": "My Explore View", - "looker.explore.name": "sales_explore", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", + "externalUrl": "https://looker.company.com/explore/data/my_view", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1153,7 +1183,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "sales_explore", + "schemaName": "my_view", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1208,7 +1238,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1227,12 +1257,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" + "renderUrl": "https://looker.company.com/embed/explore/data/my_view" } }, "systemMetadata": { @@ -1244,12 +1274,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } }, "systemMetadata": { @@ -1261,7 +1291,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1271,8 +1301,8 @@ "id": "Explore" }, { - "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", - "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", + "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } ] } @@ -1287,12 +1317,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/data" + "/Explore/order_model" ] } }, @@ -1305,12 +1335,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "data", + "model": "order_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "my_view", + "looker.explore.name": "order_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/data/my_view", + "externalUrl": "https://looker.company.com/explore/order_model/order_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1332,7 +1362,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", + "schemaName": "order_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1387,7 +1417,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1406,12 +1436,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/data/my_view" + "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" } }, "systemMetadata": { @@ -1423,12 +1453,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } }, "systemMetadata": { @@ -1440,7 +1470,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1450,8 +1480,8 @@ "id": "Explore" }, { - "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", - "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", + "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } ] } @@ -1466,12 +1496,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/order_model" + "/Explore/sales_model" ] } }, @@ -1484,12 +1514,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "order_model", + "model": "sales_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "order_explore", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/order_model/order_explore", + "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1511,7 +1541,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "order_explore", + "schemaName": "sales_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1566,7 +1596,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1585,12 +1615,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" + "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" } }, "systemMetadata": { @@ -1602,12 +1632,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } }, "systemMetadata": { @@ -1619,7 +1649,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1629,8 +1659,8 @@ "id": "Explore" }, { - "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", - "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", + "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } ] } @@ -1705,6 +1735,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index 9ac95b8482a47..87af50f95ed6b 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -793,6 +793,60 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:looker,ap-south-1)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 3a2c6359ea63c..b990ce7c67dab 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -759,6 +759,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 007eee348aeaf..391192b3d16f3 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -513,6 +513,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json index 859b9163d7aad..4909a6af73a22 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1185,6 +1200,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index 8256c984afb27..ddeb5428b1d72 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -762,6 +762,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", @@ -814,8 +870,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -831,8 +887,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -865,8 +921,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 0b3530f9c2462..594983c8fb0f2 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -678,6 +678,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 8bbf14709ff9f..a39de8384efb2 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -83,6 +83,7 @@ def test_looker_ingest(pytestconfig, tmp_path, mock_time): with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) + mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -319,6 +320,7 @@ def setup_mock_look(mocked_client): mocked_client.all_looks.return_value = [ Look( id="1", + user_id="1", title="Outer Look", description="I am not part of any Dashboard", query_id="1", @@ -327,6 +329,7 @@ def setup_mock_look(mocked_client): Look( id="2", title="Personal Look", + user_id="2", description="I am not part of any Dashboard and in personal folder", query_id="2", folder=FolderBase( @@ -561,6 +564,20 @@ def get_user( mocked_client.user.side_effect = get_user +def setup_mock_all_user(mocked_client): + def all_users( + fields: Optional[str] = None, + transport_options: Optional[transport.TransportOptions] = None, + ) -> List[User]: + return [ + User(id="1", email="test-1@looker.com"), + User(id="2", email="test-2@looker.com"), + User(id="3", email="test-3@looker.com"), + ] + + mocked_client.all_users.side_effect = all_users + + def side_effect_query_inline( result_format: str, body: WriteQuery, transport_options: Optional[TransportOptions] ) -> str: @@ -714,6 +731,7 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -946,6 +964,8 @@ def ingest_independent_looks( mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) setup_mock_explore(mocked_client) + setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) setup_mock_look(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index b67ebfb206883..b36188405e7e1 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -1,13 +1,14 @@ [ { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData" }, @@ -23,7 +24,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -39,12 +40,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -55,7 +57,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -73,12 +75,17 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { - "path": [] + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + ] } }, "systemMetadata": { @@ -89,7 +96,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -105,19 +112,36 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-19 12:34:45.843000", + "date_modified": "2024-12-19 12:34:46.017000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -138,7 +162,24 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -156,12 +197,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -172,13 +213,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_accessadmin" @@ -195,7 +237,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -211,12 +253,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -227,7 +270,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -245,15 +288,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -266,12 +313,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -282,13 +329,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_backupoperator" @@ -305,7 +353,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -321,12 +369,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -337,7 +386,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -355,15 +404,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -376,12 +429,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -392,13 +445,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_datareader" @@ -415,7 +469,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -431,12 +485,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -447,7 +502,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -465,15 +520,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -486,12 +545,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -502,13 +561,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_datawriter" @@ -525,7 +585,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -541,12 +601,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -557,7 +618,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -575,15 +636,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -596,12 +661,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -612,13 +677,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_ddladmin" @@ -635,7 +701,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -651,12 +717,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -667,7 +734,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -685,15 +752,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -706,12 +777,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -722,13 +793,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_denydatareader" @@ -745,7 +817,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -761,12 +833,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -777,7 +850,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -795,15 +868,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -816,12 +893,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -832,13 +909,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_denydatawriter" @@ -855,7 +933,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -871,12 +949,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -887,7 +966,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -905,15 +984,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -926,12 +1009,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -942,13 +1025,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_owner" @@ -965,7 +1049,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -981,12 +1065,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -997,7 +1082,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1015,15 +1100,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1036,12 +1125,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -1052,13 +1141,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_securityadmin" @@ -1075,7 +1165,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1091,12 +1181,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -1107,7 +1198,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1125,15 +1216,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1146,12 +1241,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -1162,13 +1257,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "dbo" @@ -1185,7 +1281,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1201,12 +1297,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -1217,7 +1314,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1235,15 +1332,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1256,12 +1357,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1" + "container": "urn:li:container:92899b29bb814fdeb1186eb99139073f" } }, "systemMetadata": { @@ -1273,7 +1374,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1345,7 +1446,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1363,19 +1481,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" }, { - "id": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", - "urn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1" + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + }, + { + "id": "urn:li:container:92899b29bb814fdeb1186eb99139073f", + "urn": "urn:li:container:92899b29bb814fdeb1186eb99139073f" } ] } @@ -1388,12 +1510,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -1404,13 +1526,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "Foo" @@ -1427,7 +1550,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1443,12 +1566,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -1459,7 +1583,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1477,15 +1601,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1498,12 +1626,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1515,7 +1643,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1587,7 +1715,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1605,19 +1750,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + }, + { + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -1630,12 +1779,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1647,7 +1796,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1720,7 +1869,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1738,19 +1904,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -1763,12 +1933,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1780,7 +1950,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1877,7 +2047,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1895,19 +2082,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -1920,12 +2111,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1937,7 +2128,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -2012,12 +2203,12 @@ { "name": "FK_TempSales_SalesReason", "foreignFields": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD),ID)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD),ID)" ], "sourceFields": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD),TempID)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD),TempID)" ], - "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)" + "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)" } ] } @@ -2033,7 +2224,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2051,19 +2259,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -2076,12 +2288,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -2093,7 +2305,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -2103,8 +2315,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2192,7 +2404,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2210,7 +2439,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -2228,19 +2457,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + }, + { + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -2253,7 +2486,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -2269,9 +2502,26 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -2282,8 +2532,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-19 12:34:45.660000", + "date_modified": "2024-12-19 12:34:45.660000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2300,7 +2550,24 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -2310,8 +2577,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-05 16:44:43.803000", - "date_modified": "2024-12-05 16:44:43.803000" + "date_created": "2024-12-19 12:34:45.667000", + "date_modified": "2024-12-19 12:34:45.667000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2326,14 +2593,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -2344,13 +2628,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "guest" @@ -2367,7 +2652,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2383,12 +2668,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -2399,7 +2685,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2417,15 +2703,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -2438,12 +2728,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -2454,13 +2744,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "INFORMATION_SCHEMA" @@ -2477,7 +2768,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2493,12 +2784,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -2509,7 +2801,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2527,15 +2819,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -2548,12 +2844,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -2564,13 +2860,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "sys" @@ -2587,7 +2884,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2603,12 +2900,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -2619,7 +2917,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2637,15 +2935,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -2658,7 +2960,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -2669,7 +2971,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.demodata.foo.persons,PROD)", "type": "VIEW" } ] @@ -2683,7 +2985,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2699,7 +3001,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2715,7 +3017,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2731,7 +3033,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2747,7 +3049,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml index 40bef3ff104a3..e003ec39cd528 100644 --- a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml +++ b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml @@ -7,6 +7,7 @@ source: password: test!Password database: DemoData host_port: localhost:21433 + platform_instance: my-instance # use_odbc: True # uri_args: # driver: "ODBC Driver 17 for SQL Server" diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py index 6f590b5307146..f6566f007f5e6 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py @@ -1,11 +1,14 @@ from datahub.configuration.datetimes import parse_absolute_time from datahub.metadata.urns import CorpUserUrn from datahub.sql_parsing.sql_parsing_aggregator import PreparsedQuery -from datahub.sql_parsing.tool_meta_extractor import ToolMetaExtractor +from datahub.sql_parsing.tool_meta_extractor import ( + ToolMetaExtractor, + ToolMetaExtractorReport, +) def test_extract_mode_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -30,8 +33,42 @@ def test_extract_mode_metadata() -> None: assert extractor.report.num_queries_meta_extracted["mode"] == 1 +def test_extract_looker_metadata() -> None: + extractor = ToolMetaExtractor( + report=ToolMetaExtractorReport(), looker_user_mapping={"7": "john.doe@xyz.com"} + ) + looker_query = """\ +SELECT + all_entities_extended_sibling."ENTITY" AS "all_entities_extended_sibling.entity_type", + COUNT(DISTINCT ( all_entities_extended_sibling."URN" )) AS "all_entities_extended_sibling.distinct_count" +FROM "PUBLIC"."ALL_ENTITIES" + AS all_entities_extended_sibling +GROUP BY + 1 +ORDER BY + 1 +FETCH NEXT 50 ROWS ONLY +-- Looker Query Context '{"user_id":7,"history_slug":"264797031bc403cf382cbefbe3700849","instance_slug":"32654f2ffadf10b1949d4009e52fc6a4"}' +""" + + entry = PreparsedQuery( + query_id=None, + query_text=looker_query, + upstreams=[], + downstream=None, + column_lineage=None, + column_usage=None, + inferred_schema=None, + user=CorpUserUrn("mode"), + timestamp=parse_absolute_time("2021-08-01T01:02:03Z"), + ) + assert extractor.extract_bi_metadata(entry) + assert entry.user == CorpUserUrn("john.doe") + assert extractor.report.num_queries_meta_extracted["looker"] == 1 + + def test_extract_no_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -53,3 +90,4 @@ def test_extract_no_metadata() -> None: assert not extractor.extract_bi_metadata(entry) assert extractor.report.num_queries_meta_extracted["mode"] == 0 + assert extractor.report.num_queries_meta_extracted["looker"] == 0 diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py index 85c86f8d205d9..5631ad2c69f94 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -37,7 +37,11 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Sou ), ) - with mock.patch("snowflake.connector.connect"): + with mock.patch( + "datahub.sql_parsing.sql_parsing_aggregator.ToolMetaExtractor.create", + ) as mock_checkpoint, mock.patch("snowflake.connector.connect"): + mock_checkpoint.return_value = mock.MagicMock() + yield SnowflakeV2Source(ctx=ctx, config=config) diff --git a/metadata-ingestion/tests/unit/test_iceberg.py b/metadata-ingestion/tests/unit/test_iceberg.py index b8a136586a2bf..3afa26b35dfe9 100644 --- a/metadata-ingestion/tests/unit/test_iceberg.py +++ b/metadata-ingestion/tests/unit/test_iceberg.py @@ -10,6 +10,8 @@ NoSuchIcebergTableError, NoSuchNamespaceError, NoSuchPropertyException, + NoSuchTableError, + ServerError, ) from pyiceberg.io.pyarrow import PyArrowFileIO from pyiceberg.partitioning import PartitionSpec @@ -39,6 +41,7 @@ UUIDType, ) +from datahub.configuration.common import AllowDenyPattern from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.iceberg.iceberg import ( @@ -62,12 +65,12 @@ ) -def with_iceberg_source(processing_threads: int = 1) -> IcebergSource: +def with_iceberg_source(processing_threads: int = 1, **kwargs: Any) -> IcebergSource: catalog = {"test": {"type": "rest"}} return IcebergSource( ctx=PipelineContext(run_id="iceberg-source-test"), config=IcebergSourceConfig( - catalog=catalog, processing_threads=processing_threads + catalog=catalog, processing_threads=processing_threads, **kwargs ), ) @@ -542,11 +545,11 @@ def __init__(self, tables: Dict[str, Dict[str, Callable[[], Table]]]): """ self.tables = tables - def list_namespaces(self) -> Iterable[str]: - return [*self.tables.keys()] + def list_namespaces(self) -> Iterable[Tuple[str]]: + return [*[(key,) for key in self.tables.keys()]] def list_tables(self, namespace: str) -> Iterable[Tuple[str, str]]: - return [(namespace, table) for table in self.tables[namespace].keys()] + return [(namespace[0], table) for table in self.tables[namespace[0]].keys()] def load_table(self, dataset_path: Tuple[str, str]) -> Table: return self.tables[dataset_path[0]][dataset_path[1]]() @@ -554,15 +557,15 @@ def load_table(self, dataset_path: Tuple[str, str]) -> Table: class MockCatalogExceptionListingTables(MockCatalog): def list_tables(self, namespace: str) -> Iterable[Tuple[str, str]]: - if namespace == "no_such_namespace": + if namespace == ("no_such_namespace",): raise NoSuchNamespaceError() - if namespace == "generic_exception": + if namespace == ("generic_exception",): raise Exception() return super().list_tables(namespace) class MockCatalogExceptionListingNamespaces(MockCatalog): - def list_namespaces(self) -> Iterable[str]: + def list_namespaces(self) -> Iterable[Tuple[str]]: raise Exception() @@ -814,15 +817,157 @@ def test_proper_run_with_multiple_namespaces() -> None: ) +def test_filtering() -> None: + source = with_iceberg_source( + processing_threads=1, + table_pattern=AllowDenyPattern(deny=[".*abcd.*"]), + namespace_pattern=AllowDenyPattern(allow=["namespace1"]), + ) + mock_catalog = MockCatalog( + { + "namespace1": { + "table_xyz": lambda: Table( + identifier=("namespace1", "table_xyz"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/table_xyz", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/table_xyz", + io=PyArrowFileIO(), + catalog=None, + ), + "JKLtable": lambda: Table( + identifier=("namespace1", "JKLtable"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/JKLtable", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/JKLtable", + io=PyArrowFileIO(), + catalog=None, + ), + "table_abcd": lambda: Table( + identifier=("namespace1", "table_abcd"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/table_abcd", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/table_abcd", + io=PyArrowFileIO(), + catalog=None, + ), + "aaabcd": lambda: Table( + identifier=("namespace1", "aaabcd"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/aaabcd", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/aaabcd", + io=PyArrowFileIO(), + catalog=None, + ), + }, + "namespace2": { + "foo": lambda: Table( + identifier=("namespace2", "foo"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace2/foo", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace2/foo", + io=PyArrowFileIO(), + catalog=None, + ), + "bar": lambda: Table( + identifier=("namespace2", "bar"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace2/bar", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace2/bar", + io=PyArrowFileIO(), + catalog=None, + ), + }, + "namespace3": { + "sales": lambda: Table( + identifier=("namespace3", "sales"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace3/sales", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace3/sales", + io=PyArrowFileIO(), + catalog=None, + ), + "products": lambda: Table( + identifier=("namespace2", "bar"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace3/products", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace3/products", + io=PyArrowFileIO(), + catalog=None, + ), + }, + } + ) + with patch( + "datahub.ingestion.source.iceberg.iceberg.IcebergSourceConfig.get_catalog" + ) as get_catalog: + get_catalog.return_value = mock_catalog + wu: List[MetadataWorkUnit] = [*source.get_workunits_internal()] + assert len(wu) == 2 + urns = [] + for unit in wu: + assert isinstance(unit.metadata, MetadataChangeEvent) + assert isinstance(unit.metadata.proposedSnapshot, DatasetSnapshotClass) + urns.append(unit.metadata.proposedSnapshot.urn) + TestCase().assertCountEqual( + urns, + [ + "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace1.table_xyz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace1.JKLtable,PROD)", + ], + ) + assert source.report.tables_scanned == 2 + + def test_handle_expected_exceptions() -> None: source = with_iceberg_source(processing_threads=3) def _raise_no_such_property_exception(): raise NoSuchPropertyException() - def _raise_no_such_table_exception(): + def _raise_no_such_iceberg_table_exception(): raise NoSuchIcebergTableError() + def _raise_file_not_found_error(): + raise FileNotFoundError() + + def _raise_no_such_table_exception(): + raise NoSuchTableError() + + def _raise_server_error(): + raise ServerError() + mock_catalog = MockCatalog( { "namespaceA": { @@ -876,6 +1021,9 @@ def _raise_no_such_table_exception(): ), "table5": _raise_no_such_property_exception, "table6": _raise_no_such_table_exception, + "table7": _raise_file_not_found_error, + "table8": _raise_no_such_iceberg_table_exception, + "table9": _raise_server_error, } } ) @@ -899,7 +1047,7 @@ def _raise_no_such_table_exception(): "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceA.table4,PROD)", ], ) - assert source.report.warnings.total_elements == 2 + assert source.report.warnings.total_elements == 5 assert source.report.failures.total_elements == 0 assert source.report.tables_scanned == 4 diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl index f33c41e63efed..fe782dbe01ca9 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl @@ -15,7 +15,7 @@ record DataProcessInstanceOutput { @Relationship = { "/*": { "name": "Produces", - "entityTypes": [ "dataset" ] + "entityTypes": [ "dataset", "mlModel" ] } } @Searchable = { diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index c63cb1a97c017..5c6bfaecf1ef4 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -52,4 +52,4 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc } created: AuditStamp -} +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl index b54e430038082..3cc87633c3e07 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl @@ -4,6 +4,7 @@ import com.linkedin.common.Urn import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties +import com.linkedin.common.TimeStamp /** * Properties associated with an ML Model Group @@ -13,6 +14,17 @@ import com.linkedin.common.CustomProperties } record MLModelGroupProperties includes CustomProperties { + /** + * Display name of the MLModelGroup + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + /** * Documentation of the MLModelGroup */ @@ -25,10 +37,21 @@ record MLModelGroupProperties includes CustomProperties { /** * Date when the MLModelGroup was developed */ + @deprecated createdAt: optional Time + /** + * Time and Actor who created the MLModelGroup + */ + created: optional TimeStamp + + /** + * Date when the MLModelGroup was last modified + */ + lastModified: optional TimeStamp + /** * Version of the MLModelGroup */ version: optional VersionTag -} +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl index 621a3e1747b50..2a047316ee238 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl @@ -6,6 +6,7 @@ import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties import com.linkedin.common.ExternalReference +import com.linkedin.common.TimeStamp /** * Properties associated with a ML Model @@ -15,6 +16,18 @@ import com.linkedin.common.ExternalReference } record MLModelProperties includes CustomProperties, ExternalReference { + /** + * Display name of the MLModel + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + + /** * Documentation of the MLModel */ @@ -27,8 +40,19 @@ record MLModelProperties includes CustomProperties, ExternalReference { /** * Date when the MLModel was developed */ + @deprecated date: optional Time + /** + * Audit stamp containing who created this and when + */ + created: optional TimeStamp + + /** + * Date when the MLModel was last modified + */ + lastModified: optional TimeStamp + /** * Version of the MLModel */ @@ -129,4 +153,4 @@ record MLModelProperties includes CustomProperties, ExternalReference { } } groups: optional array[Urn] -} +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl new file mode 100644 index 0000000000000..f8b8eeafe908b --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl @@ -0,0 +1,36 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.AuditStamp +import com.linkedin.common.CustomProperties +import com.linkedin.common.ExternalReference +import com.linkedin.common.Urn +import com.linkedin.common.JobFlowUrn +import com.linkedin.common.DataJobUrn +/** + * The inputs and outputs of this training run + */ +@Aspect = { + "name": "mlTrainingRunProperties", +} +record MLTrainingRunProperties includes CustomProperties, ExternalReference { + + /** + * Run Id of the ML Training Run + */ + id: optional string + + /** + * List of URLs for the Outputs of the ML Training Run + */ + outputUrls: optional array[string] + + /** + * Hyperparameters of the ML Training Run + */ + hyperParams: optional array[MLHyperParam] + + /** + * Metrics of the ML Training Run + */ + trainingMetrics: optional array[MLMetric] +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl index 2f36eda9141ab..1a1dbea4359fb 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl @@ -9,9 +9,13 @@ enum PlatformResourceType { /** * e.g. a Slack member resource, Looker user resource, etc. */ - USER_INFO, + USER_INFO, /** * e.g. a Slack channel */ CONVERSATION + /** + * e.g. Looker mapping of all user ids + */ + USER_ID_MAPPING } diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 1c3eb5b574e20..4fe170ced69f3 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -116,6 +116,10 @@ entities: - dataProcessInstanceRunEvent - status - testResults + - dataPlatformInstance + - subTypes + - container + - mlTrainingRunProperties - name: chart category: core keyAspect: chartKey diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 827789130d8bb..01c3ddd134f4c 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -3829,6 +3829,17 @@ "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", "doc" : "Documentation of the MLModel", @@ -3841,6 +3852,17 @@ "name" : "date", "type" : "com.linkedin.common.Time", "doc" : "Date when the MLModel was developed", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index b549cef0af84b..2eaeadf9b00f7 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -3987,6 +3987,17 @@ "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", "doc" : "Documentation of the MLModel", @@ -3999,6 +4010,17 @@ "name" : "date", "type" : "com.linkedin.common.Time", "doc" : "Date when the MLModel was developed", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", @@ -4984,6 +5006,17 @@ "doc" : "Properties associated with an ML Model Group", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", "doc" : "Documentation of the MLModelGroup", @@ -4996,6 +5029,17 @@ "name" : "createdAt", "type" : "com.linkedin.common.Time", "doc" : "Date when the MLModelGroup was developed", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified", "optional" : true }, { "name" : "version", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index c8be9d063eaea..b85fd9ebc6908 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -3553,6 +3553,17 @@ "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", "doc" : "Documentation of the MLModel", @@ -3565,6 +3576,17 @@ "name" : "date", "type" : "com.linkedin.common.Time", "doc" : "Date when the MLModel was developed", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index 8c7595c5e505d..851a902ddd6be 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -3547,6 +3547,17 @@ "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", "doc" : "Documentation of the MLModel", @@ -3559,6 +3570,17 @@ "name" : "date", "type" : "com.linkedin.common.Time", "doc" : "Date when the MLModel was developed", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 75e5c9a559076..99005bec3e284 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -3981,6 +3981,17 @@ "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", "doc" : "Documentation of the MLModel", @@ -3993,6 +4004,17 @@ "name" : "date", "type" : "com.linkedin.common.Time", "doc" : "Date when the MLModel was developed", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", @@ -4978,6 +5000,17 @@ "doc" : "Properties associated with an ML Model Group", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", "doc" : "Documentation of the MLModelGroup", @@ -4990,6 +5023,17 @@ "name" : "createdAt", "type" : "com.linkedin.common.Time", "doc" : "Date when the MLModelGroup was developed", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified", "optional" : true }, { "name" : "version", diff --git a/python-build/.gitignore b/python-build/.gitignore new file mode 100644 index 0000000000000..d2de6dec25809 --- /dev/null +++ b/python-build/.gitignore @@ -0,0 +1,3 @@ + +/wheels +/site diff --git a/python-build/build.gradle b/python-build/build.gradle new file mode 100644 index 0000000000000..e90bffd46828c --- /dev/null +++ b/python-build/build.gradle @@ -0,0 +1,27 @@ +plugins { + id 'base' +} + +ext { + python_executable = 'python3' +} + +task checkPythonVersion(type: Exec) { + commandLine python_executable, '-c', + 'import sys; sys.version_info >= (3, 8), f"Python version {sys.version_info} is too old"' +} + +task buildWheels(type: Exec, dependsOn: [ + checkPythonVersion, + ':metadata-ingestion:buildWheel', + ':metadata-ingestion-modules:airflow-plugin:buildWheel', + ':metadata-ingestion-modules:dagster-plugin:buildWheel', + ':metadata-ingestion-modules:prefect-plugin:buildWheel', + ':metadata-ingestion-modules:gx-plugin:buildWheel', +]) { + commandLine python_executable, "copy_wheels.py" +} + +task buildSite(type: Exec, dependsOn: [buildWheels]) { + commandLine python_executable, "build_site.py" +} diff --git a/python-build/build_site.py b/python-build/build_site.py new file mode 100644 index 0000000000000..73941eca9968c --- /dev/null +++ b/python-build/build_site.py @@ -0,0 +1,150 @@ +import contextlib +import json +import os +import pathlib +import shutil +import subprocess +from datetime import datetime, timezone + +PYTHON_BUILD_DIR = pathlib.Path(__file__).parent +WHEEL_DIR = PYTHON_BUILD_DIR / "wheels" +SITE_OUTPUT_DIR = PYTHON_BUILD_DIR / "site" + +shutil.rmtree(SITE_OUTPUT_DIR, ignore_errors=True) +SITE_OUTPUT_DIR.mkdir(parents=True) + +SITE_ARTIFACT_WHEEL_DIR = SITE_OUTPUT_DIR / "artifacts" / "wheels" +SITE_ARTIFACT_WHEEL_DIR.mkdir(parents=True) +for wheel_file in WHEEL_DIR.glob("*"): + shutil.copy(wheel_file, SITE_ARTIFACT_WHEEL_DIR) + + +def package_name(wheel_file: pathlib.Path) -> str: + return wheel_file.name.split("-")[0].replace("_", "-") + + +# Get some extra context about the build +ts = datetime.now(timezone.utc).isoformat() +context_info: dict = { + "timestamp": ts, +} + +# Get branch info. +with contextlib.suppress(Exception): + if branch_info := os.getenv("GITHUB_HEAD_REF"): + pass + else: + branch_info = subprocess.check_output( + ["git", "branch", "--show-current"], text=True + ) + context_info["branch"] = branch_info.strip() + +# Get commit info. +with contextlib.suppress(Exception): + commit_info = subprocess.check_output( + ["git", "log", "-1", "--pretty=%H%n%B"], text=True + ) + commit_hash, commit_msg = commit_info.strip().split("\n", 1) + context_info["commit"] = { + "hash": commit_hash, + "message": commit_msg.strip(), + } + +# Get PR info. +with contextlib.suppress(Exception): + pr_info = "unknown" + if github_ref := os.getenv("GITHUB_REF"): + # e.g. GITHUB_REF=refs/pull/12157/merge + parts = github_ref.split("/") + if parts[1] == "pull": + pull_number = parts[2] + pr_info = json.loads( + subprocess.check_output( + ["gh", "pr", "view", pull_number, "--json", "title,number,url"], + text=True, + ) + ) + else: + # The `gh` CLI might be able to figure it out. + pr_info = json.loads( + subprocess.check_output( + ["gh", "pr", "view", "--json", "title,number,url"], text=True + ) + ) + context_info["pr"] = pr_info + + +newline = "\n" +(SITE_OUTPUT_DIR / "index.html").write_text( + f""" + + + DataHub Python Builds + + + + + + + + + + + +
+

DataHub Python Builds

+

+ These prebuilt wheel files can be used to install our Python packages as of a specific commit. +

+ +

Build context

+

+ Built at {ts}. +

+
{json.dumps(context_info, indent=2)}
+ +

Usage

+

+ Current base URL: unknown +

+ + + + + + + + + + + { + newline.join( + f''' + + + + + + ''' + for wheel_file in sorted(WHEEL_DIR.glob("*.whl")) + ) + } + +
PackageSizeInstall command
{package_name(wheel_file)}{wheel_file.stat().st_size / 1024 / 1024:.3f} MBuv pip install '{package_name(wheel_file)} @ <base-url>/artifacts/wheels/{wheel_file.name}'
+
+ + + +""" +) + +print("DataHub Python wheel site built in", SITE_OUTPUT_DIR) diff --git a/python-build/copy_wheels.py b/python-build/copy_wheels.py new file mode 100644 index 0000000000000..b66662cbfe991 --- /dev/null +++ b/python-build/copy_wheels.py @@ -0,0 +1,27 @@ +import pathlib +import shutil + +PYTHON_BUILD_DIR = pathlib.Path(__file__).parent +ROOT_DIR = PYTHON_BUILD_DIR.parent +WHEEL_OUTPUT_DIR = PYTHON_BUILD_DIR / "wheels" + +# These should line up with the build.gradle file. +wheel_dirs = [ + ROOT_DIR / "metadata-ingestion/dist", + ROOT_DIR / "metadata-ingestion-modules/airflow-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/dagster-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/prefect-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/gx-plugin/dist", +] + +# Delete and recreate the output directory. +if WHEEL_OUTPUT_DIR.exists(): + shutil.rmtree(WHEEL_OUTPUT_DIR) +WHEEL_OUTPUT_DIR.mkdir(parents=True) + +# Copy things over. +for wheel_dir in wheel_dirs: + for wheel_file in wheel_dir.glob("*"): + shutil.copy(wheel_file, WHEEL_OUTPUT_DIR) + +print("Copied wheels to", WHEEL_OUTPUT_DIR) diff --git a/settings.gradle b/settings.gradle index 8756df31c1ac6..b0c2c707d566c 100644 --- a/settings.gradle +++ b/settings.gradle @@ -64,6 +64,7 @@ include 'metadata-ingestion-modules:airflow-plugin' include 'metadata-ingestion-modules:gx-plugin' include 'metadata-ingestion-modules:dagster-plugin' include 'metadata-ingestion-modules:prefect-plugin' +include 'python-build' include 'smoke-test' include 'metadata-auth:auth-api' include 'metadata-service:schema-registry-api'