Merge branch 'datahub-project:master' into master

acryldata · Jan 7, 2025 · ddb52f4 · ddb52f4
2 parents d3b4a5f + 03e3f46
commit ddb52f4
Show file tree

Hide file tree

Showing 30 changed files with 421 additions and 151 deletions.
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -113,7 +113,7 @@ jobs:
         if: ${{  matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
         run: |
           ./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         if: always()
         with:
           name: Test Results (build)
@@ -152,7 +152,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Upload
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: Event File
           path: ${{ github.event_path }}
diff --git a/.github/workflows/close-stale-issues.yml b/.github/workflows/close-stale-issues.yml
@@ -10,7 +10,7 @@ jobs:
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@v6
+      - uses: actions/stale@v9
         with:
           ascending: true
           operations-per-run: 100

diff --git a/.github/workflows/contributor-open-pr-comment.yml b/.github/workflows/contributor-open-pr-comment.yml
@@ -17,12 +17,12 @@ jobs:
       - name: Get and Format Username (PR only)
         if: github.event_name == 'pull_request'
         run: |
-          formatted_username=$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')
-          echo "FORMATTED_USERNAME=$formatted_username" >> $GITHUB_ENV
+          formatted_username="$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')"
+          echo "FORMATTED_USERNAME=${formatted_username}" >> "$GITHUB_ENV"
 
       - name: Create Comment (PR only)
         if: github.event_name == 'pull_request'
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         with:
           script: |
             if (context.payload.pull_request) {

diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
@@ -1252,19 +1252,19 @@ jobs:
           TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}"
           source .github/scripts/docker_logs.sh
       - name: Upload logs
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }}
           path: "docker_logs/*.log"
           retention-days: 5
       - name: Upload screenshots
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }}
           path: smoke-test/tests/cypress/cypress/screenshots/
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         if: always()
         with:
           name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }}

diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml
@@ -70,7 +70,7 @@ jobs:
       - name: Gradle build (and test)
         run: |
           ./gradlew :metadata-io:test
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         if: always()
         with:
           name: Test Results (metadata-io)
@@ -95,7 +95,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Upload
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: Event File
           path: ${{ github.event_path }}
diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml
@@ -72,14 +72,14 @@ jobs:
           docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true
           docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true
       - name: Upload logs
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: docker logs
           path: |
             "**/build/container-logs/*.log"
             "*.log"
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         if: always()
         with:
           name: Test Results (smoke tests)

diff --git a/docs/businessattributes.md b/docs/businessattributes.md
@@ -1,5 +1,10 @@
+import FeatureAvailability from '@site/src/components/FeatureAvailability';
+
 # Business Attributes
 
+<FeatureAvailability ossOnly />
+
+>**Note:** This is <b>BETA</b> feature
 
 ## What are Business Attributes
 A Business Attribute, as its name implies, is an attribute with a business focus. It embodies the traits or properties of an entity within a business framework. This attribute is a crucial piece of data for a business, utilised to define or control the entity throughout the organisation. If a business process or concept is depicted as a comprehensive logical model, then each Business Attribute can be considered as an individual component within that model. While business names and descriptions are generally managed through glossary terms, Business Attributes encompass additional characteristics such as data quality rules/assertions, data privacy markers, data usage protocols, standard tags, and supplementary documentation, alongside Names and Descriptions.
@@ -70,9 +75,11 @@ Description inherited from business attribute is greyed out to differentiate bet
 </p>
 
 ### Enable Business Attributes Feature
-By default, business attribute is disabled. To enable Business Attributes feature, set the following configuration in [application.yaml](../metadata-service/configuration/src/main/resources/application.yaml)
-
-businessAttributeEntityEnabled : true
+By default, business attribute is disabled. To enable Business Attributes feature, export environmental variable 
+(may be done via `extraEnvs` for GMS deployment):
+```shell
+BUSINESS_ATTRIBUTE_ENTITY_ENABLED=true
+```
 
 ### What updates are planned for the Business Attributes feature?
 

diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py
@@ -119,6 +119,7 @@ def get_long_description():
         "pendulum<3.0",
         "Flask-Session<0.6.0",
         "connexion<3.0",
+        "marshmallow<3.24.0",
     },
 }
 

diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md
@@ -7,10 +7,10 @@ The classification feature enables sources to be configured to automatically pre
 Note that a `.` is used to denote nested fields in the YAML recipe.
 
 | Field                     | Required | Type                                    | Description                                                                                                                                                                                                                                                                                                                              | Default                                                    |
-| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
+| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |------------------------------------------------------------|
 | enabled                   |          | boolean                                 | Whether classification should be used to auto-detect glossary terms                                                                                                                                                                                                                                                                      | False                                                      |
 | sample_size               |          | int                                     | Number of sample values used for classification.                                                                                                                                                                                                                                                                                         | 100                                                        |
-| max_workers               |          | int                                     | Number of worker processes to use for classification. Set to 1 to disable.                                                                                                                                                                                                                                                               | Number of cpu cores or 4                                   |
+| max_workers               |          | int                                     | Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.                                                                                                                                                                                                                                                               | 1                                                          |
 | info_type_to_term         |          | Dict[str,string]                        | Optional mapping to provide glossary term identifier for info type.                                                                                                                                                                                                                                                                      | By default, info type is used as glossary term identifier. |
 | classifiers               |          | Array of object                         | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance.                                                                                                                                                                       | [{'type': 'datahub', 'config': None}]                      |
 | table_pattern             |          | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True}          |

diff --git a/metadata-ingestion/docs/sources/tableau/tableau_pre.md b/metadata-ingestion/docs/sources/tableau/tableau_pre.md
@@ -3,9 +3,24 @@
 In order to ingest metadata from Tableau, you will need:
 
 - Tableau Server Version 2021.1.10 and above. It may also work for older versions.
-- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled.
-- Tableau Credentials (Username/Password or [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens))
-- The user or token must have **Site Administrator Explorer** permissions.
+- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled. This is always enabled for Tableau Cloud.
+
+### Authentication
+
+DataHub supports two authentication methods:
+
+1. Username/Password
+2. [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens)
+
+Either way, the user/token must have the **Site Administrator Explorer** site role.
+
+:::info
+
+We need the `Site Administrator Explorer` site role in order to get complete metadata from Tableau.
+
+With any lower role, the Tableau Metadata API returns missing/partial metadata. This particularly affects data source fields and definitions, which impacts our ability to extract columns and generate column lineage. As such, other site roles like `Viewer` are insufficient with the current Tableau Metadata API.
+
+:::
 
 ### Ingestion through UI
 
@@ -46,8 +61,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce
 
 | Source Concept              | DataHub Concept                                               | Notes                             |
 | --------------------------- | ------------------------------------------------------------- | --------------------------------- |
-| `"Tableau"`                 | [Data Platform](../../metamodel/entities/dataPlatform.md)     |  
-| Project                 | [Container](../../metamodel/entities/container.md)      | SubType `"Project"`              |
+| `"Tableau"`                 | [Data Platform](../../metamodel/entities/dataPlatform.md)     |
+| Project                     | [Container](../../metamodel/entities/container.md)            | SubType `"Project"`               |
 | Embedded DataSource         | [Dataset](../../metamodel/entities/dataset.md)                | SubType `"Embedded Data Source"`  |
 | Published DataSource        | [Dataset](../../metamodel/entities/dataset.md)                | SubType `"Published Data Source"` |
 | Custom SQL Table            | [Dataset](../../metamodel/entities/dataset.md)                | SubTypes `"View"`, `"Custom SQL"` |
@@ -75,14 +90,15 @@ Lineage is emitted as received from Tableau's metadata API for
 
 ### Troubleshooting
 
-### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project?
+#### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project?
 
 This may happen when the Tableau API returns NODE_LIMIT_EXCEEDED error in response to metadata query and returns partial results with message "Showing partial results. , The request exceeded the ‘n’ node limit. Use pagination, additional filtering, or both in the query to adjust results." To resolve this, consider
 
 - reducing the page size using the `page_size` config param in datahub recipe (Defaults to 10).
 - increasing tableau configuration [metadata query node limit](https://help.tableau.com/current/server/en-us/cli_configuration-set_tsm.htm#metadata_nodelimit) to higher value.
 
-### `PERMISSIONS_MODE_SWITCHED` error in ingestion report 
+#### `PERMISSIONS_MODE_SWITCHED` error in ingestion report
+
 This error occurs if the Tableau site is using external assets. For more detail, refer to the Tableau documentation [Manage Permissions for External Assets](https://help.tableau.com/current/online/en-us/dm_perms_assets.htm).
 
 Follow the below steps to enable the derived permissions:

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -461,7 +461,7 @@
     "mssql-odbc": sql_common | mssql_common | {"pyodbc"},
     "mysql": mysql,
     # mariadb should have same dependency as mysql
-    "mariadb": sql_common | {"pymysql>=1.0.2"},
+    "mariadb": sql_common | mysql,
     "okta": {"okta~=1.7.0", "nest-asyncio"},
     "oracle": sql_common | {"oracledb"},
     "postgres": sql_common | postgres_common,

diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py
@@ -13,6 +13,7 @@
 from datahub import nice_version_name
 from datahub.cli import config_utils
 from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
+from datahub.cli.env_utils import get_boolean_env_variable
 from datahub.configuration.common import ConfigurationError, OperationalError
 from datahub.emitter.generic_emitter import Emitter
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -46,6 +47,8 @@
     os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
 )
 
+_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
+
 # The limit is 16mb. We will use a max of 15mb to have some space
 # for overhead like request headers.
 # This applies to pretty much all calls to GMS.
@@ -291,7 +294,8 @@ def emit_mcps(
         mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
         async_flag: Optional[bool] = None,
     ) -> int:
-        logger.debug("Attempting to emit batch mcps")
+        if _DATAHUB_EMITTER_TRACE:
+            logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
         url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
         for mcp in mcps:
             ensure_has_system_metadata(mcp)
@@ -304,22 +308,25 @@ def emit_mcps(
         current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
         for mcp_obj in mcp_objs:
             mcp_obj_size = len(json.dumps(mcp_obj))
-            logger.debug(
-                f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
-            )
+            if _DATAHUB_EMITTER_TRACE:
+                logger.debug(
+                    f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
+                )
 
             if (
                 mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
                 or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
             ):
-                logger.debug("Decided to create new chunk")
+                if _DATAHUB_EMITTER_TRACE:
+                    logger.debug("Decided to create new chunk")
                 mcp_obj_chunks.append([])
                 current_chunk_size = 0
             mcp_obj_chunks[-1].append(mcp_obj)
             current_chunk_size += mcp_obj_size
-        logger.debug(
-            f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
-        )
+        if len(mcp_obj_chunks) > 0:
+            logger.debug(
+                f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
+            )
 
         for mcp_obj_chunk in mcp_obj_chunks:
             # TODO: We're calling json.dumps on each MCP object twice, once to estimate

diff --git a/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py
@@ -1,23 +1,25 @@
 import json
 import logging
-from typing import Iterable, List
+from typing import TYPE_CHECKING, Iterable, List
 
 from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
 from datahub.emitter.serialization_helper import pre_json_transform
-from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.metadata.schema_classes import (
     DatasetProfileClass,
     SchemaFieldClass,
     SchemaMetadataClass,
 )
 
+if TYPE_CHECKING:
+    from datahub.ingestion.api.source import SourceReport
+
 logger = logging.getLogger(__name__)
 
 
 class EnsureAspectSizeProcessor:
     def __init__(
-        self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
+        self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
     ):
         self.report = report
         self.payload_constraint = payload_constraint

diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py
@@ -31,6 +31,9 @@
 from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
     auto_patch_last_modified,
 )
+from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
+    EnsureAspectSizeProcessor,
+)
 from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
 from datahub.ingestion.api.report import Report
@@ -450,6 +453,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
             browse_path_processor,
             partial(auto_workunit_reporter, self.get_report()),
             auto_patch_last_modified,
+            EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
         ]
 
     @staticmethod

diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py
@@ -1,4 +1,3 @@
-import os
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
     )
 
     max_workers: int = Field(
-        default=(os.cpu_count() or 4),
-        description="Number of worker processes to use for classification. Set to 1 to disable.",
+        default=1,
+        description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
     )
 
     table_pattern: AllowDenyPattern = Field(