Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Jan 7, 2025
2 parents d3b4a5f + 03e3f46 commit ddb52f4
Show file tree
Hide file tree
Showing 30 changed files with 421 additions and 151 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ jobs:
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
run: |
./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
if: always()
with:
name: Test Results (build)
Expand Down Expand Up @@ -152,7 +152,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Upload
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: Event File
path: ${{ github.event_path }}
2 changes: 1 addition & 1 deletion .github/workflows/close-stale-issues.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v6
- uses: actions/stale@v9
with:
ascending: true
operations-per-run: 100
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/contributor-open-pr-comment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ jobs:
- name: Get and Format Username (PR only)
if: github.event_name == 'pull_request'
run: |
formatted_username=$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')
echo "FORMATTED_USERNAME=$formatted_username" >> $GITHUB_ENV
formatted_username="$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')"
echo "FORMATTED_USERNAME=${formatted_username}" >> "$GITHUB_ENV"
- name: Create Comment (PR only)
if: github.event_name == 'pull_request'
uses: actions/github-script@v6
uses: actions/github-script@v7
with:
script: |
if (context.payload.pull_request) {
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/docker-unified.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1252,19 +1252,19 @@ jobs:
TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}"
source .github/scripts/docker_logs.sh
- name: Upload logs
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: failure()
with:
name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }}
path: "docker_logs/*.log"
retention-days: 5
- name: Upload screenshots
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: failure()
with:
name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }}
path: smoke-test/tests/cypress/cypress/screenshots/
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
if: always()
with:
name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/metadata-io.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
- name: Gradle build (and test)
run: |
./gradlew :metadata-io:test
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
if: always()
with:
name: Test Results (metadata-io)
Expand All @@ -95,7 +95,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Upload
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: Event File
path: ${{ github.event_path }}
4 changes: 2 additions & 2 deletions .github/workflows/spark-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,14 @@ jobs:
docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true
docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true
- name: Upload logs
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: failure()
with:
name: docker logs
path: |
"**/build/container-logs/*.log"
"*.log"
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
if: always()
with:
name: Test Results (smoke tests)
Expand Down
13 changes: 10 additions & 3 deletions docs/businessattributes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import FeatureAvailability from '@site/src/components/FeatureAvailability';

# Business Attributes

<FeatureAvailability ossOnly />

>**Note:** This is <b>BETA</b> feature
## What are Business Attributes
A Business Attribute, as its name implies, is an attribute with a business focus. It embodies the traits or properties of an entity within a business framework. This attribute is a crucial piece of data for a business, utilised to define or control the entity throughout the organisation. If a business process or concept is depicted as a comprehensive logical model, then each Business Attribute can be considered as an individual component within that model. While business names and descriptions are generally managed through glossary terms, Business Attributes encompass additional characteristics such as data quality rules/assertions, data privacy markers, data usage protocols, standard tags, and supplementary documentation, alongside Names and Descriptions.
Expand Down Expand Up @@ -70,9 +75,11 @@ Description inherited from business attribute is greyed out to differentiate bet
</p>

### Enable Business Attributes Feature
By default, business attribute is disabled. To enable Business Attributes feature, set the following configuration in [application.yaml](../metadata-service/configuration/src/main/resources/application.yaml)

businessAttributeEntityEnabled : true
By default, business attribute is disabled. To enable Business Attributes feature, export environmental variable
(may be done via `extraEnvs` for GMS deployment):
```shell
BUSINESS_ATTRIBUTE_ENTITY_ENABLED=true
```

### What updates are planned for the Business Attributes feature?

Expand Down
1 change: 1 addition & 0 deletions metadata-ingestion-modules/airflow-plugin/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def get_long_description():
"pendulum<3.0",
"Flask-Session<0.6.0",
"connexion<3.0",
"marshmallow<3.24.0",
},
}

Expand Down
4 changes: 2 additions & 2 deletions metadata-ingestion/docs/dev_guides/classification.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ The classification feature enables sources to be configured to automatically pre
Note that a `.` is used to denote nested fields in the YAML recipe.

| Field | Required | Type | Description | Default |
| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |------------------------------------------------------------|
| enabled | | boolean | Whether classification should be used to auto-detect glossary terms | False |
| sample_size | | int | Number of sample values used for classification. | 100 |
| max_workers | | int | Number of worker processes to use for classification. Set to 1 to disable. | Number of cpu cores or 4 |
| max_workers | | int | Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable. | 1 |
| info_type_to_term | | Dict[str,string] | Optional mapping to provide glossary term identifier for info type. | By default, info type is used as glossary term identifier. |
| classifiers | | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] |
| table_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |
Expand Down
30 changes: 23 additions & 7 deletions metadata-ingestion/docs/sources/tableau/tableau_pre.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,24 @@
In order to ingest metadata from Tableau, you will need:

- Tableau Server Version 2021.1.10 and above. It may also work for older versions.
- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled.
- Tableau Credentials (Username/Password or [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens))
- The user or token must have **Site Administrator Explorer** permissions.
- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled. This is always enabled for Tableau Cloud.

### Authentication

DataHub supports two authentication methods:

1. Username/Password
2. [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens)

Either way, the user/token must have the **Site Administrator Explorer** site role.

:::info

We need the `Site Administrator Explorer` site role in order to get complete metadata from Tableau.

With any lower role, the Tableau Metadata API returns missing/partial metadata. This particularly affects data source fields and definitions, which impacts our ability to extract columns and generate column lineage. As such, other site roles like `Viewer` are insufficient with the current Tableau Metadata API.

:::

### Ingestion through UI

Expand Down Expand Up @@ -46,8 +61,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce

| Source Concept | DataHub Concept | Notes |
| --------------------------- | ------------------------------------------------------------- | --------------------------------- |
| `"Tableau"` | [Data Platform](../../metamodel/entities/dataPlatform.md) |
| Project | [Container](../../metamodel/entities/container.md) | SubType `"Project"` |
| `"Tableau"` | [Data Platform](../../metamodel/entities/dataPlatform.md) |
| Project | [Container](../../metamodel/entities/container.md) | SubType `"Project"` |
| Embedded DataSource | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Embedded Data Source"` |
| Published DataSource | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Published Data Source"` |
| Custom SQL Table | [Dataset](../../metamodel/entities/dataset.md) | SubTypes `"View"`, `"Custom SQL"` |
Expand Down Expand Up @@ -75,14 +90,15 @@ Lineage is emitted as received from Tableau's metadata API for

### Troubleshooting

### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project?
#### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project?

This may happen when the Tableau API returns NODE_LIMIT_EXCEEDED error in response to metadata query and returns partial results with message "Showing partial results. , The request exceeded the ‘n’ node limit. Use pagination, additional filtering, or both in the query to adjust results." To resolve this, consider

- reducing the page size using the `page_size` config param in datahub recipe (Defaults to 10).
- increasing tableau configuration [metadata query node limit](https://help.tableau.com/current/server/en-us/cli_configuration-set_tsm.htm#metadata_nodelimit) to higher value.

### `PERMISSIONS_MODE_SWITCHED` error in ingestion report
#### `PERMISSIONS_MODE_SWITCHED` error in ingestion report

This error occurs if the Tableau site is using external assets. For more detail, refer to the Tableau documentation [Manage Permissions for External Assets](https://help.tableau.com/current/online/en-us/dm_perms_assets.htm).

Follow the below steps to enable the derived permissions:
Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@
"mssql-odbc": sql_common | mssql_common | {"pyodbc"},
"mysql": mysql,
# mariadb should have same dependency as mysql
"mariadb": sql_common | {"pymysql>=1.0.2"},
"mariadb": sql_common | mysql,
"okta": {"okta~=1.7.0", "nest-asyncio"},
"oracle": sql_common | {"oracledb"},
"postgres": sql_common | postgres_common,
Expand Down
23 changes: 15 additions & 8 deletions metadata-ingestion/src/datahub/emitter/rest_emitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from datahub import nice_version_name
from datahub.cli import config_utils
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
from datahub.cli.env_utils import get_boolean_env_variable
from datahub.configuration.common import ConfigurationError, OperationalError
from datahub.emitter.generic_emitter import Emitter
from datahub.emitter.mcp import MetadataChangeProposalWrapper
Expand Down Expand Up @@ -46,6 +47,8 @@
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
)

_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)

# The limit is 16mb. We will use a max of 15mb to have some space
# for overhead like request headers.
# This applies to pretty much all calls to GMS.
Expand Down Expand Up @@ -291,7 +294,8 @@ def emit_mcps(
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
async_flag: Optional[bool] = None,
) -> int:
logger.debug("Attempting to emit batch mcps")
if _DATAHUB_EMITTER_TRACE:
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
for mcp in mcps:
ensure_has_system_metadata(mcp)
Expand All @@ -304,22 +308,25 @@ def emit_mcps(
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
for mcp_obj in mcp_objs:
mcp_obj_size = len(json.dumps(mcp_obj))
logger.debug(
f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
)
if _DATAHUB_EMITTER_TRACE:
logger.debug(
f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
)

if (
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
):
logger.debug("Decided to create new chunk")
if _DATAHUB_EMITTER_TRACE:
logger.debug("Decided to create new chunk")
mcp_obj_chunks.append([])
current_chunk_size = 0
mcp_obj_chunks[-1].append(mcp_obj)
current_chunk_size += mcp_obj_size
logger.debug(
f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
)
if len(mcp_obj_chunks) > 0:
logger.debug(
f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
)

for mcp_obj_chunk in mcp_obj_chunks:
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
import json
import logging
from typing import Iterable, List
from typing import TYPE_CHECKING, Iterable, List

from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
from datahub.emitter.serialization_helper import pre_json_transform
from datahub.ingestion.api.source import SourceReport
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.metadata.schema_classes import (
DatasetProfileClass,
SchemaFieldClass,
SchemaMetadataClass,
)

if TYPE_CHECKING:
from datahub.ingestion.api.source import SourceReport

logger = logging.getLogger(__name__)


class EnsureAspectSizeProcessor:
def __init__(
self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
):
self.report = report
self.payload_constraint = payload_constraint
Expand Down
4 changes: 4 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/api/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
auto_patch_last_modified,
)
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
EnsureAspectSizeProcessor,
)
from datahub.ingestion.api.closeable import Closeable
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
from datahub.ingestion.api.report import Report
Expand Down Expand Up @@ -450,6 +453,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
browse_path_processor,
partial(auto_workunit_reporter, self.get_report()),
auto_patch_last_modified,
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
]

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
Expand Down Expand Up @@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
)

max_workers: int = Field(
default=(os.cpu_count() or 4),
description="Number of worker processes to use for classification. Set to 1 to disable.",
default=1,
description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
)

table_pattern: AllowDenyPattern = Field(
Expand Down
Loading

0 comments on commit ddb52f4

Please sign in to comment.