Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
treff7es authored Oct 17, 2024
2 parents d6a99af + cf3634a commit 14dd3c0
Show file tree
Hide file tree
Showing 96 changed files with 6,871 additions and 8,663 deletions.
9 changes: 4 additions & 5 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,16 @@ jobs:
timezoneLinux: ${{ matrix.timezone }}
- name: Check out the repo
uses: acryldata/sane-checkout-action@v3
- uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: pip
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
distribution: "zulu"
java-version: 17
- uses: gradle/actions/setup-gradle@v3
- uses: actions/setup-python@v5
if: ${{ needs.setup.outputs.ingestion_change == 'true' }}
with:
python-version: "3.10"
cache: pip
- name: Gradle build (and test) for NOT metadata ingestion
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
run: |
Expand Down
3 changes: 3 additions & 0 deletions docker/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ dockerCompose {
isRequiredBy(tasks.named('quickstartDebug'))
composeAdditionalArgs = ['--profile', 'debug']

if (System.getenv().containsKey("DATAHUB_VERSION")) {
environment.put 'DATAHUB_VERSION', System.getenv("DATAHUB_VERSION")
}
environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally

useComposeFiles = ['profiles/docker-compose.yml']
Expand Down
2 changes: 1 addition & 1 deletion docker/profiles/docker-compose.frontend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ x-datahub-frontend-service: &datahub-frontend-service

x-datahub-frontend-service-dev: &datahub-frontend-service-dev
<<: *datahub-frontend-service
image: ${DATAHUB_FRONTEND_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-frontend-react}:debug
image: ${DATAHUB_FRONTEND_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-frontend-react}:${DATAHUB_VERSION:-debug}
ports:
- ${DATAHUB_MAPPED_FRONTEND_DEBUG_PORT:-5002}:5002
- ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002
Expand Down
8 changes: 4 additions & 4 deletions docker/profiles/docker-compose.gms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ x-datahub-system-update-service: &datahub-system-update-service

x-datahub-system-update-service-dev: &datahub-system-update-service-dev
<<: *datahub-system-update-service
image: ${DATAHUB_UPGRADE_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-upgrade}:debug
image: ${DATAHUB_UPGRADE_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-upgrade}:${DATAHUB_VERSION:-debug}
ports:
- ${DATAHUB_MAPPED_UPGRADE_DEBUG_PORT:-5003}:5003
environment: &datahub-system-update-dev-env
Expand Down Expand Up @@ -115,7 +115,7 @@ x-datahub-gms-service: &datahub-gms-service

x-datahub-gms-service-dev: &datahub-gms-service-dev
<<: *datahub-gms-service
image: ${DATAHUB_GMS_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-gms}:debug
image: ${DATAHUB_GMS_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-gms}:${DATAHUB_VERSION:-debug}
ports:
- ${DATAHUB_MAPPED_GMS_DEBUG_PORT:-5001}:5001
- ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
Expand Down Expand Up @@ -159,7 +159,7 @@ x-datahub-mae-consumer-service: &datahub-mae-consumer-service

x-datahub-mae-consumer-service-dev: &datahub-mae-consumer-service-dev
<<: *datahub-mae-consumer-service
image: ${DATAHUB_MAE_CONSUMER_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mae-consumer}:debug
image: ${DATAHUB_MAE_CONSUMER_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mae-consumer}:${DATAHUB_VERSION:-debug}
environment:
<<: [*datahub-dev-telemetry-env, *datahub-mae-consumer-env]
volumes:
Expand All @@ -185,7 +185,7 @@ x-datahub-mce-consumer-service: &datahub-mce-consumer-service

x-datahub-mce-consumer-service-dev: &datahub-mce-consumer-service-dev
<<: *datahub-mce-consumer-service
image: ${DATAHUB_MCE_CONSUMER_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mce-consumer}:debug
image: ${DATAHUB_MCE_CONSUMER_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mce-consumer}:${DATAHUB_VERSION:-debug}
environment:
<<: [*datahub-dev-telemetry-env, *datahub-mce-consumer-env]
volumes:
Expand Down
10 changes: 5 additions & 5 deletions docker/profiles/docker-compose.prerequisites.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ services:
mysql-setup-dev:
<<: *mysql-setup
profiles: *mysql-profiles-dev
image: ${DATAHUB_MYSQL_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mysql-setup}:debug
image: ${DATAHUB_MYSQL_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mysql-setup}:${DATAHUB_VERSION:-debug}
postgres:
profiles: *postgres-profiles
hostname: postgres
Expand Down Expand Up @@ -166,7 +166,7 @@ services:
postgres-setup-dev:
<<: *postgres-setup
profiles: *postgres-profiles-dev
image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-postgres-setup}:debug
image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-postgres-setup}:${DATAHUB_VERSION:-debug}
cassandra:
profiles: *cassandra-profiles
hostname: cassandra
Expand Down Expand Up @@ -272,7 +272,7 @@ services:
environment:
<<: *kafka-setup-env
DATAHUB_PRECREATE_TOPICS: ${DATAHUB_PRECREATE_TOPICS:-true}
image: ${DATAHUB_KAFKA_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-kafka-setup}:debug
image: ${DATAHUB_KAFKA_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-kafka-setup}:${DATAHUB_VERSION:-debug}
elasticsearch:
profiles: *elasticsearch-profiles
hostname: search
Expand All @@ -296,7 +296,7 @@ services:
volumes:
- esdata:/usr/share/elasticsearch/data
elasticsearch-setup-dev: &elasticsearch-setup-dev
image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-elasticsearch-setup}:debug
image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-debug}
profiles: *elasticsearch-profiles
hostname: elasticsearch-setup
env_file: elasticsearch-setup/env/docker.env
Expand Down Expand Up @@ -347,7 +347,7 @@ services:
<<: *opensearch-setup
profiles: *opensearch-profiles-dev
hostname: opensearch-setup-dev
image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-elasticsearch-setup}:debug
image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-debug}
environment:
<<: *search-datastore-environment
USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-true}
Expand Down
19 changes: 18 additions & 1 deletion docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,31 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

## Next

- #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.<br/> PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.<br/>
Entity urn with `include_workspace_name_in_dataset_urn: false`
```
urn:li:dataset:(urn:li:dataPlatform:powerbi,[<PlatformInstance>.]<SemanticModelName>.<TableName>,<ENV>)
```

Entity urn with `include_workspace_name_in_dataset_urn: true`
```
urn:li:dataset:(urn:li:dataPlatform:powerbi,[<PlatformInstance>.].<WorkspaceName>.<SemanticModelName>.<TableName>,<ENV>)
```

The config `include_workspace_name_in_dataset_urn` is default to `false` for backward compatiblity, However, we recommend enabling this flag after performing the necessary cleanup.
If stateful ingestion is enabled, running ingestion with the latest CLI version will handle the cleanup automatically. Otherwise, we recommend soft deleting all powerbi data via the DataHub CLI:
`datahub delete --platform powerbi --soft` and then re-ingest with the latest CLI version, ensuring the `include_workspace_name_in_dataset_urn` configuration is set to true.

### Breaking Changes

- #11486 - Deprecated Criterion filters using `value`. Use `values` instead. This also deprecates the ability to use comma delimited string to represent multiple values using `value`.
- #11484 - Metadata service authentication enabled by default
- #11484 - Rest API authorization enabled by default
- #10472 - `SANDBOX` added as a FabricType. No rollbacks allowed once metadata with this fabric type is added without manual cleanups in databases.
- #11619 - schema field/column paths can no longer be empty strings
- #11619 - schema field/column paths can no longer be duplicated within the schema
- #11619 - schema field/column paths can no longer be duplicated within the schema
- #11570 - The `DatahubClientConfig`'s server field no longer defaults to `http://localhost:8080`. Be sure to explicitly set this.
- #11570 - If a `datahub_api` is explicitly passed to a stateful ingestion config provider, it will be used. We previously ignored it if the pipeline context also had a graph object.

### Potential Downtime

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,6 @@ class Constant:
# Default config constants
DEFAULT_DATAHUB_REST_URL = "http://localhost:8080"

# Environment variable contants
DATAHUB_REST_URL = "DATAHUB_REST_URL"
DATAHUB_ENV = "DATAHUB_ENV"
DATAHUB_PLATFORM_INSTANCE = "DATAHUB_PLATFORM_INSTANCE"
DAGSTER_UI_URL = "DAGSTER_UI_URL"

# Datahub inputs/outputs constant
DATAHUB_INPUTS = "datahub.inputs"
DATAHUB_OUTPUTS = "datahub.outputs"
Expand Down Expand Up @@ -154,7 +148,6 @@ class DatasetLineage(NamedTuple):

class DatahubDagsterSourceConfig(DatasetSourceConfigMixin):
datahub_client_config: DatahubClientConfig = pydantic.Field(
default=DatahubClientConfig(),
description="Datahub client config",
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import traceback
import warnings
from collections import defaultdict
from types import ModuleType
from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Union
Expand Down Expand Up @@ -38,7 +39,7 @@
from dagster._core.events import DagsterEventType, HandledOutputData, LoadedInputData
from dagster._core.execution.stats import RunStepKeyStatsSnapshot
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.graph.client import DataHubGraph
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
from datahub.metadata.schema_classes import SubTypesClass
from datahub.sql_parsing.sqlglot_lineage import (
SqlParsingResult,
Expand All @@ -47,6 +48,7 @@
from datahub.utilities.urns.dataset_urn import DatasetUrn

from datahub_dagster_plugin.client.dagster_generator import (
Constant,
DagsterEnvironment,
DagsterGenerator,
DatahubDagsterSourceConfig,
Expand Down Expand Up @@ -182,7 +184,17 @@ def __init__(
if config:
self.config = config
else:
self.config = DatahubDagsterSourceConfig()
# This is a temporary warning for backwards compatibility. Eventually, we'll remove this
# branch and make the config required.
warnings.warn(
"Using the default DataHub client config is deprecated. Pass in a config object explicitly.",
stacklevel=2,
)
self.config = DatahubDagsterSourceConfig(
datahub_client_config=DatahubClientConfig(
server=Constant.DEFAULT_DATAHUB_REST_URL
)
)
self.graph = DataHubGraph(
self.config.datahub_client_config,
)
Expand Down
15 changes: 9 additions & 6 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,13 @@
"Authlib",
}

superset_common = {
"requests",
"sqlalchemy",
"great_expectations",
"greenlet",
}

# Note: for all of these, framework_common will be added.
plugins: Dict[str, Set[str]] = {
# Sink plugins.
Expand Down Expand Up @@ -462,12 +469,8 @@
"sqlalchemy": sql_common,
"sql-queries": usage_common | sqlglot_lib,
"slack": slack,
"superset": {
"requests",
"sqlalchemy",
"great_expectations",
"greenlet",
},
"superset": superset_common,
"preset": superset_common,
# FIXME: I don't think tableau uses sqllineage anymore so we should be able
# to remove that dependency.
"tableau": {"tableauserverclient>=0.24.0"} | sqllineage_lib | sqlglot_lib,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class DataProduct(ConfigModel):
@pydantic.validator("assets", each_item=True)
def assets_must_be_urns(cls, v: str) -> str:
try:
Urn.create_from_string(v)
Urn.from_string(v)
except Exception as e:
raise ValueError(f"asset {v} is not an urn: {e}") from e

Expand Down
35 changes: 29 additions & 6 deletions metadata-ingestion/src/datahub/cli/check_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,13 @@ def sql_format(sql: str, platform: str) -> None:
@click.option(
"--sql",
type=str,
required=True,
help="The SQL query to parse",
)
@click.option(
"--sql-file",
type=click.Path(exists=True, dir_okay=False, readable=True),
help="The SQL file to parse",
)
@click.option(
"--platform",
type=str,
Expand Down Expand Up @@ -218,25 +222,44 @@ def sql_format(sql: str, platform: str) -> None:
type=str,
help="The default schema to use for unqualified table names",
)
@click.option(
"--online/--offline",
type=bool,
is_flag=True,
default=True,
help="Run in offline mode and disable schema-aware parsing.",
)
@telemetry.with_telemetry()
def sql_lineage(
sql: str,
sql: Optional[str],
sql_file: Optional[str],
platform: str,
default_db: Optional[str],
default_schema: Optional[str],
platform_instance: Optional[str],
env: str,
online: bool,
) -> None:
"""Parse the lineage of a SQL query.
This performs schema-aware parsing in order to generate column-level lineage.
If the relevant tables are not in DataHub, this will be less accurate.
In online mode (the default), we perform schema-aware parsing in order to generate column-level lineage.
If offline mode is enabled or if the relevant tables are not in DataHub, this will be less accurate.
"""

graph = get_default_graph()
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result

if sql is None:
if sql_file is None:
raise click.UsageError("Either --sql or --sql-file must be provided")
sql = pathlib.Path(sql_file).read_text()

graph = None
if online:
graph = get_default_graph()

lineage = graph.parse_sql_lineage(
lineage = create_lineage_sql_parsed_result(
sql,
graph=graph,
platform=platform,
platform_instance=platform_instance,
env=env,
Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/src/datahub/ingestion/graph/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class DatahubClientConfig(ConfigModel):

# TODO: Having a default for the server doesn't make a ton of sense. This should be handled
# by callers / the CLI, but the actual client should not have any magic.
server: str = "http://localhost:8080"
server: str
token: Optional[str] = None
timeout_sec: Optional[int] = None
retry_status_codes: Optional[List[int]] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,6 @@
# We can't use close as it is not called if the ingestion is not successful
def cleanup(config: BigQueryV2Config) -> None:
if config._credentials_path is not None:
logger.debug(
f"Deleting temporary credential file at {config._credentials_path}"
)
os.unlink(config._credentials_path)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def platform_resource_key(self) -> PlatformResourceKey:
return PlatformResourceKey(
platform="bigquery",
resource_type="BigQueryLabelInfo",
platform_instance=self.project,
platform_instance=None,
primary_key=self.label.primary_key(),
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,6 @@ class LookMLSourceConfig(
description="List of regex patterns for LookML views to include in the extraction.",
)
parse_table_names_from_sql: bool = Field(True, description="See note below.")
sql_parser: str = Field(
"datahub.utilities.sql_parser.DefaultSQLParser", description="See note below."
)
api: Optional[LookerAPIConfig]
project_name: Optional[str] = Field(
None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -331,8 +331,8 @@ class PowerBiDashboardSourceConfig(
)
workspace_id_as_urn_part: bool = pydantic.Field(
default=False,
description="Highly recommend changing this to True, as you can have the same workspace name"
"To maintain backward compatability, this is set to False which uses workspace name",
description="It is recommended to set this to True only if you have legacy workspaces based on Office 365 groups, as those workspaces can have identical names."
"To maintain backward compatibility, this is set to False which uses workspace name",
)
# Enable/Disable extracting ownership information of Dashboard
extract_ownership: bool = pydantic.Field(
Expand Down Expand Up @@ -466,6 +466,13 @@ class PowerBiDashboardSourceConfig(
" Note: This field works in conjunction with 'workspace_id_pattern'. Both must be matched for a workspace to be processed.",
)

include_workspace_name_in_dataset_urn: bool = pydantic.Field(
default=False,
description="It is recommended to set this to true, as it helps prevent the overwriting of datasets."
"Read section #11560 at https://datahubproject.io/docs/how/updating-datahub/ before enabling this option."
"To maintain backward compatibility, this is set to False.",
)

@root_validator(skip_on_failure=True)
def validate_extract_column_level_lineage(cls, values: Dict) -> Dict:
flags = [
Expand Down
Loading

0 comments on commit 14dd3c0

Please sign in to comment.