From 23690320779e34fcb694df807f5dbce32395ee47 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 1 Aug 2024 10:05:49 -0700 Subject: [PATCH 1/4] docs(airflow): update min version for plugin v2 (#11065) --- docs/lineage/airflow.md | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 9d838ef8a4404..2d7707637e2d1 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -17,7 +17,7 @@ There's two actively supported implementations of the plugin, with different Air | Approach | Airflow Version | Notes | | --------- | --------------- | --------------------------------------------------------------------------- | -| Plugin v2 | 2.3+ | Recommended. Requires Python 3.8+ | +| Plugin v2 | 2.3.4+ | Recommended. Requires Python 3.8+ | | Plugin v1 | 2.1+ | No automatic lineage extraction; may not extract lineage if the task fails. | If you're using Airflow older than 2.1, it's possible to use the v1 plugin with older versions of `acryl-datahub-airflow-plugin`. See the [compatibility section](#compatibility) for more details. @@ -66,7 +66,7 @@ enabled = True # default ``` | Name | Default value | Description | -|----------------------------|----------------------|------------------------------------------------------------------------------------------| +| -------------------------- | -------------------- | ---------------------------------------------------------------------------------------- | | enabled | true | If the plugin should be enabled. | | conn_id | datahub_rest_default | The name of the datahub rest connection. | | cluster | prod | name of the airflow cluster, this is equivalent to the `env` of the instance | @@ -132,7 +132,7 @@ conn_id = datahub_rest_default # or datahub_kafka_default ``` | Name | Default value | Description | -|----------------------------|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| -------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | enabled | true | If the plugin should be enabled. | | conn_id | datahub_rest_default | The name of the datahub connection you set in step 1. | | cluster | prod | name of the airflow cluster | @@ -240,6 +240,7 @@ See this [example PR](https://github.com/datahub-project/datahub/pull/10452) whi There might be a case where the DAGs are removed from the Airflow but the corresponding pipelines and tasks are still there in the Datahub, let's call such pipelines ans tasks, `obsolete pipelines and tasks` Following are the steps to cleanup them from the datahub: + - create a DAG named `Datahub_Cleanup`, i.e. ```python @@ -263,8 +264,8 @@ with DAG( ) ``` -- ingest this DAG, and it will remove all the obsolete pipelines and tasks from the Datahub based on the `cluster` value set in the `airflow.cfg` +- ingest this DAG, and it will remove all the obsolete pipelines and tasks from the Datahub based on the `cluster` value set in the `airflow.cfg` ## Get all dataJobs associated with a dataFlow @@ -274,12 +275,7 @@ If you are looking to find all tasks (aka DataJobs) that belong to a specific pi query { dataFlow(urn: "urn:li:dataFlow:(airflow,db_etl,prod)") { childJobs: relationships( - input: { - types: ["IsPartOf"], - direction: INCOMING, - start: 0, - count: 100 - } + input: { types: ["IsPartOf"], direction: INCOMING, start: 0, count: 100 } ) { total relationships { From d5eda0de7e76df5c7503b6a3a57a578c271235ac Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Thu, 1 Aug 2024 22:56:47 +0530 Subject: [PATCH 2/4] doc(ingestion/tableau): doc update for derived permission (#11054) Co-authored-by: Pedro Silva Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Co-authored-by: Harshal Sheth --- docs/quick-ingestion-guides/tableau/setup.md | 8 ++++++++ metadata-ingestion/docs/sources/tableau/tableau_pre.md | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/docs/quick-ingestion-guides/tableau/setup.md b/docs/quick-ingestion-guides/tableau/setup.md index b6ccaf2a9cc9e..81767215d5bcd 100644 --- a/docs/quick-ingestion-guides/tableau/setup.md +++ b/docs/quick-ingestion-guides/tableau/setup.md @@ -51,6 +51,14 @@ In order to configure ingestion from Tableau, you'll first have to enable Tablea - Open a command prompt as an admin on the initial node (*where TSM is installed*) in the cluster - Run the command: `tsm maintenance metadata-services enable` +3. **Enable Derived Permissions:** This step is required only when the site is using external assets. For more detail, refer to the tableau documentation [Manage Permissions for External Assets](https://help.tableau.com/current/online/en-us/dm_perms_assets.htm). + + Follow the below steps to enable the derived permissions: + + - Sign in to Tableau Cloud or Tableau Server as an admin. + - From the left navigation pane, click Settings. + - On the General tab, under Automatic Access to Metadata about Databases and Tables, select the `Automatically grant authorized users access to metadata about databases and tables` check box. + ## Next Steps diff --git a/metadata-ingestion/docs/sources/tableau/tableau_pre.md b/metadata-ingestion/docs/sources/tableau/tableau_pre.md index 5e323da6746d2..aeb67f85b241b 100644 --- a/metadata-ingestion/docs/sources/tableau/tableau_pre.md +++ b/metadata-ingestion/docs/sources/tableau/tableau_pre.md @@ -81,3 +81,12 @@ This may happen when the Tableau API returns NODE_LIMIT_EXCEEDED error in respon - reducing the page size using the `page_size` config param in datahub recipe (Defaults to 10). - increasing tableau configuration [metadata query node limit](https://help.tableau.com/current/server/en-us/cli_configuration-set_tsm.htm#metadata_nodelimit) to higher value. + +### `PERMISSIONS_MODE_SWITCHED` error in ingestion report +This error occurs if the Tableau site is using external assets. For more detail, refer to the Tableau documentation [Manage Permissions for External Assets](https://help.tableau.com/current/online/en-us/dm_perms_assets.htm). + +Follow the below steps to enable the derived permissions: + +1. Sign in to Tableau Cloud or Tableau Server as an admin. +2. From the left navigation pane, click Settings. +3. On the General tab, under Automatic Access to Metadata about Databases and Tables, select the `Automatically grant authorized users access to metadata about databases and tables` check box. From f78b6c08fbe606410271a26e359b165dced217cd Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 2 Aug 2024 04:57:54 -0700 Subject: [PATCH 3/4] fix(py): remove dep on types-pkg_resources (#11076) --- metadata-ingestion-modules/airflow-plugin/setup.py | 2 +- metadata-ingestion-modules/dagster-plugin/setup.py | 4 ++-- metadata-ingestion/setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 6d5aa74b1d96f..2401b169cd660 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -53,7 +53,7 @@ def get_long_description(): mypy_stubs = { "types-dataclasses", "sqlalchemy-stubs", - "types-pkg_resources", + "types-setuptools", "types-six", "types-python-dateutil", "types-requests", diff --git a/metadata-ingestion-modules/dagster-plugin/setup.py b/metadata-ingestion-modules/dagster-plugin/setup.py index 60b960e653eb2..8a2a1d76d345b 100644 --- a/metadata-ingestion-modules/dagster-plugin/setup.py +++ b/metadata-ingestion-modules/dagster-plugin/setup.py @@ -26,14 +26,14 @@ def get_long_description(): "dagit >= 1.3.3", *rest_common, # Ignoring the dependency below because it causes issues with the vercel built wheel install - #f"acryl-datahub[datahub-rest]{_self_pin}", + # f"acryl-datahub[datahub-rest]{_self_pin}", "acryl-datahub[datahub-rest]", } mypy_stubs = { "types-dataclasses", "sqlalchemy-stubs", - "types-pkg_resources", + "types-setuptools", "types-six", "types-python-dateutil", "types-requests", diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 20a43a94f6bda..445600b8abd48 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -491,7 +491,7 @@ mypy_stubs = { "types-dataclasses", - "types-pkg_resources", + "types-setuptools", "types-six", "types-python-dateutil", # We need to avoid 2.31.0.5 and 2.31.0.4 due to From f2e461eb633c0bdd6196f94320ca09cb06fd360b Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Fri, 2 Aug 2024 21:01:46 +0530 Subject: [PATCH 4/4] feat(ingest/mode): add option to exclude restricted (#11081) --- .../src/datahub/ingestion/source/mode.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index 4b4822bcb98ca..3da7f98e93008 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -135,9 +135,14 @@ class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase): connect_uri: str = Field( default="https://app.mode.com", description="Mode host URL." ) - token: str = Field(description="Mode user token.") + token: str = Field( + description="When creating workspace API key this is the 'Key ID'." + ) password: pydantic.SecretStr = Field( - description="Mode password for authentication." + description="When creating workspace API key this is the 'Secret'." + ) + exclude_restricted: bool = Field( + default=False, description="Exclude restricted collections" ) workspace: str = Field( @@ -522,6 +527,16 @@ def _get_space_name_and_tokens(self) -> dict: for s in spaces: logger.debug(f"Space: {s.get('name')}") space_name = s.get("name", "") + # Using both restricted and default_access_level because + # there is a current bug with restricted returning False everytime + # which has been reported to Mode team + if self.config.exclude_restricted and ( + s.get("restricted") or s.get("default_access_level") == "restricted" + ): + logging.debug( + f"Skipping space {space_name} due to exclude restricted" + ) + continue if not self.config.space_pattern.allowed(space_name): self.report.report_dropped_space(space_name) logging.debug(f"Skipping space {space_name} due to space pattern")