diff --git a/src/exposurescrawler/crawlers/tableau.py b/src/exposurescrawler/crawlers/tableau.py index 72a8c04..e95f316 100644 --- a/src/exposurescrawler/crawlers/tableau.py +++ b/src/exposurescrawler/crawlers/tableau.py @@ -23,7 +23,7 @@ def _should_ignore_workbook(workbook, projects_to_ignore: Collection[str]) -> bo # by workbooks under projects without a name. if not workbook.project_name: return True - + return workbook.project_name in projects_to_ignore @@ -41,7 +41,6 @@ def _parse_tables_from_sql(workbooks_sqls: WorkbookModelsMapping, models) -> Wor logger().info('⚙️ Parsing SQL: looking for references to models') output: WorkbookModelsMapping = {} - for workbook_reference, custom_sqls in workbooks_sqls.items(): # a list of dbt model represented as their original dicts from the manifest all_found: List[dict] = [] @@ -66,11 +65,38 @@ def _parse_tables_from_sql(workbooks_sqls: WorkbookModelsMapping, models) -> Wor return output +def retrieve_all_workbook_owner_map(tableau_client: TableauRestClient): + """ + + :param tableau_client: Tableau rest client + :return: the dictionary of {workbook_id, WorkbookItem} + """ + logger().info('⚙️ Retrieving all workbooks (batch)') + + all_workbooks = tableau_client.retrieve_all_workbooks() + logger().info(f'✅ Fetched {len(all_workbooks)} workbooks') + return dict((workbook.id, workbook) for workbook in all_workbooks) + + +def retrieve_all_user_id_map(tableau_client: TableauRestClient): + """ + + :param tableau_client: Tableau rest client + :return: the dictionary of {user_id, UserItem} + """ + logger().info('⚙️ Retrieving all users (batch)') + + all_users = tableau_client.retrieve_all_users() + logger().info(f'⚙️ Fetched {len(all_users)} users') + + return dict((user.id, user) for user in all_users) + + def tableau_crawler( - manifest_path: str, - dbt_package_name: str, - tableau_projects_to_ignore: Collection[str], - verbose: bool, + manifest_path: str, + dbt_package_name: str, + tableau_projects_to_ignore: Collection[str], + verbose: bool, ) -> None: # Enable verbose logging if verbose: @@ -107,7 +133,7 @@ def tableau_crawler( workbooks_models: WorkbookModelsMapping = {} for workbook_reference, found in itertools.chain( - workbooks_custom_sql_models.items(), workbooks_native_sql_models.items() + workbooks_custom_sql_models.items(), workbooks_native_sql_models.items() ): workbooks_models.setdefault(workbook_reference, []).extend(found) @@ -123,12 +149,15 @@ def tableau_crawler( logger().info('') logger().info('🌏 Retrieving workbooks and authors metadata from the Tableau REST API') + # Fetching all workbooks and users using Tableau batch API and keep in a dictionary. + workbook_owner_map = retrieve_all_workbook_owner_map(tableau_client) + user_userid_map = retrieve_all_user_id_map(tableau_client) + # For every workbook and the models found, create exposures and add # to the manifest (in-memory) for workbook_reference, found in workbooks_models.items(): - workbook = tableau_client.retrieve_workbook(workbook_reference.id) - owner = tableau_client.retrieve_user(workbook.owner_id) - + workbook = workbook_owner_map[workbook_reference.id] + owner = user_userid_map[workbook.owner_id] if _should_ignore_workbook(workbook, tableau_projects_to_ignore): logger().debug( f'⏩ Skipping workbook: {workbook.name} ({workbook.project_name} is ignored)' @@ -137,7 +166,6 @@ def tableau_crawler( exposure = DbtExposure.from_tableau_workbook(dbt_package_name, workbook, owner, found) manifest.add_exposure(exposure, found) - # Persist the modified manifest logger().info('') logger().info(f'💾 Writing results to file: {manifest_path}') @@ -156,7 +184,7 @@ def tableau_crawler( required=True, metavar='PROJECT_NAME', help='The name of the dbt pacakge where the exposures should be added. If in doubt, check the ' - 'name of your dbt project on dbt_project.yml', + 'name of your dbt project on dbt_project.yml', ) @click.option( '--tableau-ignore-projects', @@ -166,10 +194,10 @@ def tableau_crawler( ) @click.option('-v', '--verbose', is_flag=True, default=False, help='Enable verbose logging') def tableau_crawler_command( - manifest_path: str, - dbt_package_name: str, - tableau_projects_to_ignore: Collection[str], - verbose: bool, + manifest_path: str, + dbt_package_name: str, + tableau_projects_to_ignore: Collection[str], + verbose: bool, ): tableau_crawler(manifest_path, dbt_package_name, tableau_projects_to_ignore, verbose) diff --git a/src/exposurescrawler/tableau/rest_client.py b/src/exposurescrawler/tableau/rest_client.py index 53b624c..74328e8 100644 --- a/src/exposurescrawler/tableau/rest_client.py +++ b/src/exposurescrawler/tableau/rest_client.py @@ -30,3 +30,15 @@ def run_metadata_api(self, query: str): response = self.server.metadata.query(query) return response['data'] + + def retrieve_all_workbooks(self): + with self.server.auth.sign_in(self.tableau_auth): + all_workbooks = list(TSC.Pager(self.server.workbooks)) + + return all_workbooks + + def retrieve_all_users(self): + with self.server.auth.sign_in(self.tableau_auth): + all_users = list(TSC.Pager(self.server.users)) + + return all_users diff --git a/tests/_integration/test_tableau_crawler.py b/tests/_integration/test_tableau_crawler.py index 40dc596..b346d3a 100644 --- a/tests/_integration/test_tableau_crawler.py +++ b/tests/_integration/test_tableau_crawler.py @@ -45,7 +45,7 @@ def mock_graphql_custom_sql_result(): }, { 'query': "select * from sample_db.public.customers left join " - "sample_db.public.orders on customers.id = orders.customer_id", + "sample_db.public.orders on customers.id = orders.customer_id", 'name': 'Custom SQL Query', 'isEmbedded': None, 'database': {'name': 'SAMPLE_DB', 'connectionType': 'snowflake'}, @@ -150,12 +150,17 @@ def mock_tableau_rest_api(): UserDetailsMock = namedtuple('UserDetailsMock', ['id', 'fullname', 'name']) workbook_details = { - 'customers-workbook-luid': WorkbookDetailsMock(id='aaa', name='Customers workbook'), - 'company-kpis-workbook-luid': WorkbookDetailsMock(id='bbb', name='Company KPIs workbook'), + 'customers-workbook-luid': WorkbookDetailsMock(id='customers-workbook-luid', name='Customers workbook', + owner_id='user-id'), + 'company-kpis-workbook-luid': WorkbookDetailsMock(id='company-kpis-workbook-luid', name='Company KPIs workbook', + owner_id='user-id'), 'orders-workbook-luid': WorkbookDetailsMock( - id='ccc', name='Orders workbook', tags=['certified'] + id='orders-workbook-luid', name='Orders workbook', tags=['certified'], owner_id='user-id' ), } + user_details = { + 'user-id': UserDetailsMock('user-id', 'John Doe', 'john.doe@example.com') + } def _get_workbook_details(workbook_id): return workbook_details[workbook_id] @@ -163,9 +168,13 @@ def _get_workbook_details(workbook_id): with patch('exposurescrawler.crawlers.tableau.TableauRestClient', autospec=True) as mock: instance = mock.return_value instance.retrieve_workbook.side_effect = _get_workbook_details - instance.retrieve_user.return_value = UserDetailsMock( - 'user-id', 'John Doe', 'john.doe@example.com' - ) + instance.retrieve_user.return_value = user_details['user-id'] + instance.retrieve_all_users.return_value = [user_details['user-id']] + instance.retrieve_all_workbooks.return_value = [ + workbook_details['customers-workbook-luid'], + workbook_details['company-kpis-workbook-luid'], + workbook_details['orders-workbook-luid'] + ] yield @@ -183,10 +192,10 @@ def test_tableau_crawler(manifest_path): tableau_crawler(manifest_path, 'jaffle_shop', [], True) final_manifest = mock.call_args.args[0].data - exposure = final_manifest['exposures']['exposure.jaffle_shop.tableau_orders_workbook_ccc'] + exposure = final_manifest['exposures']['exposure.jaffle_shop.tableau_orders_workbook_ord'] assert len(final_manifest['exposures']) == 3 - assert exposure['name'] == 'tableau_orders_workbook_ccc' + assert exposure['name'] == 'tableau_orders_workbook_ord' assert 'Workbook description' in exposure['description'] assert 'https://my-tableau-server.com/path/to/workbook' in exposure['description'] assert exposure['type'] == 'Dashboard'