Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(tableau-exposer-crawler): using batch API to fetch data #24

Merged
merged 3 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 44 additions & 16 deletions src/exposurescrawler/crawlers/tableau.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _should_ignore_workbook(workbook, projects_to_ignore: Collection[str]) -> bo
# by workbooks under projects without a name.
if not workbook.project_name:
return True

return workbook.project_name in projects_to_ignore


Expand All @@ -41,7 +41,6 @@ def _parse_tables_from_sql(workbooks_sqls: WorkbookModelsMapping, models) -> Wor
logger().info('⚙️ Parsing SQL: looking for references to models')

output: WorkbookModelsMapping = {}

for workbook_reference, custom_sqls in workbooks_sqls.items():
# a list of dbt model represented as their original dicts from the manifest
all_found: List[dict] = []
Expand All @@ -66,11 +65,38 @@ def _parse_tables_from_sql(workbooks_sqls: WorkbookModelsMapping, models) -> Wor
return output


def retrieve_all_workbook_owner_map(tableau_client: TableauRestClient):
"""

:param tableau_client: Tableau rest client
:return: the dictionary of {workbook_id, WorkbookItem}
"""
logger().info('⚙️ Retrieving all workbooks (batch)')

all_workbooks = tableau_client.retrieve_all_workbooks()
logger().info(f'✅ Fetched {len(all_workbooks)} workbooks')
return dict((workbook.id, workbook) for workbook in all_workbooks)


def retrieve_all_user_id_map(tableau_client: TableauRestClient):
"""

:param tableau_client: Tableau rest client
:return: the dictionary of {user_id, UserItem}
"""
logger().info('⚙️ Retrieving all users (batch)')

all_users = tableau_client.retrieve_all_users()
logger().info(f'⚙️ Fetched {len(all_users)} users')

return dict((user.id, user) for user in all_users)


def tableau_crawler(
manifest_path: str,
dbt_package_name: str,
tableau_projects_to_ignore: Collection[str],
verbose: bool,
manifest_path: str,
dbt_package_name: str,
tableau_projects_to_ignore: Collection[str],
verbose: bool,
) -> None:
# Enable verbose logging
if verbose:
Expand Down Expand Up @@ -107,7 +133,7 @@ def tableau_crawler(
workbooks_models: WorkbookModelsMapping = {}

for workbook_reference, found in itertools.chain(
workbooks_custom_sql_models.items(), workbooks_native_sql_models.items()
workbooks_custom_sql_models.items(), workbooks_native_sql_models.items()
):
workbooks_models.setdefault(workbook_reference, []).extend(found)

Expand All @@ -123,12 +149,15 @@ def tableau_crawler(
logger().info('')
logger().info('🌏 Retrieving workbooks and authors metadata from the Tableau REST API')

# Fetching all workbooks and users using Tableau batch API and keep in a dictionary.
workbook_owner_map = retrieve_all_workbook_owner_map(tableau_client)
user_userid_map = retrieve_all_user_id_map(tableau_client)

# For every workbook and the models found, create exposures and add
# to the manifest (in-memory)
for workbook_reference, found in workbooks_models.items():
workbook = tableau_client.retrieve_workbook(workbook_reference.id)
owner = tableau_client.retrieve_user(workbook.owner_id)

workbook = workbook_owner_map[workbook_reference.id]
owner = user_userid_map[workbook.owner_id]
if _should_ignore_workbook(workbook, tableau_projects_to_ignore):
logger().debug(
f'⏩ Skipping workbook: {workbook.name} ({workbook.project_name} is ignored)'
Expand All @@ -137,7 +166,6 @@ def tableau_crawler(

exposure = DbtExposure.from_tableau_workbook(dbt_package_name, workbook, owner, found)
manifest.add_exposure(exposure, found)

# Persist the modified manifest
logger().info('')
logger().info(f'💾 Writing results to file: {manifest_path}')
Expand All @@ -156,7 +184,7 @@ def tableau_crawler(
required=True,
metavar='PROJECT_NAME',
help='The name of the dbt pacakge where the exposures should be added. If in doubt, check the '
'name of your dbt project on dbt_project.yml',
'name of your dbt project on dbt_project.yml',
)
@click.option(
'--tableau-ignore-projects',
Expand All @@ -166,10 +194,10 @@ def tableau_crawler(
)
@click.option('-v', '--verbose', is_flag=True, default=False, help='Enable verbose logging')
def tableau_crawler_command(
manifest_path: str,
dbt_package_name: str,
tableau_projects_to_ignore: Collection[str],
verbose: bool,
manifest_path: str,
dbt_package_name: str,
tableau_projects_to_ignore: Collection[str],
verbose: bool,
):
tableau_crawler(manifest_path, dbt_package_name, tableau_projects_to_ignore, verbose)

Expand Down
12 changes: 12 additions & 0 deletions src/exposurescrawler/tableau/rest_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,15 @@ def run_metadata_api(self, query: str):
response = self.server.metadata.query(query)

return response['data']

def retrieve_all_workbooks(self):
with self.server.auth.sign_in(self.tableau_auth):
all_workbooks = list(TSC.Pager(self.server.workbooks))

return all_workbooks

def retrieve_all_users(self):
with self.server.auth.sign_in(self.tableau_auth):
all_users = list(TSC.Pager(self.server.users))

return all_users
27 changes: 18 additions & 9 deletions tests/_integration/test_tableau_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def mock_graphql_custom_sql_result():
},
{
'query': "select * from sample_db.public.customers left join "
"sample_db.public.orders on customers.id = orders.customer_id",
"sample_db.public.orders on customers.id = orders.customer_id",
'name': 'Custom SQL Query',
'isEmbedded': None,
'database': {'name': 'SAMPLE_DB', 'connectionType': 'snowflake'},
Expand Down Expand Up @@ -150,22 +150,31 @@ def mock_tableau_rest_api():
UserDetailsMock = namedtuple('UserDetailsMock', ['id', 'fullname', 'name'])

workbook_details = {
'customers-workbook-luid': WorkbookDetailsMock(id='aaa', name='Customers workbook'),
'company-kpis-workbook-luid': WorkbookDetailsMock(id='bbb', name='Company KPIs workbook'),
'customers-workbook-luid': WorkbookDetailsMock(id='customers-workbook-luid', name='Customers workbook',
owner_id='user-id'),
'company-kpis-workbook-luid': WorkbookDetailsMock(id='company-kpis-workbook-luid', name='Company KPIs workbook',
owner_id='user-id'),
'orders-workbook-luid': WorkbookDetailsMock(
id='ccc', name='Orders workbook', tags=['certified']
id='orders-workbook-luid', name='Orders workbook', tags=['certified'], owner_id='user-id'
),
}
user_details = {
'user-id': UserDetailsMock('user-id', 'John Doe', '[email protected]')
}

def _get_workbook_details(workbook_id):
return workbook_details[workbook_id]

with patch('exposurescrawler.crawlers.tableau.TableauRestClient', autospec=True) as mock:
instance = mock.return_value
instance.retrieve_workbook.side_effect = _get_workbook_details
instance.retrieve_user.return_value = UserDetailsMock(
'user-id', 'John Doe', '[email protected]'
)
instance.retrieve_user.return_value = user_details['user-id']
instance.retrieve_all_users.return_value = [user_details['user-id']]
instance.retrieve_all_workbooks.return_value = [
workbook_details['customers-workbook-luid'],
workbook_details['company-kpis-workbook-luid'],
workbook_details['orders-workbook-luid']
]
yield


Expand All @@ -183,10 +192,10 @@ def test_tableau_crawler(manifest_path):
tableau_crawler(manifest_path, 'jaffle_shop', [], True)

final_manifest = mock.call_args.args[0].data
exposure = final_manifest['exposures']['exposure.jaffle_shop.tableau_orders_workbook_ccc']
exposure = final_manifest['exposures']['exposure.jaffle_shop.tableau_orders_workbook_ord']

assert len(final_manifest['exposures']) == 3
assert exposure['name'] == 'tableau_orders_workbook_ccc'
assert exposure['name'] == 'tableau_orders_workbook_ord'
assert 'Workbook description' in exposure['description']
assert 'https://my-tableau-server.com/path/to/workbook' in exposure['description']
assert exposure['type'] == 'Dashboard'
Expand Down
Loading