Skip to content

Commit

Permalink
fix(ingest/tableau): project_path_pattern use in _is_denied_project (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
sid-acryl authored Dec 11, 2024
1 parent f1ef4f8 commit d062411
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 19 deletions.
6 changes: 3 additions & 3 deletions metadata-ingestion/src/datahub/configuration/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,15 @@ def allow_all(cls) -> "AllowDenyPattern":
return AllowDenyPattern()

def allowed(self, string: str) -> bool:
if self._denied(string):
if self.denied(string):
return False

return any(
re.match(allow_pattern, string, self.regex_flags)
for allow_pattern in self.allow
)

def _denied(self, string: str) -> bool:
def denied(self, string: str) -> bool:
for deny_pattern in self.deny:
if re.match(deny_pattern, string, self.regex_flags):
return True
Expand All @@ -290,7 +290,7 @@ def get_allowed_list(self) -> List[str]:
raise ValueError(
"allow list must be fully specified to get list of allowed strings"
)
return [a for a in self.allow if not self._denied(a)]
return [a for a in self.allow if not self.denied(a)]

def __eq__(self, other): # type: ignore
return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
Expand Down
51 changes: 35 additions & 16 deletions metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ class TableauConfig(

project_path_separator: str = Field(
default="/",
description="The separator used for the project_pattern field between project names. By default, we use a slash. "
description="The separator used for the project_path_pattern field between project names. By default, we use a slash. "
"You can change this if your Tableau projects contain slashes in their names, and you'd like to filter by project.",
)

Expand Down Expand Up @@ -959,19 +959,36 @@ def _is_allowed_project(self, project: TableauProject) -> bool:
return is_allowed

def _is_denied_project(self, project: TableauProject) -> bool:
# Either project name or project path should exist in deny
for deny_pattern in self.config.project_pattern.deny:
# Either name or project path is denied
if re.match(
deny_pattern, project.name, self.config.project_pattern.regex_flags
) or re.match(
deny_pattern,
self._get_project_path(project),
self.config.project_pattern.regex_flags,
):
return True
logger.info(f"project({project.name}) is not denied as per project_pattern")
return False
"""
Why use an explicit denial check instead of the `AllowDenyPattern.allowed` method?

Consider a scenario where a Tableau site contains four projects: A, B, C, and D, with the following hierarchical relationship:

- **A**
- **B** (Child of A)
- **C** (Child of A)
- **D**

In this setup:

- `project_pattern` is configured with `allow: ["A"]` and `deny: ["B"]`.
- `extract_project_hierarchy` is set to `True`.

The goal is to extract assets from project A and its children while explicitly denying the child project B.

If we rely solely on the `project_pattern.allowed()` method, project C's assets will not be ingested.
This happens because project C is not explicitly included in the `allow` list, nor is it part of the `deny` list.
However, since `extract_project_hierarchy` is enabled, project C should ideally be included in the ingestion process unless explicitly denied.

To address this, the function explicitly checks the deny regex to ensure that project C’s assets are ingested if it is not specifically denied in the deny list. This approach ensures that the hierarchy is respected while adhering to the configured allow/deny rules.
"""

# Either project_pattern or project_path_pattern is set in a recipe
# TableauConfig.projects_backward_compatibility ensures that at least one of these properties is configured.

return self.config.project_pattern.denied(
project.name
) or self.config.project_path_pattern.denied(self._get_project_path(project))

def _init_tableau_project_registry(self, all_project_map: dict) -> None:
list_of_skip_projects: List[TableauProject] = []
Expand Down Expand Up @@ -999,9 +1016,11 @@ def _init_tableau_project_registry(self, all_project_map: dict) -> None:
for project in list_of_skip_projects:
if (
project.parent_id in projects_to_ingest
and self._is_denied_project(project) is False
and not self._is_denied_project(project)
):
logger.debug(f"Project {project.name} is added in project registry")
logger.debug(
f"Project {project.name} is added in project registry as it's a child project and not explicitly denied in `deny` list"
)
projects_to_ingest[project.id] = project

# We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.
Expand Down

0 comments on commit d062411

Please sign in to comment.