diff --git a/datahub-web-react/src/app/entity/shared/tabs/Properties/__tests__/useStructuredProperties.test.ts b/datahub-web-react/src/app/entity/shared/tabs/Properties/__tests__/useStructuredProperties.test.ts new file mode 100644 index 0000000000000..ff7c6e51a04a0 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/tabs/Properties/__tests__/useStructuredProperties.test.ts @@ -0,0 +1,87 @@ +import { identifyAndAddParentRows } from '../useStructuredProperties'; + +describe('identifyAndAddParentRows', () => { + it('should not return parent rows when there are none', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'test1' }, + { displayName: 'test2', qualifiedName: 'test2' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]); + }); + + it('should not return parent rows when another row starts with the same letters but is a different token', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one' }, + { displayName: 'test2', qualifiedName: 'testingAgain.two' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]); + }); + + it('should return parent rows properly', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one' }, + { displayName: 'test2', qualifiedName: 'testing.two' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', childrenCount: 3 }, + ]); + }); + + it('should return parent rows properly with multiple layers of nesting', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one.two.a.1' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.a.2' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.b' }, + { displayName: 'test1', qualifiedName: 'testing.one.three' }, + { displayName: 'test2', qualifiedName: 'testing.two.c.d' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + { displayName: 'test3', qualifiedName: 'testParent' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 }, + { displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 }, + { displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 }, + { + displayName: 'testing.one.two.a', + qualifiedName: 'testing.one.two.a', + isParentRow: true, + childrenCount: 2, + }, + ]); + }); + + it('should return parent rows properly with multiple layers of nesting regardless of order', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one.two.a.1' }, + { displayName: 'test3', qualifiedName: 'testParent' }, + { displayName: 'test1', qualifiedName: 'testing.one.three' }, + { displayName: 'test2', qualifiedName: 'testing.two.c.d' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.b' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.a.2' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 }, + { displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 }, + { displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 }, + { + displayName: 'testing.one.two.a', + qualifiedName: 'testing.one.two.a', + isParentRow: true, + childrenCount: 2, + }, + ]); + }); + + it('should return parent rows properly with simpler layers of nesting', () => { + const propertyRows = [ + { displayName: 'test2', qualifiedName: 'testing.two.c.d' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + { displayName: 'test3', qualifiedName: 'testParent' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 2 }, + ]); + }); +}); diff --git a/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx b/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx index 18ee6bb18da3d..60d0aac30eb4c 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx @@ -122,10 +122,10 @@ export function identifyAndAddParentRows(rows?: Array): Array name.startsWith(token)).length; + const currentCount = qualifiedNames.filter((name) => name.startsWith(`${token}.`)).length; - // If we're at the beginning of the path and there is no nesting, break - if (index === 0 && currentCount === 1) { + // If there's only one child, don't nest it + if (currentCount === 1) { break; } diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py index d237cd9ddd306..e7e9bfd43adca 100644 --- a/metadata-ingestion/src/datahub/configuration/git.py +++ b/metadata-ingestion/src/datahub/configuration/git.py @@ -24,7 +24,11 @@ class GitReference(ConfigModel): "main", description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.", ) - + url_subdir: Optional[str] = Field( + default=None, + description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. " + "Only affects URL generation, not git operations.", + ) url_template: Optional[str] = Field( None, description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required." @@ -68,6 +72,8 @@ def infer_url_template(cls, url_template: Optional[str], values: dict) -> str: def get_url_for_file_path(self, file_path: str) -> str: assert self.url_template + if self.url_subdir: + file_path = f"{self.url_subdir}/{file_path}" return self.url_template.format( repo_url=self.repo, branch=self.branch, file_path=file_path ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 93d84d8b246e5..c769c6705ac3f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -414,9 +414,13 @@ def _process_upstream_lineage_row( except Exception as e: self.report.num_upstream_lineage_edge_parsing_failed += 1 upstream_tables = db_row.get("UPSTREAM_TABLES") + downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME") self.structured_reporter.warning( "Failed to parse lineage edge", - context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}", + # Tricky: sometimes the full row data is too large, and so the context + # message gets truncated. By pulling out the upstreams and downstream + # list, we can at least get the important fields if truncation does occur. + context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}", exc=e, ) return None diff --git a/metadata-ingestion/tests/integration/git/test_git_clone.py b/metadata-ingestion/tests/integration/git/test_git_clone.py index 60cf20fefcbdd..01e075930998a 100644 --- a/metadata-ingestion/tests/integration/git/test_git_clone.py +++ b/metadata-ingestion/tests/integration/git/test_git_clone.py @@ -1,4 +1,5 @@ import os +import pathlib import pytest from pydantic import SecretStr @@ -12,7 +13,7 @@ LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY") -def test_base_url_guessing(): +def test_base_url_guessing() -> None: # Basic GitHub repo. config = GitInfo(repo="https://github.com/datahub-project/datahub", branch="master") assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git" @@ -70,7 +71,7 @@ def test_base_url_guessing(): ) -def test_github_branch(): +def test_github_branch() -> None: config = GitInfo( repo="owner/repo", ) @@ -83,11 +84,37 @@ def test_github_branch(): assert config.branch_for_clone == "main" +def test_url_subdir() -> None: + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/dbt/model.sql" + ) + + git_ref = GitReference(repo="https://gitlab.com/org/repo", url_subdir="dbt") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://gitlab.com/org/repo/-/blob/main/dbt/model.sql" + ) + + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/model.sql" + ) + + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt/models") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/dbt/models/model.sql" + ) + + def test_sanitize_repo_url() -> None: assert_doctest(datahub.ingestion.source.git.git_import) -def test_git_clone_public(tmp_path): +def test_git_clone_public(tmp_path: pathlib.Path) -> None: git_clone = GitClone(str(tmp_path)) checkout_dir = git_clone.clone( ssh_key=None, @@ -107,7 +134,7 @@ def test_git_clone_public(tmp_path): LOOKML_TEST_SSH_KEY is None, reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured", ) -def test_git_clone_private(tmp_path): +def test_git_clone_private(tmp_path: pathlib.Path) -> None: git_clone = GitClone(str(tmp_path)) secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None