Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Dec 18, 2024
2 parents 368f43c + 8264376 commit 0c17eec
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 9 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { identifyAndAddParentRows } from '../useStructuredProperties';

describe('identifyAndAddParentRows', () => {
it('should not return parent rows when there are none', () => {
const propertyRows = [
{ displayName: 'test1', qualifiedName: 'test1' },
{ displayName: 'test2', qualifiedName: 'test2' },
];
expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]);
});

it('should not return parent rows when another row starts with the same letters but is a different token', () => {
const propertyRows = [
{ displayName: 'test1', qualifiedName: 'testing.one' },
{ displayName: 'test2', qualifiedName: 'testingAgain.two' },
];
expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]);
});

it('should return parent rows properly', () => {
const propertyRows = [
{ displayName: 'test1', qualifiedName: 'testing.one' },
{ displayName: 'test2', qualifiedName: 'testing.two' },
{ displayName: 'test3', qualifiedName: 'testing.three' },
];
expect(identifyAndAddParentRows(propertyRows)).toMatchObject([
{ displayName: 'testing', qualifiedName: 'testing', childrenCount: 3 },
]);
});

it('should return parent rows properly with multiple layers of nesting', () => {
const propertyRows = [
{ displayName: 'test1', qualifiedName: 'testing.one.two.a.1' },
{ displayName: 'test1', qualifiedName: 'testing.one.two.a.2' },
{ displayName: 'test1', qualifiedName: 'testing.one.two.b' },
{ displayName: 'test1', qualifiedName: 'testing.one.three' },
{ displayName: 'test2', qualifiedName: 'testing.two.c.d' },
{ displayName: 'test3', qualifiedName: 'testing.three' },
{ displayName: 'test3', qualifiedName: 'testParent' },
];
expect(identifyAndAddParentRows(propertyRows)).toMatchObject([
{ displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 },
{ displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 },
{ displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 },
{
displayName: 'testing.one.two.a',
qualifiedName: 'testing.one.two.a',
isParentRow: true,
childrenCount: 2,
},
]);
});

it('should return parent rows properly with multiple layers of nesting regardless of order', () => {
const propertyRows = [
{ displayName: 'test1', qualifiedName: 'testing.one.two.a.1' },
{ displayName: 'test3', qualifiedName: 'testParent' },
{ displayName: 'test1', qualifiedName: 'testing.one.three' },
{ displayName: 'test2', qualifiedName: 'testing.two.c.d' },
{ displayName: 'test1', qualifiedName: 'testing.one.two.b' },
{ displayName: 'test3', qualifiedName: 'testing.three' },
{ displayName: 'test1', qualifiedName: 'testing.one.two.a.2' },
];
expect(identifyAndAddParentRows(propertyRows)).toMatchObject([
{ displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 },
{ displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 },
{ displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 },
{
displayName: 'testing.one.two.a',
qualifiedName: 'testing.one.two.a',
isParentRow: true,
childrenCount: 2,
},
]);
});

it('should return parent rows properly with simpler layers of nesting', () => {
const propertyRows = [
{ displayName: 'test2', qualifiedName: 'testing.two.c.d' },
{ displayName: 'test3', qualifiedName: 'testing.three' },
{ displayName: 'test3', qualifiedName: 'testParent' },
];
expect(identifyAndAddParentRows(propertyRows)).toMatchObject([
{ displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 2 },
]);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,10 @@ export function identifyAndAddParentRows(rows?: Array<PropertyRow>): Array<Prope
// that would tell us to nest. If the count is not equal, we should nest the child properties.
for (let index = 0; index < substrings.length; index++) {
const token = substrings[index];
const currentCount = qualifiedNames.filter((name) => name.startsWith(token)).length;
const currentCount = qualifiedNames.filter((name) => name.startsWith(`${token}.`)).length;

// If we're at the beginning of the path and there is no nesting, break
if (index === 0 && currentCount === 1) {
// If there's only one child, don't nest it
if (currentCount === 1) {
break;
}

Expand Down
8 changes: 7 additions & 1 deletion metadata-ingestion/src/datahub/configuration/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ class GitReference(ConfigModel):
"main",
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
)

url_subdir: Optional[str] = Field(
default=None,
description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. "
"Only affects URL generation, not git operations.",
)
url_template: Optional[str] = Field(
None,
description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required."
Expand Down Expand Up @@ -68,6 +72,8 @@ def infer_url_template(cls, url_template: Optional[str], values: dict) -> str:

def get_url_for_file_path(self, file_path: str) -> str:
assert self.url_template
if self.url_subdir:
file_path = f"{self.url_subdir}/{file_path}"
return self.url_template.format(
repo_url=self.repo, branch=self.branch, file_path=file_path
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -414,9 +414,13 @@ def _process_upstream_lineage_row(
except Exception as e:
self.report.num_upstream_lineage_edge_parsing_failed += 1
upstream_tables = db_row.get("UPSTREAM_TABLES")
downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
self.structured_reporter.warning(
"Failed to parse lineage edge",
context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}",
# Tricky: sometimes the full row data is too large, and so the context
# message gets truncated. By pulling out the upstreams and downstream
# list, we can at least get the important fields if truncation does occur.
context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
exc=e,
)
return None
Expand Down
35 changes: 31 additions & 4 deletions metadata-ingestion/tests/integration/git/test_git_clone.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import pathlib

import pytest
from pydantic import SecretStr
Expand All @@ -12,7 +13,7 @@
LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")


def test_base_url_guessing():
def test_base_url_guessing() -> None:
# Basic GitHub repo.
config = GitInfo(repo="https://github.com/datahub-project/datahub", branch="master")
assert config.repo_ssh_locator == "[email protected]:datahub-project/datahub.git"
Expand Down Expand Up @@ -70,7 +71,7 @@ def test_base_url_guessing():
)


def test_github_branch():
def test_github_branch() -> None:
config = GitInfo(
repo="owner/repo",
)
Expand All @@ -83,11 +84,37 @@ def test_github_branch():
assert config.branch_for_clone == "main"


def test_url_subdir() -> None:
git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt")
assert (
git_ref.get_url_for_file_path("model.sql")
== "https://github.com/org/repo/blob/main/dbt/model.sql"
)

git_ref = GitReference(repo="https://gitlab.com/org/repo", url_subdir="dbt")
assert (
git_ref.get_url_for_file_path("model.sql")
== "https://gitlab.com/org/repo/-/blob/main/dbt/model.sql"
)

git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="")
assert (
git_ref.get_url_for_file_path("model.sql")
== "https://github.com/org/repo/blob/main/model.sql"
)

git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt/models")
assert (
git_ref.get_url_for_file_path("model.sql")
== "https://github.com/org/repo/blob/main/dbt/models/model.sql"
)


def test_sanitize_repo_url() -> None:
assert_doctest(datahub.ingestion.source.git.git_import)


def test_git_clone_public(tmp_path):
def test_git_clone_public(tmp_path: pathlib.Path) -> None:
git_clone = GitClone(str(tmp_path))
checkout_dir = git_clone.clone(
ssh_key=None,
Expand All @@ -107,7 +134,7 @@ def test_git_clone_public(tmp_path):
LOOKML_TEST_SSH_KEY is None,
reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured",
)
def test_git_clone_private(tmp_path):
def test_git_clone_private(tmp_path: pathlib.Path) -> None:
git_clone = GitClone(str(tmp_path))
secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None

Expand Down

0 comments on commit 0c17eec

Please sign in to comment.