Skip to content

Commit

Permalink
feat(ingest): add urn validation test files (datahub-project#12036)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored and sleeperdeep committed Dec 17, 2024
1 parent 26e0596 commit da85dc0
Show file tree
Hide file tree
Showing 12 changed files with 115 additions and 136 deletions.
40 changes: 40 additions & 0 deletions metadata-ingestion/tests/unit/urns/invalid_urns.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Basic URN format tests
urn:li:abc
urn:li:abc:
urn:li:abc:()
urn:li:abc:(abc,)
urn:li:corpuser:abc)

# Reserved characters
urn:li:corpuser:foo␟bar
urn:li:tag:a,b,c

# CorpUser URN tests
urn:li:corpuser:(part1,part2)

# Dataset URN tests
urn:li:dataset:(urn:li:user:abc,dataset,prod)
urn:li:dataset:(urn:li:user:abc,dataset)
urn:li:dataset:(urn:li:user:abc,dataset,invalidEnv)

# DataFlow URN tests
urn:li:dataFlow:(airflow,flow_id)

# DataJob URN tests
urn:li:dataJob:(urn:li:user:abc,job_id)
urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod))

# Domain URN tests
urn:li:domain:(part1,part2)

# Tag URN tests
urn:li:tag:(part1,part2)

# Notebook URN tests
urn:li:notebook:(part1,part2,part3)

# CorpGroup URN tests
urn:li:corpGroup:(part1,part2)

# DataProcessInstance URN tests
urn:li:dataProcessInstance:(part1,part2)
10 changes: 0 additions & 10 deletions metadata-ingestion/tests/unit/urns/test_corp_group_urn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pytest

from datahub.utilities.urns.corp_group_urn import CorpGroupUrn
from datahub.utilities.urns.error import InvalidUrnError


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
Expand All @@ -17,12 +16,3 @@ def test_parse_urn(self) -> None:
assert str(corp_group_urn) == corp_group_urn_str
assert corp_group_urn == CorpGroupUrn(name="abc")
assert corp_group_urn == CorpGroupUrn.create_from_id("abc")

def test_invalid_urn(self) -> None:
with self.assertRaises(InvalidUrnError):
CorpGroupUrn.create_from_string(
"urn:li:abc:(urn:li:dataPlatform:abc,def,prod)"
)

with self.assertRaises(InvalidUrnError):
CorpGroupUrn.create_from_string("urn:li:corpGroup:(part1,part2)")
10 changes: 0 additions & 10 deletions metadata-ingestion/tests/unit/urns/test_corpuser_urn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pytest

from datahub.utilities.urns.corpuser_urn import CorpuserUrn
from datahub.utilities.urns.error import InvalidUrnError


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
Expand All @@ -17,12 +16,3 @@ def test_parse_urn(self) -> None:
assert str(corpuser_urn) == corpuser_urn_str
assert corpuser_urn == CorpuserUrn("abc")
assert corpuser_urn == CorpuserUrn.create_from_id("abc")

def test_invalid_urn(self) -> None:
with self.assertRaises(InvalidUrnError):
CorpuserUrn.create_from_string(
"urn:li:abc:(urn:li:dataPlatform:abc,def,prod)"
)

with self.assertRaises(InvalidUrnError):
CorpuserUrn.create_from_string("urn:li:corpuser:(part1,part2)")
8 changes: 0 additions & 8 deletions metadata-ingestion/tests/unit/urns/test_data_flow_urn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pytest

from datahub.utilities.urns.data_flow_urn import DataFlowUrn
from datahub.utilities.urns.error import InvalidUrnError


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
Expand All @@ -16,10 +15,3 @@ def test_parse_urn(self) -> None:
assert data_flow_urn.get_env() == "prod"
assert data_flow_urn.__str__() == "urn:li:dataFlow:(airflow,def,prod)"
assert data_flow_urn == DataFlowUrn("airflow", "def", "prod")

def test_invalid_urn(self) -> None:
with self.assertRaises(InvalidUrnError):
DataFlowUrn.create_from_string("urn:li:abc:(airflow,def,prod)")

with self.assertRaises(InvalidUrnError):
DataFlowUrn.create_from_string("urn:li:dataFlow:(airflow,flow_id)")
15 changes: 0 additions & 15 deletions metadata-ingestion/tests/unit/urns/test_data_job_urn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from datahub.utilities.urns.data_flow_urn import DataFlowUrn
from datahub.utilities.urns.data_job_urn import DataJobUrn
from datahub.utilities.urns.error import InvalidUrnError


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
Expand All @@ -22,17 +21,3 @@ def test_parse_urn(self) -> None:
assert data_job_urn == DataJobUrn(
"urn:li:dataFlow:(airflow,flow_id,prod)", "job_id"
)

def test_invalid_urn(self) -> None:
with self.assertRaises(InvalidUrnError):
DataJobUrn.create_from_string(
"urn:li:abc:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)"
)

with self.assertRaises(InvalidUrnError):
DataJobUrn.create_from_string("urn:li:dataJob:(urn:li:user:abc,job_id)")

with self.assertRaises(InvalidUrnError):
DataJobUrn.create_from_string(
"urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod))"
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pytest

from datahub.utilities.urns.data_process_instance_urn import DataProcessInstanceUrn
from datahub.utilities.urns.error import InvalidUrnError


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
Expand All @@ -20,12 +19,3 @@ def test_parse_urn(self) -> None:
assert dataprocessinstance_urn == DataProcessInstanceUrn("abc")
assert dataprocessinstance_urn == DataProcessInstanceUrn.create_from_id("abc")
assert "abc" == dataprocessinstance_urn.get_dataprocessinstance_id()

def test_invalid_urn(self) -> None:
with self.assertRaises(InvalidUrnError):
DataProcessInstanceUrn.create_from_string("urn:li:abc:dataProcessInstance")

with self.assertRaises(InvalidUrnError):
DataProcessInstanceUrn.create_from_string(
"urn:li:dataProcessInstance:(part1,part2)"
)
20 changes: 0 additions & 20 deletions metadata-ingestion/tests/unit/urns/test_dataset_urn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from datahub.utilities.urns.data_platform_urn import DataPlatformUrn
from datahub.utilities.urns.dataset_urn import DatasetUrn
from datahub.utilities.urns.error import InvalidUrnError


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
Expand All @@ -20,22 +19,3 @@ def test_parse_urn(self) -> None:
assert dataset_urn.get_env() == "PROD"
assert dataset_urn.__str__() == dataset_urn_str
assert dataset_urn == DatasetUrn("urn:li:dataPlatform:abc", "def", "prod")

def test_invalid_urn(self) -> None:
with self.assertRaises(InvalidUrnError):
DatasetUrn.create_from_string(
"urn:li:abc:(urn:li:dataPlatform:abc,def,prod)"
)

with self.assertRaises(InvalidUrnError):
DatasetUrn.create_from_string(
"urn:li:dataset:(urn:li:user:abc,dataset,prod)"
)

with self.assertRaises(InvalidUrnError):
DatasetUrn.create_from_string("urn:li:dataset:(urn:li:user:abc,dataset)")

with self.assertRaises(InvalidUrnError):
DatasetUrn.create_from_string(
"urn:li:dataset:(urn:li:user:abc,dataset,invalidEnv)"
)
8 changes: 0 additions & 8 deletions metadata-ingestion/tests/unit/urns/test_domain_urn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pytest

from datahub.utilities.urns.domain_urn import DomainUrn
from datahub.utilities.urns.error import InvalidUrnError


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
Expand All @@ -17,10 +16,3 @@ def test_parse_urn(self) -> None:
assert str(domain_urn) == domain_urn_str
assert domain_urn == DomainUrn("abc")
assert domain_urn == DomainUrn.create_from_id("abc")

def test_invalid_urn(self) -> None:
with self.assertRaises(InvalidUrnError):
DomainUrn.create_from_string("urn:li:abc:domain")

with self.assertRaises(InvalidUrnError):
DomainUrn.create_from_string("urn:li:domain:(part1,part2)")
10 changes: 0 additions & 10 deletions metadata-ingestion/tests/unit/urns/test_notebook_urn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import pytest

from datahub.utilities.urns.error import InvalidUrnError
from datahub.utilities.urns.notebook_urn import NotebookUrn


Expand All @@ -16,12 +15,3 @@ def test_parse_urn(self) -> None:
assert str(notebook_urn) == notebook_urn_str

assert notebook_urn == NotebookUrn("querybook", "123")

def test_invalid_urn(self) -> None:
with self.assertRaises(InvalidUrnError):
NotebookUrn.create_from_string(
"urn:li:abc:(urn:li:dataPlatform:abc,def,prod)"
)

with self.assertRaises(InvalidUrnError):
NotebookUrn.create_from_string("urn:li:notebook:(part1,part2,part3)")
8 changes: 0 additions & 8 deletions metadata-ingestion/tests/unit/urns/test_tag_urn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import pytest

from datahub.utilities.urns.error import InvalidUrnError
from datahub.utilities.urns.tag_urn import TagUrn


Expand All @@ -17,10 +16,3 @@ def test_parse_urn(self) -> None:
assert str(tag_urn) == tag_urn_str
assert tag_urn == TagUrn("abc")
assert tag_urn == TagUrn.create_from_id("abc")

def test_invalid_urn(self) -> None:
with self.assertRaises(InvalidUrnError):
TagUrn.create_from_string("urn:li:abc:tag_id")

with self.assertRaises(InvalidUrnError):
TagUrn.create_from_string("urn:li:tag:(part1,part2)")
88 changes: 51 additions & 37 deletions metadata-ingestion/tests/unit/urns/test_urn.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import logging
import pathlib
from typing import List

import pytest

from datahub.metadata.urns import (
CorpUserUrn,
DashboardUrn,
DataPlatformUrn,
DatasetUrn,
Urn,
)
from datahub.metadata.urns import CorpUserUrn, DatasetUrn, Urn
from datahub.utilities.urns.error import InvalidUrnError

pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")

_CURRENT_DIR = pathlib.Path(__file__).parent
logger = logging.getLogger(__name__)


def test_parse_urn() -> None:
simple_urn_str = "urn:li:dataPlatform:abc"
Expand Down Expand Up @@ -40,38 +41,12 @@ def test_url_encode_urn() -> None:
)


def test_invalid_urn() -> None:
with pytest.raises(InvalidUrnError):
Urn.from_string("urn:li:abc")

with pytest.raises(InvalidUrnError):
Urn.from_string("urn:li:abc:")

with pytest.raises(InvalidUrnError):
Urn.from_string("urn:li:abc:()")

with pytest.raises(InvalidUrnError):
Urn.from_string("urn:li:abc:(abc,)")

with pytest.raises(InvalidUrnError):
Urn.from_string("urn:li:corpuser:abc)")


def test_urn_colon() -> None:
# Colon characters are valid in urns, and should not mess up parsing.

urn = Urn.from_string(
"urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)"
)
assert isinstance(urn, DashboardUrn)

assert DataPlatformUrn.from_string("urn:li:dataPlatform:abc:def")
assert DatasetUrn.from_string(
"urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,PROD)"
)
assert Urn.from_string("urn:li:corpuser:foo:[email protected]")
# There's a bunch of other, simpler tests for special characters in the valid_urns test.

# This test ensures that the type dispatch and fields work fine here.
# I'm not sure why you'd ever want this, but technically it's a valid urn.

urn = Urn.from_string("urn:li:corpuser::")
assert isinstance(urn, CorpUserUrn)
assert urn.username == ":"
Expand All @@ -85,9 +60,48 @@ def test_urn_coercion() -> None:
assert urn == Urn.from_string(urn.urn())


def test_urn_type_dispatch() -> None:
def test_urn_type_dispatch_1() -> None:
urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)")
assert isinstance(urn, DatasetUrn)

with pytest.raises(InvalidUrnError, match="Passed an urn of type corpuser"):
DatasetUrn.from_string("urn:li:corpuser:foo")


def test_urn_type_dispatch_2() -> None:
urn = "urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)"
assert Urn.from_string(urn).urn() == urn

with pytest.raises(InvalidUrnError, match="Passed an urn of type dataJob"):
CorpUserUrn.from_string(urn)


def _load_urns(file_name: pathlib.Path) -> List[str]:
urns = [
line.strip()
for line in file_name.read_text().splitlines()
if line.strip() and not line.startswith("#")
]
assert len(urns) > 0, f"No urns found in {file_name}"
return urns


def test_valid_urns() -> None:
valid_urns_file = _CURRENT_DIR / "valid_urns.txt"
valid_urns = _load_urns(valid_urns_file)

for valid_urn in valid_urns:
logger.info(f"Testing valid URN: {valid_urn}")
parsed_urn = Urn.from_string(valid_urn)
assert parsed_urn.urn() == valid_urn


def test_invalid_urns() -> None:
invalid_urns_file = _CURRENT_DIR / "invalid_urns.txt"
invalid_urns = _load_urns(invalid_urns_file)

# Test each invalid URN
for invalid_urn in invalid_urns:
with pytest.raises(InvalidUrnError):
logger.info(f"Testing invalid URN: {invalid_urn}")
Urn.from_string(invalid_urn)
24 changes: 24 additions & 0 deletions metadata-ingestion/tests/unit/urns/valid_urns.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Unknown entity types become generic urns
urn:li:abc:foo
urn:li:abc:(foo,bar)
urn:li:abc:(urn:li:dataPlatform:abc,def,prod)

# A bunch of pretty normal urns
urn:li:corpuser:foo
urn:li:corpGroup:bar
urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,prod)
urn:li:dataFlow:(airflow,def,prod)
urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)
urn:li:tag:abc
urn:li:chart:(looker,chart_name)
urn:li:dashboard:(looker,dashboard_name)
urn:li:dataProcessInstance:abc
urn:li:domain:abc
urn:li:notebook:(querybook,123)

# Urns with colons and other special characters
urn:li:tag:dbt:bar
urn:li:tag::
urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)
urn:li:dataPlatform:abc:def
urn:li:corpuser:foo:[email protected]

0 comments on commit da85dc0

Please sign in to comment.