diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml index 1ae3edae7aa90b..7da20ece44f6d6 100644 --- a/.github/workflows/pr-labeler.yml +++ b/.github/workflows/pr-labeler.yml @@ -45,7 +45,8 @@ jobs: "Salman-Apptware", "mayurinehate", "noggi", - "skrydal" + "skrydal", + "kevinkarchacryl" ]'), github.actor ) diff --git a/build.gradle b/build.gradle index 6e6dadb7ebfa34..9ee756d41e11ef 100644 --- a/build.gradle +++ b/build.gradle @@ -56,7 +56,7 @@ buildscript { ext.hazelcastVersion = '5.3.6' ext.ebeanVersion = '15.5.2' ext.googleJavaFormatVersion = '1.18.1' - ext.openLineageVersion = '1.19.0' + ext.openLineageVersion = '1.24.2' ext.logbackClassicJava8 = '1.2.12' ext.docker_registry = 'acryldata' diff --git a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java index ef5833f607efdb..113aeeb36551f0 100644 --- a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java +++ b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java @@ -130,8 +130,6 @@ public Object perform( CallContext ctx = ctxResult.getFirst(); Result result = (Result) ctxResult.getSecond(); - setContextRedirectUrl(ctx); - // Handle OIDC authentication errors. if (OidcResponseErrorHandler.isError(ctx)) { return OidcResponseErrorHandler.handleError(ctx); @@ -192,6 +190,9 @@ private Pair superPerform( } } + // Set the redirect url from cookie before creating action + setContextRedirectUrl(ctx); + action = this.redirectToOriginallyRequestedUrl(ctx, defaultUrl); } } catch (RuntimeException var20) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/service/AnalyticsUtil.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/service/AnalyticsUtil.java index a17745948eb823..88ac29b72dee83 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/service/AnalyticsUtil.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/service/AnalyticsUtil.java @@ -1,19 +1,14 @@ package com.linkedin.datahub.graphql.analytics.service; +import static com.linkedin.metadata.Constants.CORP_USER_EDITABLE_INFO_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_ENTITY_NAME; import static com.linkedin.metadata.Constants.CORP_USER_INFO_ASPECT_NAME; import com.google.common.collect.ImmutableSet; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.dashboard.DashboardInfo; -import com.linkedin.datahub.graphql.generated.BarSegment; -import com.linkedin.datahub.graphql.generated.Cell; -import com.linkedin.datahub.graphql.generated.Entity; -import com.linkedin.datahub.graphql.generated.EntityProfileParams; -import com.linkedin.datahub.graphql.generated.LinkParams; -import com.linkedin.datahub.graphql.generated.NamedBar; -import com.linkedin.datahub.graphql.generated.Row; -import com.linkedin.datahub.graphql.generated.SearchParams; +import com.linkedin.datahub.graphql.generated.*; import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; import com.linkedin.dataplatform.DataPlatformInfo; import com.linkedin.dataset.DatasetProperties; @@ -22,6 +17,7 @@ import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.client.EntityClient; import com.linkedin.glossary.GlossaryTermInfo; +import com.linkedin.identity.CorpUserEditableInfo; import com.linkedin.identity.CorpUserInfo; import com.linkedin.metadata.Constants; import com.linkedin.metadata.key.GlossaryTermKey; @@ -35,6 +31,7 @@ import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; @@ -169,36 +166,79 @@ public static void convertToUserInfoRows( final Map gmsResponseByUser = entityClient.batchGetV2( opContext, - CORP_USER_INFO_ASPECT_NAME, + CORP_USER_ENTITY_NAME, userUrns, - ImmutableSet.of(CORP_USER_INFO_ASPECT_NAME)); - final Map urnToCorpUserInfo = + ImmutableSet.of(CORP_USER_INFO_ASPECT_NAME, CORP_USER_EDITABLE_INFO_ASPECT_NAME)); + final Stream> entityStream = gmsResponseByUser.entrySet().stream() .filter( entry -> entry.getValue() != null - && entry.getValue().getAspects().containsKey(CORP_USER_INFO_ASPECT_NAME)) - .collect( - Collectors.toMap( - Map.Entry::getKey, - entry -> + && (entry.getValue().getAspects().containsKey(CORP_USER_INFO_ASPECT_NAME) + || entry + .getValue() + .getAspects() + .containsKey(CORP_USER_EDITABLE_INFO_ASPECT_NAME))); + final Map> urnToCorpUserInfo = + entityStream.collect( + Collectors.toMap( + Map.Entry::getKey, + entry -> { + CorpUserInfo userInfo = null; + CorpUserEditableInfo editableInfo = null; + try { + userInfo = new CorpUserInfo( entry .getValue() .getAspects() .get(CORP_USER_INFO_ASPECT_NAME) .getValue() - .data()))); + .data()); + } catch (Exception e) { + // nothing to do + } + try { + + editableInfo = + new CorpUserEditableInfo( + entry + .getValue() + .getAspects() + .get(CORP_USER_EDITABLE_INFO_ASPECT_NAME) + .getValue() + .data()); + } catch (Exception e) { + // nothing to do + } + + return Pair.of(userInfo, editableInfo); + })); // Populate a row with the user link, title, and email. rows.forEach( row -> { Urn urn = UrnUtils.getUrn(row.getCells().get(0).getValue()); EntityResponse response = gmsResponseByUser.get(urn); String maybeDisplayName = response != null ? getUserName(response).orElse(null) : null; - String maybeEmail = - urnToCorpUserInfo.containsKey(urn) ? urnToCorpUserInfo.get(urn).getEmail() : null; - String maybeTitle = - urnToCorpUserInfo.containsKey(urn) ? urnToCorpUserInfo.get(urn).getTitle() : null; + String maybeEmail = null; + String maybeTitle = null; + if (urnToCorpUserInfo.containsKey(urn)) { + Pair pair = urnToCorpUserInfo.get(urn); + if (pair.getLeft() != null) { + CorpUserInfo userInfo = pair.getLeft(); + maybeEmail = userInfo.getEmail(); + maybeTitle = userInfo.getTitle(); + } + if (pair.getRight() != null) { + CorpUserEditableInfo userInfo = pair.getRight(); + if (maybeEmail == null) { + maybeEmail = userInfo.getEmail(); + } + if (maybeTitle == null) { + maybeTitle = userInfo.getTitle(); + } + } + } if (maybeDisplayName != null) { row.getCells().get(0).setValue(maybeDisplayName); } diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/AnalyticsUtilTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/AnalyticsUtilTest.java new file mode 100644 index 00000000000000..ab1140d2380315 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/AnalyticsUtilTest.java @@ -0,0 +1,108 @@ +package com.linkedin.datahub.graphql.utils; + +import static org.mockito.ArgumentMatchers.anySet; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; + +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.DataMap; +import com.linkedin.datahub.graphql.analytics.service.AnalyticsUtil; +import com.linkedin.datahub.graphql.generated.Cell; +import com.linkedin.datahub.graphql.generated.Row; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.identity.CorpUserEditableInfo; +import com.linkedin.identity.CorpUserInfo; +import com.linkedin.metadata.Constants; +import io.datahubproject.metadata.context.OperationContext; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class AnalyticsUtilTest { + + @Mock private OperationContext mockOpContext; + + @Mock private EntityClient mockEntityClient; + + final String TEST_CORP_USER_INFO_TEST_USER = "Corp User"; + final String TEST_CORP_USER_EDITABLE_INFO_TEST_TITLE = "Editable Info Title"; + final String TEST_CORP_USER_EDITABLE_INFO_TEST_EMAIL = "Editable Info Email"; + + @BeforeMethod + public void setUp() { + MockitoAnnotations.openMocks(this); + } + + @Test + public void testConvertToUserInfoRows() throws Exception { + List rows = new ArrayList<>(); + rows.add(new Row(null, Arrays.asList(new Cell("urn:li:corpuser:testuser", null, null)))); + + // create a CorpUserInfo with only display name set + CorpUserInfo corpUserInfo = new CorpUserInfo(); + corpUserInfo.setActive(true); + corpUserInfo.setDisplayName(TEST_CORP_USER_INFO_TEST_USER); + + // create an editableInfo with the email and title set + CorpUserEditableInfo corpUserEditableInfo = new CorpUserEditableInfo(); + corpUserEditableInfo.setEmail(TEST_CORP_USER_EDITABLE_INFO_TEST_EMAIL); // Overriding email + corpUserEditableInfo.setTitle(TEST_CORP_USER_EDITABLE_INFO_TEST_TITLE); // Overriding title + + DataMap corpUserInfoDataMap = new DataMap(); + corpUserInfoDataMap.put("name", Constants.CORP_USER_INFO_ASPECT_NAME); + corpUserInfoDataMap.put("type", "VERSIONED"); + corpUserInfoDataMap.put("value", corpUserInfo.data()); + + DataMap corpUserEditableInfoDataMap = new DataMap(); + corpUserEditableInfoDataMap.put("name", Constants.CORP_USER_EDITABLE_INFO_ASPECT_NAME); + corpUserEditableInfoDataMap.put("type", "VERSIONED"); + corpUserEditableInfoDataMap.put("value", corpUserEditableInfo.data()); + + EnvelopedAspect corpUserInfoEnvelopedAspect = new EnvelopedAspect(corpUserInfoDataMap); + EnvelopedAspect corpUserEditableInfoEnvelopedAspect = + new EnvelopedAspect(corpUserEditableInfoDataMap); + + EnvelopedAspectMap aspectMap = new EnvelopedAspectMap(); + aspectMap.put(Constants.CORP_USER_INFO_ASPECT_NAME, corpUserInfoEnvelopedAspect); + aspectMap.put( + Constants.CORP_USER_EDITABLE_INFO_ASPECT_NAME, corpUserEditableInfoEnvelopedAspect); + + EntityResponse entityResponse = new EntityResponse(); + entityResponse.setAspects(aspectMap); + + Map entityResponseMap = new HashMap<>(); + Urn userUrn = UrnUtils.getUrn("urn:li:corpuser:testuser"); + entityResponseMap.put(userUrn, entityResponse); + + // method of the entity client we need to mock to retrieve the response map + when(mockEntityClient.batchGetV2( + eq(mockOpContext), eq(Constants.CORP_USER_ENTITY_NAME), anySet(), anySet())) + .thenReturn(entityResponseMap); + + // function we are testing + AnalyticsUtil.convertToUserInfoRows(mockOpContext, mockEntityClient, rows); + + Row updatedRow = rows.get(0); + List updatedCells = updatedRow.getCells(); + + // asserting that the display user is from CorpUserInfo and email, title are from EditableInfo + assertEquals(updatedCells.get(0).getValue(), TEST_CORP_USER_INFO_TEST_USER); + assertEquals( + updatedCells.get(1).getValue(), + TEST_CORP_USER_EDITABLE_INFO_TEST_TITLE); // Overriding title + assertEquals( + updatedCells.get(2).getValue(), + TEST_CORP_USER_EDITABLE_INFO_TEST_EMAIL); // Overriding email + } +} diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java index 6c70aee88675c5..cd7947ce3c11aa 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java @@ -1,13 +1,12 @@ package com.linkedin.datahub.upgrade.system; -import static com.linkedin.metadata.Constants.DATA_HUB_UPGRADE_RESULT_ASPECT_NAME; - import com.linkedin.common.urn.Urn; import com.linkedin.datahub.upgrade.UpgradeContext; import com.linkedin.datahub.upgrade.UpgradeStep; import com.linkedin.datahub.upgrade.UpgradeStepResult; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.boot.BootstrapStep; import com.linkedin.metadata.entity.AspectDao; import com.linkedin.metadata.entity.EntityService; @@ -16,10 +15,13 @@ import com.linkedin.metadata.entity.ebean.PartitionedStream; import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; import com.linkedin.metadata.utils.AuditStampUtils; +import com.linkedin.upgrade.DataHubUpgradeResult; import com.linkedin.upgrade.DataHubUpgradeState; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; import java.util.List; +import java.util.Map; +import java.util.Optional; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.function.Function; @@ -33,6 +35,8 @@ */ @Slf4j public abstract class AbstractMCLStep implements UpgradeStep { + public static final String LAST_URN_KEY = "lastUrn"; + private final OperationContext opContext; private final EntityService entityService; private final AspectDao aspectDao; @@ -70,10 +74,30 @@ protected Urn getUpgradeIdUrn() { @Override public Function executable() { return (context) -> { + // Resume state + Optional prevResult = + context.upgrade().getUpgradeResult(opContext, getUpgradeIdUrn(), entityService); + String resumeUrn = + prevResult + .filter( + result -> + DataHubUpgradeState.IN_PROGRESS.equals(result.getState()) + && result.getResult() != null + && result.getResult().containsKey(LAST_URN_KEY)) + .map(result -> result.getResult().get(LAST_URN_KEY)) + .orElse(null); + if (resumeUrn != null) { + log.info("{}: Resuming from URN: {}", getUpgradeIdUrn(), resumeUrn); + } // re-using for configuring the sql scan RestoreIndicesArgs args = - new RestoreIndicesArgs().aspectName(getAspectName()).batchSize(batchSize).limit(limit); + new RestoreIndicesArgs() + .aspectName(getAspectName()) + .batchSize(batchSize) + .lastUrn(resumeUrn) + .urnBasedPagination(resumeUrn != null) + .limit(limit); if (getUrnLike() != null) { args = args.urnLike(getUrnLike()); @@ -86,40 +110,62 @@ public Function executable() { batch -> { log.info("Processing batch({}) of size {}.", getAspectName(), batchSize); - List, Boolean>> futures; - + List, SystemAspect>> futures; futures = EntityUtils.toSystemAspectFromEbeanAspects( opContext.getRetrieverContext().get(), batch.collect(Collectors.toList())) .stream() .map( - systemAspect -> - entityService.alwaysProduceMCLAsync( - opContext, - systemAspect.getUrn(), - systemAspect.getUrn().getEntityType(), - getAspectName(), - systemAspect.getAspectSpec(), - null, - systemAspect.getRecordTemplate(), - null, - systemAspect - .getSystemMetadata() - .setRunId(id()) - .setLastObserved(System.currentTimeMillis()), - AuditStampUtils.createDefaultAuditStamp(), - ChangeType.UPSERT)) - .collect(Collectors.toList()); - - futures.forEach( - f -> { - try { - f.getFirst().get(); - } catch (InterruptedException | ExecutionException e) { - throw new RuntimeException(e); - } - }); + systemAspect -> { + Pair, Boolean> future = + entityService.alwaysProduceMCLAsync( + opContext, + systemAspect.getUrn(), + systemAspect.getUrn().getEntityType(), + getAspectName(), + systemAspect.getAspectSpec(), + null, + systemAspect.getRecordTemplate(), + null, + systemAspect + .getSystemMetadata() + .setRunId(id()) + .setLastObserved(System.currentTimeMillis()), + AuditStampUtils.createDefaultAuditStamp(), + ChangeType.UPSERT); + return Pair., SystemAspect>of( + future.getFirst(), systemAspect); + }) + .toList(); + + SystemAspect lastAspect = + futures.stream() + .map( + f -> { + try { + f.getFirst().get(); + return f.getSecond(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + }) + .reduce((a, b) -> b) + .orElse(null); + + // record progress + if (lastAspect != null) { + log.info( + "{}: Saving state. Last urn:{}", getUpgradeIdUrn(), lastAspect.getUrn()); + context + .upgrade() + .setUpgradeResult( + opContext, + getUpgradeIdUrn(), + entityService, + DataHubUpgradeState.IN_PROGRESS, + Map.of(LAST_URN_KEY, lastAspect.getUrn().toString())); + } if (batchDelayMs > 0) { log.info("Sleeping for {} ms", batchDelayMs); @@ -142,12 +188,23 @@ public Function executable() { @Override /** Returns whether the upgrade should be skipped. */ public boolean skip(UpgradeContext context) { - boolean previouslyRun = - entityService.exists( - opContext, getUpgradeIdUrn(), DATA_HUB_UPGRADE_RESULT_ASPECT_NAME, true); - if (previouslyRun) { - log.info("{} was already run. Skipping.", id()); + Optional prevResult = + context.upgrade().getUpgradeResult(opContext, getUpgradeIdUrn(), entityService); + + boolean previousRunFinal = + prevResult + .filter( + result -> + DataHubUpgradeState.SUCCEEDED.equals(result.getState()) + || DataHubUpgradeState.ABORTED.equals(result.getState())) + .isPresent(); + + if (previousRunFinal) { + log.info( + "{} was already run. State: {} Skipping.", + id(), + prevResult.map(DataHubUpgradeResult::getState)); } - return previouslyRun; + return previousRunFinal; } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java index eece83f4ab713e..55bc8edbf6a768 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java @@ -1,5 +1,6 @@ package com.linkedin.datahub.upgrade.system.schemafield; +import static com.linkedin.datahub.upgrade.system.AbstractMCLStep.LAST_URN_KEY; import static com.linkedin.metadata.Constants.APP_SOURCE; import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; import static com.linkedin.metadata.Constants.SCHEMA_METADATA_ASPECT_NAME; @@ -61,7 +62,6 @@ */ @Slf4j public class GenerateSchemaFieldsFromSchemaMetadataStep implements UpgradeStep { - private static final String LAST_URN_KEY = "lastUrn"; private static final List REQUIRED_ASPECTS = List.of(SCHEMA_METADATA_ASPECT_NAME, STATUS_ASPECT_NAME); diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java index f340e688ad7f77..21bc6b725cba2b 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java @@ -1,14 +1,18 @@ package com.linkedin.datahub.upgrade; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import static org.testng.AssertJUnit.assertNotNull; +import com.linkedin.data.template.StringMap; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeManager; import com.linkedin.datahub.upgrade.system.SystemUpdateNonBlocking; import com.linkedin.datahub.upgrade.system.bootstrapmcps.BootstrapMCPStep; @@ -20,17 +24,30 @@ import com.linkedin.metadata.entity.AspectDao; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.EntityServiceImpl; +import com.linkedin.metadata.entity.ebean.EbeanAspectV2; +import com.linkedin.metadata.entity.ebean.PartitionedStream; import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; import com.linkedin.mxe.Topics; +import com.linkedin.upgrade.DataHubUpgradeResult; +import com.linkedin.upgrade.DataHubUpgradeState; +import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; +import java.sql.Timestamp; +import java.util.Arrays; import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.inject.Named; +import org.mockito.ArgumentCaptor; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @ActiveProfiles("test") @@ -63,7 +80,12 @@ public class DatahubUpgradeNonBlockingTest extends AbstractTestNGSpringContextTe @Autowired private EntityServiceImpl entityService; - @Autowired private OperationContext opContext; + private OperationContext opContext; + + @BeforeClass + public void init() { + opContext = TestOperationContexts.systemContextNoValidate(); + } @Test public void testSystemUpdateNonBlockingInit() { @@ -81,10 +103,13 @@ public void testSystemUpdateNonBlockingInit() { } @Test - public void testReindexDataJobViaNodesCLLPaging() { + public void testReindexDataJobViaNodesCLLPagingArgs() { EntityService mockService = mock(EntityService.class); AspectDao mockAspectDao = mock(AspectDao.class); + PartitionedStream mockStream = mock(PartitionedStream.class); + when(mockStream.partition(anyInt())).thenReturn(Stream.empty()); + when(mockAspectDao.streamAspectBatches(any(RestoreIndicesArgs.class))).thenReturn(mockStream); ReindexDataJobViaNodesCLL cllUpgrade = new ReindexDataJobViaNodesCLL(opContext, mockService, mockAspectDao, true, 10, 0, 0); @@ -102,9 +127,79 @@ public void testReindexDataJobViaNodesCLLPaging() { .batchSize(10) .limit(0) .aspectName("dataJobInputOutput") + .urnBasedPagination(false) + .lastUrn(null) .urnLike("urn:li:dataJob:%"))); } + @Test + public void testReindexDataJobViaNodesCLLResumePaging() throws Exception { + // Mock services + EntityService mockService = mock(EntityService.class); + AspectDao mockAspectDao = mock(AspectDao.class); + + // Create test data + EbeanAspectV2 aspect1 = createMockEbeanAspect("urn:li:dataJob:job1", "dataJobInputOutput"); + EbeanAspectV2 aspect2 = createMockEbeanAspect("urn:li:dataJob:job2", "dataJobInputOutput"); + EbeanAspectV2 aspect3 = createMockEbeanAspect("urn:li:dataJob:job3", "dataJobInputOutput"); + List initialBatch = Arrays.asList(aspect1, aspect2); + List resumeBatch = Arrays.asList(aspect3); + + // Mock the stream for first batch + PartitionedStream initialStream = mock(PartitionedStream.class); + when(initialStream.partition(anyInt())).thenReturn(Stream.of(initialBatch.stream())); + + // Mock the stream for second batch + PartitionedStream resumeStream = mock(PartitionedStream.class); + when(resumeStream.partition(anyInt())).thenReturn(Stream.of(resumeBatch.stream())); + + // Setup the AspectDao using Answer to handle null safely + when(mockAspectDao.streamAspectBatches(any(RestoreIndicesArgs.class))) + .thenAnswer( + invocation -> { + RestoreIndicesArgs args = invocation.getArgument(0); + if (args.lastUrn() == null) { + return initialStream; + } else if ("urn:li:dataJob:job2".equals(args.lastUrn())) { + return resumeStream; + } + return mock(PartitionedStream.class); + }); + + // Mock successful MCL production + when(mockService.alwaysProduceMCLAsync( + any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) + .thenReturn(Pair.of(CompletableFuture.completedFuture(null), true)); + + // Create the upgrade + ReindexDataJobViaNodesCLL cllUpgrade = + new ReindexDataJobViaNodesCLL(opContext, mockService, mockAspectDao, true, 2, 0, 0); + + // Initial Run + cllUpgrade.steps().get(0).executable().apply(createMockInitialUpgrade()); + + // Resumed + cllUpgrade.steps().get(0).executable().apply(createMockResumeUpgrade()); + + // Use ArgumentCaptor to verify the calls + ArgumentCaptor argsCaptor = + ArgumentCaptor.forClass(RestoreIndicesArgs.class); + verify(mockAspectDao, times(2)).streamAspectBatches(argsCaptor.capture()); + + List capturedArgs = argsCaptor.getAllValues(); + + // Verify both the initial and resume calls were made with correct arguments + assertEquals(capturedArgs.get(0).lastUrn(), null); + assertEquals(capturedArgs.get(0).urnBasedPagination(), false); + assertEquals(capturedArgs.get(1).lastUrn(), "urn:li:dataJob:job2"); + assertEquals(capturedArgs.get(1).urnBasedPagination(), true); + + // Verify MCL production was called for each aspect + verify(mockService, times(3)) + .alwaysProduceMCLAsync( + any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any()); + } + @Test public void testNonBlockingBootstrapMCP() { List mcpTemplate = @@ -123,4 +218,54 @@ public void testNonBlockingBootstrapMCP() { .map(update -> update.getMcpTemplate().getName()) .collect(Collectors.toSet()))); } + + private UpgradeContext createMockInitialUpgrade() { + // Mock the Upgrade instance + Upgrade mockUpgrade = mock(Upgrade.class); + + // Configure the mock upgrade to return no previous result + when(mockUpgrade.getUpgradeResult(any(), any(), any())).thenReturn(Optional.empty()); + + UpgradeContext mockInitialContext = mock(UpgradeContext.class); + when(mockInitialContext.opContext()).thenReturn(opContext); + when(mockInitialContext.upgrade()).thenReturn(mockUpgrade); + when(mockInitialContext.report()).thenReturn(mock(UpgradeReport.class)); + + return mockInitialContext; + } + + private UpgradeContext createMockResumeUpgrade() { + // Mock the Upgrade instance + Upgrade mockUpgrade = mock(Upgrade.class); + DataHubUpgradeResult mockPrevResult = mock(DataHubUpgradeResult.class); + + // Configure the mock previous result + when(mockPrevResult.getState()).thenReturn(DataHubUpgradeState.IN_PROGRESS); + when(mockPrevResult.getResult()) + .thenReturn(new StringMap(Map.of("lastUrn", "urn:li:dataJob:job2"))); + + // Configure the mock upgrade to return our previous result + when(mockUpgrade.getUpgradeResult(any(), any(), any())).thenReturn(Optional.of(mockPrevResult)); + + UpgradeContext mockResumeContext = mock(UpgradeContext.class); + when(mockResumeContext.opContext()).thenReturn(opContext); + when(mockResumeContext.upgrade()).thenReturn(mockUpgrade); + when(mockResumeContext.report()).thenReturn(mock(UpgradeReport.class)); + + return mockResumeContext; + } + + private static EbeanAspectV2 createMockEbeanAspect(String urn, String aspectName) { + Timestamp now = new Timestamp(System.currentTimeMillis()); + return new EbeanAspectV2( + urn, + aspectName, + 0L, + "{}", // metadata + now, // createdOn + "urn:li:corpuser:testUser", // createdBy + null, // createdFor + null // systemMetadata + ); + } } diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java index 81d883d8ce36b7..5b7b8756f11fb1 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java @@ -19,17 +19,17 @@ @Import(value = {SystemAuthenticationFactory.class}) public class UpgradeCliApplicationTestConfiguration { - @MockBean private UpgradeCli upgradeCli; + @MockBean public UpgradeCli upgradeCli; - @MockBean private Database ebeanServer; + @MockBean public Database ebeanServer; - @MockBean private SearchService searchService; + @MockBean public SearchService searchService; - @MockBean private GraphService graphService; + @MockBean public GraphService graphService; - @MockBean private EntityRegistry entityRegistry; + @MockBean public EntityRegistry entityRegistry; - @MockBean ConfigEntityRegistry configEntityRegistry; + @MockBean public ConfigEntityRegistry configEntityRegistry; @MockBean public EntityIndexBuilders entityIndexBuilders; diff --git a/datahub-web-react/src/app/entity/group/GroupMembers.tsx b/datahub-web-react/src/app/entity/group/GroupMembers.tsx index 147c3f8030d0e0..28e81b438d4cb4 100644 --- a/datahub-web-react/src/app/entity/group/GroupMembers.tsx +++ b/datahub-web-react/src/app/entity/group/GroupMembers.tsx @@ -137,12 +137,13 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM }, 3000); }; - const onRemoveMember = (memberUrn: string) => { + const onRemoveMember = (memberEntity: CorpUser) => { + const memberName = entityRegistry.getDisplayName(EntityType.CorpUser, memberEntity); Modal.confirm({ title: `Confirm Group Member Removal`, - content: `Are you sure you want to remove this user from the group?`, + content: `Are you sure you want to remove ${memberName} user from the group?`, onOk() { - removeGroupMember(memberUrn); + removeGroupMember(memberEntity?.urn); }, onCancel() {}, okText: 'Yes', @@ -155,7 +156,7 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM const total = relationships?.total || 0; const groupMembers = relationships?.relationships?.map((rel) => rel.entity as CorpUser) || []; - const getItems = (urnID: string): MenuProps['items'] => { + const getItems = (userEntity: CorpUser): MenuProps['items'] => { return [ { key: 'make', @@ -169,7 +170,7 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM { key: 'remove', disabled: isExternalGroup, - onClick: () => onRemoveMember(urnID), + onClick: () => onRemoveMember(userEntity), label: ( Remove from Group @@ -210,7 +211,7 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM - + diff --git a/datahub-web-react/src/app/entity/shared/siblingUtils.ts b/datahub-web-react/src/app/entity/shared/siblingUtils.ts index 2f50dc99df191b..aa9e4bcb5e46e1 100644 --- a/datahub-web-react/src/app/entity/shared/siblingUtils.ts +++ b/datahub-web-react/src/app/entity/shared/siblingUtils.ts @@ -5,6 +5,7 @@ import * as QueryString from 'query-string'; import { Dataset, Entity, Maybe, SiblingProperties } from '../../../types.generated'; import { GenericEntityProperties } from './types'; import { useIsShowSeparateSiblingsEnabled } from '../../useAppConfig'; +import { downgradeV2FieldPath } from '../dataset/profile/schema/utils/utils'; export function stripSiblingsFromEntity(entity: any) { return { @@ -55,16 +56,30 @@ const combineMerge = (target, source, options) => { return destination; }; -function convertObjectKeysToLowercase(object: Record) { - return Object.fromEntries(Object.entries(object).map(([key, value]) => [key.toLowerCase(), value])); +// this function is responsible for normalizing object keys to make sure merging on key matches keys appropriately +function normalizeObjectKeys(object: Record, isSchemaField = false) { + return Object.fromEntries( + Object.entries(object).map(([key, value]) => { + let normalizedKey = key.toLowerCase(); + if (isSchemaField) { + normalizedKey = downgradeV2FieldPath(normalizedKey) || normalizedKey; + } + return [normalizedKey, value]; + }), + ); } // use when you want to merge an array of objects by key in the object as opposed to by index of array -const mergeArrayOfObjectsByKey = (destinationArray: any[], sourceArray: any[], key: string) => { - const destination = convertObjectKeysToLowercase(keyBy(destinationArray, key)); - const source = convertObjectKeysToLowercase(keyBy(sourceArray, key)); +const mergeArrayOfObjectsByKey = (destinationArray: any[], sourceArray: any[], key: string, isSchemaField = false) => { + const destination = normalizeObjectKeys(keyBy(destinationArray, key), isSchemaField); + const source = normalizeObjectKeys(keyBy(sourceArray, key), isSchemaField); - return values(merge(destination, source)); + return values( + merge(destination, source, { + arrayMerge: combineMerge, + customMerge, + }), + ); }; const mergeTags = (destinationArray, sourceArray, _options) => { @@ -88,7 +103,7 @@ const mergeOwners = (destinationArray, sourceArray, _options) => { }; const mergeFields = (destinationArray, sourceArray, _options) => { - return mergeArrayOfObjectsByKey(destinationArray, sourceArray, 'fieldPath'); + return mergeArrayOfObjectsByKey(destinationArray, sourceArray, 'fieldPath', true); }; function getArrayMergeFunction(key) { @@ -112,7 +127,7 @@ function getArrayMergeFunction(key) { } } -const customMerge = (isPrimary, key) => { +function customMerge(isPrimary, key) { if (key === 'upstream' || key === 'downstream') { return (_secondary, primary) => primary; } @@ -145,7 +160,7 @@ const customMerge = (isPrimary, key) => { customMerge: customMerge.bind({}, isPrimary), }); }; -}; +} export const getEntitySiblingData = (baseEntity: T): Maybe => { if (!baseEntity) { diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx index fe2a8c51f9377b..8ee0f637094d6d 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx @@ -50,26 +50,26 @@ export const Editor = forwardRef((props: EditorProps, ref) => { const { manager, state, getContext } = useRemirror({ extensions: () => [ new BlockquoteExtension(), - new BoldExtension(), - new BulletListExtension(), + new BoldExtension({}), + new BulletListExtension({}), new CodeBlockExtension({ syntaxTheme: 'base16_ateliersulphurpool_light' }), new CodeExtension(), - new DataHubMentionsExtension(), - new DropCursorExtension(), + new DataHubMentionsExtension({}), + new DropCursorExtension({}), new HardBreakExtension(), - new HeadingExtension(), - new HistoryExtension(), - new HorizontalRuleExtension(), + new HeadingExtension({}), + new HistoryExtension({}), + new HorizontalRuleExtension({}), new ImageExtension({ enableResizing: !readOnly }), new ItalicExtension(), new LinkExtension({ autoLink: true, defaultTarget: '_blank' }), - new ListItemExtension(), + new ListItemExtension({}), new MarkdownExtension({ htmlSanitizer: DOMPurify.sanitize, htmlToMarkdown, markdownToHtml }), new OrderedListExtension(), new UnderlineExtension(), new StrikeExtension(), new TableExtension({ resizable: false }), - ...(readOnly ? [] : [new HistoryExtension()]), + ...(readOnly ? [] : [new HistoryExtension({})]), ], content, stringHandler: 'markdown', diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml index 147bbd35ff6460..824c8024b05d63 100644 --- a/docker/profiles/docker-compose.gms.yml +++ b/docker/profiles/docker-compose.gms.yml @@ -40,7 +40,6 @@ x-kafka-env: &kafka-env # KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 SCHEMA_REGISTRY_TYPE: INTERNAL KAFKA_SCHEMAREGISTRY_URL: http://datahub-gms:8080/schema-registry/api/ - SPRING_KAFKA_CONSUMER_AUTO_OFFSET_RESET: ${SPRING_KAFKA_CONSUMER_AUTO_OFFSET_RESET:-earliest} x-datahub-quickstart-telemetry-env: &datahub-quickstart-telemetry-env DATAHUB_SERVER_TYPE: ${DATAHUB_SERVER_TYPE:-quickstart} diff --git a/docs-website/filterTagIndexes.json b/docs-website/filterTagIndexes.json index 2309593b2c3b9f..b269f23cccd667 100644 --- a/docs-website/filterTagIndexes.json +++ b/docs-website/filterTagIndexes.json @@ -231,6 +231,17 @@ "Features": "Stateful Ingestion, UI Ingestion, Status Aspect" } }, + { + "Path": "https://hudi.apache.org/docs/syncing_datahub/", + "imgPath": "img/logos/platforms/hudi.png", + "Title": "Apache Hudi", + "Description": "Apache Hudi is an open-source data lake framework that provides ACID transactions, efficient upserts, time travel queries, and incremental data processing for large-scale datasets.", + "tags": { + "Platform Type": "Datastore", + "Connection Type": "Pull", + "Features": "" + } + }, { "Path": "docs/generated/ingestion/sources/iceberg", "imgPath": "img/logos/platforms/iceberg.png", diff --git a/docs-website/static/img/logos/platforms/hudi.png b/docs-website/static/img/logos/platforms/hudi.png new file mode 100644 index 00000000000000..c5e79bcc86ce34 Binary files /dev/null and b/docs-website/static/img/logos/platforms/hudi.png differ diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock index 4e457abd50af7a..9c82b27c3b61f3 100644 --- a/docs-website/yarn.lock +++ b/docs-website/yarn.lock @@ -1827,7 +1827,7 @@ "@docusaurus/theme-search-algolia" "2.4.3" "@docusaurus/types" "2.4.3" -"@docusaurus/react-loadable@5.5.2", "react-loadable@npm:@docusaurus/react-loadable@5.5.2": +"@docusaurus/react-loadable@5.5.2": version "5.5.2" resolved "https://registry.yarnpkg.com/@docusaurus/react-loadable/-/react-loadable-5.5.2.tgz#81aae0db81ecafbdaee3651f12804580868fa6ce" integrity sha512-A3dYjdBGuy0IGT+wyLIGIKLRE+sAk1iNk0f1HjNDysO7u8lhL4N3VEm+FAubmJbAztn94F7MxBTPmnixbiyFdQ== @@ -4757,9 +4757,9 @@ cross-fetch@^3.1.5: node-fetch "^2.6.12" cross-spawn@^7.0.3: - version "7.0.3" - resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6" - integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w== + version "7.0.6" + resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f" + integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA== dependencies: path-key "^3.1.0" shebang-command "^2.0.0" @@ -9713,6 +9713,14 @@ react-loadable-ssr-addon-v5-slorber@^1.0.1: dependencies: "@babel/runtime" "^7.10.3" +"react-loadable@npm:@docusaurus/react-loadable@5.5.2": + version "5.5.2" + resolved "https://registry.yarnpkg.com/@docusaurus/react-loadable/-/react-loadable-5.5.2.tgz#81aae0db81ecafbdaee3651f12804580868fa6ce" + integrity sha512-A3dYjdBGuy0IGT+wyLIGIKLRE+sAk1iNk0f1HjNDysO7u8lhL4N3VEm+FAubmJbAztn94F7MxBTPmnixbiyFdQ== + dependencies: + "@types/react" "*" + prop-types "^15.6.2" + react-markdown@^8.0.6: version "8.0.7" resolved "https://registry.yarnpkg.com/react-markdown/-/react-markdown-8.0.7.tgz#c8dbd1b9ba5f1c5e7e5f2a44de465a3caafdf89b" diff --git a/docs/api/restli/restli-overview.md b/docs/api/restli/restli-overview.md index 22b913d9a25df4..3e9ab00b522670 100644 --- a/docs/api/restli/restli-overview.md +++ b/docs/api/restli/restli-overview.md @@ -1156,7 +1156,7 @@ curl -X POST 'http://localhost:8080/entities?action=search' \ "and": [ { "field": "title", - "value": "Baz Chart 1", + "values": ["Baz Chart 1"], "condition": "EQUAL" } ] @@ -1261,7 +1261,7 @@ curl -X POST 'http://localhost:8080/entities?action=autocomplete' \ "and": [ { "field": "tool", - "value": "looker", + "values": ["looker"], "condition": "EQUAL" } ] diff --git a/docs/api/tutorials/structured-properties.md b/docs/api/tutorials/structured-properties.md index 9b18aa922290b4..b606ce9a8e2455 100644 --- a/docs/api/tutorials/structured-properties.md +++ b/docs/api/tutorials/structured-properties.md @@ -66,7 +66,7 @@ mutation createStructuredProperty { qualifiedName:"retentionTime", displayName: "Retention Time", description: "Retention Time is used to figure out how long to retain records in a dataset", - valueType: "urn:li:dataType:number", + valueType: "urn:li:dataType:datahub.number", allowedValues: [ {numberValue: 30, description: "30 days, usually reserved for datasets that are ephemeral and contain pii"}, {numberValue: 90, description:"description: Use this for datasets that drive monthly reporting but contain pii"}, diff --git a/docs/deploy/aws.md b/docs/deploy/aws.md index 49b0ea1d69ae19..c625ba26a3865a 100644 --- a/docs/deploy/aws.md +++ b/docs/deploy/aws.md @@ -53,7 +53,18 @@ ip-192-168-64-56.us-west-2.compute.internal Ready 3h v1.18.9-ek ip-192-168-8-126.us-west-2.compute.internal Ready 3h v1.18.9-eks-d1db3c ``` -Once your cluster is running, make sure to install the EBS CSI driver, Core DNS, and VPC CNI plugin for Kubernetes. [add-ons](https://docs.aws.amazon.com/eks/latest/userguide/eks-add-ons.html) +### Install EBS CSI driver, Core DNS, and VPC CNI plugin for Kubernetes + +Once your cluster is running, make sure to install the EBS CSI driver, Core DNS, and VPC CNI plugin for Kubernetes. [add-ons](https://docs.aws.amazon.com/eks/latest/userguide/eks-add-ons.html). By default Core DNS and VPC CNI plugins are installed. You need to manually install the EBS CSI driver. It show look this in your console when you are done. + +![Screenshot 2024-11-15 at 4 42 09 PM](https://github.com/user-attachments/assets/5a9a2af0-e804-4896-85bb-dc5834208719) + +### Add the AmazonEBSCSIDriverPolicy role to the EKS node group + +Next is to add the AmazonEBSCSIDriverPolicy role to the EKS node group. You will from the EKS Node group by going to the Compute tab in your EKS cluster and clicking on the IAM entry for the EKS node group. Add the AmazonEBSCSIDriverPolicy policy. + +![Screenshot 2024-11-15 at 4 42 29 PM](https://github.com/user-attachments/assets/8971c8d6-8543-408b-9a07-814aacb2532d) +![Screenshot 2024-11-15 at 4 42 46 PM](https://github.com/user-attachments/assets/397f9131-5f13-4d9f-a664-9921d9bbf44e) ## Setup DataHub using Helm diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md index 59b7a23b5e5836..19cb04e9f56039 100644 --- a/docs/managed-datahub/release-notes/v_0_3_7.md +++ b/docs/managed-datahub/release-notes/v_0_3_7.md @@ -11,6 +11,11 @@ Recommended CLI/SDK If you are using an older CLI/SDK version, then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, GitHub Actions, Airflow, in Python SDK somewhere, Java SDK, etc. This is a strong recommendation to upgrade, as we keep on pushing fixes in the CLI, and it helps us support you better. +## Known Issues + +### v0.3.7.3 + * Search page fails to render when filters are applied with a query which returns zero results. + ## Release Changelog --- diff --git a/entity-registry/build.gradle b/entity-registry/build.gradle index 2dedea1f16d99c..22e5b601d39db2 100644 --- a/entity-registry/build.gradle +++ b/entity-registry/build.gradle @@ -25,6 +25,8 @@ dependencies { because("previous versions are vulnerable to CVE-2022-25857") } } + api project(path: ':li-utils') + api project(path: ':li-utils', configuration: "dataTemplate") dataModel project(':li-utils') annotationProcessor externalDependency.lombok diff --git a/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java b/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java index 89f0cd8fbc9791..0a2400badfc627 100644 --- a/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java +++ b/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java @@ -27,28 +27,6 @@ public static DatasetUrn toDatasetUrn( new DataPlatformUrn(platformName), datasetName, FabricType.valueOf(origin.toUpperCase())); } - /** - * Convert fabric String to FabricType - * - * @param fabric PROD, CORP, EI, DEV, LIT, PRIME - * @return FabricType - */ - @Nonnull - public static FabricType toFabricType(@Nonnull String fabric) { - switch (fabric.toUpperCase()) { - case "PROD": - return FabricType.PROD; - case "CORP": - return FabricType.CORP; - case "EI": - return FabricType.EI; - case "DEV": - return FabricType.DEV; - default: - throw new IllegalArgumentException("Unsupported Fabric Type: " + fabric); - } - } - public static Urn getUrn(String urnStr) { try { return Urn.createFromString(urnStr); diff --git a/metadata-ingestion/docs/sources/kafka/kafka.md b/metadata-ingestion/docs/sources/kafka/kafka.md index f9b50c13f02778..525f985be4662a 100644 --- a/metadata-ingestion/docs/sources/kafka/kafka.md +++ b/metadata-ingestion/docs/sources/kafka/kafka.md @@ -102,7 +102,29 @@ source: connection: bootstrap: "broker:9092" schema_registry_url: http://localhost:8081 +``` + +### OAuth Callback +The OAuth callback function can be set up using `config.connection.consumer_config.oauth_cb`. + +You need to specify a Python function reference in the format <python-module>:<function-name>. + +For example, in the configuration `oauth:create_token`, `create_token` is a function defined in `oauth.py`, and `oauth.py` must be accessible in the PYTHONPATH. +```YAML +source: + type: "kafka" + config: + # Set the custom schema registry implementation class + schema_registry_class: "datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry" + # Coordinates + connection: + bootstrap: "broker:9092" + schema_registry_url: http://localhost:8081 + consumer_config: + security.protocol: "SASL_PLAINTEXT" + sasl.mechanism: "OAUTHBEARER" + oauth_cb: "oauth:create_token" # sink configs ``` diff --git a/metadata-ingestion/docs/sources/sac/sac_pre.md b/metadata-ingestion/docs/sources/sac/sac_pre.md index c62cd81fa27534..624eb61f716f92 100644 --- a/metadata-ingestion/docs/sources/sac/sac_pre.md +++ b/metadata-ingestion/docs/sources/sac/sac_pre.md @@ -4,6 +4,7 @@ - Purpose: API Access - Access: + - Story Listing - Data Import Service - Authorization Grant: Client Credentials diff --git a/metadata-ingestion/examples/cli_usage/gen_schemas.py b/metadata-ingestion/examples/cli_usage/gen_schemas.py index 2fd4683347a3ba..80b2c6712977ad 100644 --- a/metadata-ingestion/examples/cli_usage/gen_schemas.py +++ b/metadata-ingestion/examples/cli_usage/gen_schemas.py @@ -28,7 +28,6 @@ class CorpGroupFile(BaseModel): with open("user/user.dhub.yaml_schema.json", "w") as fp: - fp.write(json.dumps(CorpUserFile.schema(), indent=4)) with open("group/group.dhub.yaml_schema.json", "w") as fp: diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index e2dd5151439923..e5792da32fb5d7 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -769,7 +769,7 @@ def generate( import importlib from typing import TYPE_CHECKING -from datahub._codegen.aspect import _Aspect +from datahub._codegen.aspect import _Aspect as _Aspect from datahub.utilities.docs_build import IS_SPHINX_BUILD from datahub.utilities._custom_package_loader import get_custom_models_package @@ -802,7 +802,7 @@ def generate( from datahub.utilities.docs_build import IS_SPHINX_BUILD from datahub.utilities._custom_package_loader import get_custom_urns_package -from datahub.utilities.urns._urn_base import Urn # noqa: F401 +from datahub.utilities.urns._urn_base import Urn as Urn # noqa: F401 _custom_package_path = get_custom_urns_package() diff --git a/metadata-ingestion/scripts/modeldocgen.py b/metadata-ingestion/scripts/modeldocgen.py index ee5f06cb801baa..998947e5caa954 100644 --- a/metadata-ingestion/scripts/modeldocgen.py +++ b/metadata-ingestion/scripts/modeldocgen.py @@ -14,7 +14,11 @@ import click from datahub.configuration.common import ConfigEnum, PermissiveConfigModel -from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataset_urn, + make_schema_field_urn, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.ingestion.api.common import PipelineContext, RecordEnvelope @@ -442,10 +446,10 @@ def strip_types(field_path: str) -> str: name=relnship_name, foreignDataset=foreign_dataset_urn, foreignFields=[ - f"urn:li:schemaField:({foreign_dataset_urn}, urn)" + make_schema_field_urn(foreign_dataset_urn, "urn") ], sourceFields=[ - f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})" + make_schema_field_urn(source_dataset_urn, f_field.fieldPath) ], ) foreign_keys.append(fkey) diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index c095420e4e3f30..057779bc87c622 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -31,7 +31,7 @@ exclude = __pycache__ per-file-ignores = # imported but unused - __init__.py: F401 + __init__.py: F401, I250 ban-relative-imports = true [mypy] @@ -53,6 +53,14 @@ disallow_untyped_defs = no # try to be a bit more strict in certain areas of the codebase [mypy-datahub.*] ignore_missing_imports = no +implicit_reexport = no +[mypy-datahub.metadata.*] +# TODO: Remove this once all the code has been updated. +implicit_reexport = yes +[mypy-datahub.ingestion.*] +# TODO: Remove this once all the code has been updated. +implicit_reexport = yes + [mypy-datahub_provider.*] ignore_missing_imports = no [mypy-tests.*] diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 3152d0290ec227..8ae112c0ab0b2b 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -245,7 +245,7 @@ # Instead, we put the fix in our PyHive fork, so no thrift pin is needed. } -microsoft_common = {"msal>=1.22.0"} +microsoft_common = {"msal>=1.24.0"} iceberg_common = { # Iceberg Python SDK @@ -591,22 +591,26 @@ "memray", } -base_dev_requirements = { - *base_requirements, - *framework_common, - *mypy_stubs, - *s3_base, +lint_requirements = { # This is pinned only to avoid spurious errors in CI. # We should make an effort to keep it up to date. - "black==22.12.0", - "coverage>=5.1", - "faker>=18.4.0", + "black==23.3.0", "flake8>=6.0.0", "flake8-tidy-imports>=4.3.0", "flake8-bugbear==23.3.12", "isort>=5.7.0", "mypy==1.10.1", +} + +base_dev_requirements = { + *base_requirements, + *framework_common, + *mypy_stubs, + *s3_base, + *lint_requirements, *test_api_requirements, + "coverage>=5.1", + "faker>=18.4.0", "pytest-asyncio>=0.16.0", "pytest-cov>=2.8.1", "pytest-random-order~=1.1.0", @@ -737,8 +741,8 @@ "hive = datahub.ingestion.source.sql.hive:HiveSource", "hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource", "json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource", - "kafka = datahub.ingestion.source.kafka:KafkaSource", - "kafka-connect = datahub.ingestion.source.kafka_connect:KafkaConnectSource", + "kafka = datahub.ingestion.source.kafka.kafka:KafkaSource", + "kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource", "ldap = datahub.ingestion.source.ldap:LDAPSource", "looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource", "lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource", @@ -931,6 +935,7 @@ ), "cloud": ["acryl-datahub-cloud"], "dev": list(dev_requirements), + "lint": list(lint_requirements), "testing-utils": list(test_api_requirements), # To import `datahub.testing` "integration-tests": list(full_test_dev_requirements), "debug": list(debug_requirements), diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py index 27317826264b85..0b04bfa4025a1b 100644 --- a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py +++ b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py @@ -12,3 +12,10 @@ ) requests_logger.setLevel(logging.WARNING) + +__all__ = [ + "AssertionCircuitBreaker", + "AssertionCircuitBreakerConfig", + "OperationCircuitBreaker", + "OperationCircuitBreakerConfig", +] diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py b/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py index a3c54046faf681..7c1180536a90fb 100644 --- a/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py +++ b/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py @@ -6,7 +6,7 @@ from gql.transport.requests import RequestsHTTPTransport from pydantic import Field -from datahub.configuration import ConfigModel +from datahub.configuration.common import ConfigModel logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py b/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py index 6d85a1569cb63d..3a073005968222 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py @@ -1,2 +1,5 @@ from datahub.api.entities.datajob.dataflow import DataFlow from datahub.api.entities.datajob.datajob import DataJob + +# TODO: Remove this and start importing directly from the inner files. +__all__ = ["DataFlow", "DataJob"] diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py index f2436d56d5aca1..e169c07445e969 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py @@ -3,7 +3,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, cast import datahub.emitter.mce_builder as builder -from datahub.configuration.source_common import ALL_ENV_TYPES from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -114,7 +113,7 @@ def generate_tags_aspect(self) -> List[GlobalTagsClass]: def _get_env(self) -> Optional[str]: env: Optional[str] = None - if self.env and self.env.upper() in ALL_ENV_TYPES: + if self.env and self.env.upper() in builder.ALL_ENV_TYPES: env = self.env.upper() else: logger.debug( diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py index 0f5d18c20e055b..4958a68caa95fe 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py @@ -3,7 +3,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Set import datahub.emitter.mce_builder as builder -from datahub.configuration.source_common import ALL_ENV_TYPES from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -109,7 +108,7 @@ def generate_mcp( self, materialize_iolets: bool = True ) -> Iterable[MetadataChangeProposalWrapper]: env: Optional[str] = None - if self.flow_urn.cluster.upper() in ALL_ENV_TYPES: + if self.flow_urn.cluster.upper() in builder.ALL_ENV_TYPES: env = self.flow_urn.cluster.upper() else: logger.debug( diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index b48c655015d825..56e02e4329055a 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -190,7 +190,6 @@ def create(file: str, graph: Optional[DataHubGraph] = None) -> None: @classmethod def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": - with StructuredPropertiesConfig.use_graph(graph): structured_property: Optional[ StructuredPropertyDefinitionClass diff --git a/metadata-ingestion/src/datahub/api/graphql/__init__.py b/metadata-ingestion/src/datahub/api/graphql/__init__.py index e8c8d22bbb93df..d818b19092fcbe 100644 --- a/metadata-ingestion/src/datahub/api/graphql/__init__.py +++ b/metadata-ingestion/src/datahub/api/graphql/__init__.py @@ -1,2 +1,4 @@ from datahub.api.graphql.assertion import Assertion from datahub.api.graphql.operation import Operation + +__all__ = ["Assertion", "Operation"] diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index 39ed1b2bfea087..fbe07b64f0e154 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -268,7 +268,9 @@ def sql_lineage( ) logger.debug("Sql parsing debug info: %s", lineage.debug_info) - if lineage.debug_info.error: + if lineage.debug_info.table_error: + raise lineage.debug_info.table_error + elif lineage.debug_info.error: logger.debug("Sql parsing error details", exc_info=lineage.debug_info.error) click.echo(lineage.json(indent=4)) diff --git a/metadata-ingestion/src/datahub/cli/put_cli.py b/metadata-ingestion/src/datahub/cli/put_cli.py index 989b1a6d02fd01..0a40a9f4ccf92d 100644 --- a/metadata-ingestion/src/datahub/cli/put_cli.py +++ b/metadata-ingestion/src/datahub/cli/put_cli.py @@ -6,11 +6,12 @@ from datahub.cli.cli_utils import post_entity from datahub.configuration.config_loader import load_config_file -from datahub.emitter.mcp import MetadataChangeProposalWrapper, SystemMetadataClass +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.graph.client import get_default_graph from datahub.metadata.schema_classes import ( DataPlatformInfoClass as DataPlatformInfo, PlatformTypeClass, + SystemMetadataClass, ) from datahub.telemetry import telemetry from datahub.upgrade import upgrade diff --git a/metadata-ingestion/src/datahub/configuration/__init__.py b/metadata-ingestion/src/datahub/configuration/__init__.py index 008d788072d0a5..21979829a4453d 100644 --- a/metadata-ingestion/src/datahub/configuration/__init__.py +++ b/metadata-ingestion/src/datahub/configuration/__init__.py @@ -1,5 +1,4 @@ from datahub.configuration.common import ( - ConfigModel, - ConfigurationMechanism, - DynamicTypedConfig, + ConfigModel as ConfigModel, + DynamicTypedConfig as DynamicTypedConfig, ) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 0ce7127b440534..4fdf564162410c 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -21,7 +21,7 @@ from pydantic.fields import Field from typing_extensions import Protocol -from datahub.configuration._config_enum import ConfigEnum +from datahub.configuration._config_enum import ConfigEnum as ConfigEnum # noqa: I250 from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.utilities.dedup_list import deduplicate_list diff --git a/metadata-ingestion/src/datahub/configuration/json_loader.py b/metadata-ingestion/src/datahub/configuration/json_loader.py index 35667eb5951fc7..6ecb741be528d1 100644 --- a/metadata-ingestion/src/datahub/configuration/json_loader.py +++ b/metadata-ingestion/src/datahub/configuration/json_loader.py @@ -1,7 +1,7 @@ import json from typing import IO -from datahub.configuration import ConfigurationMechanism +from datahub.configuration.common import ConfigurationMechanism class JsonConfigurationMechanism(ConfigurationMechanism): diff --git a/metadata-ingestion/src/datahub/configuration/kafka.py b/metadata-ingestion/src/datahub/configuration/kafka.py index 07e2f759bb3ff6..b8d9ff994a51ab 100644 --- a/metadata-ingestion/src/datahub/configuration/kafka.py +++ b/metadata-ingestion/src/datahub/configuration/kafka.py @@ -1,6 +1,7 @@ from pydantic import Field, validator -from datahub.configuration.common import ConfigModel +from datahub.configuration.common import ConfigModel, ConfigurationError +from datahub.configuration.kafka_consumer_config import CallableConsumerConfig from datahub.configuration.validate_host_port import validate_host_port @@ -36,6 +37,16 @@ class KafkaConsumerConnectionConfig(_KafkaConnectionConfig): description="Extra consumer config serialized as JSON. These options will be passed into Kafka's DeserializingConsumer. See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md .", ) + @validator("consumer_config") + @classmethod + def resolve_callback(cls, value: dict) -> dict: + if CallableConsumerConfig.is_callable_config(value): + try: + value = CallableConsumerConfig(value).callable_config() + except Exception as e: + raise ConfigurationError(e) + return value + class KafkaProducerConnectionConfig(_KafkaConnectionConfig): """Configuration class for holding connectivity information for Kafka producers""" diff --git a/metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py b/metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py new file mode 100644 index 00000000000000..d3ff5998d3e790 --- /dev/null +++ b/metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py @@ -0,0 +1,35 @@ +import logging +from typing import Any, Dict, Optional + +from datahub.ingestion.api.registry import import_path + +logger = logging.getLogger(__name__) + + +class CallableConsumerConfig: + CALLBACK_ATTRIBUTE: str = "oauth_cb" + + def __init__(self, config: Dict[str, Any]): + self._config = config + + self._resolve_oauth_callback() + + def callable_config(self) -> Dict[str, Any]: + return self._config + + @staticmethod + def is_callable_config(config: Dict[str, Any]) -> bool: + return CallableConsumerConfig.CALLBACK_ATTRIBUTE in config + + def get_call_back_attribute(self) -> Optional[str]: + return self._config.get(CallableConsumerConfig.CALLBACK_ATTRIBUTE) + + def _resolve_oauth_callback(self) -> None: + if not self.get_call_back_attribute(): + return + + call_back = self.get_call_back_attribute() + + assert call_back # to silent lint + # Set the callback + self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back) diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index ad12447532335f..44c737f1bd13d4 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -1,14 +1,10 @@ -from typing import Dict, Optional, Set +from typing import Dict, Optional from pydantic import validator from pydantic.fields import Field from datahub.configuration.common import ConfigModel -from datahub.emitter.enum_helpers import get_enum_options -from datahub.metadata.schema_classes import FabricTypeClass - -DEFAULT_ENV = FabricTypeClass.PROD -ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass)) +from datahub.emitter.mce_builder import ALL_ENV_TYPES, DEFAULT_ENV class PlatformInstanceConfigMixin(ConfigModel): diff --git a/metadata-ingestion/src/datahub/configuration/yaml.py b/metadata-ingestion/src/datahub/configuration/yaml.py index 1f1172836f7448..c069845e1de119 100644 --- a/metadata-ingestion/src/datahub/configuration/yaml.py +++ b/metadata-ingestion/src/datahub/configuration/yaml.py @@ -2,7 +2,7 @@ import yaml -from datahub.configuration import ConfigurationMechanism +from datahub.configuration.common import ConfigurationMechanism class YamlConfigurationMechanism(ConfigurationMechanism): diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 63b03db7f5b604..69946c575908b5 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -13,6 +13,7 @@ Any, List, Optional, + Set, Tuple, Type, TypeVar, @@ -24,7 +25,6 @@ import typing_inspect from avrogen.dict_wrapper import DictWrapper -from datahub.configuration.source_common import DEFAULT_ENV from datahub.emitter.enum_helpers import get_enum_options from datahub.metadata.schema_classes import ( AssertionKeyClass, @@ -35,6 +35,7 @@ DatasetKeyClass, DatasetLineageTypeClass, DatasetSnapshotClass, + FabricTypeClass, GlobalTagsClass, GlossaryTermAssociationClass, GlossaryTermsClass as GlossaryTerms, @@ -56,6 +57,9 @@ logger = logging.getLogger(__name__) Aspect = TypeVar("Aspect", bound=AspectAbstract) +DEFAULT_ENV = FabricTypeClass.PROD +ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass)) + DEFAULT_FLOW_CLUSTER = "prod" UNKNOWN_USER = "urn:li:corpuser:unknown" DATASET_URN_TO_LOWER: bool = ( diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index d088380d5d38c4..85968f050a3716 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -13,13 +13,10 @@ generate_access_token, make_shim_command, ) -from datahub.cli.config_utils import ( - DATAHUB_CONFIG_PATH, - get_boolean_env_variable, - write_gms_config, -) +from datahub.cli.config_utils import DATAHUB_CONFIG_PATH, write_gms_config from datahub.cli.delete_cli import delete from datahub.cli.docker_cli import docker +from datahub.cli.env_utils import get_boolean_env_variable from datahub.cli.exists_cli import exists from datahub.cli.get_cli import get from datahub.cli.ingest_cli import ingest diff --git a/metadata-ingestion/src/datahub/ingestion/api/decorators.py b/metadata-ingestion/src/datahub/ingestion/api/decorators.py index b390ffb9dd0362..d32c0b85ceef4c 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/decorators.py +++ b/metadata-ingestion/src/datahub/ingestion/api/decorators.py @@ -3,7 +3,10 @@ from typing import Callable, Dict, Optional, Type from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.source import Source, SourceCapability +from datahub.ingestion.api.source import ( # noqa: I250 + Source, + SourceCapability as SourceCapability, +) def config_class(config_cls: Type) -> Callable[[Type], Type]: diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index bcf077154343c8..88d1fcc52e2196 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -23,7 +23,7 @@ RecordTypeClass, SchemaFieldClass as SchemaField, SchemaFieldDataTypeClass, - SchemaMetadataClass as SchemaMetadata, + SchemaMetadataClass, StringTypeClass, UnionTypeClass, ) @@ -665,13 +665,13 @@ def get_schema_metadata( name: str, json_schema: Dict[Any, Any], raw_schema_string: Optional[str] = None, -) -> SchemaMetadata: +) -> SchemaMetadataClass: json_schema_as_string = raw_schema_string or json.dumps(json_schema) md5_hash: str = md5(json_schema_as_string.encode()).hexdigest() schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(json_schema)) - schema_metadata = SchemaMetadata( + schema_metadata = SchemaMetadataClass( schemaName=name, platform=f"urn:li:dataPlatform:{platform}", version=0, diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py index f62bb184252d98..e947aff384871d 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py @@ -32,7 +32,7 @@ OneofDescriptor, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, @@ -41,8 +41,8 @@ MapTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, UnionTypeClass, ) diff --git a/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py b/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py index a135b7b6ce8375..9c34c4f83b0a93 100644 --- a/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py +++ b/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py @@ -32,7 +32,6 @@ def __str__(self): class S3ListIterator(Iterator): - MAX_KEYS = 1000 def __init__( diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py index 1d381acbf3dbe9..98c43079a3bc15 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py @@ -33,7 +33,6 @@ @dataclass class ClassificationReportMixin: - num_tables_fetch_sample_values_failed: int = 0 num_tables_classification_attempted: int = 0 @@ -112,7 +111,6 @@ def classify_schema_fields( schema_metadata: SchemaMetadata, sample_data: Union[Dict[str, list], Callable[[], Dict[str, list]]], ) -> None: - if not isinstance(sample_data, Dict): try: # TODO: In future, sample_data fetcher can be lazily called if classification diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index c90ac93eee2cc2..759aebcfd46b0a 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -33,7 +33,9 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.emitter.serialization_helper import post_json_transform -from datahub.ingestion.graph.config import DatahubClientConfig +from datahub.ingestion.graph.config import ( # noqa: I250; TODO: Remove this alias + DatahubClientConfig as DatahubClientConfig, +) from datahub.ingestion.graph.connections import ( connections_gql, get_id_from_connection_urn, diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py index 33bfb63feb3fd7..5961a553a14943 100644 --- a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py @@ -11,9 +11,8 @@ redact_raw_config, ) from datahub.emitter.aspect import JSON_CONTENT_TYPE -from datahub.emitter.mce_builder import datahub_guid +from datahub.emitter.mce_builder import datahub_guid, make_data_platform_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.mcp_builder import make_data_platform_urn from datahub.ingestion.api.common import PipelineContext, RecordEnvelope from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener from datahub.ingestion.api.sink import NoopWriteCallback, Sink diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 2e56d5866efa89..7c3a42c3e08931 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -428,6 +428,7 @@ def create( def _time_to_print(self) -> bool: self.num_intermediate_workunits += 1 current_time = int(time.time()) + # TODO: Replace with ProgressTimer. if current_time - self.last_time_printed > _REPORT_PRINT_INTERVAL_SECONDS: # we print self.num_intermediate_workunits = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index ad293c702a5205..4af41921c9fa3c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -374,7 +374,6 @@ class BigQueryV2Config( StatefulProfilingConfigMixin, ClassificationSourceConfigMixin, ): - include_schema_metadata: bool = Field( default=True, description="Whether to ingest the BigQuery schema, i.e. projects, schemas, tables, and views.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 136395bcf786ab..e5168c20cf0705 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -356,7 +356,6 @@ def _process_project( project_id ) except Exception as e: - if self.config.project_ids and "not enabled BigQuery." in str(e): action_mesage = ( "The project has not enabled BigQuery API. " @@ -417,7 +416,6 @@ def _process_project_datasets( bigquery_project: BigqueryProject, db_tables: Dict[str, List[BigqueryTable]], ) -> Iterable[MetadataWorkUnit]: - db_views: Dict[str, List[BigqueryView]] = {} db_snapshots: Dict[str, List[BigqueryTableSnapshot]] = {} project_id = bigquery_project.id @@ -1144,7 +1142,6 @@ def gen_schema_metadata( columns: List[BigqueryColumn], dataset_name: BigqueryTableIdentifier, ) -> MetadataWorkUnit: - foreign_keys: List[ForeignKeyConstraint] = [] # Foreign keys only make sense for tables if isinstance(table, BigqueryTable): diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index b542992a7924a0..321b1b6207fabf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -934,7 +934,6 @@ def gen_lineage_workunits_for_external_table( ddl: Optional[str], graph: Optional[DataHubGraph] = None, ) -> Iterable[MetadataWorkUnit]: - if not ddl: return @@ -972,7 +971,6 @@ def get_lineage_for_external_table( source_uris: List[str], graph: Optional[DataHubGraph] = None, ) -> Optional[UpstreamLineageClass]: - upstreams_list: List[UpstreamClass] = [] fine_grained_lineages: List[FineGrainedLineageClass] = [] gcs_urns: Set[str] = set() diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index 497947abe4ef9a..08c9beaa73c53b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -2,7 +2,7 @@ import logging import pathlib import tempfile -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from typing import Collection, Dict, Iterable, List, Optional, TypedDict from google.cloud.bigquery import Client @@ -49,6 +49,7 @@ FileBackedDict, FileBackedList, ) +from datahub.utilities.progress_timer import ProgressTimer from datahub.utilities.time import datetime_to_ts_millis logger = logging.getLogger(__name__) @@ -270,27 +271,25 @@ def get_workunits_internal( # Preprocessing stage that deduplicates the queries using query hash per usage bucket # Note: FileBackedDict is an ordered dictionary, so the order of execution of # queries is inherently maintained - queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] - queries_deduped = self.deduplicate_queries(queries) + queries_deduped: FileBackedDict[ + Dict[int, ObservedQuery] + ] = self.deduplicate_queries(queries) self.report.num_unique_queries = len(queries_deduped) logger.info(f"Found {self.report.num_unique_queries} unique queries") with self.report.audit_log_load_timer, queries_deduped: - last_log_time = datetime.now() - last_report_time = datetime.now() + log_timer = ProgressTimer(timedelta(minutes=1)) + report_timer = ProgressTimer(timedelta(minutes=5)) + for i, (_, query_instances) in enumerate(queries_deduped.items()): for query in query_instances.values(): - now = datetime.now() - if (now - last_log_time).total_seconds() >= 60: + if log_timer.should_report(): logger.info( f"Added {i} deduplicated query log entries to SQL aggregator" ) - last_log_time = now - if (now - last_report_time).total_seconds() >= 300: - if self.report.sql_aggregator: - logger.info(self.report.sql_aggregator.as_string()) - last_report_time = now + if report_timer.should_report() and self.report.sql_aggregator: + logger.info(self.report.sql_aggregator.as_string()) self.aggregator.add(query) @@ -304,7 +303,6 @@ def get_workunits_internal( def deduplicate_queries( self, queries: FileBackedList[ObservedQuery] ) -> FileBackedDict[Dict[int, ObservedQuery]]: - # This fingerprint based deduplication is done here to reduce performance hit due to # repetitive sql parsing while adding observed query to aggregator that would otherwise # parse same query multiple times. In future, aggregator may absorb this deduplication. @@ -342,7 +340,6 @@ def deduplicate_queries( return queries_deduped def fetch_query_log(self, project: BigqueryProject) -> Iterable[ObservedQuery]: - # Multi-regions from https://cloud.google.com/bigquery/docs/locations#supported_locations regions = self.config.region_qualifiers @@ -355,7 +352,6 @@ def fetch_query_log(self, project: BigqueryProject) -> Iterable[ObservedQuery]: def fetch_region_query_log( self, project: BigqueryProject, region: str ) -> Iterable[ObservedQuery]: - # Each region needs to be a different query query_log_query = _build_enriched_query_log_query( project_id=project.id, @@ -452,7 +448,6 @@ def _build_enriched_query_log_query( start_time: datetime, end_time: datetime, ) -> str: - audit_start_time = start_time.strftime(BQ_DATETIME_FORMAT) audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT) diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py index 6a5236563f48db..dcdccc08ce0483 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py @@ -332,7 +332,6 @@ def _extract_columns_from_table( def _extract_views_from_keyspace( self, keyspace_name: str ) -> Iterable[MetadataWorkUnit]: - views: List[CassandraView] = self.cassandra_api.get_views(keyspace_name) for view in views: view_name: str = view.view_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py index 09ce8b5b05203c..2b75d0dca53cb7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py @@ -16,8 +16,10 @@ from datahub.ingestion.extractor import protobuf_util, schema_util from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator from datahub.ingestion.extractor.protobuf_util import ProtobufSchema -from datahub.ingestion.source.kafka import KafkaSourceConfig, KafkaSourceReport -from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase +from datahub.ingestion.source.kafka.kafka import KafkaSourceConfig, KafkaSourceReport +from datahub.ingestion.source.kafka.kafka_schema_registry_base import ( + KafkaSchemaRegistryBase, +) from datahub.metadata.com.linkedin.pegasus2avro.schema import ( KafkaSchema, SchemaField, @@ -371,7 +373,6 @@ def _get_schema_fields( def _get_schema_metadata( self, topic: str, platform_urn: str, is_subject: bool ) -> Optional[SchemaMetadata]: - # Process the value schema schema, fields = self._get_schema_and_fields( topic=topic, diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py index 5f88cf0234947a..ede7d3c3c56959 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py @@ -7,7 +7,6 @@ class PathSpecsConfigMixin(ConfigModel): - path_specs: List[PathSpec] = Field( description="List of PathSpec. See [below](#path-spec) the details about PathSpec" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py index de212ca9a67716..63cea45f75864b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -107,7 +107,6 @@ def _get_database_workunits( logger.info(f"Fetching database aspects starting from {from_createdon}") mcps = reader.get_aspects(from_createdon, self.report.stop_time) for i, (mcp, createdon) in enumerate(mcps): - if not self.urn_pattern.allowed(str(mcp.entityUrn)): continue diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index b5d0ed42e651ea..4598ae388b827d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -117,9 +117,8 @@ ViewPropertiesClass, ) from datahub.metadata.urns import DatasetUrn -from datahub.sql_parsing.schema_resolver import SchemaResolver +from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver from datahub.sql_parsing.sqlglot_lineage import ( - SchemaInfo, SqlParsingDebugInfo, SqlParsingResult, infer_output_schema, diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py index 81a54d1327d05a..d2b4a576953daf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py @@ -6,9 +6,8 @@ from pydantic import Field from typing_extensions import Literal -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.source_common import ( - ConfigModel, EnvConfigMixin, PlatformInstanceConfigMixin, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py index db83dde7cf6131..7b9ccb52acbef4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py @@ -566,7 +566,6 @@ def get_all_tables_and_columns(self, containers: Deque) -> List[Dict]: return tables def validate_schema_format(self, schema): - if "." in schema: schema_path = self.get( url=f"/catalog/{self.get_dataset_id(schema=schema, dataset='')}" @@ -687,7 +686,6 @@ def traverse_path(location_id: str, entity_path: List[str]) -> List: response.get("entityType") == DremioEntityContainerType.FOLDER.value.lower() ): - containers.append( { "id": location_id, diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py index 9d6f65b95554e7..d966d575c03320 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py @@ -121,7 +121,6 @@ class DremioSourceConfig( EnvConfigMixin, PlatformInstanceConfigMixin, ): - domain: Optional[str] = Field( default=None, description="Domain for all source objects.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py index cd6ba441b5c93b..5b96845ec04961 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py @@ -198,7 +198,6 @@ def _build_source_map(self) -> Dict[str, Dict]: source_platform_name = source_name for mapping in self.config.source_mappings or []: - if re.search(mapping.source_name, source_type, re.IGNORECASE): source_platform_name = mapping.source_name.lower() diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py index acda656526ef53..cb3f0dd9cf29f4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py @@ -52,24 +52,22 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, + DataPlatformInstanceClass, + DatasetPropertiesClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, SchemalessClass, - SchemaMetadata, + SchemaMetadataClass, StringTypeClass, UnionTypeClass, ) -from datahub.metadata.schema_classes import ( - DataPlatformInstanceClass, - DatasetPropertiesClass, -) from datahub.utilities.registries.domain_registry import DomainRegistry MAX_ITEMS_TO_RETRIEVE = 100 @@ -235,7 +233,6 @@ def _process_table( table_name: str, dataset_name: str, ) -> Iterable[MetadataWorkUnit]: - logger.debug(f"Processing table: {dataset_name}") table_info = dynamodb_client.describe_table(TableName=table_name)["Table"] account_id = table_info["TableArn"].split(":")[4] @@ -448,7 +445,7 @@ def construct_schema_metadata( dataset_properties: DatasetPropertiesClass, schema: Dict[Tuple[str, ...], SchemaDescription], primary_key_dict: Dict[str, str], - ) -> SchemaMetadata: + ) -> SchemaMetadataClass: """ " To construct the schema metadata, it will first sort the schema by the occurrence of attribute names in descending order and truncate the schema by MAX_SCHEMA_SIZE, and then start to construct the @@ -502,7 +499,7 @@ def construct_schema_metadata( canonical_schema.append(field) # create schema metadata object for table - schema_metadata = SchemaMetadata( + schema_metadata = SchemaMetadataClass( schemaName=table_name, platform=f"urn:li:dataPlatform:{self.platform}", version=0, diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index e40e284d6e0a42..86826ae7bedc09 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -12,8 +12,9 @@ ConfigModel, ConfigurationWarning, ) -from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.ingestion.api.report import Report from datahub.ingestion.source.bigquery_v2.bigquery_config import ( BigQueryConnectionConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py index 80f7b7a9f4480c..130f2c9c2e12fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py @@ -401,7 +401,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: total_runs=job.get("entity").get("runs").get("total"), ) if datajob_entity.total_runs > 0: - self.delete_dpi_from_datajobs(datajob_entity) + try: + self.delete_dpi_from_datajobs(datajob_entity) + except Exception as e: + logger.error(f"While trying to delete {datajob_entity} got {e}") if ( datajob_entity.total_runs == 0 and self.config.delete_empty_data_jobs diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index 570df4e99ab13d..3baf858e44cdc8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -69,7 +69,6 @@ def __init__( report: DatahubExecutionRequestCleanupReport, config: Optional[DatahubExecutionRequestCleanupConfig] = None, ) -> None: - self.graph = graph self.report = report self.instance_id = int(time.time()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 8b2443a589b8dc..c20506e36a844f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -95,7 +95,6 @@ class GEProfilingBaseConfig(ConfigModel): class GEProfilingConfig(GEProfilingBaseConfig): - report_dropped_profiles: bool = Field( default=False, description="Whether to report datasets or dataset columns which were not profiled. Set to `True` for debugging purposes.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py similarity index 97% rename from metadata-ingestion/src/datahub/ingestion/source/kafka.py rename to metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py index a757250a0d6c85..06d929774240ba 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py @@ -18,6 +18,7 @@ from datahub.configuration.common import AllowDenyPattern from datahub.configuration.kafka import KafkaConsumerConnectionConfig +from datahub.configuration.kafka_consumer_config import CallableConsumerConfig from datahub.configuration.source_common import ( DatasetSourceConfigMixin, LowerCaseDatasetUrnConfigMixin, @@ -49,7 +50,9 @@ ) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import DatasetSubTypes -from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase +from datahub.ingestion.source.kafka.kafka_schema_registry_base import ( + KafkaSchemaRegistryBase, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -143,7 +146,7 @@ class KafkaSourceConfig( def get_kafka_consumer( connection: KafkaConsumerConnectionConfig, ) -> confluent_kafka.Consumer: - return confluent_kafka.Consumer( + consumer = confluent_kafka.Consumer( { "group.id": "test", "bootstrap.servers": connection.bootstrap, @@ -151,6 +154,13 @@ def get_kafka_consumer( } ) + if CallableConsumerConfig.is_callable_config(connection.consumer_config): + # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed + # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration + consumer.poll(timeout=30) + + return consumer + @dataclass class KafkaSourceReport(StaleEntityRemovalSourceReport): diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py similarity index 100% rename from metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py rename to metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_schema_registry_base.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_schema_registry_base.py similarity index 100% rename from metadata-ingestion/src/datahub/ingestion/source/kafka_schema_registry_base.py rename to metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_schema_registry_base.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 3d1683100474e8..57a251ef2ed14f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -48,7 +48,7 @@ from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, + BASE_PROJECT_NAME, LookMLSourceReport, ) from datahub.ingestion.source.looker.str_functions import remove_suffix @@ -307,7 +307,6 @@ def view_fields_from_dict( type_cls: ViewFieldType, populate_sql_logic_in_descriptions: bool, ) -> "ViewField": - is_primary_key = field_dict.get("primary_key", "no") == "yes" name = field_dict["name"] @@ -370,7 +369,7 @@ def _form_field_name( assert view_name # for lint false positive project_include: ProjectInclude = ProjectInclude( - project=view_project_map.get(view_name, _BASE_PROJECT_NAME), + project=view_project_map.get(view_name, BASE_PROJECT_NAME), include=view_name, ) @@ -385,7 +384,7 @@ def _form_field_name( view_urn = LookerViewId( project_name=( project_include.project - if project_include.project != _BASE_PROJECT_NAME + if project_include.project != BASE_PROJECT_NAME else explore_project_name ), model_name=model_name, @@ -929,7 +928,6 @@ def from_api( # noqa: C901 reporter: SourceReport, source_config: LookerDashboardSourceConfig, ) -> Optional["LookerExplore"]: # noqa: C901 - try: explore = client.lookml_model_explore(model, explore_name) views: Set[str] = set() @@ -987,13 +985,11 @@ def from_api( # noqa: C901 field_name_vs_raw_explore_field: Dict = {} if explore.fields is not None: - if explore.fields.dimensions is not None: for dim_field in explore.fields.dimensions: if dim_field.name is None: continue else: - field_name_vs_raw_explore_field[dim_field.name] = dim_field view_fields.append( @@ -1034,7 +1030,6 @@ def from_api( # noqa: C901 if measure_field.name is None: continue else: - field_name_vs_raw_explore_field[ measure_field.name ] = measure_field @@ -1113,7 +1108,7 @@ def from_api( # noqa: C901 fields=view_fields, upstream_views=list( ProjectInclude( - project=view_project_map.get(view_name, _BASE_PROJECT_NAME), + project=view_project_map.get(view_name, BASE_PROJECT_NAME), include=view_name, ) for view_name in views @@ -1239,7 +1234,7 @@ def _to_metadata_events( # noqa: C901 view_urn = LookerViewId( project_name=( view_ref.project - if view_ref.project != _BASE_PROJECT_NAME + if view_ref.project != BASE_PROJECT_NAME else self.project_name ), model_name=self.model_name, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py index 7e23079156b625..327c9ebf99bd20 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py @@ -9,8 +9,8 @@ load_and_preprocess_file, ) from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, - _EXPLORE_FILE_EXTENSION, + BASE_PROJECT_NAME, + EXPLORE_FILE_EXTENSION, LookMLSourceConfig, LookMLSourceReport, ) @@ -69,7 +69,7 @@ def from_looker_dict( explore_files = [ x.include for x in resolved_includes - if x.include.endswith(_EXPLORE_FILE_EXTENSION) + if x.include.endswith(EXPLORE_FILE_EXTENSION) ] for included_file in explore_files: try: @@ -152,9 +152,9 @@ def resolve_includes( # As such, we try to handle it but are as defensive as possible. non_base_project_name = project_name - if project_name == _BASE_PROJECT_NAME and root_project_name is not None: + if project_name == BASE_PROJECT_NAME and root_project_name is not None: non_base_project_name = root_project_name - if non_base_project_name != _BASE_PROJECT_NAME and inc.startswith( + if non_base_project_name != BASE_PROJECT_NAME and inc.startswith( f"/{non_base_project_name}/" ): # This might be a local include. Let's make sure that '/{project_name}' doesn't diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py index f894c96debc54a..9fac0b52fde0dd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py @@ -9,8 +9,8 @@ load_and_preprocess_file, ) from datahub.ingestion.source.looker.lookml_config import ( - _EXPLORE_FILE_EXTENSION, - _VIEW_FILE_EXTENSION, + EXPLORE_FILE_EXTENSION, + VIEW_FILE_EXTENSION, LookMLSourceConfig, LookMLSourceReport, ) @@ -42,7 +42,7 @@ def _load_viewfile( ) -> Optional[LookerViewFile]: # always fully resolve paths to simplify de-dup path = str(pathlib.Path(path).resolve()) - allowed_extensions = [_VIEW_FILE_EXTENSION, _EXPLORE_FILE_EXTENSION] + allowed_extensions = [VIEW_FILE_EXTENSION, EXPLORE_FILE_EXTENSION] matched_any_extension = [ match for match in [path.endswith(x) for x in allowed_extensions] if match ] diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index e42ac7b61c1777..cd8ccb8217257c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -604,7 +604,6 @@ def _get_folder_browse_path_v2_entries( def _create_platform_instance_aspect( self, ) -> DataPlatformInstance: - assert ( self.source_config.platform_name ), "Platform name is not set in the configuration." @@ -999,7 +998,6 @@ def _gen_folder_key(self, folder_id: str) -> LookerFolderKey: def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard ) -> Iterable[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]: - # Step 1: Emit metadata for each Chart inside the Dashboard. chart_events = [] for element in looker_dashboard.dashboard_elements: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py index 1e60c08fe00c2b..6d49d57e077435 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py @@ -55,7 +55,6 @@ def _create_new_liquid_variables_with_default( current_dict: dict = new_dict for key in keys[:-1]: - if key not in current_dict: current_dict[key] = {} @@ -392,7 +391,6 @@ def process_lookml_template_language( source_config: LookMLSourceConfig, view_lkml_file_dict: dict, ) -> None: - if "views" not in view_lkml_file_dict: return @@ -425,7 +423,6 @@ def load_and_preprocess_file( path: Union[str, pathlib.Path], source_config: LookMLSourceConfig, ) -> dict: - parsed = load_lkml(path) process_lookml_template_language( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py index 6a623e1e97b5dc..ef7d64e4f42d43 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py @@ -14,7 +14,7 @@ from looker_sdk.sdk.api40.models import Dashboard, LookWithQuery -from datahub.emitter.mce_builder import Aspect, AspectAbstract +from datahub.emitter.mce_builder import Aspect from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.source.looker import looker_common from datahub.ingestion.source.looker.looker_common import ( @@ -40,6 +40,7 @@ DashboardUsageStatisticsClass, DashboardUserUsageCountsClass, TimeWindowSizeClass, + _Aspect as AspectAbstract, ) logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py index aa45bb72d1f462..562c7863b31343 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py @@ -6,7 +6,7 @@ from datahub.ingestion.source.looker.looker_dataclasses import LookerModel from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, + BASE_PROJECT_NAME, NAME, LookMLSourceReport, ) @@ -103,7 +103,7 @@ def get_looker_view_id( current_project_name: str = ( include.project - if include.project != _BASE_PROJECT_NAME + if include.project != BASE_PROJECT_NAME else self.project_name ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py index ce4a242027e11a..80be566cdcd468 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py @@ -320,7 +320,6 @@ def get_including_extends( self, field: str, ) -> Optional[Any]: - # According to Looker's inheritance rules, we need to merge the fields(i.e. dimensions, measures and # dimension_groups) from both the child and parent. if field in [DIMENSIONS, DIMENSION_GROUPS, MEASURES]: @@ -345,7 +344,6 @@ def _get_sql_table_name_field(self) -> Optional[str]: return self.get_including_extends(field="sql_table_name") def _is_dot_sql_table_name_present(self) -> bool: - sql_table_name: Optional[str] = self._get_sql_table_name_field() if sql_table_name is None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py index da837da1613864..7ffb895349ed29 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py @@ -33,17 +33,11 @@ NAME: str = "name" -_BASE_PROJECT_NAME = "__BASE" +BASE_PROJECT_NAME = "__BASE" -_EXPLORE_FILE_EXTENSION = ".explore.lkml" - -_VIEW_FILE_EXTENSION = ".view.lkml" - -_MODEL_FILE_EXTENSION = ".model.lkml" - -VIEW_LANGUAGE_LOOKML: str = "lookml" - -VIEW_LANGUAGE_SQL: str = "sql" +EXPLORE_FILE_EXTENSION = ".explore.lkml" +VIEW_FILE_EXTENSION = ".view.lkml" +MODEL_FILE_EXTENSION = ".model.lkml" DERIVED_VIEW_SUFFIX = r".sql_table_name" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py index 892ed79754a1c2..6933d9d69394bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py @@ -5,7 +5,7 @@ from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition from datahub.ingestion.source.looker.looker_dataclasses import LookerModel -from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewFileLoader +from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.lookml_config import ( NAME, LookMLSourceConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index d258570ec384f7..c7d3724472d3c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -57,10 +57,8 @@ LookerViewContext, ) from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, - _MODEL_FILE_EXTENSION, - VIEW_LANGUAGE_LOOKML, - VIEW_LANGUAGE_SQL, + BASE_PROJECT_NAME, + MODEL_FILE_EXTENSION, LookerConnectionDefinition, LookMLSourceConfig, LookMLSourceReport, @@ -98,6 +96,9 @@ ) from datahub.sql_parsing.sqlglot_lineage import ColumnRef +VIEW_LANGUAGE_LOOKML: str = "lookml" +VIEW_LANGUAGE_SQL: str = "sql" + logger = logging.getLogger(__name__) @@ -143,7 +144,6 @@ def from_looker_dict( extract_col_level_lineage: bool = False, populate_sql_logic_in_descriptions: bool = False, ) -> Optional["LookerView"]: - view_name = view_context.name() logger.debug(f"Handling view {view_name} in model {model_name}") @@ -319,7 +319,7 @@ def _load_model(self, path: str) -> LookerModel: looker_model = LookerModel.from_looker_dict( parsed, - _BASE_PROJECT_NAME, + BASE_PROJECT_NAME, self.source_config.project_name, self.base_projects_folder, path, @@ -417,7 +417,6 @@ def _get_custom_properties(self, looker_view: LookerView) -> DatasetPropertiesCl def _build_dataset_mcps( self, looker_view: LookerView ) -> List[MetadataChangeProposalWrapper]: - view_urn = looker_view.id.get_urn(self.source_config) subTypeEvent = MetadataChangeProposalWrapper( @@ -501,7 +500,6 @@ def get_project_name(self, model_name: str) -> str: def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]: manifest_file = folder / "manifest.lkml" if manifest_file.exists(): - manifest_dict = load_and_preprocess_file( path=manifest_file, source_config=self.source_config ) @@ -544,7 +542,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.source_config.base_folder = checkout_dir.resolve() self.base_projects_folder[ - _BASE_PROJECT_NAME + BASE_PROJECT_NAME ] = self.source_config.base_folder visited_projects: Set[str] = set() @@ -576,7 +574,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.base_projects_folder[project] = p_ref self._recursively_check_manifests( - tmp_dir, _BASE_PROJECT_NAME, visited_projects + tmp_dir, BASE_PROJECT_NAME, visited_projects ) yield from self.get_internal_workunits() @@ -607,7 +605,7 @@ def _recursively_check_manifests( return # Special case handling if the root project has a name in the manifest file. - if project_name == _BASE_PROJECT_NAME and manifest.project_name: + if project_name == BASE_PROJECT_NAME and manifest.project_name: if ( self.source_config.project_name is not None and manifest.project_name != self.source_config.project_name @@ -696,7 +694,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 # The ** means "this directory and all subdirectories", and hence should # include all the files we want. model_files = sorted( - self.source_config.base_folder.glob(f"**/*{_MODEL_FILE_EXTENSION}") + self.source_config.base_folder.glob(f"**/*{MODEL_FILE_EXTENSION}") ) model_suffix_len = len(".model") @@ -832,7 +830,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 current_project_name: str = ( include.project - if include.project != _BASE_PROJECT_NAME + if include.project != BASE_PROJECT_NAME else project_name ) @@ -841,7 +839,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 base_folder_path: str = str( self.base_projects_folder.get( current_project_name, - self.base_projects_folder[_BASE_PROJECT_NAME], + self.base_projects_folder[BASE_PROJECT_NAME], ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py index 057dbca4281849..8cec6f2607774e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py @@ -12,6 +12,7 @@ ViewField, ViewFieldType, ) +from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewIdCache from datahub.ingestion.source.looker.lookml_concept_context import ( LookerFieldContext, @@ -20,7 +21,6 @@ from datahub.ingestion.source.looker.lookml_config import ( DERIVED_VIEW_SUFFIX, NAME, - LookerConnectionDefinition, LookMLSourceConfig, LookMLSourceReport, ) @@ -72,7 +72,6 @@ def resolve_derived_view_urn_of_col_ref( base_folder_path: str, config: LookMLSourceConfig, ) -> List[ColumnRef]: - new_column_refs: List[ColumnRef] = [] for col_ref in column_refs: if is_derived_view(col_ref.table.lower()): @@ -641,7 +640,6 @@ def create_view_upstream( ctx: PipelineContext, reporter: LookMLSourceReport, ) -> AbstractViewUpstream: - if view_context.is_regular_case(): return RegularViewUpstream( view_context=view_context, @@ -666,7 +664,6 @@ def create_view_upstream( view_context.is_sql_based_derived_view_without_fields_case(), ] ): - return DerivedQueryUpstreamSource( view_context=view_context, config=config, diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py index 08ed7677c7ab4c..9f96f837eb9b3a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py @@ -210,7 +210,6 @@ def _get_lineage_mcp( # extract the old lineage and save it for the new mcp if preserve_upstream: - client = get_default_graph() old_upstream_lineage = get_aspects_for_entity( diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index c87b025f13b55d..bbc4897d227bac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -50,25 +50,23 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, + DataPlatformInstanceClass, + DatasetPropertiesClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, SchemalessClass, - SchemaMetadata, + SchemaMetadataClass as SchemaMetadata, StringTypeClass, TimeTypeClass, UnionTypeClass, ) -from datahub.metadata.schema_classes import ( - DataPlatformInstanceClass, - DatasetPropertiesClass, -) from datahub.metadata.urns import DatasetUrn logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index 7072ebf6473df1..f55d7a883edefe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -464,7 +464,6 @@ def report_dropped(self, ent_name: str) -> None: @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations") class NifiSource(Source): - config: NifiSourceConfig report: NifiSourceReport diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 7c8487727c9eee..91fa2e96be2cce 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,7 +9,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -240,7 +240,7 @@ class PlatformDetail(ConfigModel): "recipe of other datahub sources.", ) env: str = pydantic.Field( - default=DEFAULT_ENV, + default=builder.DEFAULT_ENV, description="The environment that all assets produced by DataHub platform ingestion source belong to", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 61b1164825257e..8ffd54613eb380 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -72,10 +72,24 @@ def get_tables(native_query: str) -> List[str]: def remove_drop_statement(query: str) -> str: # Certain PowerBI M-Queries contain a combination of DROP and SELECT statements within SQL, causing SQLParser to fail on these queries. # Therefore, these occurrences are being removed. - # Regular expression to match patterns like "DROP TABLE IF EXISTS #;" - pattern = r"DROP TABLE IF EXISTS #\w+;?" - return re.sub(pattern, "", query) + patterns = [ + # Regular expression to match patterns like: + # "DROP TABLE IF EXISTS #;" + # "DROP TABLE IF EXISTS #, , ...;" + # "DROP TABLE IF EXISTS #, , ...\n" + r"DROP\s+TABLE\s+IF\s+EXISTS\s+(?:#?\w+(?:,\s*#?\w+)*)[;\n]", + ] + + new_query = query + + for pattern in patterns: + new_query = re.sub(pattern, "", new_query, flags=re.IGNORECASE) + + # Remove extra spaces caused by consecutive replacements + new_query = re.sub(r"\s+", " ", new_query).strip() + + return new_query def parse_custom_sql( @@ -87,7 +101,6 @@ def parse_custom_sql( env: str, platform_instance: Optional[str], ) -> Optional["SqlParsingResult"]: - logger.debug("Using sqlglot_lineage to parse custom sql") logger.debug(f"Processing native query using DataHub Sql Parser = {query}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 15524137c0a85e..97698a3d0d56c1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -66,7 +66,6 @@ def get_upstream_tables( config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, ) -> List[resolver.Lineage]: - if table.expression is None: logger.debug(f"There is no M-Query expression in table {table.full_name}") return [] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 9eafde2f75ecdd..a40e67d08da5b2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -65,7 +65,6 @@ def urn_creator( server: str, qualified_table_name: str, ) -> str: - platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( PowerBIPlatformDetail( data_platform_pair=data_platform_pair, @@ -83,6 +82,16 @@ def urn_creator( ) +def get_next_item(items: List[str], item: str) -> Optional[str]: + if item in items: + try: + index = items.index(item) + return items[index + 1] + except IndexError: + logger.debug(f'item:"{item}", not found in item-list: {items}') + return None + + class AbstractDataPlatformTableCreator(ABC): """ Base class to share common functionalities among different dataplatform for M-Query parsing. @@ -169,7 +178,6 @@ def create_reference_table( arg_list: Tree, table_detail: Dict[str, str], ) -> Optional[ReferencedTable]: - arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( tree_function.token_values(arg_list) @@ -209,7 +217,6 @@ def create_reference_table( def parse_custom_sql( self, query: str, server: str, database: Optional[str], schema: Optional[str] ) -> Lineage: - dataplatform_tables: List[DataPlatformTable] = [] platform_detail: PlatformDetail = ( @@ -367,7 +374,6 @@ def get_argument_list(invoke_expression: Tree) -> Optional[Tree]: return argument_list def take_first_argument(self, expression: Tree) -> Optional[Tree]: - # function is not data-access function, lets process function argument first_arg_tree: Optional[Tree] = tree_function.first_arg_list_func(expression) @@ -675,7 +681,7 @@ def two_level_access_pattern( data_access_func_detail.arg_list ) if server is None or db_name is None: - return Lineage.empty() # Return empty list + return Lineage.empty() # Return an empty list schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor @@ -775,39 +781,44 @@ def create_urn_using_old_parser( def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail ) -> Lineage: - arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( tree_function.token_values(data_access_func_detail.arg_list) ), ) - if len(arguments) == 2: - # It is a regular case of MS-SQL - logger.debug("Handling with regular case") - return self.two_level_access_pattern(data_access_func_detail) - - if len(arguments) >= 4 and arguments[2] != "Query": - logger.debug("Unsupported case is found. Second index is not the Query") - return Lineage.empty() + server, database = self.get_db_detail_from_argument( + data_access_func_detail.arg_list + ) + if server is None or database is None: + return Lineage.empty() # Return an empty list + + assert server + assert database # to silent the lint + + query: Optional[str] = get_next_item(arguments, "Query") + if query: + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return Lineage( + upstreams=self.create_urn_using_old_parser( + query=query, + db_name=database, + server=server, + ), + column_lineage=[], + ) - if self.config.enable_advance_lineage_sql_construct is False: - # Use previous parser to generate URN to keep backward compatibility - return Lineage( - upstreams=self.create_urn_using_old_parser( - query=arguments[3], - db_name=arguments[1], - server=arguments[0], - ), - column_lineage=[], + return self.parse_custom_sql( + query=query, + database=database, + server=server, + schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, ) - return self.parse_custom_sql( - query=arguments[3], - database=arguments[1], - server=arguments[0], - schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, - ) + # It is a regular case of MS-SQL + logger.debug("Handling with regular case") + return self.two_level_access_pattern(data_access_func_detail) class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator): @@ -881,7 +892,6 @@ def form_qualified_table_name( table_reference: ReferencedTable, data_platform_pair: DataPlatformPair, ) -> str: - platform_detail: PlatformDetail = ( self.platform_instance_resolver.get_platform_instance( PowerBIPlatformDetail( @@ -1154,27 +1164,19 @@ def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name ): return None - try: - if "Database" in data_access_tokens: - index = data_access_tokens.index("Database") - if data_access_tokens[index + 1] != Constant.M_QUERY_NULL: - # Database name is explicitly set in argument - return data_access_tokens[index + 1] - - if "Name" in data_access_tokens: - index = data_access_tokens.index("Name") - # Next element is value of the Name. It is a database name - return data_access_tokens[index + 1] - - if "Catalog" in data_access_tokens: - index = data_access_tokens.index("Catalog") - # Next element is value of the Catalog. In Databricks Catalog can also be used in place of a database. - return data_access_tokens[index + 1] - except IndexError as e: - logger.debug("Database name is not available", exc_info=e) + database: Optional[str] = get_next_item(data_access_tokens, "Database") - return None + if ( + database and database != Constant.M_QUERY_NULL + ): # database name is explicitly set + return database + + return get_next_item( # database name is set in Name argument + data_access_tokens, "Name" + ) or get_next_item( # If both above arguments are not available, then try Catalog + data_access_tokens, "Catalog" + ) def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index a59d58519d6bfe..e1301edef10b84 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -439,7 +439,6 @@ def get_app( self, app_id: str, ) -> Optional[App]: - raw_app: Optional[Dict] = self._get_app( app_id=app_id, ) @@ -1062,7 +1061,6 @@ def _get_app( self, app_id: str, ) -> Optional[Dict]: - app_endpoint = self.API_ENDPOINTS[Constant.GET_WORKSPACE_APP].format( POWERBI_ADMIN_BASE_URL=DataResolverBase.ADMIN_BASE_URL, APP_ID=app_id, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py index b49f1f09fa966e..5ae333430a78bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py @@ -40,7 +40,6 @@ def form_full_table_name( dataset_name: str, table_name: str, ) -> str: - full_table_name: str = "{}.{}".format( dataset_name.replace(" ", "_"), table_name.replace(" ", "_") ) @@ -596,7 +595,6 @@ def _fill_metadata_from_scan_result( return workspaces def _fill_independent_datasets(self, workspace: Workspace) -> None: - reachable_datasets: List[str] = [] # Find out reachable datasets for dashboard in workspace.dashboards: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py index 8854f9ff48348d..2a247d0c63957a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py @@ -126,7 +126,6 @@ def log_http_error(e: BaseException, message: str) -> Any: def get_response_dict(response: requests.Response, error_message: str) -> dict: - result_dict: dict = {} try: response.raise_for_status() diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py index 5fd63e7f93f92a..581e32d29dceaf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redash.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py @@ -41,7 +41,7 @@ ) from datahub.utilities.lossy_collections import LossyDict, LossyList from datahub.utilities.perf_timer import PerfTimer -from datahub.utilities.sql_parser import SQLParser +from datahub.utilities.sql_parser_base import SQLParser from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index b18b526ef30fce..71a20890d35e88 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -9,6 +9,8 @@ # We use 290 instead instead of the standard 320, because escape characters can add to the length. _QUERY_SEQUENCE_LIMIT = 290 +_MAX_COPY_ENTRIES_PER_TABLE = 20 + class RedshiftCommonQuery: CREATE_TEMP_TABLE_CLAUSE = "create temp table" @@ -293,28 +295,37 @@ def alter_table_rename_query( def list_copy_commands_sql( db_name: str, start_time: datetime, end_time: datetime ) -> str: - return """ - select - distinct - "schema" as target_schema, - "table" as target_table, - c.file_name as filename - from - SYS_QUERY_DETAIL as si - join SYS_LOAD_DETAIL as c on - si.query_id = c.query_id - join SVV_TABLE_INFO sti on - sti.table_id = si.table_id - where - database = '{db_name}' - and si.start_time >= '{start_time}' - and si.start_time < '{end_time}' - order by target_schema, target_table, si.start_time asc - """.format( + return """\ +SELECT DISTINCT + target_schema, + target_table, + filename +FROM ( + SELECT + sti."schema" AS target_schema, + sti."table" AS target_table, + c.file_name AS filename, + ROW_NUMBER() OVER ( + PARTITION BY sti."schema", sti."table" + ORDER BY si.start_time DESC + ) AS rn + FROM + SYS_QUERY_DETAIL AS si + JOIN SYS_LOAD_DETAIL AS c ON si.query_id = c.query_id + JOIN SVV_TABLE_INFO sti ON sti.table_id = si.table_id + WHERE + sti.database = '{db_name}' + AND si.start_time >= '{start_time}' + AND si.start_time < '{end_time}' +) subquery +WHERE rn <= {_MAX_COPY_ENTRIES_PER_TABLE} +ORDER BY target_schema, target_table, filename +""".format( # We need the original database name for filtering db_name=db_name, start_time=start_time.strftime(redshift_datetime_format), end_time=end_time.strftime(redshift_datetime_format), + _MAX_COPY_ENTRIES_PER_TABLE=_MAX_COPY_ENTRIES_PER_TABLE, ) @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 76030cea984946..4bc4c1451c262f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -436,7 +436,6 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit def _extract_metadata( self, connection: redshift_connector.Connection, database: str ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: - yield from self.gen_database_container( database=database, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index e8c70260ebc7ce..1863663f98bb24 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -804,7 +804,6 @@ def get_dir_to_process( protocol: str, min: bool = False, ) -> List[str]: - # if len(path_spec.include.split("/")) == len(f"{protocol}{bucket_name}/{folder}".split("/")): # return [f"{protocol}{bucket_name}/{folder}"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py index de0904107b9bbe..b75f15c0ce770e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py @@ -13,12 +13,9 @@ from urllib3.util.retry import Retry from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.source_common import ( - DEFAULT_ENV, - DatasetSourceConfigMixin, - EnvConfigMixin, -) +from datahub.configuration.source_common import DatasetSourceConfigMixin, EnvConfigMixin from datahub.emitter.mce_builder import ( + DEFAULT_ENV, dataset_urn_to_key, make_dashboard_urn, make_data_platform_urn, @@ -404,7 +401,6 @@ def get_model_workunits( columns = self.get_import_data_model_columns(model_id=model.model_id) for column in columns: - schema_field = SchemaFieldClass( fieldPath=column.name, type=self.get_schema_field_data_type(column), diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py index 54f7dfb5b903c7..ab7b887cba1d80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py @@ -3,15 +3,15 @@ from tableschema import Table from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, DateTypeClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, TimeTypeClass, UnionTypeClass, diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py index 1f2c73a2522d04..1659aaf6fa2020 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py @@ -7,14 +7,14 @@ from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase from datahub.ingestion.source.schema_inference.object import construct_schema -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, UnionTypeClass, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py index 1f3f2e0a1e8a83..efc605e0df8cab 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py @@ -4,7 +4,7 @@ import pyarrow.parquet from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, @@ -12,8 +12,8 @@ NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, TimeTypeClass, UnionTypeClass, diff --git a/metadata-ingestion/src/datahub/configuration/oauth.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_config.py similarity index 100% rename from metadata-ingestion/src/datahub/configuration/oauth.py rename to metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_config.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_generator.py similarity index 97% rename from metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py rename to metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_generator.py index 7231c6ef6b1df5..a2dc0118b39782 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_generator.py @@ -8,7 +8,7 @@ from OpenSSL.crypto import FILETYPE_PEM, load_certificate from pydantic.types import SecretStr -from datahub.configuration.oauth import OAuthIdentityProvider +from datahub.ingestion.source.snowflake.oauth_config import OAuthIdentityProvider logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py index a9f454cfd3cdb3..397606400d389c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py @@ -16,14 +16,17 @@ from datahub.configuration.common import ConfigModel, ConfigurationError, MetaError from datahub.configuration.connection_resolver import auto_connection_resolver -from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.source.snowflake.constants import ( CLIENT_PREFETCH_THREADS, CLIENT_SESSION_KEEP_ALIVE, ) -from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator +from datahub.ingestion.source.snowflake.oauth_config import ( + OAuthConfiguration, + OAuthIdentityProvider, +) +from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri from datahub.utilities.config_clean import ( remove_protocol, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 6f9c9259b27844..e065e2f34bc66d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -31,14 +31,16 @@ ) from datahub.metadata.schema_classes import DatasetLineageTypeClass, UpstreamClass from datahub.sql_parsing.sql_parsing_aggregator import ( - ColumnLineageInfo, - ColumnRef, KnownLineageMapping, KnownQueryLineageInfo, SqlParsingAggregator, UrnStr, ) -from datahub.sql_parsing.sqlglot_lineage import DownstreamColumnRef +from datahub.sql_parsing.sqlglot_lineage import ( + ColumnLineageInfo, + ColumnRef, + DownstreamColumnRef, +) from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.time import ts_millis_to_datetime @@ -234,7 +236,6 @@ def populate_known_query_lineage( def get_known_query_lineage( self, query: Query, dataset_name: str, db_row: UpstreamLineageEdge ) -> Optional[KnownQueryLineageInfo]: - if not db_row.UPSTREAM_TABLES: return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index dd7f73268fdc4f..538841018067e2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -48,11 +48,9 @@ SnowflakeQueriesExtractor, SnowflakeQueriesExtractorConfig, ) +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_schema import ( - SnowflakeDataDictionary, - SnowflakeQuery, -) +from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDataDictionary from datahub.ingestion.source.snowflake.snowflake_schema_gen import ( SnowflakeSchemaGenerator, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py b/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py index 5356cee7f6ea30..76b72d8e37f74b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py @@ -28,7 +28,6 @@ class CockroachDBConfig(PostgresConfig): @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class CockroachDBSource(PostgresSource): - config: CockroachDBConfig def __init__(self, config: CockroachDBConfig, ctx: PipelineContext): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index 21e7fad3343314..5107a4e38f64de 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -101,6 +101,7 @@ class StoredProcedure: flow: Union[MSSQLJob, MSSQLProceduresContainer] type: str = "STORED_PROCEDURE" source: str = "mssql" + code: Optional[str] = None @property def full_type(self) -> str: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index c19b22a8622ca2..9ab9c76c30417f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -24,6 +24,8 @@ platform_name, support_status, ) +from datahub.ingestion.api.source import StructuredLogLevel +from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.mssql.job_models import ( JobStep, @@ -36,6 +38,9 @@ ProcedureParameter, StoredProcedure, ) +from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import ( + generate_procedure_lineage, +) from datahub.ingestion.source.sql.sql_common import ( SQLAlchemySource, SqlWorkUnit, @@ -51,6 +56,7 @@ StringTypeClass, UnionTypeClass, ) +from datahub.utilities.file_backed_collections import FileBackedList logger: logging.Logger = logging.getLogger(__name__) @@ -99,6 +105,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig): default=False, description="Enable to convert the SQL Server assets urns to lowercase", ) + include_lineage: bool = Field( + default=True, + description="Enable lineage extraction for stored procedures", + ) @pydantic.validator("uri_args") def passwords_match(cls, v, values, **kwargs): @@ -161,6 +171,7 @@ def __init__(self, config: SQLServerConfig, ctx: PipelineContext): self.current_database = None self.table_descriptions: Dict[str, str] = {} self.column_descriptions: Dict[str, str] = {} + self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList() if self.config.include_descriptions: for inspector in self.get_inspectors(): db_name: str = self.get_db_name(inspector) @@ -374,7 +385,7 @@ def loop_jobs( def loop_job_steps( self, job: MSSQLJob, job_steps: Dict[str, Any] ) -> Iterable[MetadataWorkUnit]: - for step_id, step_data in job_steps.items(): + for _step_id, step_data in job_steps.items(): step = JobStep( job_name=job.formatted_name, step_name=step_data["step_name"], @@ -412,37 +423,44 @@ def loop_stored_procedures( # noqa: C901 if procedures: yield from self.construct_flow_workunits(data_flow=data_flow) for procedure in procedures: - upstream = self._get_procedure_upstream(conn, procedure) - downstream = self._get_procedure_downstream(conn, procedure) - data_job = MSSQLDataJob( - entity=procedure, - ) - # TODO: because of this upstream and downstream are more dependencies, - # can't be used as DataJobInputOutput. - # Should be reorganized into lineage. - data_job.add_property("procedure_depends_on", str(upstream.as_property)) - data_job.add_property( - "depending_on_procedure", str(downstream.as_property) - ) - procedure_definition, procedure_code = self._get_procedure_code( - conn, procedure - ) - if procedure_definition: - data_job.add_property("definition", procedure_definition) - if sql_config.include_stored_procedures_code and procedure_code: - data_job.add_property("code", procedure_code) - procedure_inputs = self._get_procedure_inputs(conn, procedure) - properties = self._get_procedure_properties(conn, procedure) - data_job.add_property( - "input parameters", str([param.name for param in procedure_inputs]) - ) - for param in procedure_inputs: - data_job.add_property( - f"parameter {param.name}", str(param.properties) - ) - for property_name, property_value in properties.items(): - data_job.add_property(property_name, str(property_value)) - yield from self.construct_job_workunits(data_job) + yield from self._process_stored_procedure(conn, procedure) + + def _process_stored_procedure( + self, conn: Connection, procedure: StoredProcedure + ) -> Iterable[MetadataWorkUnit]: + upstream = self._get_procedure_upstream(conn, procedure) + downstream = self._get_procedure_downstream(conn, procedure) + data_job = MSSQLDataJob( + entity=procedure, + ) + # TODO: because of this upstream and downstream are more dependencies, + # can't be used as DataJobInputOutput. + # Should be reorganized into lineage. + data_job.add_property("procedure_depends_on", str(upstream.as_property)) + data_job.add_property("depending_on_procedure", str(downstream.as_property)) + procedure_definition, procedure_code = self._get_procedure_code(conn, procedure) + procedure.code = procedure_code + if procedure_definition: + data_job.add_property("definition", procedure_definition) + if procedure_code and self.config.include_stored_procedures_code: + data_job.add_property("code", procedure_code) + procedure_inputs = self._get_procedure_inputs(conn, procedure) + properties = self._get_procedure_properties(conn, procedure) + data_job.add_property( + "input parameters", str([param.name for param in procedure_inputs]) + ) + for param in procedure_inputs: + data_job.add_property(f"parameter {param.name}", str(param.properties)) + for property_name, property_value in properties.items(): + data_job.add_property(property_name, str(property_value)) + if self.config.include_lineage: + # These will be used to construct lineage + self.stored_procedures.append(procedure) + yield from self.construct_job_workunits( + data_job, + # For stored procedure lineage is ingested later + include_lineage=False, + ) @staticmethod def _get_procedure_downstream( @@ -546,8 +564,8 @@ def _get_procedure_code( code_list.append(row["Text"]) if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip(): code_slice_index = index - definition = "\n".join(code_list[:code_slice_index]) - code = "\n".join(code_list[code_slice_index:]) + definition = "".join(code_list[:code_slice_index]) + code = "".join(code_list[code_slice_index:]) except ResourceClosedError: logger.warning( "Connection was closed from procedure '%s'", @@ -602,16 +620,18 @@ def _get_stored_procedures( def construct_job_workunits( self, data_job: MSSQLDataJob, + include_lineage: bool = True, ) -> Iterable[MetadataWorkUnit]: yield MetadataChangeProposalWrapper( entityUrn=data_job.urn, aspect=data_job.as_datajob_info_aspect, ).as_workunit() - yield MetadataChangeProposalWrapper( - entityUrn=data_job.urn, - aspect=data_job.as_datajob_input_output_aspect, - ).as_workunit() + if include_lineage: + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_input_output_aspect, + ).as_workunit() # TODO: Add SubType when it appear def construct_flow_workunits( @@ -664,3 +684,58 @@ def get_identifier( if self.config.convert_urns_to_lowercase else qualified_table_name ) + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + yield from super().get_workunits_internal() + + # This is done at the end so that we will have access to tables + # from all databases in schema_resolver and discovered_tables + for procedure in self.stored_procedures: + with self.report.report_exc( + message="Failed to parse stored procedure lineage", + context=procedure.full_name, + level=StructuredLogLevel.WARN, + ): + yield from auto_workunit( + generate_procedure_lineage( + schema_resolver=self.schema_resolver, + procedure=procedure, + procedure_job_urn=MSSQLDataJob(entity=procedure).urn, + is_temp_table=self.is_temp_table, + ) + ) + + def is_temp_table(self, name: str) -> bool: + try: + parts = name.split(".") + table_name = parts[-1] + schema_name = parts[-2] + db_name = parts[-3] + + if table_name.startswith("#"): + return True + + # This is also a temp table if + # 1. this name would be allowed by the dataset patterns, and + # 2. we have a list of discovered tables, and + # 3. it's not in the discovered tables list + if ( + self.config.database_pattern.allowed(db_name) + and self.config.schema_pattern.allowed(schema_name) + and self.config.table_pattern.allowed(name) + and self.standardize_identifier_case(name) + not in self.discovered_datasets + ): + logger.debug(f"inferred as temp table {name}") + return True + + except Exception: + logger.warning(f"Error parsing table name {name} ") + return False + + def standardize_identifier_case(self, table_ref_str: str) -> str: + return ( + table_ref_str.lower() + if self.config.convert_urns_to_lowercase + else table_ref_str + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py new file mode 100644 index 00000000000000..b979a270a55282 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py @@ -0,0 +1,84 @@ +import logging +from typing import Callable, Iterable, Optional + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure +from datahub.metadata.schema_classes import DataJobInputOutputClass +from datahub.sql_parsing.datajob import to_datajob_input_output +from datahub.sql_parsing.schema_resolver import SchemaResolver +from datahub.sql_parsing.split_statements import split_statements +from datahub.sql_parsing.sql_parsing_aggregator import ( + ObservedQuery, + SqlParsingAggregator, +) + +logger = logging.getLogger(__name__) + + +def parse_procedure_code( + *, + schema_resolver: SchemaResolver, + default_db: Optional[str], + default_schema: Optional[str], + code: str, + is_temp_table: Callable[[str], bool], + raise_: bool = False, +) -> Optional[DataJobInputOutputClass]: + aggregator = SqlParsingAggregator( + platform=schema_resolver.platform, + env=schema_resolver.env, + schema_resolver=schema_resolver, + generate_lineage=True, + generate_queries=False, + generate_usage_statistics=False, + generate_operations=False, + generate_query_subject_fields=False, + generate_query_usage_statistics=False, + is_temp_table=is_temp_table, + ) + for query in split_statements(code): + # TODO: We should take into account `USE x` statements. + aggregator.add_observed_query( + observed=ObservedQuery( + default_db=default_db, + default_schema=default_schema, + query=query, + ) + ) + if aggregator.report.num_observed_queries_failed and raise_: + logger.info(aggregator.report.as_string()) + raise ValueError( + f"Failed to parse {aggregator.report.num_observed_queries_failed} queries." + ) + + mcps = list(aggregator.gen_metadata()) + return to_datajob_input_output( + mcps=mcps, + ignore_extra_mcps=True, + ) + + +# Is procedure handling generic enough to be added to SqlParsingAggregator? +def generate_procedure_lineage( + *, + schema_resolver: SchemaResolver, + procedure: StoredProcedure, + procedure_job_urn: str, + is_temp_table: Callable[[str], bool] = lambda _: False, + raise_: bool = False, +) -> Iterable[MetadataChangeProposalWrapper]: + if procedure.code: + datajob_input_output = parse_procedure_code( + schema_resolver=schema_resolver, + default_db=procedure.db, + default_schema=procedure.schema, + code=procedure.code, + is_temp_table=is_temp_table, + raise_=raise_, + ) + + if datajob_input_output: + yield MetadataChangeProposalWrapper( + entityUrn=procedure_job_urn, + aspect=datajob_input_output, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py index bce3b29130ec9e..766b704d6ffafe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py @@ -178,7 +178,6 @@ def get_table_names(self, schema: Optional[str] = None) -> List[str]: ] def get_view_names(self, schema: Optional[str] = None) -> List[str]: - schema = self._inspector_instance.dialect.denormalize_name( schema or self.default_schema_name ) @@ -200,7 +199,6 @@ def get_view_names(self, schema: Optional[str] = None) -> List[str]: def get_columns( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> List[dict]: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -344,7 +342,6 @@ def get_columns( return columns def get_table_comment(self, table_name: str, schema: Optional[str] = None) -> Dict: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -416,7 +413,6 @@ def _get_constraint_data( def get_pk_constraint( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> Dict: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -458,7 +454,6 @@ def get_pk_constraint( def get_foreign_keys( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> List: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -540,7 +535,6 @@ def fkey_rec(): def get_view_definition( self, view_name: str, schema: Optional[str] = None ) -> Union[str, None]: - denormalized_view_name = self._inspector_instance.dialect.denormalize_name( view_name ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 238fd88f1c9509..ae6116326da33c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -32,6 +32,7 @@ make_data_platform_urn, make_dataplatform_instance_urn, make_dataset_urn_with_platform_instance, + make_schema_field_urn, make_tag_urn, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -391,6 +392,7 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str) platform_instance=self.config.platform_instance, env=self.config.env, ) + self.discovered_datasets: Set[str] = set() self._view_definition_cache: MutableMapping[str, str] if self.config.use_file_backed_cache: self._view_definition_cache = FileBackedDict[str]() @@ -669,7 +671,7 @@ def get_foreign_key_metadata( ) source_fields = [ - f"urn:li:schemaField:({dataset_urn},{f})" + make_schema_field_urn(dataset_urn, f) for f in fk_dict["constrained_columns"] ] foreign_dataset = make_dataset_urn_with_platform_instance( @@ -679,7 +681,7 @@ def get_foreign_key_metadata( env=self.config.env, ) foreign_fields = [ - f"urn:li:schemaField:({foreign_dataset},{f})" + make_schema_field_urn(foreign_dataset, f) for f in fk_dict["referred_columns"] ] @@ -830,8 +832,9 @@ def _process_table( self._classify(dataset_name, schema, table, data_reader, schema_metadata) dataset_snapshot.aspects.append(schema_metadata) - if self.config.include_view_lineage: + if self._save_schema_to_resolver(): self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) + self.discovered_datasets.add(dataset_name) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container( @@ -1125,8 +1128,9 @@ def _process_view( columns, canonical_schema=schema_fields, ) - if self.config.include_view_lineage: + if self._save_schema_to_resolver(): self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) + self.discovered_datasets.add(dataset_name) description, properties, _ = self.get_table_properties(inspector, schema, view) try: view_definition = inspector.get_view_definition(view, schema) @@ -1189,6 +1193,11 @@ def _process_view( domain_registry=self.domain_registry, ) + def _save_schema_to_resolver(self): + return self.config.include_view_lineage or ( + hasattr(self.config, "include_lineage") and self.config.include_lineage + ) + def _run_sql_parser( self, view_identifier: str, query: str, schema_resolver: SchemaResolver ) -> Optional[SqlParsingResult]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py b/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py index 318395d4e66b2a..2b10ca1fa57ed8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py @@ -146,7 +146,11 @@ def urn_count(self) -> int: def compute_percent_entities_changed( new_entities: List[str], old_entities: List[str] ) -> float: - (overlap_count, old_count, _,) = _get_entity_overlap_and_cardinalities( + ( + overlap_count, + old_count, + _, + ) = _get_entity_overlap_and_cardinalities( new_entities=new_entities, old_entities=old_entities ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index c1d899f11f2e1d..0eafdb4ad23ba0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -2117,7 +2117,6 @@ def parse_custom_sql( def _enrich_database_tables_with_parsed_schemas( self, parsing_result: SqlParsingResult ) -> None: - in_tables_schemas: Dict[ str, Set[str] ] = transform_parsing_result_to_in_tables_schemas(parsing_result) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py index ce224bde003fd3..bb1c297513de10 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py @@ -105,7 +105,6 @@ class SimpleAddDatasetDataProduct(AddDatasetDataProduct): """Transformer that adds a specified dataproduct entity for provided dataset as its asset.""" def __init__(self, config: SimpleDatasetDataProductConfig, ctx: PipelineContext): - generic_config = AddDatasetDataProductConfig( get_data_product_to_add=lambda dataset_urn: config.dataset_to_data_product_urns.get( dataset_urn diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py index ef6ef43fa2d7f3..c60f4dca28882d 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py @@ -67,7 +67,6 @@ def transform_aspect( def handle_end_of_stream( self, ) -> List[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]: - mcps: List[ Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass] ] = [] diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py index 245a3aa3d9db15..212e018dd64fb7 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py @@ -105,7 +105,6 @@ def convert_tag_as_per_mapping(self, tag: str) -> str: def handle_end_of_stream( self, ) -> Sequence[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]: - return self.owner_mcps def transform_aspect( diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py b/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py index 57af10d1040c8a..f6847f234aefe6 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py @@ -103,7 +103,6 @@ def create( def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: - in_container_properties_aspect: ContainerPropertiesClass = cast( ContainerPropertiesClass, aspect ) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py index 338f191c0829df..7e6125079f16e3 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py @@ -84,7 +84,6 @@ def get_tags_from_schema_metadata( def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: - in_glossary_terms: Optional[GlossaryTermsClass] = cast( Optional[GlossaryTermsClass], aspect ) diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py index 5b079129e0a9c5..facc7d107d1ba7 100644 --- a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py @@ -72,7 +72,6 @@ def _(self, assertion: FixedIntervalFreshnessAssertion) -> str: @metric_sql.register def _(self, assertion: RowCountTotalVolumeAssertion) -> str: - # Can not use information schema here due to error - # Data metric function body cannot refer to the non-deterministic function 'CURRENT_DATABASE_MAIN_METASTORE_ID'. diff --git a/metadata-ingestion/src/datahub/specific/dashboard.py b/metadata-ingestion/src/datahub/specific/dashboard.py index 8228dbc011db2f..f57df15914369c 100644 --- a/metadata-ingestion/src/datahub/specific/dashboard.py +++ b/metadata-ingestion/src/datahub/specific/dashboard.py @@ -433,7 +433,6 @@ def set_description(self, description: str) -> "DashboardPatchBuilder": def add_custom_properties( self, custom_properties: Optional[Dict[str, str]] = None ) -> "DashboardPatchBuilder": - if custom_properties: for key, value in custom_properties.items(): self.custom_properties_patch_helper.add_property(key, value) diff --git a/metadata-ingestion/src/datahub/specific/dataset.py b/metadata-ingestion/src/datahub/specific/dataset.py index 9dd2616078f08d..b171dc4cc2939f 100644 --- a/metadata-ingestion/src/datahub/specific/dataset.py +++ b/metadata-ingestion/src/datahub/specific/dataset.py @@ -13,7 +13,7 @@ KafkaAuditHeaderClass, OwnerClass as Owner, OwnershipTypeClass, - SchemaMetadataClass as SchemaMetadata, + SchemaMetadataClass, SystemMetadataClass, TagAssociationClass as Tag, UpstreamClass as Upstream, @@ -40,7 +40,7 @@ def __init__( self.aspect_name = ( EditableSchemaMetadata.ASPECT_NAME if editable - else SchemaMetadata.ASPECT_NAME + else SchemaMetadataClass.ASPECT_NAME ) self.aspect_field = "editableSchemaFieldInfo" if editable else "schemaFieldInfo" diff --git a/metadata-ingestion/src/datahub/sql_parsing/_models.py b/metadata-ingestion/src/datahub/sql_parsing/_models.py index d92d178b81cf4b..d586e7d6d9045b 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/_models.py +++ b/metadata-ingestion/src/datahub/sql_parsing/_models.py @@ -42,6 +42,8 @@ def __lt__(self, other: "_FrozenModel") -> bool: class _TableName(_FrozenModel): + # TODO: Move this into the schema_resolver.py file. + database: Optional[str] = None db_schema: Optional[str] = None table: str diff --git a/metadata-ingestion/src/datahub/sql_parsing/datajob.py b/metadata-ingestion/src/datahub/sql_parsing/datajob.py new file mode 100644 index 00000000000000..215b207c3dcf51 --- /dev/null +++ b/metadata-ingestion/src/datahub/sql_parsing/datajob.py @@ -0,0 +1,50 @@ +import logging +from typing import Iterable, List, Optional + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + DataJobInputOutputClass, + FineGrainedLineageClass, + UpstreamLineageClass, +) + +logger = logging.getLogger(__name__) + + +def to_datajob_input_output( + *, mcps: Iterable[MetadataChangeProposalWrapper], ignore_extra_mcps: bool = True +) -> Optional[DataJobInputOutputClass]: + inputDatasets: List[str] = [] + outputDatasets: List[str] = [] + fineGrainedLineages: List[FineGrainedLineageClass] = [] + for mcp in mcps: + # TODO: Represent simple write operations without lineage as outputDatasets. + + upstream_lineage = mcp.as_workunit().get_aspect_of_type(UpstreamLineageClass) + if upstream_lineage is not None: + if mcp.entityUrn and mcp.entityUrn not in outputDatasets: + outputDatasets.append(mcp.entityUrn) + + for upstream in upstream_lineage.upstreams: + if upstream.dataset not in inputDatasets: + inputDatasets.append(upstream.dataset) + + if upstream_lineage.fineGrainedLineages: + for fineGrainedLineage in upstream_lineage.fineGrainedLineages: + fineGrainedLineages.append(fineGrainedLineage) + + elif ignore_extra_mcps: + pass + else: + raise ValueError( + f"Expected an upstreamLineage aspect, got {mcp.aspectName} for {mcp.entityUrn}" + ) + + if not inputDatasets and not outputDatasets: + return None + + return DataJobInputOutputClass( + inputDatasets=inputDatasets, + outputDatasets=outputDatasets, + fineGrainedLineages=fineGrainedLineages, + ) diff --git a/metadata-ingestion/src/datahub/sql_parsing/query_types.py b/metadata-ingestion/src/datahub/sql_parsing/query_types.py index 2acad19418c113..802fb3e993f428 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/query_types.py +++ b/metadata-ingestion/src/datahub/sql_parsing/query_types.py @@ -14,7 +14,16 @@ def _is_temp_table(table: sqlglot.exp.Table, dialect: sqlglot.Dialect) -> bool: identifier: sqlglot.exp.Identifier = table.this return identifier.args.get("temporary") or ( - is_dialect_instance(dialect, "redshift") and identifier.name.startswith("#") + # These dialects use # as a prefix for temp tables. + is_dialect_instance( + dialect, + [ + "redshift", + "mssql", + # sybase is another one, but we don't support that dialect yet. + ], + ) + and identifier.name.startswith("#") ) diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py index e7b0527d30d978..e3f2fbc786b437 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py +++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py @@ -13,7 +13,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.metadata.schema_classes import SchemaFieldClass, SchemaMetadataClass from datahub.metadata.urns import DataPlatformUrn -from datahub.sql_parsing._models import _TableName +from datahub.sql_parsing._models import _TableName as _TableName # noqa: I250 from datahub.sql_parsing.sql_parsing_common import PLATFORMS_WITH_CASE_SENSITIVE_TABLES from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path diff --git a/metadata-ingestion/src/datahub/sql_parsing/split_statements.py b/metadata-ingestion/src/datahub/sql_parsing/split_statements.py new file mode 100644 index 00000000000000..42dda4e62158b0 --- /dev/null +++ b/metadata-ingestion/src/datahub/sql_parsing/split_statements.py @@ -0,0 +1,163 @@ +import re +from enum import Enum +from typing import Generator, List, Tuple + +CONTROL_FLOW_KEYWORDS = [ + "GO", + r"BEGIN\w+TRY", + r"BEGIN\w+CATCH", + "BEGIN", + r"END\w+TRY", + r"END\w+CATCH", + "END", +] + +# There's an exception to this rule, which is when the statement +# is preceeded by a CTE. +FORCE_NEW_STATEMENT_KEYWORDS = [ + # SELECT is used inside queries as well, so we can't include it here. + "INSERT", + "UPDATE", + "DELETE", + "MERGE", +] + + +class ParserState(Enum): + NORMAL = 1 + STRING = 2 + COMMENT = 3 + MULTILINE_COMMENT = 4 + + +def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool: + """ + Check if a keyword exists at the given position using regex word boundaries. + """ + if pos + len(keyword) > len(sql): + return False + + # If we're not at a word boundary, we can't generate a keyword. + if pos > 0 and not ( + bool(re.match(r"\w\W", sql[pos - 1 : pos + 1])) + or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1])) + ): + return False + + pattern = rf"^{re.escape(keyword)}\b" + match = re.match(pattern, sql[pos:], re.IGNORECASE) + return bool(match) + + +def _look_ahead_for_keywords( + sql: str, pos: int, keywords: List[str] +) -> Tuple[bool, str, int]: + """ + Look ahead for SQL keywords at the current position. + """ + + for keyword in keywords: + if _is_keyword_at_position(sql, pos, keyword): + return True, keyword, len(keyword) + return False, "", 0 + + +def split_statements(sql: str) -> Generator[str, None, None]: + """ + Split T-SQL code into individual statements, handling various SQL constructs. + """ + if not sql or not sql.strip(): + return + + current_statement: List[str] = [] + state = ParserState.NORMAL + i = 0 + + def yield_if_complete() -> Generator[str, None, None]: + statement = "".join(current_statement).strip() + if statement: + yield statement + current_statement.clear() + + prev_real_char = "\0" # the most recent non-whitespace, non-comment character + while i < len(sql): + c = sql[i] + next_char = sql[i + 1] if i < len(sql) - 1 else "\0" + + if state == ParserState.NORMAL: + if c == "'": + state = ParserState.STRING + current_statement.append(c) + prev_real_char = c + elif c == "-" and next_char == "-": + state = ParserState.COMMENT + current_statement.append(c) + current_statement.append(next_char) + i += 1 + elif c == "/" and next_char == "*": + state = ParserState.MULTILINE_COMMENT + current_statement.append(c) + current_statement.append(next_char) + i += 1 + else: + most_recent_real_char = prev_real_char + if not c.isspace(): + prev_real_char = c + + is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords( + sql, i, keywords=CONTROL_FLOW_KEYWORDS + ) + if is_control_keyword: + # Yield current statement if any + yield from yield_if_complete() + # Yield keyword as its own statement + yield keyword + i += keyword_len + continue + + ( + is_force_new_statement_keyword, + keyword, + keyword_len, + ) = _look_ahead_for_keywords( + sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS + ) + if ( + is_force_new_statement_keyword and most_recent_real_char != ")" + ): # usually we'd have a close paren that closes a CTE + # Force termination of current statement + yield from yield_if_complete() + + current_statement.append(keyword) + i += keyword_len + continue + + elif c == ";": + yield from yield_if_complete() + else: + current_statement.append(c) + + elif state == ParserState.STRING: + current_statement.append(c) + if c == "'" and next_char == "'": + current_statement.append(next_char) + i += 1 + elif c == "'": + state = ParserState.NORMAL + + elif state == ParserState.COMMENT: + current_statement.append(c) + if c == "\n": + state = ParserState.NORMAL + + elif state == ParserState.MULTILINE_COMMENT: + current_statement.append(c) + if c == "*" and next_char == "/": + current_statement.append(next_char) + i += 1 + state = ParserState.NORMAL + + i += 1 + + # Handle the last statement + yield from yield_if_complete() diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index e8a0369597d53a..44f0d7be7aad9a 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -762,7 +762,6 @@ def add_observed_query( This assumes that queries come in order of increasing timestamps. """ - self.report.num_observed_queries += 1 # All queries with no session ID are assumed to be part of the same session. @@ -831,7 +830,6 @@ def add_preparsed_query( session_has_temp_tables: bool = True, _is_internal: bool = False, ) -> None: - # Adding tool specific metadata extraction here allows it # to work for both ObservedQuery and PreparsedQuery as # add_preparsed_query it used within add_observed_query. diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_common.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_common.py index a1b850b56677a4..ec7dbc8251b200 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_common.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_common.py @@ -21,6 +21,9 @@ # See more below: # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/acreldb/n0ejgx4895bofnn14rlguktfx5r3.htm "teradata", + # For SQL server, the default collation rules mean that all identifiers (schema, table, column names) + # are case preserving but case insensitive. + "mssql", } DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = { # In some dialects, column identifiers are effectively case insensitive @@ -28,6 +31,9 @@ # automatically lowercase unquoted identifiers. "snowflake", } +assert DIALECTS_WITH_DEFAULT_UPPERCASE_COLS.issubset( + DIALECTS_WITH_CASE_INSENSITIVE_COLS +) class QueryType(enum.Enum): diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index b635f8cb47b6d2..9adb792a4be518 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -5,7 +5,7 @@ import logging import traceback from collections import defaultdict -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, Union import pydantic.dataclasses import sqlglot @@ -873,6 +873,49 @@ def _translate_internal_column_lineage( ) +_StrOrNone = TypeVar("_StrOrNone", str, Optional[str]) + + +def _normalize_db_or_schema( + db_or_schema: _StrOrNone, + dialect: sqlglot.Dialect, +) -> _StrOrNone: + if db_or_schema is None: + return None + + # In snowflake, table identifiers must be uppercased to match sqlglot's behavior. + if is_dialect_instance(dialect, "snowflake"): + return db_or_schema.upper() + + # In mssql, table identifiers must be lowercased. + elif is_dialect_instance(dialect, "mssql"): + return db_or_schema.lower() + + return db_or_schema + + +def _simplify_select_into(statement: sqlglot.exp.Expression) -> sqlglot.exp.Expression: + """ + Check if the expression is a SELECT INTO statement. If so, converts it into a CTAS. + Other expressions are returned as-is. + """ + + if not (isinstance(statement, sqlglot.exp.Select) and statement.args.get("into")): + return statement + + # Convert from SELECT INTO + # to CREATE TABLE AS SELECT + into_expr: sqlglot.exp.Into = statement.args["into"].pop() + into_table = into_expr.this + + create = sqlglot.exp.Create( + this=into_table, + kind="TABLE", + expression=statement, + ) + return create + + def _sqlglot_lineage_inner( sql: sqlglot.exp.ExpOrStr, schema_resolver: SchemaResolverInterface, @@ -880,18 +923,14 @@ def _sqlglot_lineage_inner( default_schema: Optional[str] = None, default_dialect: Optional[str] = None, ) -> SqlParsingResult: - if not default_dialect: dialect = get_dialect(schema_resolver.platform) else: dialect = get_dialect(default_dialect) - if is_dialect_instance(dialect, "snowflake"): - # in snowflake, table identifiers must be uppercased to match sqlglot's behavior. - if default_db: - default_db = default_db.upper() - if default_schema: - default_schema = default_schema.upper() + default_db = _normalize_db_or_schema(default_db, dialect) + default_schema = _normalize_db_or_schema(default_schema, dialect) + if is_dialect_instance(dialect, "redshift") and not default_schema: # On Redshift, there's no "USE SCHEMA " command. The default schema # is public, and "current schema" is the one at the front of the search path. @@ -904,12 +943,23 @@ def _sqlglot_lineage_inner( logger.debug("Parsing lineage from sql statement: %s", sql) statement = parse_statement(sql, dialect=dialect) + if isinstance(statement, sqlglot.exp.Command): + # For unsupported syntax, sqlglot will usually fallback to parsing as a Command. + # This is effectively a parsing error, and we won't get any lineage from it. + # See https://github.com/tobymao/sqlglot/commit/3a13fdf4e597a2f0a3f9fc126a129183fe98262f + # and https://github.com/tobymao/sqlglot/pull/2874 + raise UnsupportedStatementTypeError( + f"Got unsupported syntax for statement: {sql}" + ) + original_statement, statement = statement, statement.copy() # logger.debug( # "Formatted sql statement: %s", # original_statement.sql(pretty=True, dialect=dialect), # ) + statement = _simplify_select_into(statement) + # Make sure the tables are resolved with the default db / schema. # This only works for Unionable statements. For other types of statements, # we have to do it manually afterwards, but that's slightly lower accuracy diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py index 5413161d0b7a3d..bd98557e08aacc 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py @@ -61,7 +61,7 @@ def is_dialect_instance( else: platforms = list(platforms) - dialects = [sqlglot.Dialect.get_or_raise(platform) for platform in platforms] + dialects = [get_dialect(platform) for platform in platforms] if any(isinstance(dialect, dialect_class.__class__) for dialect_class in dialects): return True diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index cdd35c23e30885..0d85002776e5e2 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -79,7 +79,6 @@ def _extract_mode_query(self, entry: QueryLog) -> bool: return True def extract_bi_metadata(self, entry: QueryLog) -> bool: - for tool, meta_extractor in self.known_tool_extractors: try: if meta_extractor(entry): diff --git a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py index 72b5f6c5e26e4b..13be45ec1be28d 100644 --- a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py +++ b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py @@ -6,12 +6,8 @@ import deepdiff from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.sql_parsing.schema_resolver import SchemaResolver -from datahub.sql_parsing.sqlglot_lineage import ( - SchemaInfo, - SqlParsingResult, - sqlglot_lineage, -) +from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver +from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, sqlglot_lineage logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/testing/mcp_diff.py b/metadata-ingestion/src/datahub/testing/mcp_diff.py index 95b8e83c7a64a5..5e669a718e9ad3 100644 --- a/metadata-ingestion/src/datahub/testing/mcp_diff.py +++ b/metadata-ingestion/src/datahub/testing/mcp_diff.py @@ -206,7 +206,7 @@ def apply_delta(self, golden: List[Dict[str, Any]]) -> None: """ aspect_diffs = [v for d in self.aspect_changes.values() for v in d.values()] for aspect_diff in aspect_diffs: - for (_, old, new) in aspect_diff.aspects_changed.keys(): + for _, old, new in aspect_diff.aspects_changed.keys(): golden[old.delta_info.idx] = new.delta_info.original indices_to_remove = set() diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py index b67f9c1a9ffa15..b0f5022446de15 100644 --- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py +++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py @@ -228,6 +228,12 @@ def __post_init__(self) -> None: else: self._conn = ConnectionWrapper() + if sqlite3.sqlite_version_info < (3, 24, 0): + # We use the ON CONFLICT clause to implement UPSERTs with sqlite. + # This was added in 3.24.0 from 2018-06-04. + # See https://www.sqlite.org/lang_conflict.html + raise RuntimeError("SQLite version 3.24.0 or later is required") + # We keep a small cache in memory to avoid having to serialize/deserialize # data from the database too often. We use an OrderedDict to build # a poor-man's LRU cache. diff --git a/metadata-ingestion/src/datahub/utilities/is_pytest.py b/metadata-ingestion/src/datahub/utilities/is_pytest.py index 68bb1b285a50e9..572b4bf5356220 100644 --- a/metadata-ingestion/src/datahub/utilities/is_pytest.py +++ b/metadata-ingestion/src/datahub/utilities/is_pytest.py @@ -1,5 +1,6 @@ +import os import sys def is_pytest_running() -> bool: - return "pytest" in sys.modules + return "pytest" in sys.modules and os.environ.get("DATAHUB_TEST_MODE") == "1" diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 4ea42d568da635..17023c7b388e76 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -43,7 +43,6 @@ def _make_owner_category_list( owner_category_urn: Optional[str], owner_ids: List[str], ) -> List[Dict]: - return [ { "urn": mce_builder.make_owner_urn(owner_id, owner_type), @@ -285,7 +284,6 @@ def convert_to_aspects(self, operation_map: Dict[str, list]) -> Dict[str, Any]: aspect_map[Constants.ADD_TAG_OPERATION] = tag_aspect if Constants.ADD_OWNER_OPERATION in operation_map: - owner_aspect = OwnershipClass( owners=[ OwnerClass( diff --git a/metadata-ingestion/src/datahub/utilities/progress_timer.py b/metadata-ingestion/src/datahub/utilities/progress_timer.py new file mode 100644 index 00000000000000..eac62cddb55f2c --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/progress_timer.py @@ -0,0 +1,34 @@ +from datetime import datetime, timedelta, timezone + + +class ProgressTimer: + def __init__(self, report_every: timedelta, report_0: bool = False): + """A helper for reporting progress at a given time interval. + + Should be used for long-running processes that iterate over a large number of items, + but each iteration is fast. + + Args: + report_every: The time interval between progress reports. + report_0: Whether to report progress on the first iteration. + """ + + self._report_every = report_every + + if report_0: + # Use the earliest possible time to force reporting on the first iteration. + self._last_report_time = datetime.min.replace(tzinfo=timezone.utc) + else: + self._last_report_time = self._now() + + def _now(self) -> datetime: + return datetime.now(timezone.utc) + + def should_report(self) -> bool: + current_time = self._now() + + should_report = (self._last_report_time + self._report_every) <= current_time + if should_report: + self._last_report_time = current_time + + return should_report diff --git a/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py b/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py index 4d328ad31c6c4a..ab8987a7d2e8b2 100644 --- a/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py +++ b/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py @@ -19,7 +19,6 @@ def process( args_list: Iterable[Tuple[Any, ...]], max_workers: int, ) -> Generator[T, None, None]: - out_q: queue.Queue[T] = queue.Queue() def _worker_wrapper( diff --git a/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py b/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py index 37c10769259459..577f90215a6353 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import CorpGroupUrn # noqa: F401 +from datahub.metadata.urns import CorpGroupUrn + +__all__ = ["CorpGroupUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py b/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py index 5f9ecf65951b95..8acb86be00f6c8 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import CorpUserUrn as CorpuserUrn # noqa: F401 +from datahub.metadata.urns import CorpUserUrn as CorpuserUrn + +__all__ = ["CorpuserUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py index 5b2b45927c339e..3508ae5c4a3490 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataFlowUrn # noqa: F401 +from datahub.metadata.urns import DataFlowUrn + +__all__ = ["DataFlowUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py index 53e3419ee7ecb2..d003b6c6ad7a88 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataJobUrn # noqa: F401 +from datahub.metadata.urns import DataJobUrn + +__all__ = ["DataJobUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py index 9d37e38f256e7f..51e013e715d4fd 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataPlatformUrn # noqa: F401 +from datahub.metadata.urns import DataPlatformUrn + +__all__ = ["DataPlatformUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py index df6ba797d069c1..22e6b36c5f7aec 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataProcessInstanceUrn # noqa: F401 +from datahub.metadata.urns import DataProcessInstanceUrn + +__all__ = ["DataProcessInstanceUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py b/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py index 6078ffefc03d85..1652e170599958 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DatasetUrn # noqa: F401 +from datahub.metadata.urns import DatasetUrn + +__all__ = ["DatasetUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py b/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py index 442a6b27729bba..242a3d8228320d 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DomainUrn # noqa: F401 +from datahub.metadata.urns import DomainUrn + +__all__ = ["DomainUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py b/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py index 60a4f5396aa468..f9b861d7f08524 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import NotebookUrn # noqa: F401 +from datahub.metadata.urns import NotebookUrn + +__all__ = ["NotebookUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py b/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py index 5bd36a0656d99e..6774978c7a76d9 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py @@ -1,4 +1,6 @@ -from datahub.metadata.urns import StructuredPropertyUrn # noqa: F401 +from datahub.metadata.urns import StructuredPropertyUrn + +__all__ = ["StructuredPropertyUrn", "make_structured_property_urn"] def make_structured_property_urn(structured_property_id: str) -> str: diff --git a/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py b/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py index 0ac632ee40a015..f66d56a745a961 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import TagUrn # noqa: F401 +from datahub.metadata.urns import TagUrn + +__all__ = ["TagUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn.py b/metadata-ingestion/src/datahub/utilities/urns/urn.py index 2e5cebfd0e8f55..2ded2d4d9b32c0 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn.py @@ -1,4 +1,6 @@ -from datahub.metadata.urns import Urn # noqa: F401 +from datahub.metadata.urns import Urn + +__all__ = ["Urn", "guess_entity_type"] def guess_entity_type(urn: str) -> str: diff --git a/metadata-ingestion/tests/conftest.py b/metadata-ingestion/tests/conftest.py index db025e7f806c06..4685faabfcb285 100644 --- a/metadata-ingestion/tests/conftest.py +++ b/metadata-ingestion/tests/conftest.py @@ -7,6 +7,7 @@ import pytest os.environ["DATAHUB_SUPPRESS_LOGGING_MANAGER"] = "1" +os.environ["DATAHUB_TEST_MODE"] = "1" # Enable debug logging. logging.getLogger().setLevel(logging.DEBUG) diff --git a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py index 7005bc2e4411bf..024bb62bbe9ce9 100644 --- a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py +++ b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py @@ -68,7 +68,6 @@ def run_ingest( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint: - mock_checkpoint.return_value = mock_datahub_graph mocked_functions_reference( diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py index ef846f698f156e..806779475dea9d 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py @@ -47,7 +47,6 @@ def _generate_queries_cached_file(tmp_path: Path, queries_json_path: Path) -> No @patch("google.cloud.bigquery.Client") @patch("google.cloud.resourcemanager_v3.ProjectsClient") def test_queries_ingestion(project_client, client, pytestconfig, monkeypatch, tmp_path): - test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2" mcp_golden_path = f"{test_resources_dir}/bigquery_queries_mcps_golden.json" mcp_output_path = tmp_path / "bigquery_queries_mcps.json" diff --git a/metadata-ingestion/tests/integration/dremio/test_dremio.py b/metadata-ingestion/tests/integration/dremio/test_dremio.py index eb999367962817..cc3a7e19bc93e2 100644 --- a/metadata-ingestion/tests/integration/dremio/test_dremio.py +++ b/metadata-ingestion/tests/integration/dremio/test_dremio.py @@ -192,7 +192,6 @@ def create_mysql_source(headers): def upload_dataset(headers): - url = f"{DREMIO_HOST}/apiv2/source/s3/file_format/warehouse/sample.parquet" payload = {"ignoreOtherFileFormats": False, "type": "Parquet"} diff --git a/metadata-ingestion/tests/integration/kafka/kafka_to_file_oauth.yml b/metadata-ingestion/tests/integration/kafka/kafka_to_file_oauth.yml new file mode 100644 index 00000000000000..34cf8fd47e658c --- /dev/null +++ b/metadata-ingestion/tests/integration/kafka/kafka_to_file_oauth.yml @@ -0,0 +1,20 @@ +run_id: kafka-test + +source: + type: kafka + config: + connection: + bootstrap: "localhost:29092" + schema_registry_url: "http://localhost:28081" + consumer_config: + security.protocol: "SASL_PLAINTEXT" + sasl.mechanism: "OAUTHBEARER" + oauth_cb: "oauth:create_token" + domain: + "urn:li:domain:sales": + allow: + - "key_value_topic" +sink: + type: file + config: + filename: "./kafka_mces.json" diff --git a/metadata-ingestion/tests/integration/kafka/oauth.py b/metadata-ingestion/tests/integration/kafka/oauth.py new file mode 100644 index 00000000000000..28cfee521d6c0f --- /dev/null +++ b/metadata-ingestion/tests/integration/kafka/oauth.py @@ -0,0 +1,14 @@ +import logging +from typing import Any, Tuple + +logger = logging.getLogger(__name__) + +MESSAGE: str = "OAuth token `create_token` callback" + + +def create_token(*args: Any, **kwargs: Any) -> Tuple[str, int]: + logger.warning(MESSAGE) + return ( + "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJjbGllbnRfaWQiOiJrYWZrYV9jbGllbnQiLCJleHAiOjE2OTg3NjYwMDB9.dummy_sig_abcdef123456", + 3600, + ) diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py index dfdbea5de5cbfd..597889c8440b7a 100644 --- a/metadata-ingestion/tests/integration/kafka/test_kafka.py +++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py @@ -1,10 +1,14 @@ +import logging import subprocess import pytest +import yaml from freezegun import freeze_time from datahub.ingestion.api.source import SourceCapability -from datahub.ingestion.source.kafka import KafkaSource +from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.kafka.kafka import KafkaSource +from tests.integration.kafka import oauth # type: ignore from tests.test_helpers import mce_helpers, test_connection_helpers from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.docker_helpers import wait_for_port @@ -99,3 +103,36 @@ def test_kafka_test_connection(mock_kafka_service, config_dict, is_success): SourceCapability.SCHEMA_METADATA: "Failed to establish a new connection" }, ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_kafka_oauth_callback( + mock_kafka_service, test_resources_dir, pytestconfig, tmp_path, mock_time +): + # Run the metadata ingestion pipeline. + config_file = (test_resources_dir / "kafka_to_file_oauth.yml").resolve() + + log_file = tmp_path / "kafka_oauth_message.log" + + file_handler = logging.FileHandler( + str(log_file) + ) # Add a file handler to later validate a test-case + logging.getLogger().addHandler(file_handler) + + recipe: dict = {} + with open(config_file) as fp: + recipe = yaml.safe_load(fp) + + pipeline = Pipeline.create(recipe) + + pipeline.run() + + is_found: bool = False + with open(log_file, "r") as file: + for line_number, line in enumerate(file, 1): + if oauth.MESSAGE in line: + is_found = True + break + + assert is_found diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 7238a49cb37d2f..8bbf14709ff9fb 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -1047,7 +1047,6 @@ def test_independent_soft_deleted_looks( mocked_client = mock.MagicMock() with mock.patch("looker_sdk.init40") as mock_sdk: - mock_sdk.return_value = mocked_client setup_mock_look(mocked_client) setup_mock_soft_deleted_look(mocked_client) diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 94b3b103d0548c..ab55321a4d7342 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -12,16 +12,14 @@ from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.file import read_metadata_file +from datahub.ingestion.source.looker.looker_dataclasses import LookerModel from datahub.ingestion.source.looker.looker_template_language import ( SpecialVariable, load_and_preprocess_file, resolve_liquid_variable, ) -from datahub.ingestion.source.looker.lookml_source import ( - LookerModel, - LookerRefinementResolver, - LookMLSourceConfig, -) +from datahub.ingestion.source.looker.lookml_config import LookMLSourceConfig +from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver from datahub.metadata.schema_classes import ( DatasetSnapshotClass, MetadataChangeEventClass, @@ -833,7 +831,6 @@ def test_manifest_parser(pytestconfig: pytest.Config) -> None: @freeze_time(FROZEN_TIME) def test_duplicate_field_ingest(pytestconfig, tmp_path, mock_time): - test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" mce_out_file = "duplicate_ingest_mces_output.json" diff --git a/metadata-ingestion/tests/integration/okta/test_okta.py b/metadata-ingestion/tests/integration/okta/test_okta.py index 63ef8793cadddc..10148273c93666 100644 --- a/metadata-ingestion/tests/integration/okta/test_okta.py +++ b/metadata-ingestion/tests/integration/okta/test_okta.py @@ -58,14 +58,12 @@ def run_ingest( mocked_functions_reference, recipe, ): - with patch( "datahub.ingestion.source.identity.okta.OktaClient" ) as MockClient, patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint: - mock_checkpoint.return_value = mock_datahub_graph mocked_functions_reference(MockClient=MockClient) @@ -277,7 +275,6 @@ def overwrite_group_in_mocked_data(test_resources_dir, MockClient): def _init_mock_okta_client( test_resources_dir, MockClient, mock_users_json=None, mock_groups_json=None ): - okta_users_json_file = ( test_resources_dir / "okta_users.json" if mock_users_json is None diff --git a/metadata-ingestion/tests/integration/oracle/common.py b/metadata-ingestion/tests/integration/oracle/common.py index 79dbda8c30f896..9e2cc42ef10256 100644 --- a/metadata-ingestion/tests/integration/oracle/common.py +++ b/metadata-ingestion/tests/integration/oracle/common.py @@ -33,7 +33,6 @@ def scalar(self): @dataclass class MockConstraints: - constraint_name: str = "mock constraint name" constraint_type: str = "P" local_column: str = "mock column name" diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json b/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json index 8e06ba9a80906b..5881ad5de0bcb8 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json @@ -1024,6 +1024,32 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD)", "type": "TRANSFORMED" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD),client_director)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD),month_wid)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV),cd_agent_key)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD),client_manager_closing_month)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD),month_wid)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV),agent_key)" + ], + "confidenceScore": 1.0 + } ] } }, diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index f4613c524316e3..f22998b47b9008 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -712,7 +712,6 @@ def test_redshift_regular_case(): def test_redshift_native_query(): - table: powerbi_data_classes.Table = powerbi_data_classes.Table( expression=M_QUERIES[22], name="category", @@ -1101,7 +1100,6 @@ def test_double_quotes_in_alias(): @patch("datahub.ingestion.source.powerbi.m_query.parser.get_lark_parser") def test_m_query_timeout(mock_get_lark_parser): - q = 'let\n Source = Value.NativeQuery(Snowflake.Databases("0DD93C6BD5A6.snowflakecomputing.com","sales_analytics_warehouse_prod",[Role="sales_analytics_member_ad"]){[Name="SL_OPERATIONS"]}[Data], "select SALE_NO AS ""\x1b[4mSaleNo\x1b[0m""#(lf) ,CODE AS ""Code""#(lf) ,ENDDATE AS ""end_date""#(lf) from SL_OPERATIONS.SALE.REPORTS#(lf) where ENDDATE > \'2024-02-03\'", null, [EnableFolding=true]),\n #"selected Row" = Table.SelectRows(Source)\nin\n #"selected Row"' table: powerbi_data_classes.Table = powerbi_data_classes.Table( diff --git a/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py b/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py index 53e184515c1d8c..887f7fe4d6f44a 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py @@ -19,3 +19,13 @@ def test_simple_from(): assert len(tables) == 1 assert tables[0] == "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + +def test_drop_statement(): + expected: str = "SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + query: str = "DROP TABLE IF EXISTS #table1; DROP TABLE IF EXISTS #table1,#table2; DROP TABLE IF EXISTS table1; DROP TABLE IF EXISTS table1, #table2;SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + actual: str = native_sql_parser.remove_drop_statement(query) + + assert actual == expected diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 0f360d44c38cbe..edde11ff87d293 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -96,7 +96,6 @@ def read_mock_data(path: Union[Path, str]) -> dict: def register_mock_api( pytestconfig: pytest.Config, request_mock: Any, override_data: Optional[dict] = None ) -> None: - default_mock_data_path = ( pytestconfig.rootpath / "tests/integration/powerbi/mock_data/default_mock_response.json" @@ -467,7 +466,6 @@ def test_scan_all_workspaces( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -517,7 +515,6 @@ def test_extract_reports( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -1219,7 +1216,6 @@ def test_independent_datasets_extraction( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -1323,7 +1319,6 @@ def test_cll_extraction( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -1380,7 +1375,6 @@ def test_cll_extraction_flags( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - register_mock_api( pytestconfig=pytestconfig, request_mock=requests_mock, @@ -1392,7 +1386,6 @@ def test_cll_extraction_flags( ) with pytest.raises(Exception, match=pattern): - Pipeline.create( { "run_id": "powerbi-test", @@ -1559,7 +1552,6 @@ def test_powerbi_app_ingest( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - common_app_ingest( pytestconfig=pytestconfig, requests_mock=requests_mock, @@ -1590,7 +1582,6 @@ def test_powerbi_app_ingest_info_message( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - pipeline = common_app_ingest( pytestconfig=pytestconfig, requests_mock=requests_mock, diff --git a/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py b/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py index ee1aafb6cf32dc..95f096cc3def35 100644 --- a/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py +++ b/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py @@ -1011,7 +1011,6 @@ def default_config(): def test_qlik_sense_ingest( pytestconfig, tmp_path, requests_mock, mock_websocket_send_request ): - test_resources_dir = pytestconfig.rootpath / "tests/integration/qlik_sense" register_mock_api(request_mock=requests_mock) @@ -1051,7 +1050,6 @@ def test_qlik_sense_ingest( def test_platform_instance_ingest( pytestconfig, tmp_path, requests_mock, mock_websocket_send_request ): - test_resources_dir = pytestconfig.rootpath / "tests/integration/qlik_sense" register_mock_api(request_mock=requests_mock) diff --git a/metadata-ingestion/tests/integration/sigma/test_sigma.py b/metadata-ingestion/tests/integration/sigma/test_sigma.py index 6c01bf6dc80fe7..19fa1448fee598 100644 --- a/metadata-ingestion/tests/integration/sigma/test_sigma.py +++ b/metadata-ingestion/tests/integration/sigma/test_sigma.py @@ -420,7 +420,6 @@ def register_mock_api(request_mock: Any, override_data: dict = {}) -> None: @pytest.mark.integration def test_sigma_ingest(pytestconfig, tmp_path, requests_mock): - test_resources_dir = pytestconfig.rootpath / "tests/integration/sigma" register_mock_api(request_mock=requests_mock) @@ -464,7 +463,6 @@ def test_sigma_ingest(pytestconfig, tmp_path, requests_mock): @pytest.mark.integration def test_platform_instance_ingest(pytestconfig, tmp_path, requests_mock): - test_resources_dir = pytestconfig.rootpath / "tests/integration/sigma" register_mock_api(request_mock=requests_mock) @@ -510,7 +508,6 @@ def test_platform_instance_ingest(pytestconfig, tmp_path, requests_mock): @pytest.mark.integration def test_sigma_ingest_shared_entities(pytestconfig, tmp_path, requests_mock): - test_resources_dir = pytestconfig.rootpath / "tests/integration/sigma" override_data = { diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 8f45be96625a45..9e4bb2f0eb634f 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -441,7 +441,6 @@ def default_query_results( # noqa: C901 include_column_lineage=True, ), ): - return [ { "DOWNSTREAM_TABLE_NAME": f"TEST_DB.TEST_SCHEMA.TABLE_{op_idx}", diff --git a/metadata-ingestion/tests/integration/sql_server/docker-compose.yml b/metadata-ingestion/tests/integration/sql_server/docker-compose.yml index 1046321e4f7205..aed70503903c03 100644 --- a/metadata-ingestion/tests/integration/sql_server/docker-compose.yml +++ b/metadata-ingestion/tests/integration/sql_server/docker-compose.yml @@ -1,7 +1,7 @@ version: "3" services: testsqlserver: - image: "mcr.microsoft.com/mssql/server:latest" + image: "mcr.microsoft.com/mssql/server:2022-latest" platform: linux/amd64 container_name: "testsqlserver" environment: diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 558548bfc7a698..54821347fd28b8 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ba144ff3-f6f8-4a61-a8a5-5cf1ed172738", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-09-16 15:59:53.077000", - "date_modified": "2024-09-16 15:59:53.217000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -154,6 +154,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -231,10 +247,15 @@ "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -245,17 +266,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -341,10 +357,15 @@ "entityType": "container", "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -355,17 +376,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -451,10 +467,15 @@ "entityType": "container", "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -465,17 +486,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -561,10 +577,15 @@ "entityType": "container", "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -575,17 +596,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -671,10 +687,15 @@ "entityType": "container", "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -685,17 +706,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -781,10 +797,15 @@ "entityType": "container", "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -795,17 +816,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -891,10 +907,15 @@ "entityType": "container", "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -905,17 +926,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1001,10 +1017,15 @@ "entityType": "container", "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -1015,17 +1036,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1111,10 +1127,15 @@ "entityType": "container", "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -1125,17 +1146,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1217,22 +1233,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", @@ -1386,6 +1386,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", @@ -1459,22 +1475,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", @@ -1498,7 +1498,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -1515,7 +1515,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1525,14 +1525,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": {}, - "name": "Items", - "description": "Description for table Items of schema Foo.", + "name": "age_dist", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "DemoData.Foo.Items", + "schemaName": "DemoData.Foo.age_dist", "platform": "urn:li:dataPlatform:mssql", "version": 0, "created": { @@ -1551,7 +1550,7 @@ }, "fields": [ { - "fieldPath": "ID", + "fieldPath": "Age", "nullable": true, "type": { "type": { @@ -1563,14 +1562,147 @@ "isPartOfKey": false }, { - "fieldPath": "ItemName", + "fieldPath": "Count", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "Items", + "description": "Description for table Items of schema Foo.", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "DemoData.Foo.Items", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ItemName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", "recursive": false, "isPartOfKey": false } @@ -1942,6 +2074,183 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" + }, + "name": "PersonsView", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "DemoData.Foo.PersonsView", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "LastName", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "FirstName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", @@ -1973,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-09-16 15:59:53.010000", - "date_modified": "2024-09-16 15:59:53.010000" + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -1991,14 +2300,40 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "dataJobInfo", "aspect": { "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] + "customProperties": { + "procedure_depends_on": "{'DemoData.Foo.age_dist': 'USER_TABLE', 'DemoData.Foo.Items': 'USER_TABLE', 'DemoData.Foo.Persons': 'USER_TABLE', 'DemoData.Foo.SalesReason': 'USER_TABLE'}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", + "input parameters": "[]", + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" + }, + "externalUrl": "", + "name": "DemoData.Foo.NewProc", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -2084,10 +2419,15 @@ "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -2098,17 +2438,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -2194,10 +2529,15 @@ "entityType": "container", "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -2208,17 +2548,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -2300,22 +2635,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", @@ -2425,6 +2744,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", @@ -2502,10 +2837,15 @@ "entityType": "container", "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -2516,17 +2856,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -2612,10 +2947,15 @@ "entityType": "container", "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -2626,17 +2966,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -2722,10 +3057,15 @@ "entityType": "container", "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -2736,17 +3076,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -2832,10 +3167,15 @@ "entityType": "container", "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -2846,17 +3186,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -2942,10 +3277,15 @@ "entityType": "container", "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -2956,17 +3296,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -3052,10 +3387,15 @@ "entityType": "container", "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -3066,17 +3406,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -3162,10 +3497,15 @@ "entityType": "container", "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -3176,17 +3516,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -3272,10 +3607,15 @@ "entityType": "container", "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -3286,17 +3626,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -3382,10 +3717,15 @@ "entityType": "container", "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -3396,17 +3736,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -3488,22 +3823,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", @@ -3669,6 +3988,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", @@ -3725,14 +4060,154 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.ItemsNew,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.ItemsNew,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "ItemsNew", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "NewData.FooNew.ItemsNew", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ItemName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Price", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "SMALLMONEY", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.ItemsNew,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Schema" + "Table" ] } }, @@ -3743,24 +4218,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.ItemsNew,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -3769,6 +4228,10 @@ { "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + }, + { + "id": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "urn": "urn:li:container:f721da08adde46586c0f113287cb60d1" } ] } @@ -3781,7 +4244,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.ItemsNew,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.PersonsNew,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -3798,7 +4261,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.ItemsNew,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.PersonsNew,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -3808,13 +4271,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": {}, - "name": "ItemsNew", + "name": "PersonsNew", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "NewData.FooNew.ItemsNew", + "schemaName": "NewData.FooNew.PersonsNew", "platform": "urn:li:dataPlatform:mssql", "version": 0, "created": { @@ -3834,7 +4297,7 @@ "fields": [ { "fieldPath": "ID", - "nullable": true, + "nullable": false, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -3842,29 +4305,41 @@ }, "nativeDataType": "INTEGER", "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "LastName", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, "isPartOfKey": false }, { - "fieldPath": "ItemName", + "fieldPath": "FirstName", "nullable": true, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "Price", + "fieldPath": "Age", "nullable": true, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "SMALLMONEY", + "nativeDataType": "INTEGER", "recursive": false, "isPartOfKey": false } @@ -3882,7 +4357,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.ItemsNew,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.PersonsNew,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -3900,7 +4375,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.ItemsNew,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.PersonsNew,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -3925,7 +4400,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.PersonsNew,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -3942,7 +4417,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.PersonsNew,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -3951,14 +4426,17 @@ }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": {}, - "name": "PersonsNew", + "customProperties": { + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" + }, + "name": "View1", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "NewData.FooNew.PersonsNew", + "schemaName": "NewData.FooNew.View1", "platform": "urn:li:dataPlatform:mssql", "version": 0, "created": { @@ -3976,18 +4454,6 @@ } }, "fields": [ - { - "fieldPath": "ID", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER", - "recursive": false, - "isPartOfKey": true - }, { "fieldPath": "LastName", "nullable": false, @@ -4011,18 +4477,6 @@ "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", "recursive": false, "isPartOfKey": false - }, - { - "fieldPath": "Age", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER", - "recursive": false, - "isPartOfKey": false } ] } @@ -4038,13 +4492,13 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.PersonsNew,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Table" + "View" ] } }, @@ -4056,7 +4510,25 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.PersonsNew,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -4079,6 +4551,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", @@ -4156,10 +4644,15 @@ "entityType": "container", "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -4170,17 +4663,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -4266,10 +4754,15 @@ "entityType": "container", "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -4280,17 +4773,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -4376,10 +4864,15 @@ "entityType": "container", "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -4389,16 +4882,69 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "upstreamLineage", "aspect": { "json": { - "path": [ + "upstreams": [ { - "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", - "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "type": "VIEW" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),firstname)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),firstname)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),lastname)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),lastname)" + ], + "confidenceScore": 1.0 } ] } @@ -4441,6 +4987,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -4472,5 +5034,37 @@ "runId": "mssql-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index 161d40ea91d91a..3836e587ef8cf4 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ba144ff3-f6f8-4a61-a8a5-5cf1ed172738", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-09-16 15:59:53.077000", - "date_modified": "2024-09-16 15:59:53.217000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -154,6 +154,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -231,10 +247,15 @@ "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -245,17 +266,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -341,10 +357,15 @@ "entityType": "container", "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -355,17 +376,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -451,10 +467,15 @@ "entityType": "container", "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -465,17 +486,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -561,10 +577,15 @@ "entityType": "container", "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -575,17 +596,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -671,10 +687,15 @@ "entityType": "container", "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -685,17 +706,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -781,10 +797,15 @@ "entityType": "container", "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -795,17 +816,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -891,10 +907,15 @@ "entityType": "container", "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -905,17 +926,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1001,10 +1017,15 @@ "entityType": "container", "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -1015,17 +1036,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1111,10 +1127,15 @@ "entityType": "container", "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -1125,17 +1146,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1217,22 +1233,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", @@ -1386,6 +1386,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", @@ -1459,22 +1475,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", @@ -1498,7 +1498,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -1515,7 +1515,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1525,14 +1525,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": {}, - "name": "Items", - "description": "Description for table Items of schema Foo.", + "name": "age_dist", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "DemoData.Foo.Items", + "schemaName": "DemoData.Foo.age_dist", "platform": "urn:li:dataPlatform:mssql", "version": 0, "created": { @@ -1551,7 +1550,7 @@ }, "fields": [ { - "fieldPath": "ID", + "fieldPath": "Age", "nullable": true, "type": { "type": { @@ -1563,14 +1562,147 @@ "isPartOfKey": false }, { - "fieldPath": "ItemName", + "fieldPath": "Count", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "Items", + "description": "Description for table Items of schema Foo.", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "DemoData.Foo.Items", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ItemName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", "recursive": false, "isPartOfKey": false } @@ -1942,6 +2074,183 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" + }, + "name": "PersonsView", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "DemoData.Foo.PersonsView", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "LastName", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "FirstName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", @@ -1973,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-09-16 15:59:53.010000", - "date_modified": "2024-09-16 15:59:53.010000" + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -1991,14 +2300,40 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "dataJobInfo", "aspect": { "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] + "customProperties": { + "procedure_depends_on": "{'DemoData.Foo.age_dist': 'USER_TABLE', 'DemoData.Foo.Items': 'USER_TABLE', 'DemoData.Foo.Persons': 'USER_TABLE', 'DemoData.Foo.SalesReason': 'USER_TABLE'}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", + "input parameters": "[]", + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" + }, + "externalUrl": "", + "name": "DemoData.Foo.NewProc", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -2084,10 +2419,15 @@ "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -2098,17 +2438,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -2194,10 +2529,15 @@ "entityType": "container", "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -2208,17 +2548,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -2304,10 +2639,15 @@ "entityType": "container", "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -2317,16 +2657,20 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "upstreamLineage", "aspect": { "json": { - "path": [ + "upstreams": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "type": "VIEW" } ] } @@ -2369,6 +2713,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2400,5 +2760,21 @@ "runId": "mssql-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index 161d40ea91d91a..3836e587ef8cf4 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ba144ff3-f6f8-4a61-a8a5-5cf1ed172738", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-09-16 15:59:53.077000", - "date_modified": "2024-09-16 15:59:53.217000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -154,6 +154,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -231,10 +247,15 @@ "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -245,17 +266,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -341,10 +357,15 @@ "entityType": "container", "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -355,17 +376,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -451,10 +467,15 @@ "entityType": "container", "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -465,17 +486,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -561,10 +577,15 @@ "entityType": "container", "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -575,17 +596,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -671,10 +687,15 @@ "entityType": "container", "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -685,17 +706,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -781,10 +797,15 @@ "entityType": "container", "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -795,17 +816,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -891,10 +907,15 @@ "entityType": "container", "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -905,17 +926,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1001,10 +1017,15 @@ "entityType": "container", "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -1015,17 +1036,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1111,10 +1127,15 @@ "entityType": "container", "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -1125,17 +1146,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1217,22 +1233,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", @@ -1386,6 +1386,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", @@ -1459,22 +1475,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", @@ -1498,7 +1498,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -1515,7 +1515,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1525,14 +1525,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": {}, - "name": "Items", - "description": "Description for table Items of schema Foo.", + "name": "age_dist", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "DemoData.Foo.Items", + "schemaName": "DemoData.Foo.age_dist", "platform": "urn:li:dataPlatform:mssql", "version": 0, "created": { @@ -1551,7 +1550,7 @@ }, "fields": [ { - "fieldPath": "ID", + "fieldPath": "Age", "nullable": true, "type": { "type": { @@ -1563,14 +1562,147 @@ "isPartOfKey": false }, { - "fieldPath": "ItemName", + "fieldPath": "Count", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "Items", + "description": "Description for table Items of schema Foo.", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "DemoData.Foo.Items", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ItemName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", "recursive": false, "isPartOfKey": false } @@ -1942,6 +2074,183 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" + }, + "name": "PersonsView", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "DemoData.Foo.PersonsView", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "LastName", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "FirstName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", @@ -1973,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-09-16 15:59:53.010000", - "date_modified": "2024-09-16 15:59:53.010000" + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -1991,14 +2300,40 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "dataJobInfo", "aspect": { "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] + "customProperties": { + "procedure_depends_on": "{'DemoData.Foo.age_dist': 'USER_TABLE', 'DemoData.Foo.Items': 'USER_TABLE', 'DemoData.Foo.Persons': 'USER_TABLE', 'DemoData.Foo.SalesReason': 'USER_TABLE'}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", + "input parameters": "[]", + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" + }, + "externalUrl": "", + "name": "DemoData.Foo.NewProc", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -2084,10 +2419,15 @@ "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -2098,17 +2438,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -2194,10 +2529,15 @@ "entityType": "container", "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -2208,17 +2548,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -2304,10 +2639,15 @@ "entityType": "container", "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -2317,16 +2657,20 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "upstreamLineage", "aspect": { "json": { - "path": [ + "upstreams": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "type": "VIEW" } ] } @@ -2369,6 +2713,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2400,5 +2760,21 @@ "runId": "mssql-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index 29124f6fc156c4..ebcadcc11dcbfa 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ba144ff3-f6f8-4a61-a8a5-5cf1ed172738", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-09-16 15:59:53.077000", - "date_modified": "2024-09-16 15:59:53.217000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -154,6 +154,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -231,10 +247,15 @@ "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -245,17 +266,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -341,10 +357,15 @@ "entityType": "container", "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -355,17 +376,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -451,10 +467,15 @@ "entityType": "container", "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -465,17 +486,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -561,10 +577,15 @@ "entityType": "container", "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -575,17 +596,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -671,10 +687,15 @@ "entityType": "container", "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -685,17 +706,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -781,10 +797,15 @@ "entityType": "container", "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -795,17 +816,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -891,10 +907,15 @@ "entityType": "container", "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -905,17 +926,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1001,10 +1017,15 @@ "entityType": "container", "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -1015,17 +1036,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1111,10 +1127,15 @@ "entityType": "container", "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] } }, "systemMetadata": { @@ -1125,17 +1146,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" } }, "systemMetadata": { @@ -1217,22 +1233,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", @@ -1386,6 +1386,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", @@ -1459,22 +1475,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", @@ -1498,7 +1498,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -1515,7 +1515,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1525,14 +1525,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": {}, - "name": "Items", - "description": "Description for table Items of schema Foo.", + "name": "age_dist", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "demodata.foo.items", + "schemaName": "demodata.foo.age_dist", "platform": "urn:li:dataPlatform:mssql", "version": 0, "created": { @@ -1551,7 +1550,7 @@ }, "fields": [ { - "fieldPath": "ID", + "fieldPath": "Age", "nullable": true, "type": { "type": { @@ -1563,14 +1562,14 @@ "isPartOfKey": false }, { - "fieldPath": "ItemName", + "fieldPath": "Count", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "nativeDataType": "INTEGER", "recursive": false, "isPartOfKey": false } @@ -1586,10 +1585,2624 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "Items", + "description": "Description for table Items of schema Foo.", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "demodata.foo.items", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ItemName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "Persons", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "demodata.foo.persons", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "LastName", + "nullable": false, + "description": "Description for column LastName of table Persons of schema Foo.", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "FirstName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "SalesReason", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "demodata.foo.salesreason", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "TempID", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "SomeId", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "UNIQUEIDENTIFIER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(50) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + } + ], + "foreignKeys": [ + { + "name": "FK_TempSales_SalesReason", + "foreignFields": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),ID)" + ], + "sourceFields": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),TempID)" + ], + "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" + }, + "name": "PersonsView", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "demodata.foo.personsview", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "LastName", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "FirstName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "DemoData.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" + }, + "externalUrl": "", + "name": "DemoData.Foo.Proc.With.SpecialChar", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{'DemoData.Foo.age_dist': 'USER_TABLE', 'DemoData.Foo.Items': 'USER_TABLE', 'DemoData.Foo.Persons': 'USER_TABLE', 'DemoData.Foo.SalesReason': 'USER_TABLE'}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", + "input parameters": "[]", + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" + }, + "externalUrl": "", + "name": "DemoData.Foo.NewProc", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "DemoData", + "schema": "guest" + }, + "name": "guest", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "DemoData", + "schema": "INFORMATION_SCHEMA" + }, + "name": "INFORMATION_SCHEMA", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "DemoData", + "schema": "sys" + }, + "name": "sys", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData" + }, + "name": "NewData", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_accessadmin" + }, + "name": "db_accessadmin", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_backupoperator" + }, + "name": "db_backupoperator", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_datareader" + }, + "name": "db_datareader", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_datawriter" + }, + "name": "db_datawriter", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_ddladmin" + }, + "name": "db_ddladmin", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_denydatareader" + }, + "name": "db_denydatareader", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_denydatawriter" + }, + "name": "db_denydatawriter", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_owner" + }, + "name": "db_owner", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_securityadmin" + }, + "name": "db_securityadmin", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "dbo" + }, + "name": "dbo", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.dbo.productsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:269d0067d130eda0399a534fc787054c" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.dbo.productsnew,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "ProductsNew", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "newdata.dbo.productsnew", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ProductName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Price", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "MONEY", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.dbo.productsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.dbo.productsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + }, + { + "id": "urn:li:container:269d0067d130eda0399a534fc787054c", + "urn": "urn:li:container:269d0067d130eda0399a534fc787054c" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "FooNew" + }, + "name": "FooNew", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.itemsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.itemsnew,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "ItemsNew", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "newdata.foonew.itemsnew", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ItemName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Price", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "SMALLMONEY", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.itemsnew,PROD)", + "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { @@ -1606,19 +4219,19 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.itemsnew,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "urn": "urn:li:container:f721da08adde46586c0f113287cb60d1" } ] } @@ -1631,12 +4244,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:f721da08adde46586c0f113287cb60d1" } }, "systemMetadata": { @@ -1648,7 +4261,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1658,13 +4271,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": {}, - "name": "Persons", + "name": "PersonsNew", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "demodata.foo.persons", + "schemaName": "newdata.foonew.personsnew", "platform": "urn:li:dataPlatform:mssql", "version": 0, "created": { @@ -1697,7 +4310,6 @@ { "fieldPath": "LastName", "nullable": false, - "description": "Description for column LastName of table Persons of schema Foo.", "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -1745,7 +4357,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1763,19 +4375,19 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "urn": "urn:li:container:f721da08adde46586c0f113287cb60d1" } ] } @@ -1788,12 +4400,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:f721da08adde46586c0f113287cb60d1" } }, "systemMetadata": { @@ -1805,7 +4417,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1814,14 +4426,17 @@ }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": {}, - "name": "SalesReason", + "customProperties": { + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" + }, + "name": "View1", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "demodata.foo.salesreason", + "schemaName": "newdata.foonew.view1", "platform": "urn:li:dataPlatform:mssql", "version": 0, "created": { @@ -1840,53 +4455,29 @@ }, "fields": [ { - "fieldPath": "TempID", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER", - "recursive": false, - "isPartOfKey": true - }, - { - "fieldPath": "SomeId", + "fieldPath": "LastName", "nullable": false, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "UNIQUEIDENTIFIER", + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "Name", + "fieldPath": "FirstName", "nullable": true, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "NVARCHAR(50) COLLATE SQL_Latin1_General_CP1_CI_AS", + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", "recursive": false, "isPartOfKey": false } - ], - "foreignKeys": [ - { - "name": "FK_TempSales_SalesReason", - "foreignFields": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),ID)" - ], - "sourceFields": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),TempID)" - ], - "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" - } ] } } @@ -1901,13 +4492,13 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Table" + "View" ] } }, @@ -1919,39 +4510,14 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - }, - { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", "changeType": "UPSERT", - "aspectName": "dataFlowInfo", + "aspectName": "viewProperties", "aspect": { "json": { - "customProperties": {}, - "externalUrl": "", - "name": "DemoData.Foo.stored_procedures" + "materialized": false, + "viewLogic": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "viewLanguage": "SQL" } }, "systemMetadata": { @@ -1961,26 +4527,22 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", "changeType": "UPSERT", - "aspectName": "dataJobInfo", + "aspectName": "browsePathsV2", "aspect": { "json": { - "customProperties": { - "procedure_depends_on": "{}", - "depending_on_procedure": "{}", - "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", - "input parameters": "['@ID']", - "parameter @ID": "{'type': 'int'}", - "date_created": "2024-09-16 15:59:53.010000", - "date_modified": "2024-09-16 15:59:53.010000" - }, - "externalUrl": "", - "name": "DemoData.Foo.Proc.With.SpecialChar", - "type": { - "string": "MSSQL_STORED_PROCEDURE" - } + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + }, + { + "id": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "urn": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + ] } }, "systemMetadata": { @@ -1990,15 +4552,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityType": "container", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "container", "aspect": { "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -2009,7 +4569,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { @@ -2017,7 +4577,7 @@ "customProperties": { "platform": "mssql", "env": "PROD", - "database": "DemoData", + "database": "NewData", "schema": "guest" }, "name": "guest", @@ -2032,7 +4592,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2048,7 +4608,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -2064,7 +4624,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2082,12 +4642,17 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -2098,17 +4663,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -2119,7 +4679,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { @@ -2127,7 +4687,7 @@ "customProperties": { "platform": "mssql", "env": "PROD", - "database": "DemoData", + "database": "NewData", "schema": "INFORMATION_SCHEMA" }, "name": "INFORMATION_SCHEMA", @@ -2142,7 +4702,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2158,7 +4718,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -2174,7 +4734,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2192,12 +4752,17 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -2208,17 +4773,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" - } - ] + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" } }, "systemMetadata": { @@ -2229,7 +4789,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { @@ -2237,7 +4797,7 @@ "customProperties": { "platform": "mssql", "env": "PROD", - "database": "DemoData", + "database": "NewData", "schema": "sys" }, "name": "sys", @@ -2252,7 +4812,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2268,7 +4828,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -2284,7 +4844,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2302,12 +4862,17 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] } }, "systemMetadata": { @@ -2317,16 +4882,186 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "upstreamLineage", "aspect": { "json": { - "path": [ + "upstreams": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),Age)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),FirstName)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),FirstName)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),ID)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),ID)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),LastName)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),LastName)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),FirstName)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),FirstName)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),LastName)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),LastName)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD),Age)" + ], + "confidenceScore": 0.9 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),TempID)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),tempid)" + ], + "confidenceScore": 0.9 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),Name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),name)" + ], + "confidenceScore": 0.9 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),Age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" + ], + "confidenceScore": 0.35 } ] } @@ -2369,6 +5104,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/DemoData.Foo.NewProc.json b/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/DemoData.Foo.NewProc.json new file mode 100644 index 00000000000000..609e3a6f429452 --- /dev/null +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/DemoData.Foo.NewProc.json @@ -0,0 +1,57 @@ +[ +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD),age)" + ], + "confidenceScore": 0.2 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),tempid)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),tempid)" + ], + "confidenceScore": 0.2 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),name)" + ], + "confidenceScore": 0.2 + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/demodata.foo.proc2.json b/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/demodata.foo.proc2.json new file mode 100644 index 00000000000000..8ebd1c065ebf94 --- /dev/null +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/demodata.foo.proc2.json @@ -0,0 +1,57 @@ +[ +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,demodata.foo.stored_procedures,PROD),proc2)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD),age)" + ], + "confidenceScore": 0.2 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),tempid)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),tempid)" + ], + "confidenceScore": 0.2 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),name)" + ], + "confidenceScore": 0.2 + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/procedures/DemoData.Foo.NewProc.sql b/metadata-ingestion/tests/integration/sql_server/procedures/DemoData.Foo.NewProc.sql new file mode 100644 index 00000000000000..52a8d1327653b2 --- /dev/null +++ b/metadata-ingestion/tests/integration/sql_server/procedures/DemoData.Foo.NewProc.sql @@ -0,0 +1,37 @@ +CREATE PROCEDURE [Foo].[NewProc] + AS + BEGIN + --insert into items table from salesreason table + insert into Foo.Items (ID, ItemName) + SELECT TempID, Name + FROM Foo.SalesReason; + + + IF OBJECT_ID('Foo.age_dist', 'U') IS NULL + + BEGIN + -- Create and populate if table doesn't exist + SELECT Age, COUNT(*) as Count + INTO Foo.age_dist + FROM Foo.Persons + GROUP BY Age + END + ELSE + BEGIN + -- Update existing table + TRUNCATE TABLE Foo.age_dist; + + INSERT INTO Foo.age_dist (Age, Count) + SELECT Age, COUNT(*) as Count + FROM Foo.Persons + GROUP BY Age + END + + SELECT * INTO #TempTable FROM NewData.FooNew.PersonsNew + + UPDATE DemoData.Foo.Persons + SET Age = t.Age + FROM DemoData.Foo.Persons p + JOIN #TempTable t ON p.ID = t.ID + + END \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/procedures/demodata.foo.proc2.sql b/metadata-ingestion/tests/integration/sql_server/procedures/demodata.foo.proc2.sql new file mode 100644 index 00000000000000..69194a8d2c5464 --- /dev/null +++ b/metadata-ingestion/tests/integration/sql_server/procedures/demodata.foo.proc2.sql @@ -0,0 +1,36 @@ +CREATE PROCEDURE [foo].[proc2] + AS + BEGIN + --insert into items table from salesreason table + insert into foo.items (id, itemame) + SELECT tempid, name + FROM foo.salesreason; + + + IF OBJECT_ID('foo.age_dist', 'U') IS NULL + + BEGIN + -- Create and populate if table doesn't exist + SELECT age, COUNT(*) as count + INTO foo.age_dist + FROM foo.persons + GROUP BY age + END + ELSE + BEGIN + -- Update existing table + TRUNCATE TABLE foo.age_dist; + + INSERT INTO foo.age_dist (age, count) + SELECT age, COUNT(*) as count + FROM foo.persons + GROUP BY age + END + + SELECT * INTO #temptable FROM newdata.foonew.personsnew + + UPDATE demodata.foo.persons + SET age = t.age + FROM demodata.foo.persons p + JOIN #temptable t ON p.ID = t.ID + END \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql index 77ecabc5a3fffc..0c3c7ee2fd29e3 100644 --- a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql +++ b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql @@ -1,3 +1,4 @@ +DROP DATABASE IF EXISTS NewData; CREATE DATABASE NewData; GO USE NewData; @@ -14,7 +15,14 @@ CREATE TABLE FooNew.PersonsNew ( FirstName varchar(255), Age int ); +GO +CREATE VIEW FooNew.View1 AS +SELECT LastName, FirstName +FROM FooNew.PersonsNew +WHERE Age > 18 +GO +DROP DATABASE IF EXISTS DemoData; CREATE DATABASE DemoData; GO USE DemoData; @@ -32,6 +40,8 @@ CREATE TABLE Foo.Persons ( Age int ); GO +CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons; +GO CREATE TABLE Foo.SalesReason ( TempID int NOT NULL, @@ -45,11 +55,54 @@ CREATE TABLE Foo.SalesReason ) ; GO +DROP PROCEDURE IF EXISTS [Foo].[Proc.With.SpecialChar]; +GO CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT AS SELECT @ID AS ThatDB; GO +DROP PROCEDURE IF EXISTS [Foo].[NewProc]; +GO +CREATE PROCEDURE [Foo].[NewProc] + AS + BEGIN + --insert into items table from salesreason table + insert into Foo.Items (ID, ItemName) + SELECT TempID, Name + FROM Foo.SalesReason; + + + IF OBJECT_ID('Foo.age_dist', 'U') IS NULL + BEGIN + -- Create and populate if table doesn't exist + SELECT Age, COUNT(*) as Count + INTO Foo.age_dist + FROM Foo.Persons + GROUP BY Age + END + ELSE + BEGIN + -- Update existing table + TRUNCATE TABLE Foo.age_dist; + + INSERT INTO Foo.age_dist (Age, Count) + SELECT Age, COUNT(*) as Count + FROM Foo.Persons + GROUP BY Age + END + + SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew + + UPDATE DemoData.Foo.Persons + SET Age = t.Age + FROM DemoData.Foo.Persons p + JOIN #TEMPTABLE t ON p.ID = t.ID + + END +GO + +EXEC Foo.NewProc GO EXEC sys.sp_addextendedproperty @name = N'MS_Description', @@ -91,4 +144,4 @@ EXEC sp_attach_schedule GO EXEC dbo.sp_add_jobserver @job_name = N'Weekly Demo Data Backup' -GO +GO \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_with_lower_case_urn.yml b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_with_lower_case_urn.yml index ff1179034833f9..94128810f026b9 100644 --- a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_with_lower_case_urn.yml +++ b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_with_lower_case_urn.yml @@ -5,7 +5,6 @@ source: config: username: sa password: test!Password - database: DemoData host_port: localhost:21433 convert_urns_to_lowercase: true # use_odbc: True diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index 1f418ffbd32ea9..b969f77b4c3c18 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -1,9 +1,16 @@ import os +import pathlib import subprocess import time +from pathlib import Path import pytest +from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure +from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import ( + generate_procedure_lineage, +) +from datahub.sql_parsing.schema_resolver import SchemaResolver from tests.test_helpers import mce_helpers from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port @@ -57,3 +64,50 @@ def test_mssql_ingest(mssql_runner, pytestconfig, tmp_path, mock_time, config_fi r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_modified'\]", ], ) + + +PROCEDURE_SQLS_DIR = pathlib.Path(__file__).parent / "procedures" +PROCEDURES_GOLDEN_DIR = pathlib.Path(__file__).parent / "golden_files/procedures/" +procedure_sqls = [sql_file.name for sql_file in PROCEDURE_SQLS_DIR.iterdir()] + + +@pytest.mark.parametrize("procedure_sql_file", procedure_sqls) +@pytest.mark.integration +def test_stored_procedure_lineage( + pytestconfig: pytest.Config, procedure_sql_file: str +) -> None: + sql_file_path = PROCEDURE_SQLS_DIR / procedure_sql_file + procedure_code = sql_file_path.read_text() + + # Procedure file is named as .. + splits = procedure_sql_file.split(".") + db = splits[0] + schema = splits[1] + name = splits[2] + + procedure = StoredProcedure( + db=db, + schema=schema, + name=name, + flow=None, # type: ignore # flow is not used in this test + code=procedure_code, + ) + data_job_urn = f"urn:li:dataJob:(urn:li:dataFlow:(mssql,{db}.{schema}.stored_procedures,PROD),{name})" + + schema_resolver = SchemaResolver(platform="mssql") + + mcps = list( + generate_procedure_lineage( + schema_resolver=schema_resolver, + procedure=procedure, + procedure_job_urn=data_job_urn, + is_temp_table=lambda name: "temp" in name.lower(), + ) + ) + mce_helpers.check_goldens_stream( + pytestconfig, + outputs=mcps, + golden_path=( + PROCEDURES_GOLDEN_DIR / Path(procedure_sql_file).with_suffix(".json") + ), + ) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index edfc41616e44be..6c45b8a47de412 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -19,8 +19,7 @@ ) from tableauserverclient.models.reference_item import ResourceReference -from datahub.configuration.source_common import DEFAULT_ENV -from datahub.emitter.mce_builder import make_schema_field_urn +from datahub.emitter.mce_builder import DEFAULT_ENV, make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.run.pipeline import Pipeline, PipelineContext from datahub.ingestion.source.tableau.tableau import ( @@ -277,7 +276,6 @@ def mock_sdk_client( datasources_side_effect: List[dict], sign_out_side_effect: List[dict], ) -> mock.MagicMock: - mock_client = mock.Mock() mocked_metadata = mock.Mock() mocked_metadata.query.side_effect = side_effect_query_metadata_response @@ -1229,7 +1227,6 @@ def test_permission_ingestion(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_graph): - with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, diff --git a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py index c078f1b77fd1be..b8b0563a1d24e5 100644 --- a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py +++ b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py @@ -282,7 +282,6 @@ def register_mock_data(workspace_client): def mock_hive_sql(query): - if query == "DESCRIBE EXTENDED `bronze_kambi`.`bet` betStatusId": return [ ("col_name", "betStatusId"), diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py index 73b7790b62e9e7..5042c78c2e7b91 100644 --- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py @@ -16,7 +16,6 @@ def run_test(): - with mock.patch("snowflake.connector.connect") as mock_connect: sf_connection = mock.MagicMock() sf_cursor = mock.MagicMock() diff --git a/metadata-ingestion/tests/test_helpers/docker_helpers.py b/metadata-ingestion/tests/test_helpers/docker_helpers.py index 20aec975787e4e..d0e943bbe63daf 100644 --- a/metadata-ingestion/tests/test_helpers/docker_helpers.py +++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py @@ -4,10 +4,10 @@ import pytest -from datahub.testing.docker_utils import ( # noqa: F401 - docker_compose_runner, - is_responsive, - wait_for_port, +from datahub.testing.docker_utils import ( # noqa: F401,I250 + docker_compose_runner as docker_compose_runner, + is_responsive as is_responsive, + wait_for_port as wait_for_port, ) logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/tests/test_helpers/mce_helpers.py b/metadata-ingestion/tests/test_helpers/mce_helpers.py index 3b59481d8cb022..f4c629df7dba4e 100644 --- a/metadata-ingestion/tests/test_helpers/mce_helpers.py +++ b/metadata-ingestion/tests/test_helpers/mce_helpers.py @@ -17,15 +17,16 @@ Union, ) +import pytest + from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.sink.file import write_metadata_file from datahub.metadata.schema_classes import MetadataChangeEventClass +from datahub.metadata.urns import Urn from datahub.testing.compare_metadata_json import ( assert_metadata_files_equal, load_json_file, ) -from datahub.utilities.urns.urn import Urn -from tests.test_helpers.type_helpers import PytestConfig logger = logging.getLogger(__name__) @@ -77,7 +78,7 @@ def clean_nones(value): def check_golden_file( - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, output_path: Union[str, os.PathLike], golden_path: Union[str, os.PathLike], ignore_paths: Sequence[str] = (), @@ -98,7 +99,7 @@ def check_golden_file( def check_goldens_stream( - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, outputs: List, golden_path: Union[str, os.PathLike], ignore_paths: Sequence[str] = (), diff --git a/metadata-ingestion/tests/test_helpers/type_helpers.py b/metadata-ingestion/tests/test_helpers/type_helpers.py index 154960bbf7fc42..3a2215ed81ca99 100644 --- a/metadata-ingestion/tests/test_helpers/type_helpers.py +++ b/metadata-ingestion/tests/test_helpers/type_helpers.py @@ -1,12 +1,5 @@ from typing import Optional, TypeVar -# The current PytestConfig solution is somewhat ugly and not ideal. -# However, it is currently the best solution available, as the type itself is not -# exported: https://docs.pytest.org/en/stable/reference.html#config. -# As pytest's type support improves, this will likely change. -# TODO: revisit pytestconfig as https://github.com/pytest-dev/pytest/issues/7469 progresses. -from _pytest.config import Config as PytestConfig # noqa: F401 - _T = TypeVar("_T") diff --git a/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py b/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py index c9f16bbcef6fc4..a72087376b78a3 100644 --- a/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py +++ b/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py @@ -10,7 +10,6 @@ class MyTestModel(BaseModel): def test_base_model(): - test_base_model = MyTestModel( test_string_field="test_string_field", test_int_field=42, @@ -31,7 +30,6 @@ def test_base_model(): def test_dictwrapper(): - from datahub.metadata.schema_classes import DatasetPropertiesClass dataset_properties = DatasetPropertiesClass( @@ -58,7 +56,6 @@ def test_dictwrapper(): def test_raw_dictionary(): - test_object = { "test_string_field": "test_string_field", "test_int_field": 42, diff --git a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py index a84e373dbe72c2..6a03f511fa51c5 100644 --- a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py +++ b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py @@ -185,7 +185,6 @@ class TestModel(BaseModel): def test_platform_resource_filters(): - query = ( ElasticPlatformResourceQuery.create_from() .group(LogicalOperator.AND) diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py index cafca521ae0148..c5c4a378894c32 100644 --- a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py @@ -104,7 +104,6 @@ def test_incremental_table_lineage(tmp_path, pytestconfig): def test_incremental_table_lineage_empty_upstreams(tmp_path, pytestconfig): - urn = make_dataset_urn(platform, "dataset1") aspect = make_lineage_aspect( "dataset1", diff --git a/metadata-ingestion/tests/unit/api/test_pipeline.py b/metadata-ingestion/tests/unit/api/test_pipeline.py index 432d8e11c1c0b4..fe3d3160b729a1 100644 --- a/metadata-ingestion/tests/unit/api/test_pipeline.py +++ b/metadata-ingestion/tests/unit/api/test_pipeline.py @@ -33,7 +33,9 @@ class TestPipeline: @patch("confluent_kafka.Consumer", autospec=True) - @patch("datahub.ingestion.source.kafka.KafkaSource.get_workunits", autospec=True) + @patch( + "datahub.ingestion.source.kafka.kafka.KafkaSource.get_workunits", autospec=True + ) @patch("datahub.ingestion.sink.console.ConsoleSink.close", autospec=True) @freeze_time(FROZEN_TIME) def test_configure(self, mock_sink, mock_source, mock_consumer): @@ -198,7 +200,9 @@ def test_configure_with_rest_sink_with_additional_props_initializes_graph( assert pipeline.ctx.graph.config.token == pipeline.config.sink.config["token"] @freeze_time(FROZEN_TIME) - @patch("datahub.ingestion.source.kafka.KafkaSource.get_workunits", autospec=True) + @patch( + "datahub.ingestion.source.kafka.kafka.KafkaSource.get_workunits", autospec=True + ) def test_configure_with_file_sink_does_not_init_graph(self, mock_source, tmp_path): pipeline = Pipeline.create( { diff --git a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py index 415977b0f8467b..a1981ccf767916 100644 --- a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py @@ -144,7 +144,6 @@ def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: def test_lineage_for_external_bq_table(mock_datahub_graph_instance): - pipeline_context = PipelineContext(run_id="bq_gcs_lineage") pipeline_context.graph = mock_datahub_graph_instance @@ -239,7 +238,6 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: def test_lineage_for_external_bq_table_no_column_lineage(mock_datahub_graph_instance): - pipeline_context = PipelineContext(run_id="bq_gcs_lineage") pipeline_context.graph = mock_datahub_graph_instance diff --git a/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py index 63de742b201a97..3247a64631da76 100644 --- a/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py @@ -184,7 +184,6 @@ def test_bigquery_table_sanitasitation(): def test_unquote_and_decode_unicode_escape_seq(): - # Test with a string that starts and ends with quotes and has Unicode escape sequences input_string = '"Hello \\u003cWorld\\u003e"' expected_output = "Hello " diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source.py b/metadata-ingestion/tests/unit/glue/test_glue_source.py index 4df0c6d17b06cc..693fd6bc336fd3 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source.py @@ -34,7 +34,6 @@ run_and_get_pipeline, validate_all_providers_have_committed_successfully, ) -from tests.test_helpers.type_helpers import PytestConfig from tests.unit.glue.test_glue_source_stubs import ( databases_1, databases_2, @@ -174,7 +173,7 @@ def test_column_type(hive_column_type: str, expected_type: Type) -> None: @freeze_time(FROZEN_TIME) def test_glue_ingest( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, platform_instance: str, mce_file: str, mce_golden_file: str, @@ -410,7 +409,7 @@ def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): def test_glue_with_delta_schema_ingest( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, ) -> None: glue_source_instance = glue_source( platform_instance="delta_platform_instance", @@ -446,7 +445,7 @@ def test_glue_with_delta_schema_ingest( def test_glue_with_malformed_delta_schema_ingest( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, ) -> None: glue_source_instance = glue_source( platform_instance="delta_platform_instance", @@ -489,7 +488,7 @@ def test_glue_with_malformed_delta_schema_ingest( @freeze_time(FROZEN_TIME) def test_glue_ingest_include_table_lineage( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, mock_datahub_graph_instance: DataHubGraph, platform_instance: str, mce_file: str, @@ -584,7 +583,7 @@ def test_glue_ingest_include_table_lineage( @freeze_time(FROZEN_TIME) def test_glue_ingest_include_column_lineage( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, mock_datahub_graph_instance: DataHubGraph, platform_instance: str, mce_file: str, @@ -684,7 +683,7 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: @freeze_time(FROZEN_TIME) def test_glue_ingest_with_profiling( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, ) -> None: glue_source_instance = glue_source_with_profiling() mce_file = "glue_mces.json" diff --git a/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py index 2e3eb8fde1292b..941d13be0a6139 100644 --- a/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py +++ b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py @@ -221,7 +221,6 @@ def mock_redshift_connection() -> MagicMock: def mock_graph() -> DataHubGraph: - graph = MagicMock() graph._make_schema_resolver.return_value = SchemaResolver( diff --git a/metadata-ingestion/tests/unit/redshift/test_redshift_source.py b/metadata-ingestion/tests/unit/redshift/test_redshift_source.py index 8198caf50df7f4..f016312dfe47fb 100644 --- a/metadata-ingestion/tests/unit/redshift/test_redshift_source.py +++ b/metadata-ingestion/tests/unit/redshift/test_redshift_source.py @@ -1,15 +1,15 @@ from typing import Iterable -from datahub.emitter.mcp import ( - MetadataChangeProposalClass, - MetadataChangeProposalWrapper, -) +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.redshift.config import RedshiftConfig from datahub.ingestion.source.redshift.redshift import RedshiftSource from datahub.ingestion.source.redshift.redshift_schema import RedshiftTable -from datahub.metadata.schema_classes import MetadataChangeEventClass +from datahub.metadata.schema_classes import ( + MetadataChangeEventClass, + MetadataChangeProposalClass, +) def redshift_source_setup(custom_props_flag: bool) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/tests/unit/serde/test_serde.py b/metadata-ingestion/tests/unit/serde/test_serde.py index 727f2b10511b5e..a131ac9ce2a1bc 100644 --- a/metadata-ingestion/tests/unit/serde/test_serde.py +++ b/metadata-ingestion/tests/unit/serde/test_serde.py @@ -19,7 +19,6 @@ from datahub.metadata.schemas import getMetadataChangeEventSchema from tests.test_helpers import mce_helpers from tests.test_helpers.click_helpers import run_datahub_cmd -from tests.test_helpers.type_helpers import PytestConfig FROZEN_TIME = "2021-07-22 18:54:06" @@ -41,7 +40,7 @@ ], ) def test_serde_to_json( - pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str + pytestconfig: pytest.Config, tmp_path: pathlib.Path, json_filename: str ) -> None: golden_file = pytestconfig.rootpath / json_filename output_file = tmp_path / "output.json" @@ -73,7 +72,7 @@ def test_serde_to_json( ) @freeze_time(FROZEN_TIME) def test_serde_to_avro( - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, json_filename: str, ) -> None: # In this test, we want to read in from JSON -> MCE object. @@ -126,14 +125,14 @@ def test_serde_to_avro( ], ) @freeze_time(FROZEN_TIME) -def test_check_metadata_schema(pytestconfig: PytestConfig, json_filename: str) -> None: +def test_check_metadata_schema(pytestconfig: pytest.Config, json_filename: str) -> None: json_file_path = pytestconfig.rootpath / json_filename run_datahub_cmd(["check", "metadata-file", f"{json_file_path}"]) def test_check_metadata_rewrite( - pytestconfig: PytestConfig, tmp_path: pathlib.Path + pytestconfig: pytest.Config, tmp_path: pathlib.Path ) -> None: json_input = ( pytestconfig.rootpath / "tests/unit/serde/test_canonicalization_input.json" @@ -161,7 +160,7 @@ def test_check_metadata_rewrite( ], ) def test_check_mce_schema_failure( - pytestconfig: PytestConfig, json_filename: str + pytestconfig: pytest.Config, json_filename: str ) -> None: json_file_path = pytestconfig.rootpath / json_filename diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_shares.py similarity index 100% rename from metadata-ingestion/tests/unit/test_snowflake_shares.py rename to metadata-ingestion/tests/unit/snowflake/test_snowflake_shares.py diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py similarity index 99% rename from metadata-ingestion/tests/unit/test_snowflake_source.py rename to metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index 72b59a3a4e4938..161dfa2b4e78f3 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -5,7 +5,6 @@ from pydantic import ValidationError from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.oauth import OAuthConfiguration from datahub.configuration.pattern_utils import UUID_REGEX from datahub.ingestion.api.source import SourceCapability from datahub.ingestion.source.snowflake.constants import ( @@ -13,6 +12,7 @@ CLIENT_SESSION_KEEP_ALIVE, SnowflakeCloudProvider, ) +from datahub.ingestion.source.snowflake.oauth_config import OAuthConfiguration from datahub.ingestion.source.snowflake.snowflake_config import ( DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeV2Config, diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_mssql_casing_resolver.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_mssql_casing_resolver.json new file mode 100644 index 00000000000000..072260f5cd1651 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_mssql_casing_resolver.json @@ -0,0 +1,82 @@ +{ + "query_type": "SELECT", + "query_type_props": {}, + "query_fingerprint": "6a779a57ffb2598c301606d3a7d82142a7af8b102efa55a2c9a4e960fd55ac07", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.persons,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "age", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INTEGER" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.persons,PROD)", + "column": "Age" + } + ] + }, + { + "downstream": { + "table": null, + "column": "name", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR(16777216)" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.persons,PROD)", + "column": "Name" + } + ] + }, + { + "downstream": { + "table": null, + "column": "uppercased_col", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR(16777216)" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.persons,PROD)", + "column": "Uppercased_Col" + } + ] + }, + { + "downstream": { + "table": null, + "column": "count", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INTEGER" + }, + "upstreams": [] + } + ], + "debug_info": { + "confidence": 0.9, + "generalized_statement": "SELECT Age, name, UPPERCASED_COL, COUNT(*) AS Count FROM Foo.Persons GROUP BY Age" + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_mssql_select_into.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_mssql_select_into.json new file mode 100644 index 00000000000000..ff8609922a9bd8 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_mssql_select_into.json @@ -0,0 +1,48 @@ +{ + "query_type": "SELECT", + "query_type_props": {}, + "query_fingerprint": "d18c43ed4d7f0e303b88c40ac078c5505be473edf0136e42a77dccc090d1eb8b", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.persons,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.age_dist,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.age_dist,PROD)", + "column": "AGE", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INTEGER" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.persons,PROD)", + "column": "Age" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.age_dist,PROD)", + "column": "Count", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INTEGER" + }, + "upstreams": [] + } + ], + "debug_info": { + "confidence": 0.9, + "generalized_statement": "SELECT age AS AGE, COUNT(*) AS Count INTO Foo.age_dist FROM Foo.Persons GROUP BY Age" + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json new file mode 100644 index 00000000000000..bcf31f6be803a2 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json @@ -0,0 +1,12 @@ +{ + "query_type": "UNKNOWN", + "query_type_props": {}, + "query_fingerprint": null, + "in_tables": [], + "out_tables": [], + "column_lineage": null, + "debug_info": { + "confidence": 0.0, + "generalized_statement": null + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_split_statements.py b/metadata-ingestion/tests/unit/sql_parsing/test_split_statements.py new file mode 100644 index 00000000000000..06e0e84ede5547 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/test_split_statements.py @@ -0,0 +1,51 @@ +from datahub.sql_parsing.split_statements import split_statements + + +def test_split_statements_complex() -> None: + test_sql = """ + CREATE TABLE Users (Id INT); + -- Comment here + INSERT INTO Users VALUES (1); + BEGIN + UPDATE Users SET Id = 2; + /* Multi-line + comment */ + DELETE FROM /* inline DELETE comment */ Users; + END + GO + SELECT * FROM Users + """ + + statements = [statement.strip() for statement in split_statements(test_sql)] + assert statements == [ + "CREATE TABLE Users (Id INT)", + "-- Comment here", + "INSERT INTO Users VALUES (1)", + "BEGIN", + "UPDATE Users SET Id = 2", + "/* Multi-line\n comment */", + "DELETE FROM /* inline DELETE comment */ Users", + "END", + "GO", + "SELECT * FROM Users", + ] + + +def test_split_statements_cte() -> None: + # SQL example from https://stackoverflow.com/a/11562724 + test_sql = """\ +WITH T AS +( SELECT InvoiceNumber, + DocTotal, + SUM(Sale + VAT) OVER(PARTITION BY InvoiceNumber) AS NewDocTotal + FROM PEDI_InvoiceDetail +) +-- comment +/* multi-line +comment */ +UPDATE T +SET DocTotal = NewDocTotal""" + statements = [statement.strip() for statement in split_statements(test_sql)] + assert statements == [ + test_sql, + ] diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py index b1ad9eb5c15d76..2a771a9847abd8 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py @@ -958,7 +958,6 @@ def test_table_lineage_via_temp_table_disordered_add( @freeze_time(FROZEN_TIME) def test_basic_usage(pytestconfig: pytest.Config) -> None: - frozen_timestamp = parse_user_datetime(FROZEN_TIME) aggregator = SqlParsingAggregator( platform="redshift", diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 90cc863d6bd231..2df6f6fa26cb56 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -1268,3 +1268,58 @@ def test_bigquery_subquery_column_inference() -> None: dialect="bigquery", expected_file=RESOURCE_DIR / "test_bigquery_subquery_column_inference.json", ) + + +def test_sqlite_attach_database() -> None: + assert_sql_result( + """\ +ATTACH DATABASE ':memory:' AS aux1 +""", + dialect="sqlite", + expected_file=RESOURCE_DIR / "test_sqlite_attach_database.json", + allow_table_error=True, + ) + + +def test_mssql_casing_resolver() -> None: + assert_sql_result( + """\ +SELECT Age, name, UPPERCASED_COL, COUNT(*) as Count +FROM Foo.Persons +GROUP BY Age +""", + dialect="mssql", + default_db="NewData", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.persons,PROD)": { + "Age": "INTEGER", + "Name": "VARCHAR(16777216)", + "Uppercased_Col": "VARCHAR(16777216)", + }, + }, + expected_file=RESOURCE_DIR / "test_mssql_casing_resolver.json", + ) + + +def test_mssql_select_into() -> None: + assert_sql_result( + """\ +SELECT age as AGE, COUNT(*) as Count +INTO Foo.age_dist +FROM Foo.Persons +GROUP BY Age +""", + dialect="mssql", + default_db="NewData", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.persons,PROD)": { + "Age": "INTEGER", + "Name": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foo.age_dist,PROD)": { + "AGE": "INTEGER", + "Count": "INTEGER", + }, + }, + expected_file=RESOURCE_DIR / "test_mssql_select_into.json", + ) diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py index 744d43373a0a1f..4e8ba8aa6b7770 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py @@ -4,11 +4,9 @@ import pytest import sqlglot +from datahub.sql_parsing.query_types import get_query_type_of_sql from datahub.sql_parsing.sql_parsing_common import QueryType -from datahub.sql_parsing.sqlglot_lineage import ( - _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT, - get_query_type_of_sql, -) +from datahub.sql_parsing.sqlglot_lineage import _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT from datahub.sql_parsing.sqlglot_utils import ( generalize_query, generalize_query_fast, diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py index be2d8bac12e386..b04d4b86d2e4bb 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py @@ -50,7 +50,6 @@ def test_change_percent( def test_filter_ignored_entity_types(): - assert filter_ignored_entity_types( [ "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)", diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py index 66564dc856abae..96ab8f7a01a386 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py @@ -10,7 +10,8 @@ from datahub.api.entities.dataprocess.dataprocess_instance import DataProcessInstance from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport diff --git a/metadata-ingestion/tests/unit/test_cassandra_source.py b/metadata-ingestion/tests/unit/test_cassandra_source.py index a4ca3a0a9ef3f6..75dedde76c7c89 100644 --- a/metadata-ingestion/tests/unit/test_cassandra_source.py +++ b/metadata-ingestion/tests/unit/test_cassandra_source.py @@ -56,7 +56,6 @@ def assert_field_paths_match( def test_cassandra_schema_conversion( schema: str, expected_field_paths: List[str] ) -> None: - schema_dict: Dict[str, List[Any]] = json.loads(schema) column_infos: List = schema_dict["column_infos"] diff --git a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py index b047cd16c52a97..3500636f00eddf 100644 --- a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py +++ b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py @@ -8,7 +8,7 @@ ) from datahub.ingestion.source.confluent_schema_registry import ConfluentSchemaRegistry -from datahub.ingestion.source.kafka import KafkaSourceConfig, KafkaSourceReport +from datahub.ingestion.source.kafka.kafka import KafkaSourceConfig, KafkaSourceReport class ConfluentSchemaRegistryTest(unittest.TestCase): diff --git a/metadata-ingestion/tests/unit/test_kafka_source.py b/metadata-ingestion/tests/unit/test_kafka_source.py index b4e37d288a3041..dfd32085b77055 100644 --- a/metadata-ingestion/tests/unit/test_kafka_source.py +++ b/metadata-ingestion/tests/unit/test_kafka_source.py @@ -23,7 +23,7 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.kafka import KafkaSource, KafkaSourceConfig +from datahub.ingestion.source.kafka.kafka import KafkaSource, KafkaSourceConfig from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.schema_classes import ( BrowsePathsClass, @@ -38,11 +38,13 @@ @pytest.fixture def mock_admin_client(): - with patch("datahub.ingestion.source.kafka.AdminClient", autospec=True) as mock: + with patch( + "datahub.ingestion.source.kafka.kafka.AdminClient", autospec=True + ) as mock: yield mock -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_configuration(mock_kafka): ctx = PipelineContext(run_id="test") kafka_source = KafkaSource( @@ -53,7 +55,7 @@ def test_kafka_source_configuration(mock_kafka): assert mock_kafka.call_count == 1 -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_workunits_wildcard_topic(mock_kafka, mock_admin_client): mock_kafka_instance = mock_kafka.return_value mock_cluster_metadata = MagicMock() @@ -74,7 +76,7 @@ def test_kafka_source_workunits_wildcard_topic(mock_kafka, mock_admin_client): assert len(workunits) == 4 -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_workunits_topic_pattern(mock_kafka, mock_admin_client): mock_kafka_instance = mock_kafka.return_value mock_cluster_metadata = MagicMock() @@ -108,7 +110,7 @@ def test_kafka_source_workunits_topic_pattern(mock_kafka, mock_admin_client): assert len(workunits) == 4 -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_workunits_with_platform_instance(mock_kafka, mock_admin_client): PLATFORM_INSTANCE = "kafka_cluster" PLATFORM = "kafka" @@ -160,7 +162,7 @@ def test_kafka_source_workunits_with_platform_instance(mock_kafka, mock_admin_cl assert f"/prod/{PLATFORM}/{PLATFORM_INSTANCE}" in browse_path_aspects[0].paths -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_workunits_no_platform_instance(mock_kafka, mock_admin_client): PLATFORM = "kafka" TOPIC_NAME = "test" @@ -204,7 +206,7 @@ def test_kafka_source_workunits_no_platform_instance(mock_kafka, mock_admin_clie assert f"/prod/{PLATFORM}" in browse_path_aspects[0].paths -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_close(mock_kafka, mock_admin_client): mock_kafka_instance = mock_kafka.return_value ctx = PipelineContext(run_id="test") @@ -223,7 +225,7 @@ def test_close(mock_kafka, mock_admin_client): "datahub.ingestion.source.confluent_schema_registry.SchemaRegistryClient", autospec=True, ) -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_workunits_schema_registry_subject_name_strategies( mock_kafka_consumer, mock_schema_registry_client, mock_admin_client ): @@ -415,7 +417,7 @@ def mock_get_latest_version(subject_name: str) -> Optional[RegisteredSchema]: "datahub.ingestion.source.confluent_schema_registry.SchemaRegistryClient", autospec=True, ) -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_ignore_warnings_on_schema_type( mock_kafka_consumer, mock_schema_registry_client, @@ -483,8 +485,8 @@ def mock_get_latest_version(subject_name: str) -> Optional[RegisteredSchema]: assert kafka_source.report.warnings -@patch("datahub.ingestion.source.kafka.AdminClient", autospec=True) -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.AdminClient", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_succeeds_with_admin_client_init_error( mock_kafka, mock_kafka_admin_client ): @@ -513,8 +515,8 @@ def test_kafka_source_succeeds_with_admin_client_init_error( assert len(workunits) == 2 -@patch("datahub.ingestion.source.kafka.AdminClient", autospec=True) -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.AdminClient", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_succeeds_with_describe_configs_error( mock_kafka, mock_kafka_admin_client ): @@ -550,7 +552,7 @@ def test_kafka_source_succeeds_with_describe_configs_error( "datahub.ingestion.source.confluent_schema_registry.SchemaRegistryClient", autospec=True, ) -@patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) +@patch("datahub.ingestion.source.kafka.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_topic_meta_mappings( mock_kafka_consumer, mock_schema_registry_client, mock_admin_client ): diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py index a98bf641711220..cfb8f55bd977f7 100644 --- a/metadata-ingestion/tests/unit/test_sql_common.py +++ b/metadata-ingestion/tests/unit/test_sql_common.py @@ -38,7 +38,7 @@ def test_generate_foreign_key(): "referred_columns": ["test_referred_column"], # type: ignore } foreign_key = source.get_foreign_key_metadata( - dataset_urn="test_urn", + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD)", schema="test_schema", fk_dict=fk_dict, inspector=mock.Mock(), @@ -48,7 +48,9 @@ def test_generate_foreign_key(): assert [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_referred_schema.test_table,PROD),test_referred_column)" ] == foreign_key.foreignFields - assert ["urn:li:schemaField:(test_urn,test_column)"] == foreign_key.sourceFields + assert [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD),test_column)" + ] == foreign_key.sourceFields def test_use_source_schema_for_foreign_key_if_not_specified(): @@ -60,7 +62,7 @@ def test_use_source_schema_for_foreign_key_if_not_specified(): "referred_columns": ["test_referred_column"], # type: ignore } foreign_key = source.get_foreign_key_metadata( - dataset_urn="test_urn", + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD)", schema="test_schema", fk_dict=fk_dict, inspector=mock.Mock(), @@ -70,7 +72,9 @@ def test_use_source_schema_for_foreign_key_if_not_specified(): assert [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.test_table,PROD),test_referred_column)" ] == foreign_key.foreignFields - assert ["urn:li:schemaField:(test_urn,test_column)"] == foreign_key.sourceFields + assert [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD),test_column)" + ] == foreign_key.sourceFields PLATFORM_FROM_SQLALCHEMY_URI_TEST_CASES: Dict[str, str] = { diff --git a/metadata-ingestion/tests/unit/test_parsing_util.py b/metadata-ingestion/tests/unit/utilities/test_parsing_util.py similarity index 100% rename from metadata-ingestion/tests/unit/test_parsing_util.py rename to metadata-ingestion/tests/unit/utilities/test_parsing_util.py diff --git a/metadata-ingestion/tests/unit/utilities/test_progress_timer.py b/metadata-ingestion/tests/unit/utilities/test_progress_timer.py new file mode 100644 index 00000000000000..139bad371bb9f4 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_progress_timer.py @@ -0,0 +1,53 @@ +from datetime import timedelta +from time import sleep + +from datahub.utilities.progress_timer import ProgressTimer + + +def test_progress_timer_basic(): + timer = ProgressTimer(report_every=timedelta(milliseconds=100)) + + # First call should not report since report_0=False by default + assert not timer.should_report() + + # Call before interval elapsed should not report + sleep(0.05) # 50ms + assert not timer.should_report() + + # Call after interval elapsed should report + sleep(0.1) # Additional 100ms + assert timer.should_report() + + # Next immediate call should not report + assert not timer.should_report() + + +def test_progress_timer_with_report_0(): + timer = ProgressTimer(report_every=timedelta(milliseconds=100), report_0=True) + + # First call should report since report_0=True + assert timer.should_report() + + # Next immediate call should not report + assert not timer.should_report() + + # Call after interval elapsed should report + sleep(0.1) # 100ms + assert timer.should_report() + + +def test_progress_timer_multiple_intervals(): + timer = ProgressTimer(report_every=timedelta(milliseconds=50)) + + # First call should not report + assert not timer.should_report() + + # Check multiple intervals + sleep(0.06) # 60ms - should report + assert timer.should_report() + + sleep(0.02) # 20ms - should not report + assert not timer.should_report() + + sleep(0.05) # 50ms - should report + assert timer.should_report() diff --git a/metadata-integration/java/acryl-spark-lineage/README.md b/metadata-integration/java/acryl-spark-lineage/README.md index bd0a58b635b483..267e979b0fa073 100644 --- a/metadata-integration/java/acryl-spark-lineage/README.md +++ b/metadata-integration/java/acryl-spark-lineage/README.md @@ -165,6 +165,7 @@ information like tokens. | spark.datahub.rest.server | | http://localhost:8080 | Datahub server url eg: | | spark.datahub.rest.token | | | Authentication token. | | spark.datahub.rest.disable_ssl_verification | | false | Disable SSL certificate validation. Caution: Only use this if you know what you are doing! | +| spark.datahub.rest.disable_chunked_encoding | | false | Disable Chunked Transfer Encoding. In some environment chunked encoding causes issues. With this config option it can be disabled. || | spark.datahub.rest.max_retries | | 0 | Number of times a request retried if failed | | spark.datahub.rest.retry_interval | | 10 | Number of seconds to wait between retries | | spark.datahub.file.filename | | | The file where metadata will be written if file emitter is set | diff --git a/metadata-integration/java/acryl-spark-lineage/build.gradle b/metadata-integration/java/acryl-spark-lineage/build.gradle index 6620c34021ac4a..3f83e5657bbf4d 100644 --- a/metadata-integration/java/acryl-spark-lineage/build.gradle +++ b/metadata-integration/java/acryl-spark-lineage/build.gradle @@ -1,7 +1,7 @@ plugins { id("com.palantir.git-version") apply false } -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'com.github.johnrengelman.shadow' apply plugin: 'signing' apply plugin: 'io.codearte.nexus-staging' @@ -51,8 +51,8 @@ dependencies { implementation project(':metadata-integration:java:openlineage-converter') - implementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') - implementation project(path: ':metadata-integration:java:openlineage-converter', configuration: 'shadow') + implementation project(path: ':metadata-integration:java:datahub-client') + implementation project(path: ':metadata-integration:java:openlineage-converter') //implementation "io.acryl:datahub-client:0.10.2" implementation "io.openlineage:openlineage-spark_2.12:$openLineageVersion" @@ -91,6 +91,8 @@ shadowJar { zip64 = true archiveClassifier = '' mergeServiceFiles() + project.configurations.implementation.canBeResolved = true + configurations = [project.configurations.implementation] def exclude_modules = project .configurations @@ -106,6 +108,8 @@ shadowJar { exclude(dependency { exclude_modules.contains(it.name) }) + exclude(dependency("org.slf4j::")) + exclude("org/apache/commons/logging/**") } // preventing java multi-release JAR leakage @@ -123,39 +127,36 @@ shadowJar { relocate 'com.sun.activation', 'io.acryl.shaded.com.sun.activation' relocate 'com.sun.codemodel', 'io.acryl.shaded.com.sun.codemodel' relocate 'com.sun.mail', 'io.acryl.shaded.com.sun.mail' - relocate 'com.fasterxml.jackson', 'datahub.spark2.shaded.jackson' - relocate 'org.slf4j', 'datahub.spark2.shaded.org.slf4j' // relocate 'org.apache.hc', 'io.acryl.shaded.http' - relocate 'org.apache.commons.codec', 'datahub.spark2.shaded.o.a.c.codec' - relocate 'org.apache.commons.compress', 'datahub.spark2.shaded.o.a.c.compress' - relocate 'org.apache.commons.lang3', 'datahub.spark2.shaded.o.a.c.lang3' + relocate 'org.apache.commons.codec', 'io.acryl.shaded.org.apache.commons.codec' + relocate 'org.apache.commons.compress', 'io.acryl.shaded.org.apache.commons.compress' + relocate 'org.apache.commons.lang3', 'io.acryl.shaded.org.apache.commons.lang3' relocate 'mozilla', 'datahub.spark2.shaded.mozilla' - relocate 'com.typesafe', 'datahub.spark2.shaded.typesafe' - relocate 'io.opentracing', 'datahub.spark2.shaded.io.opentracing' - relocate 'io.netty', 'datahub.spark2.shaded.io.netty' - relocate 'ch.randelshofer', 'datahub.spark2.shaded.ch.randelshofer' - relocate 'ch.qos', 'datahub.spark2.shaded.ch.qos' + relocate 'com.typesafe', 'io.acryl.shaded.com.typesafe' + relocate 'io.opentracing', 'io.acryl.shaded.io.opentracing' + relocate 'io.netty', 'io.acryl.shaded.io.netty' + relocate 'ch.randelshofer', 'io.acryl.shaded.ch.randelshofer' + relocate 'ch.qos', 'io.acryl.shaded.ch.qos' relocate 'org.springframework', 'io.acryl.shaded.org.springframework' relocate 'com.fasterxml.jackson', 'io.acryl.shaded.jackson' relocate 'org.yaml', 'io.acryl.shaded.org.yaml' // Required for shading snakeyaml relocate 'net.jcip.annotations', 'io.acryl.shaded.annotations' relocate 'javassist', 'io.acryl.shaded.javassist' relocate 'edu.umd.cs.findbugs', 'io.acryl.shaded.findbugs' - relocate 'org.antlr', 'io.acryl.shaded.org.antlr' - relocate 'antlr', 'io.acryl.shaded.antlr' + //relocate 'org.antlr', 'io.acryl.shaded.org.antlr' + //relocate 'antlr', 'io.acryl.shaded.antlr' relocate 'com.google.common', 'io.acryl.shaded.com.google.common' - relocate 'org.apache.commons', 'io.acryl.shaded.org.apache.commons' relocate 'org.reflections', 'io.acryl.shaded.org.reflections' relocate 'st4hidden', 'io.acryl.shaded.st4hidden' relocate 'org.stringtemplate', 'io.acryl.shaded.org.stringtemplate' relocate 'org.abego.treelayout', 'io.acryl.shaded.treelayout' - relocate 'org.slf4j', 'io.acryl.shaded.slf4j' relocate 'javax.annotation', 'io.acryl.shaded.javax.annotation' relocate 'com.github.benmanes.caffeine', 'io.acryl.shaded.com.github.benmanes.caffeine' relocate 'org.checkerframework', 'io.acryl.shaded.org.checkerframework' relocate 'com.google.errorprone', 'io.acryl.shaded.com.google.errorprone' relocate 'com.sun.jna', 'io.acryl.shaded.com.sun.jna' + } checkShadowJar { diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java index ee0938edb50454..b594f6bae954fa 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java @@ -120,7 +120,9 @@ public Optional initializeEmitter(Config sparkConf) { boolean disableSslVerification = sparkConf.hasPath(SparkConfigParser.DISABLE_SSL_VERIFICATION_KEY) && sparkConf.getBoolean(SparkConfigParser.DISABLE_SSL_VERIFICATION_KEY); - + boolean disableChunkedEncoding = + sparkConf.hasPath(SparkConfigParser.REST_DISABLE_CHUNKED_ENCODING) + && sparkConf.getBoolean(SparkConfigParser.REST_DISABLE_CHUNKED_ENCODING); int retry_interval_in_sec = sparkConf.hasPath(SparkConfigParser.RETRY_INTERVAL_IN_SEC) ? sparkConf.getInt(SparkConfigParser.RETRY_INTERVAL_IN_SEC) @@ -150,6 +152,7 @@ public Optional initializeEmitter(Config sparkConf) { .disableSslVerification(disableSslVerification) .maxRetries(max_retries) .retryIntervalSec(retry_interval_in_sec) + .disableChunkedEncoding(disableChunkedEncoding) .build(); return Optional.of(new RestDatahubEmitterConfig(restEmitterConf)); case "kafka": @@ -374,7 +377,8 @@ private static void initializeMetrics(OpenLineageConfig openLineageConfig) { String disabledFacets; if (openLineageConfig.getFacetsConfig() != null && openLineageConfig.getFacetsConfig().getDisabledFacets() != null) { - disabledFacets = String.join(";", openLineageConfig.getFacetsConfig().getDisabledFacets()); + disabledFacets = + String.join(";", openLineageConfig.getFacetsConfig().getEffectiveDisabledFacets()); } else { disabledFacets = ""; } diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java index 45ec5365d09b36..3860285083c4bb 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java @@ -30,6 +30,8 @@ public class SparkConfigParser { public static final String GMS_AUTH_TOKEN = "rest.token"; public static final String FILE_EMITTER_FILE_NAME = "file.filename"; public static final String DISABLE_SSL_VERIFICATION_KEY = "rest.disable_ssl_verification"; + public static final String REST_DISABLE_CHUNKED_ENCODING = "rest.disable_chunked_encoding"; + public static final String MAX_RETRIES = "rest.max_retries"; public static final String RETRY_INTERVAL_IN_SEC = "rest.retry_interval_in_sec"; public static final String KAFKA_MCP_TOPIC = "kafka.mcp_topic"; diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java index d46d741d155b8b..5f87df2a65d6c2 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java @@ -5,14 +5,13 @@ package io.openlineage.spark.agent.util; -import static io.openlineage.spark.agent.lifecycle.ExecutionContext.CAMEL_TO_SNAKE_CASE; - import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import datahub.spark.conf.SparkLineageConf; import io.datahubproject.openlineage.dataset.HdfsPathDataset; import io.openlineage.client.OpenLineage; import io.openlineage.spark.agent.Versions; +import io.openlineage.spark.api.naming.NameNormalizer; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; @@ -21,7 +20,6 @@ import java.util.Collection; import java.util.Collections; import java.util.List; -import java.util.Locale; import java.util.Objects; import java.util.Optional; import java.util.UUID; @@ -186,7 +184,7 @@ public static OpenLineage.ParentRunFacet parentRunFacet( .run(new OpenLineage.ParentRunFacetRunBuilder().runId(parentRunId).build()) .job( new OpenLineage.ParentRunFacetJobBuilder() - .name(parentJob.replaceAll(CAMEL_TO_SNAKE_CASE, "_$1").toLowerCase(Locale.ROOT)) + .name(NameNormalizer.normalize(parentJob)) .namespace(parentJobNamespace) .build()) .build(); @@ -287,8 +285,6 @@ public static boolean safeIsDefinedAt(PartialFunction pfn, Object x) { * @param pfn * @param x * @return - * @param - * @param */ public static List safeApply(PartialFunction> pfn, D x) { try { diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java index 62005bf15f8505..6ef7403362a909 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java @@ -7,6 +7,7 @@ import java.util.Arrays; import java.util.Objects; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Stream; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.reflect.FieldUtils; @@ -18,6 +19,7 @@ import org.apache.spark.rdd.MapPartitionsRDD; import org.apache.spark.rdd.ParallelCollectionRDD; import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.execution.datasources.FilePartition; import org.apache.spark.sql.execution.datasources.FileScanRDD; import scala.Tuple2; import scala.collection.immutable.Seq; @@ -90,7 +92,7 @@ public boolean isDefinedAt(Object rdd) { @SuppressWarnings("PMD.AvoidLiteralsInIfCondition") public Stream extract(FileScanRDD rdd) { return ScalaConversionUtils.fromSeq(rdd.filePartitions()).stream() - .flatMap(fp -> Arrays.stream(fp.files())) + .flatMap((FilePartition fp) -> Arrays.stream(fp.files())) .map( f -> { if ("3.4".compareTo(package$.MODULE$.SPARK_VERSION()) <= 0) { @@ -115,11 +117,15 @@ public boolean isDefinedAt(Object rdd) { @Override public Stream extract(ParallelCollectionRDD rdd) { + int SEQ_LIMIT = 1000; + AtomicBoolean loggingDone = new AtomicBoolean(false); try { Object data = FieldUtils.readField(rdd, "data", true); log.debug("ParallelCollectionRDD data: {}", data); - if (data instanceof Seq) { - return ScalaConversionUtils.fromSeq((Seq) data).stream() + if ((data instanceof Seq) && ((Seq) data).head() instanceof Tuple2) { + // exit if the first element is invalid + Seq data_slice = (Seq) ((Seq) data).slice(0, SEQ_LIMIT); + return ScalaConversionUtils.fromSeq(data_slice).stream() .map( el -> { Path path = null; @@ -127,9 +133,9 @@ public Stream extract(ParallelCollectionRDD rdd) { // we're able to extract path path = parentOf(((Tuple2) el)._1.toString()); log.debug("Found input {}", path); - } else { - // Change to debug to silence error - log.debug("unable to extract Path from {}", el.getClass().getCanonicalName()); + } else if (!loggingDone.get()) { + log.warn("unable to extract Path from {}", el.getClass().getCanonicalName()); + loggingDone.set(true); } return path; }) diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index d9087347e1b5c6..1bdc848d0385b1 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -1,6 +1,6 @@ plugins { id("com.palantir.git-version") apply false - id 'java' + id 'java-library' id 'com.github.johnrengelman.shadow' id 'jacoco' id 'signing' @@ -12,11 +12,13 @@ apply from: "../versioning.gradle" import org.apache.tools.ant.filters.ReplaceTokens -jar.enabled = false // Since we only want to build shadow jars, disabling the regular jar creation +jar { + archiveClassifier = "lib" +} dependencies { - implementation project(':entity-registry') - implementation project(':metadata-integration:java:datahub-event') + api project(':entity-registry') + api project(':metadata-integration:java:datahub-event') implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } @@ -33,7 +35,7 @@ dependencies { implementation externalDependency.jacksonDataBind runtimeOnly externalDependency.jna - implementation externalDependency.slf4jApi + api externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok // VisibleForTesting @@ -78,6 +80,11 @@ shadowJar { // https://github.com/johnrengelman/shadow/issues/729 exclude('module-info.class', 'META-INF/versions/**', '**/LICENSE', '**/LICENSE*.txt', '**/NOTICE', '**/NOTICE.txt', 'licenses/**', 'log4j2.*', 'log4j.*') + dependencies { + exclude(dependency("org.slf4j::")) + exclude(dependency("antlr::")) + exclude("org/apache/commons/logging/**") + } mergeServiceFiles() // we relocate namespaces manually, because we want to know exactly which libs we are exposing and why // we can move to automatic relocation using ConfigureShadowRelocation after we get to a good place on these first @@ -88,15 +95,20 @@ shadowJar { relocate 'javassist', 'datahub.shaded.javassist' relocate 'edu.umd.cs.findbugs', 'datahub.shaded.findbugs' relocate 'org.antlr', 'datahub.shaded.org.antlr' - relocate 'antlr', 'datahub.shaded.antlr' + //relocate 'antlr', 'datahub.shaded.antlr' relocate 'com.google.common', 'datahub.shaded.com.google.common' - relocate 'org.apache.commons', 'datahub.shaded.org.apache.commons' + relocate 'org.apache.commons.codec', 'datahub.shaded.org.apache.commons.codec' + relocate 'org.apache.commons.compress', 'datahub.shaded.org.apache.commons.compress' + relocate 'org.apache.commons.lang3', 'datahub.shaded.org.apache.commons.lang3' + relocate 'org.apache.commons.lang', 'datahub.shaded.org.apache.commons.lang' + relocate 'org.apache.commons.cli', 'datahub.shaded.org.apache.commons.cli' + relocate 'org.apache.commons.text', 'datahub.shaded.org.apache.commons.text' + relocate 'org.apache.commons.io', 'datahub.shaded.org.apache.commons.io' relocate 'org.apache.maven', 'datahub.shaded.org.apache.maven' relocate 'org.reflections', 'datahub.shaded.org.reflections' relocate 'st4hidden', 'datahub.shaded.st4hidden' relocate 'org.stringtemplate', 'datahub.shaded.org.stringtemplate' relocate 'org.abego.treelayout', 'datahub.shaded.treelayout' - relocate 'org.slf4j', 'datahub.shaded.slf4j' relocate 'javax.annotation', 'datahub.shaded.javax.annotation' relocate 'com.github.benmanes.caffeine', 'datahub.shaded.com.github.benmanes.caffeine' relocate 'org.checkerframework', 'datahub.shaded.org.checkerframework' diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java index 71a4b93baf48f4..50c0277c98b03b 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java @@ -48,7 +48,6 @@ public boolean retryRequest( @Override public boolean retryRequest(HttpResponse response, int execCount, HttpContext context) { - log.warn("Retrying request due to error: {}", response); return super.retryRequest(response, execCount, context); } } diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java index e1017372be124b..d70c5baf10879d 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java @@ -1,6 +1,7 @@ package datahub.client.rest; import static com.linkedin.metadata.Constants.*; +import static org.apache.hc.core5.http.HttpHeaders.*; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.core.StreamReadConstraints; @@ -18,6 +19,7 @@ import datahub.event.UpsertAspectRequest; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.security.KeyManagementException; import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; @@ -26,6 +28,7 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; import javax.annotation.concurrent.ThreadSafe; @@ -97,17 +100,20 @@ public RestEmitter(RestEmitterConfig config) { this.config = config; HttpAsyncClientBuilder httpClientBuilder = this.config.getAsyncHttpClientBuilder(); httpClientBuilder.setRetryStrategy(new DatahubHttpRequestRetryStrategy()); - - // Override httpClient settings with RestEmitter configs if present - if (config.getTimeoutSec() != null) { - httpClientBuilder.setDefaultRequestConfig( - RequestConfig.custom() - .setConnectionRequestTimeout( - config.getTimeoutSec() * 1000, java.util.concurrent.TimeUnit.MILLISECONDS) - .setResponseTimeout( - config.getTimeoutSec() * 1000, java.util.concurrent.TimeUnit.MILLISECONDS) - .build()); + if ((config.getTimeoutSec() != null) || (config.isDisableChunkedEncoding())) { + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); + // Override httpClient settings with RestEmitter configs if present + if (config.getTimeoutSec() != null) { + requestConfigBuilder + .setConnectionRequestTimeout(config.getTimeoutSec() * 1000, TimeUnit.MILLISECONDS) + .setResponseTimeout(config.getTimeoutSec() * 1000, TimeUnit.MILLISECONDS); + } + if (config.isDisableChunkedEncoding()) { + requestConfigBuilder.setContentCompressionEnabled(false); + } + httpClientBuilder.setDefaultRequestConfig(requestConfigBuilder.build()); } + PoolingAsyncClientConnectionManagerBuilder poolingAsyncClientConnectionManagerBuilder = PoolingAsyncClientConnectionManagerBuilder.create(); @@ -223,8 +229,13 @@ private Future postGeneric( if (this.config.getToken() != null) { simpleRequestBuilder.setHeader("Authorization", "Bearer " + this.config.getToken()); } + if (this.config.isDisableChunkedEncoding()) { + byte[] payloadBytes = payloadJson.getBytes(StandardCharsets.UTF_8); + simpleRequestBuilder.setBody(payloadBytes, ContentType.APPLICATION_JSON); + } else { + simpleRequestBuilder.setBody(payloadJson, ContentType.APPLICATION_JSON); + } - simpleRequestBuilder.setBody(payloadJson, ContentType.APPLICATION_JSON); AtomicReference responseAtomicReference = new AtomicReference<>(); CountDownLatch responseLatch = new CountDownLatch(1); FutureCallback httpCallback = diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java index e28ad4ed660f0b..55c11aab0ebf3c 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java @@ -30,6 +30,8 @@ public class RestEmitterConfig { Integer timeoutSec; @Builder.Default boolean disableSslVerification = false; + @Builder.Default boolean disableChunkedEncoding = false; + @Builder.Default int maxRetries = 0; @Builder.Default int retryIntervalSec = 10; diff --git a/metadata-integration/java/openlineage-converter/build.gradle b/metadata-integration/java/openlineage-converter/build.gradle index 2e04881ab5ccda..d149104f089b36 100644 --- a/metadata-integration/java/openlineage-converter/build.gradle +++ b/metadata-integration/java/openlineage-converter/build.gradle @@ -1,4 +1,4 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'com.github.johnrengelman.shadow' apply plugin: 'signing' apply plugin: 'maven-publish' diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MCLKafkaListenerRegistrar.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MCLKafkaListenerRegistrar.java index fb2880f617d301..c909b0034a9125 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MCLKafkaListenerRegistrar.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MCLKafkaListenerRegistrar.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.kafka; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.MCL_EVENT_CONSUMER_NAME; + import com.linkedin.metadata.kafka.config.MetadataChangeLogProcessorCondition; import com.linkedin.metadata.kafka.hook.MetadataChangeLogHook; import com.linkedin.mxe.Topics; @@ -39,7 +41,7 @@ public class MCLKafkaListenerRegistrar implements InitializingBean { @Autowired private KafkaListenerEndpointRegistry kafkaListenerEndpointRegistry; @Autowired - @Qualifier("kafkaEventConsumer") + @Qualifier(MCL_EVENT_CONSUMER_NAME) private KafkaListenerContainerFactory kafkaListenerContainerFactory; @Value("${METADATA_CHANGE_LOG_KAFKA_CONSUMER_GROUP_ID:generic-mae-consumer-job-client}") diff --git a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java index 1b3d19915b439e..5d2f6452e69197 100644 --- a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java +++ b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.kafka; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.DEFAULT_EVENT_CONSUMER_NAME; + import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.Timer; @@ -60,7 +62,7 @@ public class MetadataChangeEventsProcessor { "${METADATA_CHANGE_EVENT_NAME:${KAFKA_MCE_TOPIC_NAME:" + Topics.METADATA_CHANGE_EVENT + "}}", - containerFactory = "kafkaEventConsumer") + containerFactory = DEFAULT_EVENT_CONSUMER_NAME) @Deprecated public void consume(final ConsumerRecord consumerRecord) { try (Timer.Context i = MetricUtils.timer(this.getClass(), "consume").time()) { diff --git a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java index 22c2b4b9c04503..ef87afdef46cb7 100644 --- a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java +++ b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java @@ -4,6 +4,7 @@ import static com.linkedin.metadata.Constants.MDC_CHANGE_TYPE; import static com.linkedin.metadata.Constants.MDC_ENTITY_TYPE; import static com.linkedin.metadata.Constants.MDC_ENTITY_URN; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.MCP_EVENT_CONSUMER_NAME; import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; @@ -116,7 +117,7 @@ public void registerConsumerThrottle() { @KafkaListener( id = CONSUMER_GROUP_ID_VALUE, topics = "${METADATA_CHANGE_PROPOSAL_TOPIC_NAME:" + Topics.METADATA_CHANGE_PROPOSAL + "}", - containerFactory = "kafkaEventConsumer") + containerFactory = MCP_EVENT_CONSUMER_NAME) public void consume(final ConsumerRecord consumerRecord) { try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "consume").time()) { kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); diff --git a/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/PlatformEventProcessor.java b/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/PlatformEventProcessor.java index 358a2ac0c2ee33..5d11697bed93d2 100644 --- a/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/PlatformEventProcessor.java +++ b/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/PlatformEventProcessor.java @@ -1,5 +1,7 @@ package com.datahub.event; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.PE_EVENT_CONSUMER_NAME; + import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.Timer; @@ -56,7 +58,7 @@ public PlatformEventProcessor( @KafkaListener( id = "${PLATFORM_EVENT_KAFKA_CONSUMER_GROUP_ID:generic-platform-event-job-client}", topics = {"${PLATFORM_EVENT_TOPIC_NAME:" + Topics.PLATFORM_EVENT + "}"}, - containerFactory = "kafkaEventConsumer") + containerFactory = PE_EVENT_CONSUMER_NAME) public void consume(final ConsumerRecord consumerRecord) { try (Timer.Context i = MetricUtils.timer(this.getClass(), "consume").time()) { diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java index 60f3e1b4fef76f..9b476483a2baf8 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java @@ -8,4 +8,13 @@ public class ConsumerConfiguration { private int maxPartitionFetchBytes; private boolean stopOnDeserializationError; private boolean healthCheckEnabled; + + private ConsumerOptions mcp; + private ConsumerOptions mcl; + private ConsumerOptions pe; + + @Data + public static class ConsumerOptions { + private String autoOffsetReset; + } } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/KafkaConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/KafkaConfiguration.java index b03aedc1a7b5eb..ae0d3a3bb4647a 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/KafkaConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/KafkaConfiguration.java @@ -20,6 +20,10 @@ public class KafkaConfiguration { "spring.deserializer.key.delegate.class"; public static final String VALUE_DESERIALIZER_DELEGATE_CLASS = "spring.deserializer.value.delegate.class"; + public static final String MCP_EVENT_CONSUMER_NAME = "mcpEventConsumer"; + public static final String MCL_EVENT_CONSUMER_NAME = "mclEventConsumer"; + public static final String PE_EVENT_CONSUMER_NAME = "platformEventConsumer"; + public static final String DEFAULT_EVENT_CONSUMER_NAME = "kafkaEventConsumer"; private String bootstrapServers; diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 8010ae187b6c8b..4945b36a251c26 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -289,6 +289,13 @@ kafka: maxPartitionFetchBytes: ${KAFKA_CONSUMER_MAX_PARTITION_FETCH_BYTES:5242880} # the max bytes consumed per partition stopOnDeserializationError: ${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:true} # Stops kafka listener container on deserialization error, allows user to fix problems before moving past problematic offset. If false will log and move forward past the offset healthCheckEnabled: ${KAFKA_CONSUMER_HEALTH_CHECK_ENABLED:true} # Sets the health indicator to down when a message listener container has stopped due to a deserialization failure, will force consumer apps to restart through k8s and docker-compose health mechanisms + mcp: + autoOffsetReset: ${KAFKA_CONSUMER_MCP_AUTO_OFFSET_RESET:earliest} + mcl: + autoOffsetReset: ${KAFKA_CONSUMER_MCL_AUTO_OFFSET_RESET:earliest} + pe: + autoOffsetReset: ${KAFKA_CONSUMER_PE_AUTO_OFFSET_RESET:latest} + schemaRegistry: type: ${SCHEMA_REGISTRY_TYPE:KAFKA} # INTERNAL or KAFKA or AWS_GLUE url: ${KAFKA_SCHEMAREGISTRY_URL:http://localhost:8081} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java index 750af8ec488df3..a1ee4df360b7ec 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java @@ -1,10 +1,18 @@ package com.linkedin.gms.factory.kafka; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.DEFAULT_EVENT_CONSUMER_NAME; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.MCL_EVENT_CONSUMER_NAME; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.MCP_EVENT_CONSUMER_NAME; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.PE_EVENT_CONSUMER_NAME; + import com.linkedin.gms.factory.config.ConfigurationProvider; +import com.linkedin.metadata.config.kafka.ConsumerConfiguration; import com.linkedin.metadata.config.kafka.KafkaConfiguration; import java.time.Duration; import java.util.Arrays; +import java.util.HashMap; import java.util.Map; +import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; import org.apache.avro.generic.GenericRecord; import org.apache.kafka.clients.consumer.ConsumerConfig; @@ -23,7 +31,6 @@ @Slf4j @Configuration public class KafkaEventConsumerFactory { - private int kafkaEventConsumerConcurrency; @Bean(name = "kafkaConsumerFactory") @@ -87,15 +94,82 @@ private static Map buildCustomizedProperties( return customizedProperties; } - @Bean(name = "kafkaEventConsumer") + @Bean(name = PE_EVENT_CONSUMER_NAME) + protected KafkaListenerContainerFactory platformEventConsumer( + @Qualifier("kafkaConsumerFactory") + DefaultKafkaConsumerFactory kafkaConsumerFactory, + @Qualifier("configurationProvider") ConfigurationProvider configurationProvider) { + + return buildDefaultKafkaListenerContainerFactory( + PE_EVENT_CONSUMER_NAME, + kafkaConsumerFactory, + configurationProvider.getKafka().getConsumer().isStopOnDeserializationError(), + configurationProvider.getKafka().getConsumer().getPe()); + } + + @Bean(name = MCP_EVENT_CONSUMER_NAME) + protected KafkaListenerContainerFactory mcpEventConsumer( + @Qualifier("kafkaConsumerFactory") + DefaultKafkaConsumerFactory kafkaConsumerFactory, + @Qualifier("configurationProvider") ConfigurationProvider configurationProvider) { + + return buildDefaultKafkaListenerContainerFactory( + MCP_EVENT_CONSUMER_NAME, + kafkaConsumerFactory, + configurationProvider.getKafka().getConsumer().isStopOnDeserializationError(), + configurationProvider.getKafka().getConsumer().getMcp()); + } + + @Bean(name = MCL_EVENT_CONSUMER_NAME) + protected KafkaListenerContainerFactory mclEventConsumer( + @Qualifier("kafkaConsumerFactory") + DefaultKafkaConsumerFactory kafkaConsumerFactory, + @Qualifier("configurationProvider") ConfigurationProvider configurationProvider) { + + return buildDefaultKafkaListenerContainerFactory( + MCL_EVENT_CONSUMER_NAME, + kafkaConsumerFactory, + configurationProvider.getKafka().getConsumer().isStopOnDeserializationError(), + configurationProvider.getKafka().getConsumer().getMcl()); + } + + @Bean(name = DEFAULT_EVENT_CONSUMER_NAME) protected KafkaListenerContainerFactory kafkaEventConsumer( @Qualifier("kafkaConsumerFactory") DefaultKafkaConsumerFactory kafkaConsumerFactory, @Qualifier("configurationProvider") ConfigurationProvider configurationProvider) { + return buildDefaultKafkaListenerContainerFactory( + DEFAULT_EVENT_CONSUMER_NAME, + kafkaConsumerFactory, + configurationProvider.getKafka().getConsumer().isStopOnDeserializationError(), + null); + } + + private KafkaListenerContainerFactory buildDefaultKafkaListenerContainerFactory( + String consumerFactoryName, + DefaultKafkaConsumerFactory kafkaConsumerFactory, + boolean isStopOnDeserializationError, + @Nullable ConsumerConfiguration.ConsumerOptions consumerOptions) { + + final DefaultKafkaConsumerFactory factoryWithOverrides; + if (consumerOptions != null) { + // Copy the base config + Map props = new HashMap<>(kafkaConsumerFactory.getConfigurationProperties()); + // Override just the auto.offset.reset + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, consumerOptions.getAutoOffsetReset()); + factoryWithOverrides = + new DefaultKafkaConsumerFactory<>( + props, + kafkaConsumerFactory.getKeyDeserializer(), + kafkaConsumerFactory.getValueDeserializer()); + } else { + factoryWithOverrides = kafkaConsumerFactory; + } + ConcurrentKafkaListenerContainerFactory factory = new ConcurrentKafkaListenerContainerFactory<>(); - factory.setConsumerFactory(kafkaConsumerFactory); + factory.setConsumerFactory(factoryWithOverrides); factory.setContainerCustomizer(new ThreadPoolContainerCustomizer()); factory.setConcurrency(kafkaEventConsumerConcurrency); @@ -103,7 +177,7 @@ protected KafkaListenerContainerFactory kafkaEventConsumer( use DefaultErrorHandler (does back-off retry and then logs) rather than stopping the container. Stopping the container prevents lost messages until the error can be examined, disabling this will allow progress, but may lose data */ - if (configurationProvider.getKafka().getConsumer().isStopOnDeserializationError()) { + if (isStopOnDeserializationError) { CommonDelegatingErrorHandler delegatingErrorHandler = new CommonDelegatingErrorHandler(new DefaultErrorHandler()); delegatingErrorHandler.addDelegate( @@ -111,9 +185,9 @@ use DefaultErrorHandler (does back-off retry and then logs) rather than stopping factory.setCommonErrorHandler(delegatingErrorHandler); } log.info( - String.format( - "Event-based KafkaListenerContainerFactory built successfully. Consumer concurrency = %s", - kafkaEventConsumerConcurrency)); + "Event-based {} KafkaListenerContainerFactory built successfully. Consumer concurrency = {}", + consumerFactoryName, + kafkaEventConsumerConcurrency); return factory; } diff --git a/metadata-service/schema-registry-servlet/src/test/java/io/datahubproject/openapi/test/SchemaRegistryControllerTest.java b/metadata-service/schema-registry-servlet/src/test/java/io/datahubproject/openapi/test/SchemaRegistryControllerTest.java index 664766f204e460..e8deed00672da7 100644 --- a/metadata-service/schema-registry-servlet/src/test/java/io/datahubproject/openapi/test/SchemaRegistryControllerTest.java +++ b/metadata-service/schema-registry-servlet/src/test/java/io/datahubproject/openapi/test/SchemaRegistryControllerTest.java @@ -1,6 +1,7 @@ package io.datahubproject.openapi.test; import static com.linkedin.metadata.Constants.*; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.DEFAULT_EVENT_CONSUMER_NAME; import static org.testng.Assert.*; import com.linkedin.common.urn.Urn; @@ -199,7 +200,7 @@ public void testPEConsumption() @KafkaListener( id = "test-mcp-consumer", topics = Topics.METADATA_CHANGE_PROPOSAL, - containerFactory = "kafkaEventConsumer", + containerFactory = DEFAULT_EVENT_CONSUMER_NAME, properties = {"auto.offset.reset:earliest"}) public void receiveMCP(ConsumerRecord consumerRecord) { @@ -216,7 +217,7 @@ public void receiveMCP(ConsumerRecord consumerRecord) { @KafkaListener( id = "test-mcl-consumer", topics = Topics.METADATA_CHANGE_LOG_VERSIONED, - containerFactory = "kafkaEventConsumer", + containerFactory = DEFAULT_EVENT_CONSUMER_NAME, properties = {"auto.offset.reset:earliest"}) public void receiveMCL(ConsumerRecord consumerRecord) { @@ -232,7 +233,7 @@ public void receiveMCL(ConsumerRecord consumerRecord) { @KafkaListener( id = "test-pe-consumer", topics = Topics.PLATFORM_EVENT, - containerFactory = "kafkaEventConsumer", + containerFactory = DEFAULT_EVENT_CONSUMER_NAME, properties = {"auto.offset.reset:earliest"}) public void receivePE(ConsumerRecord consumerRecord) { diff --git a/smoke-test/tests/cypress/yarn.lock b/smoke-test/tests/cypress/yarn.lock index 2433e9f8fae08e..c6116609b11467 100644 --- a/smoke-test/tests/cypress/yarn.lock +++ b/smoke-test/tests/cypress/yarn.lock @@ -510,9 +510,9 @@ core-util-is@1.0.2: integrity sha1-tf1UIgqivFq1eqtxQMlAdUUDwac= cross-spawn@^7.0.0, cross-spawn@^7.0.2: - version "7.0.3" - resolved "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz" - integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w== + version "7.0.6" + resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f" + integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA== dependencies: path-key "^3.1.0" shebang-command "^2.0.0"